2 * Copyright notice from original mutt:
3 * Copyright (C) 1996-2000 Michael R. Elkins <me@mutt.org>
4 * Copyright (C) 2000-2001 Edmund Grimley Evans <edmundo@rano.org>
6 * This file is part of mutt-ng, see http://www.muttng.org/.
7 * It's licensed under the GNU General Public License,
8 * please see the file GPL in the top level source directory.
30 /* If you are debugging this file, comment out the following line. */
39 #define ENCWORD_LEN_MAX 75
40 #define ENCWORD_LEN_MIN 9 /* str_len ("=?.?.?.?=") */
42 #define HSPACE(x) ((x) == '\0' || (x) == ' ' || (x) == '\t')
44 #define CONTINUATION_BYTE(c) (((c) & 0xc0) == 0x80)
46 extern char RFC822Specials[];
48 typedef size_t (*encoder_t) (char *, ICONV_CONST char *, size_t,
51 static size_t convert_string (ICONV_CONST char *f, size_t flen,
52 const char *from, const char *to,
53 char **t, size_t * tlen)
60 cd = mutt_iconv_open (to, from, 0);
61 if (cd == (iconv_t) (-1))
64 ob = buf = mem_malloc (obl);
65 n = iconv (cd, &f, &flen, &ob, &obl);
66 if (n == (size_t) (-1) || iconv (cd, 0, 0, &ob, &obl) == (size_t) (-1)) {
77 mem_realloc (&buf, ob - buf + 1);
84 char *mutt_choose_charset (const char *fromcode, const char *charsets,
85 char *u, size_t ulen, char **d, size_t * dlen)
87 char canonical_buff[LONG_STRING];
88 char *e = 0, *tocode = 0;
89 size_t elen = 0, bestn = 0;
92 for (p = charsets; p; p = q ? q + 1 : 0) {
98 n = q ? q - p : str_len (p);
101 /* Assume that we never need more than 12 characters of
102 encoded-text to encode a single character. */
103 n > (ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 2 - 12))
106 t = mem_malloc (n + 1);
110 n = convert_string (u, ulen, fromcode, t, &s, &slen);
111 if (n == (size_t) (-1))
114 if (!tocode || n < bestn) {
139 mutt_canonical_charset (canonical_buff, sizeof (canonical_buff), tocode);
140 str_replace (&tocode, canonical_buff);
145 static size_t b_encoder (char *s, ICONV_CONST char *d, size_t dlen,
150 memcpy (s, "=?", 2), s += 2;
151 memcpy (s, tocode, str_len (tocode)), s += str_len (tocode);
152 memcpy (s, "?B?", 3), s += 3;
156 else if (dlen == 1) {
157 *s++ = B64Chars[(*d >> 2) & 0x3f];
158 *s++ = B64Chars[(*d & 0x03) << 4];
163 else if (dlen == 2) {
164 *s++ = B64Chars[(*d >> 2) & 0x3f];
165 *s++ = B64Chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
166 *s++ = B64Chars[(d[1] & 0x0f) << 2];
171 *s++ = B64Chars[(*d >> 2) & 0x3f];
172 *s++ = B64Chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
173 *s++ = B64Chars[((d[1] & 0x0f) << 2) | ((d[2] >> 6) & 0x03)];
174 *s++ = B64Chars[d[2] & 0x3f];
178 memcpy (s, "?=", 2), s += 2;
182 static size_t q_encoder (char *s, ICONV_CONST char *d, size_t dlen,
185 char hex[] = "0123456789ABCDEF";
188 memcpy (s, "=?", 2), s += 2;
189 memcpy (s, tocode, str_len (tocode)), s += str_len (tocode);
190 memcpy (s, "?Q?", 3), s += 3;
192 unsigned char c = *d++;
196 else if (c >= 0x7f || c < 0x20 || c == '_' || strchr (MimeSpecials, c)) {
198 *s++ = hex[(c & 0xf0) >> 4];
199 *s++ = hex[c & 0x0f];
204 memcpy (s, "?=", 2), s += 2;
209 * Return 0 if and set *encoder and *wlen if the data (d, dlen) could
210 * be converted to an encoded word of length *wlen using *encoder.
211 * Otherwise return an upper bound on the maximum length of the data
212 * which could be converted.
213 * The data is converted from fromcode (which must be stateless) to
214 * tocode, unless fromcode is 0, in which case the data is assumed to
215 * be already in tocode, which should be 8-bit and stateless.
217 static size_t try_block (ICONV_CONST char *d, size_t dlen,
218 const char *fromcode, const char *tocode,
219 encoder_t * encoder, size_t * wlen)
221 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
223 ICONV_CONST char *ib;
226 int count, len, len_b, len_q;
229 cd = mutt_iconv_open (tocode, fromcode, 0);
230 assert (cd != (iconv_t) (-1));
231 ib = d, ibl = dlen, ob = buf1, obl = sizeof (buf1) - str_len (tocode);
232 if (iconv (cd, &ib, &ibl, &ob, &obl) == (size_t) (-1) ||
233 iconv (cd, 0, 0, &ob, &obl) == (size_t) (-1)) {
234 assert (errno == E2BIG);
237 return (ib - d == dlen) ? dlen : ib - d + 1;
242 if (dlen > sizeof (buf1) - str_len (tocode))
243 return sizeof (buf1) - str_len (tocode) + 1;
244 memcpy (buf1, d, dlen);
249 for (p = buf1; p < ob; p++) {
250 unsigned char c = *p;
252 assert (strchr (MimeSpecials, '?'));
253 if (c >= 0x7f || c < 0x20 || *p == '_' ||
254 (c != ' ' && strchr (MimeSpecials, *p)))
258 len = ENCWORD_LEN_MIN - 2 + str_len (tocode);
259 len_b = len + (((ob - buf1) + 2) / 3) * 4;
260 len_q = len + (ob - buf1) + 2 * count;
262 /* Apparently RFC 1468 says to use B encoding for iso-2022-jp. */
263 if (!ascii_strcasecmp (tocode, "ISO-2022-JP"))
264 len_q = ENCWORD_LEN_MAX + 1;
266 if (len_b < len_q && len_b <= ENCWORD_LEN_MAX) {
267 *encoder = b_encoder;
271 else if (len_q <= ENCWORD_LEN_MAX) {
272 *encoder = q_encoder;
281 * Encode the data (d, dlen) into s using the encoder.
282 * Return the length of the encoded word.
284 static size_t encode_block (char *s, char *d, size_t dlen,
285 const char *fromcode, const char *tocode,
288 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
290 ICONV_CONST char *ib;
292 size_t ibl, obl, n1, n2;
295 cd = mutt_iconv_open (tocode, fromcode, 0);
296 assert (cd != (iconv_t) (-1));
297 ib = d, ibl = dlen, ob = buf1, obl = sizeof (buf1) - str_len (tocode);
298 n1 = iconv (cd, &ib, &ibl, &ob, &obl);
299 n2 = iconv (cd, 0, 0, &ob, &obl);
300 assert (n1 != (size_t) (-1) && n2 != (size_t) (-1));
302 return (*encoder) (s, buf1, ob - buf1, tocode);
305 return (*encoder) (s, d, dlen, tocode);
309 * Discover how much of the data (d, dlen) can be converted into
310 * a single encoded word. Return how much data can be converted,
311 * and set the length *wlen of the encoded word and *encoder.
312 * We start in column col, which limits the length of the word.
314 static size_t choose_block (char *d, size_t dlen, int col,
315 const char *fromcode, const char *tocode,
316 encoder_t * encoder, size_t * wlen)
319 int utf8 = fromcode && !ascii_strcasecmp (fromcode, "UTF-8");
324 nn = try_block (d, n, fromcode, tocode, encoder, wlen);
325 if (!nn && (col + *wlen <= ENCWORD_LEN_MAX + 1 || n <= 1))
327 n = (nn ? nn : n) - 1;
330 while (n > 1 && CONTINUATION_BYTE (d[n]))
337 * Place the result of RFC-2047-encoding (d, dlen) into the dynamically
338 * allocated buffer (e, elen). The input data is in charset fromcode
339 * and is converted into a charset chosen from charsets.
340 * Return 1 if the conversion to UTF-8 failed, 2 if conversion from UTF-8
341 * failed, otherwise 0. If conversion failed, fromcode is assumed to be
342 * compatible with us-ascii and the original data is used.
343 * The input data is assumed to be a single line starting at column col;
344 * if col is non-zero, the preceding character was a space.
346 static int rfc2047_encode (ICONV_CONST char *d, size_t dlen, int col,
347 const char *fromcode, const char *charsets,
348 char **e, size_t * elen, char *specials)
352 size_t bufpos, buflen;
353 char *u, *t0, *t1, *t;
355 size_t ulen, r, n, wlen;
359 char *icode = "UTF-8";
361 /* Try to convert to UTF-8. */
362 if (convert_string (d, dlen, fromcode, icode, &u, &ulen)) {
365 u = mem_malloc ((ulen = dlen) + 1);
370 /* Find earliest and latest things we must encode. */
371 s0 = s1 = t0 = t1 = 0;
372 for (t = u; t < u + ulen; t++) {
374 (*t == '=' && t[1] == '?' && (t == u || HSPACE (*(t - 1))))) {
379 else if (specials && strchr (specials, *t)) {
386 /* If we have something to encode, include RFC822 specials */
387 if (t0 && s0 && s0 < t0)
389 if (t1 && s1 && s1 > t1)
393 /* No encoding is required. */
399 /* Choose target charset. */
402 if ((tocode1 = mutt_choose_charset (icode, charsets, u, ulen, 0, 0)))
408 /* Hack to avoid labelling 8-bit data as us-ascii. */
409 if (!icode && mutt_is_us_ascii (tocode))
410 tocode = "unknown-8bit";
412 /* Adjust t0 for maximum length of line. */
413 t = u + (ENCWORD_LEN_MAX + 1) - col - ENCWORD_LEN_MIN;
420 /* Adjust t0 until we can encode a character after a space. */
421 for (; t0 > u; t0--) {
422 if (!HSPACE (*(t0 - 1)))
426 while (t < u + ulen && CONTINUATION_BYTE (*t))
428 if (!try_block (t0, t - t0, icode, tocode, &encoder, &wlen) &&
429 col + (t0 - u) + wlen <= ENCWORD_LEN_MAX + 1)
433 /* Adjust t1 until we can encode a character before a space. */
434 for (; t1 < u + ulen; t1++) {
439 while (CONTINUATION_BYTE (*t))
441 if (!try_block (t, t1 - t, icode, tocode, &encoder, &wlen) &&
442 1 + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
446 /* We shall encode the region [t0,t1). */
448 /* Initialise the output buffer with the us-ascii prefix. */
450 buf = mem_malloc (buflen);
452 memcpy (buf, u, t0 - u);
458 /* Find how much we can encode. */
459 n = choose_block (t, t1 - t, col, icode, tocode, &encoder, &wlen);
461 /* See if we can fit the us-ascii suffix, too. */
462 if (col + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
466 while (CONTINUATION_BYTE (t[n]))
470 /* This should only happen in the really stupid case where the
471 only word that needs encoding is one character long, but
472 there is too much us-ascii stuff after it to use a single
473 encoded word. We add the next word to the encoded region
475 assert (t1 < u + ulen);
476 for (t1++; t1 < u + ulen && !HSPACE (*t1); t1++);
479 n = choose_block (t, n, col, icode, tocode, &encoder, &wlen);
482 /* Add to output buffer. */
483 #define LINEBREAK "\n\t"
484 if (bufpos + wlen + str_len (LINEBREAK) > buflen) {
485 buflen = bufpos + wlen + str_len (LINEBREAK);
486 mem_realloc (&buf, buflen);
488 r = encode_block (buf + bufpos, t, n, icode, tocode, encoder);
491 memcpy (buf + bufpos, LINEBREAK, str_len (LINEBREAK));
492 bufpos += str_len (LINEBREAK);
500 /* Add last encoded word and us-ascii suffix to buffer. */
501 buflen = bufpos + wlen + (u + ulen - t1);
502 mem_realloc (&buf, buflen + 1);
503 r = encode_block (buf + bufpos, t, t1 - t, icode, tocode, encoder);
506 memcpy (buf + bufpos, t1, u + ulen - t1);
518 void _rfc2047_encode_string (char **pd, int encode_specials, int col)
524 if (!Charset || !*pd)
527 charsets = SendCharset;
528 if (!charsets || !*charsets)
531 rfc2047_encode (*pd, str_len (*pd), col,
532 Charset, charsets, &e, &elen,
533 encode_specials ? RFC822Specials : NULL);
539 void rfc2047_encode_adrlist (ADDRESS * addr, const char *tag)
542 int col = tag ? str_len (tag) + 2 : 32;
546 _rfc2047_encode_string (&ptr->personal, 1, col);
551 static int rfc2047_decode_word (char *d, const char *s, size_t len)
553 const char *pp, *pp1;
556 int enc = 0, count = 0;
557 char *charset = NULL;
559 pd = d0 = mem_malloc (str_len (s));
561 for (pp = s; (pp1 = strchr (pp, '?')); pp = pp1 + 1) {
565 /* ignore language specification a la RFC 2231 */
567 if ((t1 = memchr (pp, '*', t - pp)))
569 charset = mem_malloc (t - pp + 1);
570 memcpy (charset, pp, t - pp);
571 charset[t - pp] = '\0';
574 if (toupper ((unsigned char) *pp) == 'Q')
575 enc = ENCQUOTEDPRINTABLE;
576 else if (toupper ((unsigned char) *pp) == 'B')
585 if (enc == ENCQUOTEDPRINTABLE) {
586 for (; pp < pp1; pp++) {
589 else if (*pp == '=' &&
590 (!(pp[1] & ~127) && hexval (pp[1]) != -1) &&
591 (!(pp[2] & ~127) && hexval (pp[2]) != -1)) {
592 *pd++ = (hexval (pp[1]) << 4) | hexval (pp[2]);
600 else if (enc == ENCBASE64) {
603 for (; pp < pp1; pp++) {
606 if ((*pp & ~127) || (c = base64val (*pp)) == -1)
610 *pd++ = b | (c >> k);
625 mutt_convert_string (&d0, charset, Charset, M_ICONV_HOOK_FROM);
626 strfcpy (d, d0, len);
633 * Find the start and end of the first encoded word in the string.
634 * We use the grammar in section 2 of RFC 2047, but the "encoding"
635 * must be B or Q. Also, we don't require the encoded word to be
636 * separated by linear-white-space (section 5(1)).
638 static const char *find_encoded_word (const char *s, const char **x)
643 while ((p = strstr (q, "=?"))) {
645 0x20 < *q && *q < 0x7f && !strchr ("()<>@,;:\"/[]?.=", *q); q++);
646 if (q[0] != '?' || !strchr ("BbQq", q[1]) || q[2] != '?')
648 for (q = q + 3; 0x20 <= *q && *q < 0x7f && *q != '?'; q++);
649 if (q[0] != '?' || q[1] != '=') {
661 /* return length of linear white space */
662 static size_t lwslen (const char *s, size_t n)
670 for (; p < s + n; p++)
671 if (!strchr (" \t\r\n", *p)) {
672 len = (size_t) (p - s);
675 if (strchr ("\r\n", *(p - 1))) /* LWS doesn't end with CRLF */
680 /* return length of linear white space : reverse */
681 static size_t lwsrlen (const char *s, size_t n)
683 const char *p = s + n - 1;
689 if (strchr ("\r\n", *p)) /* LWS doesn't end with CRLF */
693 if (!strchr (" \t\r\n", *p)) {
694 len = (size_t) (s + n - 1 - p);
700 /* try to decode anything that looks like a valid RFC2047 encoded
701 * header field, ignoring RFC822 parsing rules
703 void rfc2047_decode (char **pd)
707 int found_encoded = 0;
715 dlen = 4 * str_len (s); /* should be enough */
716 d = d0 = mem_malloc (dlen + 1);
718 while (*s && dlen > 0) {
719 if (!(p = find_encoded_word (s, &q))) {
720 /* no encoded words */
721 if (!option (OPTSTRICTMIME)) {
723 if (found_encoded && (m = lwslen (s, n)) != 0) {
725 *d = ' ', d++, dlen--;
728 if (ascii_strcasecmp (AssumedCharset, "us-ascii")) {
732 t = mem_malloc (n + 1);
733 strfcpy (t, s, n + 1);
734 if (mutt_convert_nonmime_string (&t) == 0) {
736 strncpy (d, t, tlen);
747 strncpy (d, s, dlen);
753 n = (size_t) (p - s);
754 /* ignore spaces between encoded words
755 * and linear white spaces between encoded word and *text */
756 if (!option (OPTSTRICTMIME)) {
757 if (found_encoded && (m = lwslen (s, n)) != 0) {
759 *d = ' ', d++, dlen--;
763 if ((m = n - lwsrlen (s, n)) != 0) {
770 *d = ' ', d++, dlen--;
773 else if (!found_encoded || strspn (s, " \t\r\n") != n) {
782 rfc2047_decode_word (d, p, dlen);
796 void rfc2047_decode_adrlist (ADDRESS * a)
800 rfc2047_decode (&a->personal);
805 void rfc2047_decode_envelope (ENVELOPE* e) {
810 /* do RFC2047 decoding */
811 rfc2047_decode_adrlist (e->from);
812 rfc2047_decode_adrlist (e->to);
813 rfc2047_decode_adrlist (e->cc);
814 rfc2047_decode_adrlist (e->bcc);
815 rfc2047_decode_adrlist (e->reply_to);
816 rfc2047_decode_adrlist (e->mail_followup_to);
817 rfc2047_decode_adrlist (e->return_path);
818 rfc2047_decode_adrlist (e->sender);
821 regmatch_t pmatch[1];
823 rfc2047_decode (&e->subject);
825 if (regexec (ReplyRegexp.rx, e->subject, 1, pmatch, 0) == 0)
826 e->real_subj = e->subject + pmatch[0].rm_eo;
828 e->real_subj = e->subject;