2 * Copyright (C) 1996-2000 Michael R. Elkins <me@mutt.org>
3 * Copyright (C) 2000-2001 Edmund Grimley Evans <edmundo@rano.org>
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111, USA.
31 /* If you are debugging this file, comment out the following line. */
40 #define ENCWORD_LEN_MAX 75
41 #define ENCWORD_LEN_MIN 9 /* strlen ("=?.?.?.?=") */
43 #define HSPACE(x) ((x) == '\0' || (x) == ' ' || (x) == '\t')
45 #define CONTINUATION_BYTE(c) (((c) & 0xc0) == 0x80)
47 extern char RFC822Specials[];
49 typedef size_t (*encoder_t) (char *, ICONV_CONST char *, size_t,
52 static size_t convert_string (ICONV_CONST char *f, size_t flen,
53 const char *from, const char *to,
54 char **t, size_t *tlen)
61 cd = mutt_iconv_open (to, from, 0);
62 if (cd == (iconv_t)(-1))
65 ob = buf = safe_malloc (obl);
66 n = iconv (cd, &f, &flen, &ob, &obl);
67 if (n == (size_t)(-1) || iconv (cd, 0, 0, &ob, &obl) == (size_t)(-1))
79 safe_realloc (&buf, ob - buf + 1);
86 char *mutt_choose_charset (const char *fromcode, const char *charsets,
87 char *u, size_t ulen, char **d, size_t *dlen)
89 char canonical_buff[LONG_STRING];
90 char *e = 0, *tocode = 0;
91 size_t elen = 0, bestn = 0;
94 for (p = charsets; p; p = q ? q + 1 : 0)
101 n = q ? q - p : strlen (p);
104 /* Assume that we never need more than 12 characters of
105 encoded-text to encode a single character. */
106 n > (ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 2 - 12))
109 t = safe_malloc (n + 1);
113 n = convert_string (u, ulen, fromcode, t, &s, &slen);
114 if (n == (size_t)(-1))
117 if (!tocode || n < bestn)
146 mutt_canonical_charset (canonical_buff, sizeof (canonical_buff), tocode);
147 mutt_str_replace (&tocode, canonical_buff);
152 static size_t b_encoder (char *s, ICONV_CONST char *d, size_t dlen,
157 memcpy (s, "=?", 2), s += 2;
158 memcpy (s, tocode, strlen (tocode)), s += strlen (tocode);
159 memcpy (s, "?B?", 3), s += 3;
166 *s++ = B64Chars[(*d >> 2) & 0x3f];
167 *s++ = B64Chars[(*d & 0x03) << 4];
174 *s++ = B64Chars[(*d >> 2) & 0x3f];
175 *s++ = B64Chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
176 *s++ = B64Chars[(d[1] & 0x0f) << 2];
182 *s++ = B64Chars[(*d >> 2) & 0x3f];
183 *s++ = B64Chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
184 *s++ = B64Chars[((d[1] & 0x0f) << 2) | ((d[2] >> 6) & 0x03)];
185 *s++ = B64Chars[d[2] & 0x3f];
189 memcpy (s, "?=", 2), s += 2;
193 static size_t q_encoder (char *s, ICONV_CONST char *d, size_t dlen,
196 char hex[] = "0123456789ABCDEF";
199 memcpy (s, "=?", 2), s += 2;
200 memcpy (s, tocode, strlen (tocode)), s += strlen (tocode);
201 memcpy (s, "?Q?", 3), s += 3;
204 unsigned char c = *d++;
207 else if (c >= 0x7f || c < 0x20 || c == '_' || strchr (MimeSpecials, c))
210 *s++ = hex[(c & 0xf0) >> 4];
211 *s++ = hex[c & 0x0f];
216 memcpy (s, "?=", 2), s += 2;
221 * Return 0 if and set *encoder and *wlen if the data (d, dlen) could
222 * be converted to an encoded word of length *wlen using *encoder.
223 * Otherwise return an upper bound on the maximum length of the data
224 * which could be converted.
225 * The data is converted from fromcode (which must be stateless) to
226 * tocode, unless fromcode is 0, in which case the data is assumed to
227 * be already in tocode, which should be 8-bit and stateless.
229 static size_t try_block (ICONV_CONST char *d, size_t dlen,
230 const char *fromcode, const char *tocode,
231 encoder_t *encoder, size_t *wlen)
233 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
235 ICONV_CONST char *ib;
238 int count, len, len_b, len_q;
242 cd = mutt_iconv_open (tocode, fromcode, 0);
243 assert (cd != (iconv_t)(-1));
244 ib = d, ibl = dlen, ob = buf1, obl = sizeof (buf1) - strlen (tocode);
245 if (iconv (cd, &ib, &ibl, &ob, &obl) == (size_t)(-1) ||
246 iconv (cd, 0, 0, &ob, &obl) == (size_t)(-1))
248 assert (errno == E2BIG);
251 return (ib - d == dlen) ? dlen : ib - d + 1;
257 if (dlen > sizeof (buf1) - strlen (tocode))
258 return sizeof (buf1) - strlen (tocode) + 1;
259 memcpy (buf1, d, dlen);
264 for (p = buf1; p < ob; p++)
266 unsigned char c = *p;
267 assert (strchr (MimeSpecials, '?'));
268 if (c >= 0x7f || c < 0x20 || *p == '_' ||
269 (c != ' ' && strchr (MimeSpecials, *p)))
273 len = ENCWORD_LEN_MIN - 2 + strlen (tocode);
274 len_b = len + (((ob - buf1) + 2) / 3) * 4;
275 len_q = len + (ob - buf1) + 2 * count;
277 /* Apparently RFC 1468 says to use B encoding for iso-2022-jp. */
278 if (!ascii_strcasecmp (tocode, "ISO-2022-JP"))
279 len_q = ENCWORD_LEN_MAX + 1;
281 if (len_b < len_q && len_b <= ENCWORD_LEN_MAX)
283 *encoder = b_encoder;
287 else if (len_q <= ENCWORD_LEN_MAX)
289 *encoder = q_encoder;
298 * Encode the data (d, dlen) into s using the encoder.
299 * Return the length of the encoded word.
301 static size_t encode_block (char *s, char *d, size_t dlen,
302 const char *fromcode, const char *tocode,
305 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
307 ICONV_CONST char *ib;
309 size_t ibl, obl, n1, n2;
313 cd = mutt_iconv_open (tocode, fromcode, 0);
314 assert (cd != (iconv_t)(-1));
315 ib = d, ibl = dlen, ob = buf1, obl = sizeof (buf1) - strlen (tocode);
316 n1 = iconv (cd, &ib, &ibl, &ob, &obl);
317 n2 = iconv (cd, 0, 0, &ob, &obl);
318 assert (n1 != (size_t)(-1) && n2 != (size_t)(-1));
320 return (*encoder) (s, buf1, ob - buf1, tocode);
323 return (*encoder) (s, d, dlen, tocode);
327 * Discover how much of the data (d, dlen) can be converted into
328 * a single encoded word. Return how much data can be converted,
329 * and set the length *wlen of the encoded word and *encoder.
330 * We start in column col, which limits the length of the word.
332 static size_t choose_block (char *d, size_t dlen, int col,
333 const char *fromcode, const char *tocode,
334 encoder_t *encoder, size_t *wlen)
337 int utf8 = fromcode && !ascii_strcasecmp (fromcode, "UTF-8");
343 nn = try_block (d, n, fromcode, tocode, encoder, wlen);
344 if (!nn && (col + *wlen <= ENCWORD_LEN_MAX + 1 || n <= 1))
346 n = (nn ? nn : n) - 1;
349 while (n > 1 && CONTINUATION_BYTE(d[n]))
356 * Place the result of RFC-2047-encoding (d, dlen) into the dynamically
357 * allocated buffer (e, elen). The input data is in charset fromcode
358 * and is converted into a charset chosen from charsets.
359 * Return 1 if the conversion to UTF-8 failed, 2 if conversion from UTF-8
360 * failed, otherwise 0. If conversion failed, fromcode is assumed to be
361 * compatible with us-ascii and the original data is used.
362 * The input data is assumed to be a single line starting at column col;
363 * if col is non-zero, the preceding character was a space.
365 static int rfc2047_encode (ICONV_CONST char *d, size_t dlen, int col,
366 const char *fromcode, const char *charsets,
367 char **e, size_t *elen, char *specials)
371 size_t bufpos, buflen;
372 char *u, *t0, *t1, *t;
374 size_t ulen, r, n, wlen;
378 char *icode = "UTF-8";
380 /* Try to convert to UTF-8. */
381 if (convert_string (d, dlen, fromcode, icode, &u, &ulen))
385 u = safe_malloc ((ulen = dlen) + 1);
390 /* Find earliest and latest things we must encode. */
391 s0 = s1 = t0 = t1 = 0;
392 for (t = u; t < u + ulen; t++)
395 (*t == '=' && t[1] == '?' && (t == u || HSPACE(*(t-1)))))
400 else if (specials && strchr (specials, *t))
407 /* If we have something to encode, include RFC822 specials */
408 if (t0 && s0 && s0 < t0)
410 if (t1 && s1 && s1 > t1)
415 /* No encoding is required. */
421 /* Choose target charset. */
425 if ((tocode1 = mutt_choose_charset (icode, charsets, u, ulen, 0, 0)))
431 /* Hack to avoid labelling 8-bit data as us-ascii. */
432 if (!icode && mutt_is_us_ascii (tocode))
433 tocode = "unknown-8bit";
435 /* Adjust t0 for maximum length of line. */
436 t = u + (ENCWORD_LEN_MAX + 1) - col - ENCWORD_LEN_MIN;
441 /* Adjust t0 until we can encode a character after a space. */
444 if (!HSPACE(*(t0-1)))
448 while (t < u + ulen && CONTINUATION_BYTE(*t))
450 if (!try_block (t0, t - t0, icode, tocode, &encoder, &wlen) &&
451 col + (t0 - u) + wlen <= ENCWORD_LEN_MAX + 1)
455 /* Adjust t1 until we can encode a character before a space. */
456 for (; t1 < u + ulen; t1++)
462 while (CONTINUATION_BYTE(*t))
464 if (!try_block (t, t1 - t, icode, tocode, &encoder, &wlen) &&
465 1 + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
469 /* We shall encode the region [t0,t1). */
471 /* Initialise the output buffer with the us-ascii prefix. */
473 buf = safe_malloc (buflen);
475 memcpy (buf, u, t0 - u);
482 /* Find how much we can encode. */
483 n = choose_block (t, t1 - t, col, icode, tocode, &encoder, &wlen);
486 /* See if we can fit the us-ascii suffix, too. */
487 if (col + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
491 while (CONTINUATION_BYTE(t[n]))
496 /* This should only happen in the really stupid case where the
497 only word that needs encoding is one character long, but
498 there is too much us-ascii stuff after it to use a single
499 encoded word. We add the next word to the encoded region
501 assert (t1 < u + ulen);
502 for (t1++; t1 < u + ulen && !HSPACE(*t1); t1++)
506 n = choose_block (t, n, col, icode, tocode, &encoder, &wlen);
509 /* Add to output buffer. */
510 #define LINEBREAK "\n\t"
511 if (bufpos + wlen + strlen (LINEBREAK) > buflen)
513 buflen = bufpos + wlen + strlen (LINEBREAK);
514 safe_realloc (&buf, buflen);
516 r = encode_block (buf + bufpos, t, n, icode, tocode, encoder);
519 memcpy (buf + bufpos, LINEBREAK, strlen (LINEBREAK));
520 bufpos += strlen (LINEBREAK);
528 /* Add last encoded word and us-ascii suffix to buffer. */
529 buflen = bufpos + wlen + (u + ulen - t1);
530 safe_realloc (&buf, buflen + 1);
531 r = encode_block (buf + bufpos, t, t1 - t, icode, tocode, encoder);
534 memcpy (buf + bufpos, t1, u + ulen - t1);
546 void _rfc2047_encode_string (char **pd, int encode_specials, int col)
552 if (!Charset || !*pd)
555 charsets = SendCharset;
556 if (!charsets || !*charsets)
559 rfc2047_encode (*pd, strlen (*pd), col,
560 Charset, charsets, &e, &elen,
561 encode_specials ? RFC822Specials : NULL);
567 void rfc2047_encode_adrlist (ADDRESS *addr, const char *tag)
570 int col = tag ? strlen (tag) + 2 : 32;
575 _rfc2047_encode_string (&ptr->personal, 1, col);
578 _rfc2047_encode_string (&ptr->val, 1, col);
584 static int rfc2047_decode_word (char *d, const char *s, size_t len)
586 const char *pp, *pp1;
589 int enc = 0, count = 0;
590 char *charset = NULL;
592 pd = d0 = safe_malloc (strlen (s));
594 for (pp = s; (pp1 = strchr (pp, '?')); pp = pp1 + 1)
600 /* ignore language specification a la RFC 2231 */
602 if ((t1 = memchr (pp, '*', t - pp)))
604 charset = safe_malloc (t - pp + 1);
605 memcpy (charset, pp, t - pp);
606 charset[t-pp] = '\0';
609 if (toupper ((unsigned char) *pp) == 'Q')
610 enc = ENCQUOTEDPRINTABLE;
611 else if (toupper ((unsigned char) *pp) == 'B')
621 if (enc == ENCQUOTEDPRINTABLE)
623 for (; pp < pp1; pp++)
627 else if (*pp == '=' &&
628 (!(pp[1] & ~127) && hexval(pp[1]) != -1) &&
629 (!(pp[2] & ~127) && hexval(pp[2]) != -1))
631 *pd++ = (hexval(pp[1]) << 4) | hexval(pp[2]);
639 else if (enc == ENCBASE64)
643 for (; pp < pp1; pp++)
647 if ((*pp & ~127) || (c = base64val(*pp)) == -1)
652 *pd++ = b | (c >> k);
668 mutt_convert_string (&d0, charset, Charset, M_ICONV_HOOK_FROM);
669 strfcpy (d, d0, len);
676 * Find the start and end of the first encoded word in the string.
677 * We use the grammar in section 2 of RFC 2047, but the "encoding"
678 * must be B or Q. Also, we don't require the encoded word to be
679 * separated by linear-white-space (section 5(1)).
681 static const char *find_encoded_word (const char *s, const char **x)
686 while ((p = strstr (q, "=?")))
689 0x20 < *q && *q < 0x7f && !strchr ("()<>@,;:\"/[]?.=", *q);
692 if (q[0] != '?' || !strchr ("BbQq", q[1]) || q[2] != '?')
694 for (q = q + 3; 0x20 <= *q && *q < 0x7f && *q != '?'; q++)
696 if (q[0] != '?' || q[1] != '=')
709 /* try to decode anything that looks like a valid RFC2047 encoded
710 * header field, ignoring RFC822 parsing rules
712 void rfc2047_decode (char **pd)
716 int found_encoded = 0;
724 dlen = 4 * strlen (s); /* should be enough */
725 d = d0 = safe_malloc (dlen + 1);
727 while (*s && dlen > 0)
729 if (!(p = find_encoded_word (s, &q)))
731 /* no encoded words */
732 strncpy (d, s, dlen);
739 n = (size_t) (p - s);
740 /* ignore spaces between encoded words */
741 if (!found_encoded || strspn (s, " \t\r\n") != n)
751 rfc2047_decode_word (d, p, dlen);
762 mutt_str_adjust (pd);
765 void rfc2047_decode_adrlist (ADDRESS *a)
769 if (a->personal && strstr (a->personal, "=?") != NULL) {
770 rfc2047_decode (&a->personal);
773 if (a->val && strstr (a->val, "=?") != NULL)
774 rfc2047_decode (&a->val);