2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or (at
5 * your option) any later version.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17 * Copyright © 2006 Pierre Habouzit
21 * Copyright notice from original mutt:
22 * Copyright (C) 1996-2000 Michael R. Elkins <me@mutt.org>
23 * Copyright (C) 2000-2001 Edmund Grimley Evans <edmundo@rano.org>
25 * This file is part of mutt-ng, see http://www.muttng.org/.
26 * It's licensed under the GNU General Public License,
27 * please see the file GPL in the top level source directory.
30 #include <lib-lib/mem.h>
31 #include <lib-lib/str.h>
32 #include <lib-lib/ascii.h>
34 #include <lib-mime/mime.h>
47 /* If you are debugging this file, comment out the following line. */
55 #define ENCWORD_LEN_MAX 75
56 #define ENCWORD_LEN_MIN 9 /* m_strlen("=?.?.?.?=") */
58 #define HSPACE(x) ((x) == '\0' || (x) == ' ' || (x) == '\t')
60 #define CONTINUATION_BYTE(c) (((c) & 0xc0) == 0x80)
62 /* converts f of len flen and charset from
63 into *t of len *tlen and charset to
66 returns number of converted chars from f, see iconv(3)
69 convert_string(const char *from, const char *f, ssize_t flen,
70 const char *to, char **t, ssize_t *tlen)
76 cd = mutt_iconv_open(to, from, 0);
78 if (cd == (iconv_t)(-1))
82 ob = buf = p_new(char, obl);
83 n = my_iconv(cd, &f, &flen, &ob, &obl);
85 if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
101 /* choose the shortest encoding for u */
102 char *mutt_choose_charset(const char *fromcode, const char *charsets,
103 char *u, ssize_t ulen, char **dst, ssize_t *dlen)
111 const char *p = charsets;
114 char cset[SHORT_STRING];
121 n = m_strncpy(cset, sizeof(cset), p, q - p);
124 n = m_strcpy(cset, sizeof(cset), p);
128 if (!n || n > (ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 2 - 12)) {
129 /* Assume that we never need more than 12 characters of
130 encoded-text to encode a single character. */
134 n = convert_string(fromcode, u, ulen, cset, &s, &slen);
138 if (!tocode || n < bestn) {
139 m_strreplace(&tocode, cset);
153 char buf[LONG_STRING];
162 mutt_canonical_charset(buf, sizeof(buf), tocode);
163 m_strreplace(&tocode, buf);
170 /****************************************************************************/
171 /* Encoding functions */
172 /****************************************************************************/
174 typedef size_t (encoder_t)(char *, const char *, ssize_t, const char *);
177 b_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
181 s += sprintf(s, "=?%s?B?", tocode);
189 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
190 *s++ = __m_b64chars[(*d & 0x03) << 4];
196 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
197 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
198 *s++ = __m_b64chars[(d[1] & 0x0f) << 2];
203 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
204 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
205 *s++ = __m_b64chars[((d[1] & 0x0f) << 2) | ((d[2] >> 6) & 0x03)];
206 *s++ = __m_b64chars[d[2] & 0x3f];
218 q_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
222 s += sprintf(s, "=?%s?Q?", tocode);
224 unsigned char c = *d++;
229 if (c & 0x80 || c < 0x20 || c == '_' || strchr (MimeSpecials, c)) {
231 *s++ = __m_b36chars_upper[c >> 4];
232 *s++ = __m_b36chars_upper[c & 0xf];
244 * Return 0 if and set *encoder and *wlen if the data (d, dlen) could
245 * be converted to an encoded word of length *wlen using *encoder.
246 * Otherwise return an upper bound on the maximum length of the data
247 * which could be converted.
248 * The data is converted from fromcode (which must be stateless) to
249 * tocode, unless fromcode is 0, in which case the data is assumed to
250 * be already in tocode, which should be 8-bit and stateless.
252 static size_t try_block (const char *d, ssize_t dlen,
253 const char *fromcode, const char *tocode,
254 encoder_t **encoder, ssize_t *wlen)
256 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
261 int count, len, len_b, len_q;
264 cd = mutt_iconv_open (tocode, fromcode, 0);
265 assert (cd != (iconv_t) (-1));
266 ib = d, ibl = dlen, ob = buf1, obl = sizeof (buf1) - m_strlen(tocode);
267 if (my_iconv(cd, &ib, &ibl, &ob, &obl) < 0
268 || my_iconv(cd, 0, 0, &ob, &obl) < 0)
270 assert (errno == E2BIG);
273 return (ib - d == dlen) ? dlen : ib - d + 1;
278 if (dlen > ssizeof(buf1) - m_strlen(tocode))
279 return ssizeof(buf1) - m_strlen(tocode) + 1;
280 memcpy (buf1, d, dlen);
285 for (p = buf1; p < ob; p++) {
286 unsigned char c = *p;
288 assert (strchr (MimeSpecials, '?'));
289 if (c >= 0x7f || c < 0x20 || *p == '_' ||
290 (c != ' ' && strchr (MimeSpecials, *p)))
294 len = ENCWORD_LEN_MIN - 2 + m_strlen(tocode);
295 len_b = len + (((ob - buf1) + 2) / 3) * 4;
296 len_q = len + (ob - buf1) + 2 * count;
298 /* Apparently RFC 1468 says to use B encoding for iso-2022-jp. */
299 if (!ascii_strcasecmp (tocode, "ISO-2022-JP"))
300 len_q = ENCWORD_LEN_MAX + 1;
302 if (len_b < len_q && len_b <= ENCWORD_LEN_MAX) {
303 *encoder = b_encoder;
307 else if (len_q <= ENCWORD_LEN_MAX) {
308 *encoder = q_encoder;
317 * Encode the data (d, dlen) into s using the encoder.
318 * Return the length of the encoded word.
320 static size_t encode_block (char *s, char *d, ssize_t dlen,
321 const char *fromcode, const char *tocode,
324 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
325 ssize_t ibl, obl, n1, n2;
331 cd = mutt_iconv_open (tocode, fromcode, 0);
332 assert (cd != (iconv_t) (-1));
333 ib = d, ibl = dlen, ob = buf1, obl = sizeof (buf1) - m_strlen(tocode);
334 n1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
335 n2 = my_iconv(cd, 0, 0, &ob, &obl);
336 assert (n1 >= 0 && n2 >= 0);
338 return (*encoder) (s, buf1, ob - buf1, tocode);
340 return (*encoder) (s, d, dlen, tocode);
345 * Discover how much of the data (d, dlen) can be converted into
346 * a single encoded word. Return how much data can be converted,
347 * and set the length *wlen of the encoded word and *encoder.
348 * We start in column col, which limits the length of the word.
350 static size_t choose_block(char *d, size_t dlen, int col,
351 const char *fromcode, const char *tocode,
352 encoder_t **encoder, ssize_t *wlen)
355 int utf8 = fromcode && !ascii_strcasecmp (fromcode, "UTF-8");
360 nn = try_block (d, n, fromcode, tocode, encoder, wlen);
361 if (!nn && (col + *wlen <= ENCWORD_LEN_MAX + 1 || n <= 1))
363 n = (nn ? nn : n) - 1;
366 while (n > 1 && CONTINUATION_BYTE (d[n]))
373 * Place the result of RFC-2047-encoding (d, dlen) into the dynamically
374 * allocated buffer (e, elen). The input data is in charset fromcode
375 * and is converted into a charset chosen from charsets.
376 * Return 1 if the conversion to UTF-8 failed, 2 if conversion from UTF-8
377 * failed, otherwise 0. If conversion failed, fromcode is assumed to be
378 * compatible with us-ascii and the original data is used.
379 * The input data is assumed to be a single line starting at column col;
380 * if col is non-zero, the preceding character was a space.
382 static int rfc2047_encode (const char *d, ssize_t dlen, int col,
383 const char *fromcode, const char *charsets,
384 char **e, ssize_t *elen, const char *specials)
388 ssize_t bufpos, buflen;
389 char *u, *t0, *t1, *t;
391 ssize_t ulen, r, n, wlen;
395 const char *icode = "UTF-8";
397 /* Try to convert to UTF-8. */
398 if (convert_string(fromcode, d, dlen, icode, &u, &ulen)) {
401 u = p_dupstr(d, ulen = dlen);
404 /* Find earliest and latest things we must encode. */
405 s0 = s1 = t0 = t1 = 0;
406 for (t = u; t < u + ulen; t++) {
408 (*t == '=' && t[1] == '?' && (t == u || HSPACE (*(t - 1))))) {
413 else if (specials && strchr (specials, *t)) {
420 /* If we have something to encode, include RFC822 specials */
421 if (t0 && s0 && s0 < t0)
423 if (t1 && s1 && s1 > t1)
427 /* No encoding is required. */
433 /* Choose target charset. */
436 if ((tocode1 = mutt_choose_charset(icode, charsets, u, ulen,
443 /* Hack to avoid labelling 8-bit data as us-ascii. */
444 if (!icode && mutt_is_us_ascii (tocode))
445 tocode = "unknown-8bit";
447 /* Adjust t0 for maximum length of line. */
448 t = u + (ENCWORD_LEN_MAX + 1) - col - ENCWORD_LEN_MIN;
455 /* Adjust t0 until we can encode a character after a space. */
456 for (; t0 > u; t0--) {
457 if (!HSPACE (*(t0 - 1)))
461 while (t < u + ulen && CONTINUATION_BYTE (*t))
463 if (!try_block (t0, t - t0, icode, tocode, &encoder, &wlen) &&
464 col + (t0 - u) + wlen <= ENCWORD_LEN_MAX + 1)
468 /* Adjust t1 until we can encode a character before a space. */
469 for (; t1 < u + ulen; t1++) {
474 while (CONTINUATION_BYTE (*t))
476 if (!try_block (t, t1 - t, icode, tocode, &encoder, &wlen) &&
477 1 + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
481 /* We shall encode the region [t0,t1). */
483 /* Initialise the output buffer with the us-ascii prefix. */
485 buf = p_new(char, buflen);
487 memcpy (buf, u, t0 - u);
493 /* Find how much we can encode. */
494 n = choose_block (t, t1 - t, col, icode, tocode, &encoder, &wlen);
496 /* See if we can fit the us-ascii suffix, too. */
497 if (col + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
501 while (CONTINUATION_BYTE (t[n]))
505 /* This should only happen in the really stupid case where the
506 only word that needs encoding is one character long, but
507 there is too much us-ascii stuff after it to use a single
508 encoded word. We add the next word to the encoded region
510 assert (t1 < u + ulen);
511 for (t1++; t1 < u + ulen && !HSPACE (*t1); t1++);
514 n = choose_block (t, n, col, icode, tocode, &encoder, &wlen);
517 /* Add to output buffer. */
518 #define LINEBREAK "\n\t"
519 if (bufpos + wlen + m_strlen(LINEBREAK) > buflen) {
520 buflen = bufpos + wlen + m_strlen(LINEBREAK);
521 p_realloc(&buf, buflen);
523 r = encode_block (buf + bufpos, t, n, icode, tocode, encoder);
526 memcpy (buf + bufpos, LINEBREAK, m_strlen(LINEBREAK));
527 bufpos += m_strlen(LINEBREAK);
535 /* Add last encoded word and us-ascii suffix to buffer. */
536 buflen = bufpos + wlen + (u + ulen - t1);
537 p_realloc(&buf, buflen + 1);
538 r = encode_block (buf + bufpos, t, t1 - t, icode, tocode, encoder);
541 memcpy (buf + bufpos, t1, u + ulen - t1);
553 void _rfc2047_encode_string (char **pd, int encode_specials, int col)
557 const char *charsets;
559 if (!Charset || !*pd)
562 charsets = SendCharset;
563 if (!charsets || !*charsets)
566 rfc2047_encode (*pd, m_strlen(*pd), col,
567 Charset, charsets, &e, &elen,
568 encode_specials ? RFC822Specials : NULL);
574 void rfc2047_encode_string(char **pd) {
575 _rfc2047_encode_string(pd, 0, 32);
578 void rfc2047_encode_adrlist (address_t * addr, const char *tag)
580 address_t *ptr = addr;
581 int col = tag ? m_strlen(tag) + 2 : 32;
585 _rfc2047_encode_string (&ptr->personal, 1, col);
590 static int rfc2047_decode_word (char *d, const char *s, size_t len)
592 const char *pp, *pp1;
595 int enc = 0, count = 0;
596 char *charset = NULL;
598 pd = d0 = p_new(char, m_strlen(s));
600 for (pp = s; (pp1 = strchr (pp, '?')); pp = pp1 + 1) {
604 /* ignore language specification a la RFC 2231 */
606 if ((t1 = memchr (pp, '*', t - pp)))
608 charset = p_dupstr(pp, t - pp);
611 if (toupper ((unsigned char) *pp) == 'Q')
612 enc = ENCQUOTEDPRINTABLE;
613 else if (toupper ((unsigned char) *pp) == 'B')
622 if (enc == ENCQUOTEDPRINTABLE) {
623 for (; pp < pp1; pp++) {
626 else if (*pp == '=' && hexval(pp[1]) >= 0 && hexval(pp[2]) >= 0) {
627 *pd++ = (hexval (pp[1]) << 4) | hexval (pp[2]);
635 else if (enc == ENCBASE64) {
638 for (; pp < pp1; pp++) {
641 if ((c = base64val(*pp)) < 0)
645 *pd++ = b | (c >> k);
660 mutt_convert_string (&d0, charset, Charset, M_ICONV_HOOK_FROM);
661 m_strcpy(d, len, d0);
668 * Find the start and end of the first encoded word in the string.
669 * We use the grammar in section 2 of RFC 2047, but the "encoding"
670 * must be B or Q. Also, we don't require the encoded word to be
671 * separated by linear-white-space (section 5(1)).
673 static const char *find_encoded_word (const char *s, const char **x)
678 while ((p = strstr (q, "=?"))) {
680 0x20 < *q && *q < 0x7f && !strchr ("()<>@,;:\"/[]?.=", *q); q++);
681 if (q[0] != '?' || !strchr ("BbQq", q[1]) || q[2] != '?')
683 for (q = q + 3; 0x20 <= *q && *q < 0x7f && *q != '?'; q++);
684 if (q[0] != '?' || q[1] != '=') {
696 /* return length of linear white space */
697 static size_t lwslen (const char *s, size_t n)
705 for (; p < s + n; p++)
706 if (!strchr (" \t\r\n", *p)) {
707 len = (size_t) (p - s);
710 if (strchr ("\r\n", *(p - 1))) /* LWS doesn't end with CRLF */
715 /* return length of linear white space : reverse */
716 static size_t lwsrlen (const char *s, size_t n)
718 const char *p = s + n - 1;
724 if (strchr ("\r\n", *p)) /* LWS doesn't end with CRLF */
728 if (!strchr (" \t\r\n", *p)) {
729 len = (size_t) (s + n - 1 - p);
735 /* try to decode anything that looks like a valid RFC2047 encoded
736 * header field, ignoring RFC822 parsing rules
738 void rfc2047_decode (char **pd)
742 int found_encoded = 0;
750 dlen = 4 * m_strlen(s); /* should be enough */
751 d = d0 = p_new(char, dlen + 1);
753 while (*s && dlen > 0) {
754 if (!(p = find_encoded_word (s, &q))) {
755 /* no encoded words */
756 if (!option (OPTSTRICTMIME)) {
758 if (found_encoded && (m = lwslen (s, n)) != 0) {
760 *d = ' ', d++, dlen--;
763 if (ascii_strcasecmp (AssumedCharset, "us-ascii")) {
768 if (mutt_convert_nonmime_string (&t) == 0) {
770 strncpy (d, t, tlen);
781 strncpy (d, s, dlen);
788 /* ignore spaces between encoded words
789 * and linear white spaces between encoded word and *text */
790 if (!option (OPTSTRICTMIME)) {
791 if (found_encoded && (m = lwslen (s, n)) != 0) {
793 *d = ' ', d++, dlen--;
797 if ((m = n - lwsrlen (s, n)) != 0) {
804 *d = ' ', d++, dlen--;
807 else if (!found_encoded || strspn (s, " \t\r\n") != n) {
816 rfc2047_decode_word (d, p, dlen);
830 void rfc2047_decode_adrlist(address_t *a)
834 rfc2047_decode(&a->personal);
839 void rfc2047_decode_envelope(ENVELOPE* e)
843 /* do RFC2047 decoding */
844 rfc2047_decode_adrlist(e->from);
845 rfc2047_decode_adrlist(e->to);
846 rfc2047_decode_adrlist(e->cc);
847 rfc2047_decode_adrlist(e->bcc);
848 rfc2047_decode_adrlist(e->reply_to);
849 rfc2047_decode_adrlist(e->mail_followup_to);
850 rfc2047_decode_adrlist(e->return_path);
851 rfc2047_decode_adrlist(e->sender);
854 rfc2047_decode(&e->subject);
855 mutt_adjust_subject(e);