2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or (at
5 * your option) any later version.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17 * Copyright © 2006 Pierre Habouzit
21 * Copyright notice from original mutt:
22 * Copyright (C) 1996-2000 Michael R. Elkins <me@mutt.org>
23 * Copyright (C) 2000-2001 Edmund Grimley Evans <edmundo@rano.org>
25 * This file is part of mutt-ng, see http://www.muttng.org/.
26 * It's licensed under the GNU General Public License,
27 * please see the file GPL in the top level source directory.
30 #include <lib-lib/mem.h>
31 #include <lib-lib/str.h>
32 #include <lib-lib/ascii.h>
34 #include <lib-mime/mime.h>
47 /* If you are debugging this file, comment out the following line. */
55 #define ENCWORD_LEN_MAX 75
56 #define ENCWORD_LEN_MIN 9 /* m_strlen("=?.?.?.?=") */
58 #define HSPACE(x) ((x) == '\0' || (x) == ' ' || (x) == '\t')
59 #define CONTINUATION_BYTE(c) (((c) & 0xc0) == 0x80)
61 /* converts f of len flen and charset from
62 into *t of len *tlen and charset to
65 returns number of converted chars from f, see iconv(3)
68 convert_string(const char *from, const char *f, ssize_t flen,
69 const char *to, char **t, ssize_t *tlen)
75 cd = mutt_iconv_open(to, from, 0);
77 if (cd == (iconv_t)(-1))
81 ob = buf = p_new(char, obl);
82 n = my_iconv(cd, &f, &flen, &ob, &obl);
84 if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
100 /* choose the shortest encoding for u */
101 char *mutt_choose_charset(const char *fromcode, const char *charsets,
102 char *u, ssize_t ulen, char **dst, ssize_t *dlen)
110 const char *p = charsets;
113 char cset[SHORT_STRING];
120 n = m_strncpy(cset, sizeof(cset), p, q - p);
123 n = m_strcpy(cset, sizeof(cset), p);
127 if (!n || n > (ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 2 - 12)) {
128 /* Assume that we never need more than 12 characters of
129 encoded-text to encode a single character. */
133 n = convert_string(fromcode, u, ulen, cset, &s, &slen);
137 if (!tocode || n < bestn) {
138 m_strreplace(&tocode, cset);
152 char buf[LONG_STRING];
161 mutt_canonical_charset(buf, sizeof(buf), tocode);
162 m_strreplace(&tocode, buf);
169 /****************************************************************************/
170 /* Encoding functions */
171 /****************************************************************************/
173 static const char __qp_special[128] = {
174 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
175 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
176 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
177 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
178 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
184 typedef size_t (encoder_t)(char *, const char *, ssize_t, const char *);
187 b_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
191 s += sprintf(s, "=?%s?B?", tocode);
199 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
200 *s++ = __m_b64chars[(*d & 0x03) << 4];
206 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
207 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
208 *s++ = __m_b64chars[(d[1] & 0x0f) << 2];
213 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
214 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
215 *s++ = __m_b64chars[((d[1] & 0x0f) << 2) | ((d[2] >> 6) & 0x03)];
216 *s++ = __m_b64chars[d[2] & 0x3f];
228 q_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
232 s += sprintf(s, "=?%s?Q?", tocode);
234 unsigned char c = *d++;
239 if (c & 0x80 || __qp_special[c]) {
241 *s++ = __m_b36chars_upper[c >> 4];
242 *s++ = __m_b36chars_upper[c & 0xf];
254 * Return 0 if and set *encoder and *wlen if the data (d, dlen) could
255 * be converted to an encoded word of length *wlen using *encoder.
256 * Otherwise return an upper bound on the maximum length of the data
257 * which could be converted.
258 * The data is converted from fromcode (which must be stateless) to
259 * tocode, unless fromcode is 0, in which case the data is assumed to
260 * be already in tocode, which should be 8-bit and stateless.
262 static size_t try_block(const char *d, ssize_t dlen,
263 const char *fromcode, const char *tocode,
264 encoder_t **encoder, ssize_t *wlen)
266 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
267 ssize_t obl = sizeof(buf1) - m_strlen(tocode);
273 iconv_t cd = mutt_iconv_open(tocode, fromcode, 0);
275 assert (cd != (iconv_t)(-1));
279 if (my_iconv(cd, &ib, &ibl, &ob, &obl) < 0
280 || my_iconv(cd, 0, 0, &ob, &obl) < 0)
282 assert (errno == E2BIG && ib > d);
284 return (ib - d == dlen) ? dlen : ib - d + 1;
290 memcpy(buf1, d, dlen);
296 int count, len, len_b, len_q;
299 for (p = buf1; p < ob; p++) {
300 count += (*p & 0x80 || __qp_special[(int)*p]);
303 len = ENCWORD_LEN_MIN - 2 + m_strlen(tocode);
304 len_b = len + (((ob - buf1) + 2) / 3) * 4;
305 len_q = len + (ob - buf1) + 2 * count;
307 /* Apparently RFC 1468 says to use B encoding for iso-2022-jp. */
308 if (!ascii_strcasecmp(tocode, "ISO-2022-JP"))
309 len_q = ENCWORD_LEN_MAX + 1;
311 if (len_b < len_q && len_b <= ENCWORD_LEN_MAX) {
312 *encoder = b_encoder;
316 if (len_q <= ENCWORD_LEN_MAX) {
317 *encoder = q_encoder;
327 * Encode the data (d, dlen) into s using the encoder.
328 * Return the length of the encoded word.
331 encode_block(char *s, char *d, ssize_t dlen,
332 const char *fromcode, const char *tocode, encoder_t *encoder)
334 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
335 ssize_t ibl, obl, n1, n2;
341 cd = mutt_iconv_open(tocode, fromcode, 0);
342 assert (cd != (iconv_t) (-1));
343 ib = d, ibl = dlen, ob = buf1, obl = sizeof(buf1) - m_strlen(tocode);
344 n1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
345 n2 = my_iconv(cd, 0, 0, &ob, &obl);
346 assert (n1 >= 0 && n2 >= 0);
348 return (*encoder)(s, buf1, ob - buf1, tocode);
350 return (*encoder)(s, d, dlen, tocode);
355 * Discover how much of the data (d, dlen) can be converted into
356 * a single encoded word. Return how much data can be converted,
357 * and set the length *wlen of the encoded word and *encoder.
358 * We start in column col, which limits the length of the word.
360 static size_t choose_block(char *d, size_t dlen, int col,
361 const char *fromcode, const char *tocode,
362 encoder_t **encoder, ssize_t *wlen)
365 int utf8 = fromcode && !ascii_strcasecmp(fromcode, "UTF-8");
370 nn = try_block(d, n, fromcode, tocode, encoder, wlen);
371 if (!nn && (col + *wlen <= ENCWORD_LEN_MAX + 1 || n <= 1))
373 n = (nn ? nn : n) - 1;
376 while (n > 1 && CONTINUATION_BYTE(d[n]))
385 /*** XXX: MC: not read after that mark ***/
388 * Place the result of RFC-2047-encoding (d, dlen) into the dynamically
389 * allocated buffer (e, elen). The input data is in charset fromcode
390 * and is converted into a charset chosen from charsets.
391 * Return 1 if the conversion to UTF-8 failed, 2 if conversion from UTF-8
392 * failed, otherwise 0. If conversion failed, fromcode is assumed to be
393 * compatible with us-ascii and the original data is used.
394 * The input data is assumed to be a single line starting at column col;
395 * if col is non-zero, the preceding character was a space.
397 static int rfc2047_encode (const char *d, ssize_t dlen, int col,
398 const char *fromcode, const char *charsets,
399 char **e, ssize_t *elen, const char *specials)
403 ssize_t bufpos, buflen;
404 char *u, *t0, *t1, *t;
406 ssize_t ulen, r, n, wlen;
410 const char *icode = "UTF-8";
412 /* Try to convert to UTF-8. */
413 if (convert_string(fromcode, d, dlen, icode, &u, &ulen)) {
416 u = p_dupstr(d, ulen = dlen);
419 /* Find earliest and latest things we must encode. */
420 s0 = s1 = t0 = t1 = 0;
421 for (t = u; t < u + ulen; t++) {
423 (*t == '=' && t[1] == '?' && (t == u || HSPACE (*(t - 1))))) {
428 else if (specials && strchr (specials, *t)) {
435 /* If we have something to encode, include RFC822 specials */
436 if (t0 && s0 && s0 < t0)
438 if (t1 && s1 && s1 > t1)
442 /* No encoding is required. */
448 /* Choose target charset. */
451 if ((tocode1 = mutt_choose_charset(icode, charsets, u, ulen,
458 /* Hack to avoid labelling 8-bit data as us-ascii. */
459 if (!icode && mutt_is_us_ascii (tocode))
460 tocode = "unknown-8bit";
462 /* Adjust t0 for maximum length of line. */
463 t = u + (ENCWORD_LEN_MAX + 1) - col - ENCWORD_LEN_MIN;
470 /* Adjust t0 until we can encode a character after a space. */
471 for (; t0 > u; t0--) {
472 if (!HSPACE (*(t0 - 1)))
476 while (t < u + ulen && CONTINUATION_BYTE (*t))
478 if (!try_block (t0, t - t0, icode, tocode, &encoder, &wlen) &&
479 col + (t0 - u) + wlen <= ENCWORD_LEN_MAX + 1)
483 /* Adjust t1 until we can encode a character before a space. */
484 for (; t1 < u + ulen; t1++) {
489 while (CONTINUATION_BYTE (*t))
491 if (!try_block (t, t1 - t, icode, tocode, &encoder, &wlen) &&
492 1 + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
496 /* We shall encode the region [t0,t1). */
498 /* Initialise the output buffer with the us-ascii prefix. */
500 buf = p_new(char, buflen);
502 memcpy (buf, u, t0 - u);
508 /* Find how much we can encode. */
509 n = choose_block (t, t1 - t, col, icode, tocode, &encoder, &wlen);
511 /* See if we can fit the us-ascii suffix, too. */
512 if (col + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
516 while (CONTINUATION_BYTE (t[n]))
520 /* This should only happen in the really stupid case where the
521 only word that needs encoding is one character long, but
522 there is too much us-ascii stuff after it to use a single
523 encoded word. We add the next word to the encoded region
525 assert (t1 < u + ulen);
526 for (t1++; t1 < u + ulen && !HSPACE (*t1); t1++);
529 n = choose_block (t, n, col, icode, tocode, &encoder, &wlen);
532 /* Add to output buffer. */
533 #define LINEBREAK "\n\t"
534 if (bufpos + wlen + m_strlen(LINEBREAK) > buflen) {
535 buflen = bufpos + wlen + m_strlen(LINEBREAK);
536 p_realloc(&buf, buflen);
538 r = encode_block (buf + bufpos, t, n, icode, tocode, encoder);
541 memcpy (buf + bufpos, LINEBREAK, m_strlen(LINEBREAK));
542 bufpos += m_strlen(LINEBREAK);
550 /* Add last encoded word and us-ascii suffix to buffer. */
551 buflen = bufpos + wlen + (u + ulen - t1);
552 p_realloc(&buf, buflen + 1);
553 r = encode_block (buf + bufpos, t, t1 - t, icode, tocode, encoder);
556 memcpy (buf + bufpos, t1, u + ulen - t1);
568 void _rfc2047_encode_string (char **pd, int encode_specials, int col)
572 const char *charsets;
574 if (!Charset || !*pd)
577 charsets = SendCharset;
578 if (!charsets || !*charsets)
581 rfc2047_encode (*pd, m_strlen(*pd), col,
582 Charset, charsets, &e, &elen,
583 encode_specials ? RFC822Specials : NULL);
589 void rfc2047_encode_string(char **pd) {
590 _rfc2047_encode_string(pd, 0, 32);
593 void rfc2047_encode_adrlist (address_t * addr, const char *tag)
595 address_t *ptr = addr;
596 int col = tag ? m_strlen(tag) + 2 : 32;
600 _rfc2047_encode_string (&ptr->personal, 1, col);
605 static int rfc2047_decode_word (char *d, const char *s, size_t len)
607 const char *pp, *pp1;
610 int enc = 0, count = 0;
611 char *charset = NULL;
613 pd = d0 = p_new(char, m_strlen(s));
615 for (pp = s; (pp1 = strchr (pp, '?')); pp = pp1 + 1) {
619 /* ignore language specification a la RFC 2231 */
621 if ((t1 = memchr (pp, '*', t - pp)))
623 charset = p_dupstr(pp, t - pp);
626 if (toupper ((unsigned char) *pp) == 'Q')
627 enc = ENCQUOTEDPRINTABLE;
628 else if (toupper ((unsigned char) *pp) == 'B')
637 if (enc == ENCQUOTEDPRINTABLE) {
638 for (; pp < pp1; pp++) {
641 else if (*pp == '=' && hexval(pp[1]) >= 0 && hexval(pp[2]) >= 0) {
642 *pd++ = (hexval (pp[1]) << 4) | hexval (pp[2]);
650 else if (enc == ENCBASE64) {
653 for (; pp < pp1; pp++) {
656 if ((c = base64val(*pp)) < 0)
660 *pd++ = b | (c >> k);
675 mutt_convert_string (&d0, charset, Charset, M_ICONV_HOOK_FROM);
676 m_strcpy(d, len, d0);
683 * Find the start and end of the first encoded word in the string.
684 * We use the grammar in section 2 of RFC 2047, but the "encoding"
685 * must be B or Q. Also, we don't require the encoded word to be
686 * separated by linear-white-space (section 5(1)).
688 static const char *find_encoded_word (const char *s, const char **x)
693 while ((p = strstr (q, "=?"))) {
695 0x20 < *q && *q < 0x7f && !strchr ("()<>@,;:\"/[]?.=", *q); q++);
696 if (q[0] != '?' || !strchr ("BbQq", q[1]) || q[2] != '?')
698 for (q = q + 3; 0x20 <= *q && *q < 0x7f && *q != '?'; q++);
699 if (q[0] != '?' || q[1] != '=') {
711 /* return length of linear white space */
712 static size_t lwslen (const char *s, size_t n)
720 for (; p < s + n; p++)
721 if (!strchr (" \t\r\n", *p)) {
722 len = (size_t) (p - s);
725 if (strchr ("\r\n", *(p - 1))) /* LWS doesn't end with CRLF */
730 /* return length of linear white space : reverse */
731 static size_t lwsrlen (const char *s, size_t n)
733 const char *p = s + n - 1;
739 if (strchr ("\r\n", *p)) /* LWS doesn't end with CRLF */
743 if (!strchr (" \t\r\n", *p)) {
744 len = (size_t) (s + n - 1 - p);
750 /* try to decode anything that looks like a valid RFC2047 encoded
751 * header field, ignoring RFC822 parsing rules
753 void rfc2047_decode (char **pd)
757 int found_encoded = 0;
765 dlen = 4 * m_strlen(s); /* should be enough */
766 d = d0 = p_new(char, dlen + 1);
768 while (*s && dlen > 0) {
769 if (!(p = find_encoded_word (s, &q))) {
770 /* no encoded words */
771 if (!option (OPTSTRICTMIME)) {
773 if (found_encoded && (m = lwslen (s, n)) != 0) {
775 *d = ' ', d++, dlen--;
778 if (ascii_strcasecmp (AssumedCharset, "us-ascii")) {
783 if (mutt_convert_nonmime_string (&t) == 0) {
785 strncpy (d, t, tlen);
796 strncpy (d, s, dlen);
803 /* ignore spaces between encoded words
804 * and linear white spaces between encoded word and *text */
805 if (!option (OPTSTRICTMIME)) {
806 if (found_encoded && (m = lwslen (s, n)) != 0) {
808 *d = ' ', d++, dlen--;
812 if ((m = n - lwsrlen (s, n)) != 0) {
819 *d = ' ', d++, dlen--;
822 else if (!found_encoded || strspn (s, " \t\r\n") != n) {
831 rfc2047_decode_word (d, p, dlen);
845 void rfc2047_decode_adrlist(address_t *a)
849 rfc2047_decode(&a->personal);
854 void rfc2047_decode_envelope(ENVELOPE* e)
858 /* do RFC2047 decoding */
859 rfc2047_decode_adrlist(e->from);
860 rfc2047_decode_adrlist(e->to);
861 rfc2047_decode_adrlist(e->cc);
862 rfc2047_decode_adrlist(e->bcc);
863 rfc2047_decode_adrlist(e->reply_to);
864 rfc2047_decode_adrlist(e->mail_followup_to);
865 rfc2047_decode_adrlist(e->return_path);
866 rfc2047_decode_adrlist(e->sender);
869 rfc2047_decode(&e->subject);
870 mutt_adjust_subject(e);