2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or (at
5 * your option) any later version.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17 * Copyright © 2006 Pierre Habouzit
21 * Copyright notice from original mutt:
22 * Copyright (C) 1996-2000 Michael R. Elkins <me@mutt.org>
23 * Copyright (C) 2000-2001 Edmund Grimley Evans <edmundo@rano.org>
25 * This file is part of mutt-ng, see http://www.muttng.org/.
26 * It's licensed under the GNU General Public License,
27 * please see the file GPL in the top level source directory.
30 #include <lib-lib/mem.h>
31 #include <lib-lib/str.h>
32 #include <lib-lib/ascii.h>
34 #include <lib-mime/mime.h>
47 /* If you are debugging this file, comment out the following line. */
55 #define ENCWORD_LEN_MAX 75
56 #define ENCWORD_LEN_MIN 9 /* m_strlen("=?.?.?.?=") */
58 #define HSPACE(x) ((x) == '\0' || (x) == ' ' || (x) == '\t')
60 #define CONTINUATION_BYTE(c) (((c) & 0xc0) == 0x80)
62 /* converts f of len flen and charset from
63 into *t of len *tlen and charset to
66 returns number of converted chars from f, see iconv(3)
69 convert_string(const char *from, const char *f, ssize_t flen,
70 const char *to, char **t, ssize_t *tlen)
76 cd = mutt_iconv_open(to, from, 0);
78 if (cd == (iconv_t)(-1))
82 ob = buf = p_new(char, obl);
83 n = my_iconv(cd, &f, &flen, &ob, &obl);
85 if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
101 /* choose the shortest encoding for u */
102 char *mutt_choose_charset(const char *fromcode, const char *charsets,
103 char *u, ssize_t ulen, char **dst, ssize_t *dlen)
111 const char *p = charsets;
114 char cset[SHORT_STRING];
121 n = m_strncpy(cset, sizeof(cset), p, q - p);
124 n = m_strcpy(cset, sizeof(cset), p);
128 if (!n || n > (ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 2 - 12)) {
129 /* Assume that we never need more than 12 characters of
130 encoded-text to encode a single character. */
134 n = convert_string(fromcode, u, ulen, cset, &s, &slen);
138 if (!tocode || n < bestn) {
139 m_strreplace(&tocode, cset);
153 char buf[LONG_STRING];
162 mutt_canonical_charset(buf, sizeof(buf), tocode);
163 m_strreplace(&tocode, buf);
170 /****************************************************************************/
171 /* Encoding functions */
172 /****************************************************************************/
174 static const char __qp_special[128] = {
175 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
176 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
177 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
179 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
182 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
185 typedef size_t (encoder_t)(char *, const char *, ssize_t, const char *);
188 b_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
192 s += sprintf(s, "=?%s?B?", tocode);
200 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
201 *s++ = __m_b64chars[(*d & 0x03) << 4];
207 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
208 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
209 *s++ = __m_b64chars[(d[1] & 0x0f) << 2];
214 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
215 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
216 *s++ = __m_b64chars[((d[1] & 0x0f) << 2) | ((d[2] >> 6) & 0x03)];
217 *s++ = __m_b64chars[d[2] & 0x3f];
229 q_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
233 s += sprintf(s, "=?%s?Q?", tocode);
235 unsigned char c = *d++;
240 if (c & 0x80 || __qp_special[c]) {
242 *s++ = __m_b36chars_upper[c >> 4];
243 *s++ = __m_b36chars_upper[c & 0xf];
255 * Return 0 if and set *encoder and *wlen if the data (d, dlen) could
256 * be converted to an encoded word of length *wlen using *encoder.
257 * Otherwise return an upper bound on the maximum length of the data
258 * which could be converted.
259 * The data is converted from fromcode (which must be stateless) to
260 * tocode, unless fromcode is 0, in which case the data is assumed to
261 * be already in tocode, which should be 8-bit and stateless.
263 static size_t try_block (const char *d, ssize_t dlen,
264 const char *fromcode, const char *tocode,
265 encoder_t **encoder, ssize_t *wlen)
267 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
268 ssize_t obl = sizeof(buf1) - m_strlen(tocode);
274 iconv_t cd = mutt_iconv_open(tocode, fromcode, 0);
276 assert (cd != (iconv_t)(-1));
280 if (my_iconv(cd, &ib, &ibl, &ob, &obl) < 0
281 || my_iconv(cd, 0, 0, &ob, &obl) < 0)
283 assert (errno == E2BIG && ib > d);
285 return (ib - d == dlen) ? dlen : ib - d + 1;
291 memcpy(buf1, d, dlen);
297 int count, len, len_b, len_q;
300 for (p = buf1; p < ob; p++) {
301 count += (*p & 0x80 || __qp_special[(int)*p]);
304 len = ENCWORD_LEN_MIN - 2 + m_strlen(tocode);
305 len_b = len + (((ob - buf1) + 2) / 3) * 4;
306 len_q = len + (ob - buf1) + 2 * count;
308 /* Apparently RFC 1468 says to use B encoding for iso-2022-jp. */
309 if (!ascii_strcasecmp(tocode, "ISO-2022-JP"))
310 len_q = ENCWORD_LEN_MAX + 1;
312 if (len_b < len_q && len_b <= ENCWORD_LEN_MAX) {
313 *encoder = b_encoder;
317 if (len_q <= ENCWORD_LEN_MAX) {
318 *encoder = q_encoder;
328 * Encode the data (d, dlen) into s using the encoder.
329 * Return the length of the encoded word.
331 static size_t encode_block (char *s, char *d, ssize_t dlen,
332 const char *fromcode, const char *tocode,
335 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
336 ssize_t ibl, obl, n1, n2;
342 cd = mutt_iconv_open (tocode, fromcode, 0);
343 assert (cd != (iconv_t) (-1));
344 ib = d, ibl = dlen, ob = buf1, obl = sizeof (buf1) - m_strlen(tocode);
345 n1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
346 n2 = my_iconv(cd, 0, 0, &ob, &obl);
347 assert (n1 >= 0 && n2 >= 0);
349 return (*encoder) (s, buf1, ob - buf1, tocode);
351 return (*encoder) (s, d, dlen, tocode);
356 * Discover how much of the data (d, dlen) can be converted into
357 * a single encoded word. Return how much data can be converted,
358 * and set the length *wlen of the encoded word and *encoder.
359 * We start in column col, which limits the length of the word.
361 static size_t choose_block(char *d, size_t dlen, int col,
362 const char *fromcode, const char *tocode,
363 encoder_t **encoder, ssize_t *wlen)
366 int utf8 = fromcode && !ascii_strcasecmp (fromcode, "UTF-8");
371 nn = try_block (d, n, fromcode, tocode, encoder, wlen);
372 if (!nn && (col + *wlen <= ENCWORD_LEN_MAX + 1 || n <= 1))
374 n = (nn ? nn : n) - 1;
377 while (n > 1 && CONTINUATION_BYTE (d[n]))
384 * Place the result of RFC-2047-encoding (d, dlen) into the dynamically
385 * allocated buffer (e, elen). The input data is in charset fromcode
386 * and is converted into a charset chosen from charsets.
387 * Return 1 if the conversion to UTF-8 failed, 2 if conversion from UTF-8
388 * failed, otherwise 0. If conversion failed, fromcode is assumed to be
389 * compatible with us-ascii and the original data is used.
390 * The input data is assumed to be a single line starting at column col;
391 * if col is non-zero, the preceding character was a space.
393 static int rfc2047_encode (const char *d, ssize_t dlen, int col,
394 const char *fromcode, const char *charsets,
395 char **e, ssize_t *elen, const char *specials)
399 ssize_t bufpos, buflen;
400 char *u, *t0, *t1, *t;
402 ssize_t ulen, r, n, wlen;
406 const char *icode = "UTF-8";
408 /* Try to convert to UTF-8. */
409 if (convert_string(fromcode, d, dlen, icode, &u, &ulen)) {
412 u = p_dupstr(d, ulen = dlen);
415 /* Find earliest and latest things we must encode. */
416 s0 = s1 = t0 = t1 = 0;
417 for (t = u; t < u + ulen; t++) {
419 (*t == '=' && t[1] == '?' && (t == u || HSPACE (*(t - 1))))) {
424 else if (specials && strchr (specials, *t)) {
431 /* If we have something to encode, include RFC822 specials */
432 if (t0 && s0 && s0 < t0)
434 if (t1 && s1 && s1 > t1)
438 /* No encoding is required. */
444 /* Choose target charset. */
447 if ((tocode1 = mutt_choose_charset(icode, charsets, u, ulen,
454 /* Hack to avoid labelling 8-bit data as us-ascii. */
455 if (!icode && mutt_is_us_ascii (tocode))
456 tocode = "unknown-8bit";
458 /* Adjust t0 for maximum length of line. */
459 t = u + (ENCWORD_LEN_MAX + 1) - col - ENCWORD_LEN_MIN;
466 /* Adjust t0 until we can encode a character after a space. */
467 for (; t0 > u; t0--) {
468 if (!HSPACE (*(t0 - 1)))
472 while (t < u + ulen && CONTINUATION_BYTE (*t))
474 if (!try_block (t0, t - t0, icode, tocode, &encoder, &wlen) &&
475 col + (t0 - u) + wlen <= ENCWORD_LEN_MAX + 1)
479 /* Adjust t1 until we can encode a character before a space. */
480 for (; t1 < u + ulen; t1++) {
485 while (CONTINUATION_BYTE (*t))
487 if (!try_block (t, t1 - t, icode, tocode, &encoder, &wlen) &&
488 1 + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
492 /* We shall encode the region [t0,t1). */
494 /* Initialise the output buffer with the us-ascii prefix. */
496 buf = p_new(char, buflen);
498 memcpy (buf, u, t0 - u);
504 /* Find how much we can encode. */
505 n = choose_block (t, t1 - t, col, icode, tocode, &encoder, &wlen);
507 /* See if we can fit the us-ascii suffix, too. */
508 if (col + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
512 while (CONTINUATION_BYTE (t[n]))
516 /* This should only happen in the really stupid case where the
517 only word that needs encoding is one character long, but
518 there is too much us-ascii stuff after it to use a single
519 encoded word. We add the next word to the encoded region
521 assert (t1 < u + ulen);
522 for (t1++; t1 < u + ulen && !HSPACE (*t1); t1++);
525 n = choose_block (t, n, col, icode, tocode, &encoder, &wlen);
528 /* Add to output buffer. */
529 #define LINEBREAK "\n\t"
530 if (bufpos + wlen + m_strlen(LINEBREAK) > buflen) {
531 buflen = bufpos + wlen + m_strlen(LINEBREAK);
532 p_realloc(&buf, buflen);
534 r = encode_block (buf + bufpos, t, n, icode, tocode, encoder);
537 memcpy (buf + bufpos, LINEBREAK, m_strlen(LINEBREAK));
538 bufpos += m_strlen(LINEBREAK);
546 /* Add last encoded word and us-ascii suffix to buffer. */
547 buflen = bufpos + wlen + (u + ulen - t1);
548 p_realloc(&buf, buflen + 1);
549 r = encode_block (buf + bufpos, t, t1 - t, icode, tocode, encoder);
552 memcpy (buf + bufpos, t1, u + ulen - t1);
564 void _rfc2047_encode_string (char **pd, int encode_specials, int col)
568 const char *charsets;
570 if (!Charset || !*pd)
573 charsets = SendCharset;
574 if (!charsets || !*charsets)
577 rfc2047_encode (*pd, m_strlen(*pd), col,
578 Charset, charsets, &e, &elen,
579 encode_specials ? RFC822Specials : NULL);
585 void rfc2047_encode_string(char **pd) {
586 _rfc2047_encode_string(pd, 0, 32);
589 void rfc2047_encode_adrlist (address_t * addr, const char *tag)
591 address_t *ptr = addr;
592 int col = tag ? m_strlen(tag) + 2 : 32;
596 _rfc2047_encode_string (&ptr->personal, 1, col);
601 static int rfc2047_decode_word (char *d, const char *s, size_t len)
603 const char *pp, *pp1;
606 int enc = 0, count = 0;
607 char *charset = NULL;
609 pd = d0 = p_new(char, m_strlen(s));
611 for (pp = s; (pp1 = strchr (pp, '?')); pp = pp1 + 1) {
615 /* ignore language specification a la RFC 2231 */
617 if ((t1 = memchr (pp, '*', t - pp)))
619 charset = p_dupstr(pp, t - pp);
622 if (toupper ((unsigned char) *pp) == 'Q')
623 enc = ENCQUOTEDPRINTABLE;
624 else if (toupper ((unsigned char) *pp) == 'B')
633 if (enc == ENCQUOTEDPRINTABLE) {
634 for (; pp < pp1; pp++) {
637 else if (*pp == '=' && hexval(pp[1]) >= 0 && hexval(pp[2]) >= 0) {
638 *pd++ = (hexval (pp[1]) << 4) | hexval (pp[2]);
646 else if (enc == ENCBASE64) {
649 for (; pp < pp1; pp++) {
652 if ((c = base64val(*pp)) < 0)
656 *pd++ = b | (c >> k);
671 mutt_convert_string (&d0, charset, Charset, M_ICONV_HOOK_FROM);
672 m_strcpy(d, len, d0);
679 * Find the start and end of the first encoded word in the string.
680 * We use the grammar in section 2 of RFC 2047, but the "encoding"
681 * must be B or Q. Also, we don't require the encoded word to be
682 * separated by linear-white-space (section 5(1)).
684 static const char *find_encoded_word (const char *s, const char **x)
689 while ((p = strstr (q, "=?"))) {
691 0x20 < *q && *q < 0x7f && !strchr ("()<>@,;:\"/[]?.=", *q); q++);
692 if (q[0] != '?' || !strchr ("BbQq", q[1]) || q[2] != '?')
694 for (q = q + 3; 0x20 <= *q && *q < 0x7f && *q != '?'; q++);
695 if (q[0] != '?' || q[1] != '=') {
707 /* return length of linear white space */
708 static size_t lwslen (const char *s, size_t n)
716 for (; p < s + n; p++)
717 if (!strchr (" \t\r\n", *p)) {
718 len = (size_t) (p - s);
721 if (strchr ("\r\n", *(p - 1))) /* LWS doesn't end with CRLF */
726 /* return length of linear white space : reverse */
727 static size_t lwsrlen (const char *s, size_t n)
729 const char *p = s + n - 1;
735 if (strchr ("\r\n", *p)) /* LWS doesn't end with CRLF */
739 if (!strchr (" \t\r\n", *p)) {
740 len = (size_t) (s + n - 1 - p);
746 /* try to decode anything that looks like a valid RFC2047 encoded
747 * header field, ignoring RFC822 parsing rules
749 void rfc2047_decode (char **pd)
753 int found_encoded = 0;
761 dlen = 4 * m_strlen(s); /* should be enough */
762 d = d0 = p_new(char, dlen + 1);
764 while (*s && dlen > 0) {
765 if (!(p = find_encoded_word (s, &q))) {
766 /* no encoded words */
767 if (!option (OPTSTRICTMIME)) {
769 if (found_encoded && (m = lwslen (s, n)) != 0) {
771 *d = ' ', d++, dlen--;
774 if (ascii_strcasecmp (AssumedCharset, "us-ascii")) {
779 if (mutt_convert_nonmime_string (&t) == 0) {
781 strncpy (d, t, tlen);
792 strncpy (d, s, dlen);
799 /* ignore spaces between encoded words
800 * and linear white spaces between encoded word and *text */
801 if (!option (OPTSTRICTMIME)) {
802 if (found_encoded && (m = lwslen (s, n)) != 0) {
804 *d = ' ', d++, dlen--;
808 if ((m = n - lwsrlen (s, n)) != 0) {
815 *d = ' ', d++, dlen--;
818 else if (!found_encoded || strspn (s, " \t\r\n") != n) {
827 rfc2047_decode_word (d, p, dlen);
841 void rfc2047_decode_adrlist(address_t *a)
845 rfc2047_decode(&a->personal);
850 void rfc2047_decode_envelope(ENVELOPE* e)
854 /* do RFC2047 decoding */
855 rfc2047_decode_adrlist(e->from);
856 rfc2047_decode_adrlist(e->to);
857 rfc2047_decode_adrlist(e->cc);
858 rfc2047_decode_adrlist(e->bcc);
859 rfc2047_decode_adrlist(e->reply_to);
860 rfc2047_decode_adrlist(e->mail_followup_to);
861 rfc2047_decode_adrlist(e->return_path);
862 rfc2047_decode_adrlist(e->sender);
865 rfc2047_decode(&e->subject);
866 mutt_adjust_subject(e);