2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or (at
5 * your option) any later version.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17 * Copyright © 2006 Pierre Habouzit
21 * Copyright notice from original mutt:
22 * Copyright (C) 1996-2000 Michael R. Elkins <me@mutt.org>
23 * Copyright (C) 2000-2001 Edmund Grimley Evans <edmundo@rano.org>
25 * This file is part of mutt-ng, see http://www.muttng.org/.
26 * It's licensed under the GNU General Public License,
27 * please see the file GPL in the top level source directory.
30 #include <lib-lib/mem.h>
31 #include <lib-lib/str.h>
32 #include <lib-lib/ascii.h>
34 #include <lib-mime/mime.h>
47 /* If you are debugging this file, comment out the following line. */
55 #define ENCWORD_LEN_MAX 75
56 #define ENCWORD_LEN_MIN 9 /* m_strlen("=?.?.?.?=") */
58 #define HSPACE(x) ((x) == '\0' || (x) == ' ' || (x) == '\t')
59 #define CONTINUATION_BYTE(c) (((c) & 0xc0) == 0x80)
61 /* converts f of len flen and charset from
62 into *t of len *tlen and charset to
65 returns number of converted chars from f, see iconv(3)
68 convert_string(const char *from, const char *f, ssize_t flen,
69 const char *to, char **t, ssize_t *tlen)
75 cd = mutt_iconv_open(to, from, 0);
77 if (cd == (iconv_t)(-1))
81 ob = buf = p_new(char, obl);
82 n = my_iconv(cd, &f, &flen, &ob, &obl);
84 if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
100 /* choose the shortest encoding for u */
101 char *mutt_choose_charset(const char *fromcode, const char *charsets,
102 char *u, ssize_t ulen, char **dst, ssize_t *dlen)
110 const char *p = charsets;
113 char cset[SHORT_STRING];
120 n = m_strncpy(cset, sizeof(cset), p, q - p);
123 n = m_strcpy(cset, sizeof(cset), p);
127 if (!n || n > (ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 2 - 12)) {
128 /* Assume that we never need more than 12 characters of
129 encoded-text to encode a single character. */
133 n = convert_string(fromcode, u, ulen, cset, &s, &slen);
137 if (!tocode || n < bestn) {
138 m_strreplace(&tocode, cset);
152 char buf[LONG_STRING];
161 mutt_canonical_charset(buf, sizeof(buf), tocode);
162 m_strreplace(&tocode, buf);
169 /****************************************************************************/
170 /* Encoding functions */
171 /****************************************************************************/
173 static const char __qp_special[128] = {
174 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
175 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
176 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
177 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
178 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
184 typedef size_t (encoder_t)(char *, const char *, ssize_t, const char *);
187 b_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
191 s += sprintf(s, "=?%s?B?", tocode);
199 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
200 *s++ = __m_b64chars[(*d & 0x03) << 4];
206 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
207 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
208 *s++ = __m_b64chars[(d[1] & 0x0f) << 2];
213 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
214 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
215 *s++ = __m_b64chars[((d[1] & 0x0f) << 2) | ((d[2] >> 6) & 0x03)];
216 *s++ = __m_b64chars[d[2] & 0x3f];
228 q_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
232 s += sprintf(s, "=?%s?Q?", tocode);
234 unsigned char c = *d++;
239 if (c & 0x80 || __qp_special[c]) {
241 *s++ = __m_b36chars_upper[c >> 4];
242 *s++ = __m_b36chars_upper[c & 0xf];
254 * Return 0 if and set *encoder and *wlen if the data (d, dlen) could
255 * be converted to an encoded word of length *wlen using *encoder.
256 * Otherwise return an upper bound on the maximum length of the data
257 * which could be converted.
258 * The data is converted from fromcode (which must be stateless) to
259 * tocode, unless fromcode is 0, in which case the data is assumed to
260 * be already in tocode, which should be 8-bit and stateless.
262 static size_t try_block(const char *d, ssize_t dlen,
263 const char *fromcode, const char *tocode,
264 encoder_t **encoder, ssize_t *wlen)
266 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
267 ssize_t obl = sizeof(buf1) - m_strlen(tocode);
273 iconv_t cd = mutt_iconv_open(tocode, fromcode, 0);
275 assert (cd != (iconv_t)(-1));
279 if (my_iconv(cd, &ib, &ibl, &ob, &obl) < 0
280 || my_iconv(cd, 0, 0, &ob, &obl) < 0)
282 assert (errno == E2BIG && ib > d);
284 return (ib - d == dlen) ? dlen : ib - d + 1;
290 memcpy(buf1, d, dlen);
296 int count, len, len_b, len_q;
299 for (p = buf1; p < ob; p++) {
300 count += (*p & 0x80 || __qp_special[(int)*p]);
303 len = ENCWORD_LEN_MIN - 2 + m_strlen(tocode);
304 len_b = len + (((ob - buf1) + 2) / 3) * 4;
305 len_q = len + (ob - buf1) + 2 * count;
307 /* Apparently RFC 1468 says to use B encoding for iso-2022-jp. */
308 if (!ascii_strcasecmp(tocode, "ISO-2022-JP"))
309 len_q = ENCWORD_LEN_MAX + 1;
311 if (len_b < len_q && len_b <= ENCWORD_LEN_MAX) {
312 *encoder = b_encoder;
316 if (len_q <= ENCWORD_LEN_MAX) {
317 *encoder = q_encoder;
327 * Encode the data (d, dlen) into s using the encoder.
328 * Return the length of the encoded word.
331 encode_block(char *s, char *d, ssize_t dlen,
332 const char *fromcode, const char *tocode, encoder_t *encoder)
334 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
335 ssize_t ibl, obl, n1, n2;
341 cd = mutt_iconv_open(tocode, fromcode, 0);
342 assert (cd != (iconv_t) (-1));
343 ib = d, ibl = dlen, ob = buf1, obl = sizeof(buf1) - m_strlen(tocode);
344 n1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
345 n2 = my_iconv(cd, 0, 0, &ob, &obl);
346 assert (n1 >= 0 && n2 >= 0);
348 return (*encoder)(s, buf1, ob - buf1, tocode);
350 return (*encoder)(s, d, dlen, tocode);
355 * Discover how much of the data (d, dlen) can be converted into
356 * a single encoded word. Return how much data can be converted,
357 * and set the length *wlen of the encoded word and *encoder.
358 * We start in column col, which limits the length of the word.
360 static size_t choose_block(char *d, size_t dlen, int col,
361 const char *fromcode, const char *tocode,
362 encoder_t **encoder, ssize_t *wlen)
365 int utf8 = fromcode && !ascii_strcasecmp(fromcode, "UTF-8");
370 nn = try_block(d, n, fromcode, tocode, encoder, wlen);
371 if (!nn && (col + *wlen <= ENCWORD_LEN_MAX + 1 || n <= 1))
373 n = (nn ? nn : n) - 1;
376 while (n > 1 && CONTINUATION_BYTE(d[n]))
384 * Place the result of RFC-2047-encoding (d, dlen) into the dynamically
385 * allocated buffer (e, elen). The input data is in charset fromcode
386 * and is converted into a charset chosen from charsets.
387 * Return 1 if the conversion to UTF-8 failed, 2 if conversion from UTF-8
388 * failed, otherwise 0. If conversion failed, fromcode is assumed to be
389 * compatible with us-ascii and the original data is used.
390 * The input data is assumed to be a single line starting at column col;
391 * if col is non-zero, the preceding character was a space.
393 /*** XXX: simplify that one day ***/
394 static int rfc2047_encode(const char *d, ssize_t dlen, int col,
395 const char *fromcode, const char *charsets,
396 char **e, ssize_t *elen, const char *specials)
400 ssize_t bufpos, buflen;
402 char *s0, *s1, *t0, *t1;
405 const char *icode = "UTF-8";
406 ssize_t ulen, r, n, wlen;
409 /* Try to convert to UTF-8. */
410 if (convert_string(fromcode, d, dlen, icode, &u, &ulen)) {
413 u = p_dupstr(d, ulen = dlen);
416 /* Find earliest and latest things we must encode. */
417 s0 = s1 = t0 = t1 = NULL;
418 for (t = u; t < u + ulen; t++) {
420 (*t == '=' && t[1] == '?' && (t == u || HSPACE (*(t - 1))))) {
425 else if (specials && strchr (specials, *t)) {
432 /* If we have something to encode, include RFC822 specials */
433 if (t0 && s0 && s0 < t0)
435 if (t1 && s1 && s1 > t1)
439 /* No encoding is required. */
445 /* Choose target charset. */
448 if ((tocode1 = mutt_choose_charset(icode, charsets, u, ulen,
455 /* Hack to avoid labelling 8-bit data as us-ascii. */
456 if (!icode && mutt_is_us_ascii(tocode))
457 tocode = "unknown-8bit";
459 /* Adjust t0 for maximum length of line. */
460 t = u + (ENCWORD_LEN_MAX + 1) - col - ENCWORD_LEN_MIN;
467 /* Adjust t0 until we can encode a character after a space. */
468 for (; t0 > u; t0--) {
473 while (t < u + ulen && CONTINUATION_BYTE(*t))
476 if (!try_block(t0, t - t0, icode, tocode, &encoder, &wlen)
477 && col + (t0 - u) + wlen <= ENCWORD_LEN_MAX + 1)
481 /* Adjust t1 until we can encode a character before a space. */
482 for (; t1 < u + ulen; t1++) {
487 while (CONTINUATION_BYTE(*t))
490 if (!try_block (t, t1 - t, icode, tocode, &encoder, &wlen)
491 && 1 + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
495 /* We shall encode the region [t0,t1). */
497 /* Initialise the output buffer with the us-ascii prefix. */
499 buf = p_new(char, buflen);
501 memcpy(buf, u, t0 - u);
507 /* Find how much we can encode. */
508 n = choose_block (t, t1 - t, col, icode, tocode, &encoder, &wlen);
510 /* See if we can fit the us-ascii suffix, too. */
511 if (col + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
515 while (CONTINUATION_BYTE (t[n]))
519 /* This should only happen in the really stupid case where the
520 only word that needs encoding is one character long, but
521 there is too much us-ascii stuff after it to use a single
522 encoded word. We add the next word to the encoded region
524 assert (t1 < u + ulen);
525 for (t1++; t1 < u + ulen && !HSPACE (*t1); t1++);
528 n = choose_block (t, n, col, icode, tocode, &encoder, &wlen);
531 /* Add to output buffer. */
532 #define LINEBREAK "\n\t"
533 if (bufpos + wlen + 2 > buflen) {
534 buflen = bufpos + wlen + 2;
535 p_realloc(&buf, buflen);
537 r = encode_block (buf + bufpos, t, n, icode, tocode, encoder);
540 memcpy (buf + bufpos, LINEBREAK, m_strlen(LINEBREAK));
541 bufpos += m_strlen(LINEBREAK);
549 /* Add last encoded word and us-ascii suffix to buffer. */
550 buflen = bufpos + wlen + (u + ulen - t1);
551 p_realloc(&buf, buflen + 1);
552 r = encode_block (buf + bufpos, t, t1 - t, icode, tocode, encoder);
555 memcpy (buf + bufpos, t1, u + ulen - t1);
568 void _rfc2047_encode_string(char **pd, int encode_specials, int col)
572 const char *charsets;
574 if (!Charset || !*pd)
577 charsets = m_strisempty(SendCharset) ? "UTF-8" : SendCharset;
579 rfc2047_encode(*pd, m_strlen(*pd), col,
580 Charset, charsets, &e, &elen,
581 encode_specials ? RFC822Specials : NULL);
587 void rfc2047_encode_string(char **pd) {
588 _rfc2047_encode_string(pd, 0, 32);
591 void rfc2047_encode_adrlist(address_t *addr, const char *tag)
593 address_t *ptr = addr;
594 int col = tag ? m_strlen(tag) + 2 : 32;
598 _rfc2047_encode_string(&ptr->personal, 1, col);
604 /****************************************************************************/
605 /* Decoding functions */
606 /****************************************************************************/
608 /* decode one word into d[len] */
609 static int rfc2047_decode_word(char *d, size_t len, const char *s)
611 const char *p, *eotoken;
612 char *charset = NULL;
613 int enc = 0, count = 0;
616 /* =?[QB]?cset?.?= */
617 for (p = s; (eotoken = strchr(p, '?')); p = eotoken + 1) {
623 /* ignore language specification a la RFC 2231 */
624 t = memchr(p, '*', eotoken - p) ?: eotoken;
625 charset = p_dupstr(p, t - p);
631 enc = ENCQUOTEDPRINTABLE;
645 d0 = q = p_new(char, m_strlen(s) + 1);
647 if (enc == ENCQUOTEDPRINTABLE) {
648 while (p < eotoken) {
649 if (*p == '=' && hexval(p[1]) >= 0 && hexval(p[2]) >= 0) {
650 *q++ = (hexval (p[1]) << 4) | hexval (p[2]);
661 } else { /* enc == ENCBASE64 */
664 while (p < eotoken) {
688 mutt_convert_string(&d0, charset, Charset, M_ICONV_HOOK_FROM);
689 m_strcpy(d, len, d0);
696 * Find the start and end of the first encoded word in the string.
697 * We use the grammar in section 2 of RFC 2047, but the "encoding"
698 * must be B or Q. Also, we don't require the encoded word to be
699 * separated by linear-white-space (section 5(1)).
701 static const char *find_encoded_word(const char *s, const char **x)
705 while ((p = strstr(s, "=?"))) {
707 while (0x20 < *s && *s < 0x7f && !strchr ("()<>@,;:\"/[]?.=", *s)) {
711 if (s[0] != '?' || !strchr("BbQq", s[1]) || s[2] != '?')
715 while (0x20 <= *s && *s < 0x7f && *s != '?') {
719 if (s[0] != '?' || s[1] != '=') {
731 /* return length of linear white space */
732 static ssize_t lwslen(const char *s, ssize_t n)
740 for (p = s; p < s + n; p++) {
741 if (!strchr (" \t\r\n", *p)) {
747 if (p[-1] == '\r' || p[-1] == '\n') /* LWS cannot end with CRLF */
753 /* return length of linear white space : reverse */
754 static ssize_t lwsrlen(const char *s, ssize_t n)
756 const char *p = s + n - 1;
762 if (*p == '\r' || *p == '\n') /* LWS doesn't end with CRLF */
766 if (!strchr(" \t\r\n", *p)) {
776 /* try to decode anything that looks like a valid RFC2047 encoded
777 * header field, ignoring RFC822 parsing rules
779 void rfc2047_decode (char **pd)
783 int found_encoded = 0;
791 dlen = 4 * m_strlen(s); /* should be enough */
792 d = d0 = p_new(char, dlen + 1);
794 while (*s && dlen > 0) {
795 if (!(p = find_encoded_word (s, &q))) {
796 /* no encoded words */
797 if (!option (OPTSTRICTMIME)) {
799 if (found_encoded && (m = lwslen (s, n)) != 0) {
801 *d = ' ', d++, dlen--;
804 if (ascii_strcasecmp (AssumedCharset, "us-ascii")) {
809 if (mutt_convert_nonmime_string (&t) == 0) {
811 strncpy (d, t, tlen);
822 strncpy (d, s, dlen);
829 /* ignore spaces between encoded words
830 * and linear white spaces between encoded word and *text */
831 if (!option (OPTSTRICTMIME)) {
832 if (found_encoded && (m = lwslen (s, n)) != 0) {
834 *d = ' ', d++, dlen--;
838 if ((m = n - lwsrlen (s, n)) != 0) {
845 *d = ' ', d++, dlen--;
848 else if (!found_encoded || strspn (s, " \t\r\n") != n) {
857 rfc2047_decode_word(d, dlen, p);
871 void rfc2047_decode_adrlist(address_t *a)
875 rfc2047_decode(&a->personal);
880 void rfc2047_decode_envelope(ENVELOPE* e)
884 /* do RFC2047 decoding */
885 rfc2047_decode_adrlist(e->from);
886 rfc2047_decode_adrlist(e->to);
887 rfc2047_decode_adrlist(e->cc);
888 rfc2047_decode_adrlist(e->bcc);
889 rfc2047_decode_adrlist(e->reply_to);
890 rfc2047_decode_adrlist(e->mail_followup_to);
891 rfc2047_decode_adrlist(e->return_path);
892 rfc2047_decode_adrlist(e->sender);
895 rfc2047_decode(&e->subject);
896 mutt_adjust_subject(e);