2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or (at
5 * your option) any later version.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17 * Copyright © 2006 Pierre Habouzit
21 * Copyright notice from original mutt:
22 * Copyright (C) 1996-2000 Michael R. Elkins <me@mutt.org>
23 * Copyright (C) 2000-2001 Edmund Grimley Evans <edmundo@rano.org>
25 * This file is part of mutt-ng, see http://www.muttng.org/.
26 * It's licensed under the GNU General Public License,
27 * please see the file GPL in the top level source directory.
30 #include <lib-lib/mem.h>
31 #include <lib-lib/str.h>
32 #include <lib-lib/ascii.h>
34 #include <lib-mime/mime.h>
46 /* If you are debugging this file, comment out the following line. */
54 #define ENCWORD_LEN_MAX 75
55 #define ENCWORD_LEN_MIN 9 /* m_strlen("=?.?.?.?=") */
57 #define HSPACE(x) ((x) == '\0' || (x) == ' ' || (x) == '\t')
58 #define CONTINUATION_BYTE(c) (((c) & 0xc0) == 0x80)
60 /* converts f of len flen and charset from
61 into *t of len *tlen and charset to
64 returns number of converted chars from f, see iconv(3)
67 convert_string(const char *from, const char *f, ssize_t flen,
68 const char *to, char **t, ssize_t *tlen)
74 cd = mutt_iconv_open(to, from, 0);
76 if (cd == (iconv_t)(-1))
80 ob = buf = p_new(char, obl);
81 n = my_iconv(cd, &f, &flen, &ob, &obl);
83 if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
99 /* choose the shortest encoding for u */
100 char *mutt_choose_charset(const char *fromcode, const char *charsets,
101 char *u, ssize_t ulen, char **dst, ssize_t *dlen)
109 const char *p = charsets;
112 char cset[SHORT_STRING];
119 n = m_strncpy(cset, sizeof(cset), p, q - p);
122 n = m_strcpy(cset, sizeof(cset), p);
126 if (!n || n > (ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 2 - 12)) {
127 /* Assume that we never need more than 12 characters of
128 encoded-text to encode a single character. */
132 n = convert_string(fromcode, u, ulen, cset, &s, &slen);
136 if (!tocode || n < bestn) {
137 m_strreplace(&tocode, cset);
151 char buf[LONG_STRING];
160 mutt_canonical_charset(buf, sizeof(buf), tocode);
161 m_strreplace(&tocode, buf);
168 /****************************************************************************/
169 /* Encoding functions */
170 /****************************************************************************/
172 static const char __qp_special[128] = {
173 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
174 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
175 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
177 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
179 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
183 typedef size_t (encoder_t)(char *, const char *, ssize_t, const char *);
186 b_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
190 s += sprintf(s, "=?%s?B?", tocode);
198 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
199 *s++ = __m_b64chars[(*d & 0x03) << 4];
205 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
206 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
207 *s++ = __m_b64chars[(d[1] & 0x0f) << 2];
212 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
213 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
214 *s++ = __m_b64chars[((d[1] & 0x0f) << 2) | ((d[2] >> 6) & 0x03)];
215 *s++ = __m_b64chars[d[2] & 0x3f];
227 q_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
231 s += sprintf(s, "=?%s?Q?", tocode);
233 unsigned char c = *d++;
238 if (c & 0x80 || __qp_special[c]) {
240 *s++ = __m_b36chars_upper[c >> 4];
241 *s++ = __m_b36chars_upper[c & 0xf];
253 * Return 0 if and set *encoder and *wlen if the data (d, dlen) could
254 * be converted to an encoded word of length *wlen using *encoder.
255 * Otherwise return an upper bound on the maximum length of the data
256 * which could be converted.
257 * The data is converted from fromcode (which must be stateless) to
258 * tocode, unless fromcode is 0, in which case the data is assumed to
259 * be already in tocode, which should be 8-bit and stateless.
261 static size_t try_block(const char *d, ssize_t dlen,
262 const char *fromcode, const char *tocode,
263 encoder_t **encoder, ssize_t *wlen)
265 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
266 ssize_t obl = sizeof(buf1) - m_strlen(tocode);
272 iconv_t cd = mutt_iconv_open(tocode, fromcode, 0);
274 assert (cd != (iconv_t)(-1));
278 if (my_iconv(cd, &ib, &ibl, &ob, &obl) < 0
279 || my_iconv(cd, 0, 0, &ob, &obl) < 0)
281 assert (errno == E2BIG && ib > d);
283 return (ib - d == dlen) ? dlen : ib - d + 1;
289 memcpy(buf1, d, dlen);
295 int count, len, len_b, len_q;
298 for (p = buf1; p < ob; p++) {
299 count += (*p & 0x80 || __qp_special[(int)*p]);
302 len = ENCWORD_LEN_MIN - 2 + m_strlen(tocode);
303 len_b = len + (((ob - buf1) + 2) / 3) * 4;
304 len_q = len + (ob - buf1) + 2 * count;
306 /* Apparently RFC 1468 says to use B encoding for iso-2022-jp. */
307 if (!ascii_strcasecmp(tocode, "ISO-2022-JP"))
308 len_q = ENCWORD_LEN_MAX + 1;
310 if (len_b < len_q && len_b <= ENCWORD_LEN_MAX) {
311 *encoder = b_encoder;
315 if (len_q <= ENCWORD_LEN_MAX) {
316 *encoder = q_encoder;
326 * Encode the data (d, dlen) into s using the encoder.
327 * Return the length of the encoded word.
330 encode_block(char *s, char *d, ssize_t dlen,
331 const char *fromcode, const char *tocode, encoder_t *encoder)
333 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
334 ssize_t ibl, obl, n1, n2;
340 cd = mutt_iconv_open(tocode, fromcode, 0);
341 assert (cd != (iconv_t) (-1));
342 ib = d, ibl = dlen, ob = buf1, obl = sizeof(buf1) - m_strlen(tocode);
343 n1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
344 n2 = my_iconv(cd, 0, 0, &ob, &obl);
345 assert (n1 >= 0 && n2 >= 0);
347 return (*encoder)(s, buf1, ob - buf1, tocode);
349 return (*encoder)(s, d, dlen, tocode);
354 * Discover how much of the data (d, dlen) can be converted into
355 * a single encoded word. Return how much data can be converted,
356 * and set the length *wlen of the encoded word and *encoder.
357 * We start in column col, which limits the length of the word.
359 static size_t choose_block(char *d, size_t dlen, int col,
360 const char *fromcode, const char *tocode,
361 encoder_t **encoder, ssize_t *wlen)
364 int utf8 = fromcode && !ascii_strcasecmp(fromcode, "UTF-8");
369 nn = try_block(d, n, fromcode, tocode, encoder, wlen);
370 if (!nn && (col + *wlen <= ENCWORD_LEN_MAX + 1 || n <= 1))
372 n = (nn ? nn : n) - 1;
375 while (n > 1 && CONTINUATION_BYTE(d[n]))
383 * Place the result of RFC-2047-encoding (d, dlen) into the dynamically
384 * allocated buffer (e, elen). The input data is in charset fromcode
385 * and is converted into a charset chosen from charsets.
386 * Return 1 if the conversion to UTF-8 failed, 2 if conversion from UTF-8
387 * failed, otherwise 0. If conversion failed, fromcode is assumed to be
388 * compatible with us-ascii and the original data is used.
389 * The input data is assumed to be a single line starting at column col;
390 * if col is non-zero, the preceding character was a space.
392 /*** XXX: simplify that one day ***/
393 static int rfc2047_encode(const char *d, ssize_t dlen, int col,
394 const char *fromcode, const char *charsets,
395 char **e, ssize_t *elen, const char *specials)
399 ssize_t bufpos, buflen;
401 char *s0, *s1, *t0, *t1;
404 const char *icode = "UTF-8";
405 ssize_t ulen, r, n, wlen;
408 /* Try to convert to UTF-8. */
409 if (convert_string(fromcode, d, dlen, icode, &u, &ulen)) {
412 u = p_dupstr(d, ulen = dlen);
415 /* Find earliest and latest things we must encode. */
416 s0 = s1 = t0 = t1 = NULL;
417 for (t = u; t < u + ulen; t++) {
419 (*t == '=' && t[1] == '?' && (t == u || HSPACE (*(t - 1))))) {
424 else if (specials && strchr (specials, *t)) {
431 /* If we have something to encode, include RFC822 specials */
432 if (t0 && s0 && s0 < t0)
434 if (t1 && s1 && s1 > t1)
438 /* No encoding is required. */
444 /* Choose target charset. */
447 if ((tocode1 = mutt_choose_charset(icode, charsets, u, ulen,
454 /* Hack to avoid labelling 8-bit data as us-ascii. */
455 if (!icode && mutt_is_us_ascii(tocode))
456 tocode = "unknown-8bit";
458 /* Adjust t0 for maximum length of line. */
459 t = u + (ENCWORD_LEN_MAX + 1) - col - ENCWORD_LEN_MIN;
466 /* Adjust t0 until we can encode a character after a space. */
467 for (; t0 > u; t0--) {
472 while (t < u + ulen && CONTINUATION_BYTE(*t))
475 if (!try_block(t0, t - t0, icode, tocode, &encoder, &wlen)
476 && col + (t0 - u) + wlen <= ENCWORD_LEN_MAX + 1)
480 /* Adjust t1 until we can encode a character before a space. */
481 for (; t1 < u + ulen; t1++) {
486 while (CONTINUATION_BYTE(*t))
489 if (!try_block (t, t1 - t, icode, tocode, &encoder, &wlen)
490 && 1 + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
494 /* We shall encode the region [t0,t1). */
496 /* Initialise the output buffer with the us-ascii prefix. */
498 buf = p_new(char, buflen);
500 memcpy(buf, u, t0 - u);
506 /* Find how much we can encode. */
507 n = choose_block (t, t1 - t, col, icode, tocode, &encoder, &wlen);
509 /* See if we can fit the us-ascii suffix, too. */
510 if (col + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
514 while (CONTINUATION_BYTE (t[n]))
518 /* This should only happen in the really stupid case where the
519 only word that needs encoding is one character long, but
520 there is too much us-ascii stuff after it to use a single
521 encoded word. We add the next word to the encoded region
523 assert (t1 < u + ulen);
524 for (t1++; t1 < u + ulen && !HSPACE (*t1); t1++);
527 n = choose_block (t, n, col, icode, tocode, &encoder, &wlen);
530 /* Add to output buffer. */
531 #define LINEBREAK "\n\t"
532 if (bufpos + wlen + 2 > buflen) {
533 buflen = bufpos + wlen + 2;
534 p_realloc(&buf, buflen);
536 r = encode_block (buf + bufpos, t, n, icode, tocode, encoder);
539 memcpy (buf + bufpos, LINEBREAK, m_strlen(LINEBREAK));
540 bufpos += m_strlen(LINEBREAK);
548 /* Add last encoded word and us-ascii suffix to buffer. */
549 buflen = bufpos + wlen + (u + ulen - t1);
550 p_realloc(&buf, buflen + 1);
551 r = encode_block (buf + bufpos, t, t1 - t, icode, tocode, encoder);
554 memcpy (buf + bufpos, t1, u + ulen - t1);
567 void _rfc2047_encode_string(char **pd, int encode_specials, int col)
571 const char *charsets;
573 if (!Charset || !*pd)
576 charsets = m_strisempty(SendCharset) ? "UTF-8" : SendCharset;
578 rfc2047_encode(*pd, m_strlen(*pd), col,
579 Charset, charsets, &e, &elen,
580 encode_specials ? RFC822Specials : NULL);
586 void rfc2047_encode_string(char **pd) {
587 _rfc2047_encode_string(pd, 0, 32);
590 void rfc2047_encode_adrlist(address_t *addr, const char *tag)
592 address_t *ptr = addr;
593 int col = tag ? m_strlen(tag) + 2 : 32;
597 _rfc2047_encode_string(&ptr->personal, 1, col);
603 /****************************************************************************/
604 /* Decoding functions */
605 /****************************************************************************/
607 /* decode one word into d[len] */
608 static int rfc2047_decode_word(char *d, size_t len, const char *s)
610 const char *p, *eotoken;
611 char *charset = NULL;
612 int enc = 0, count = 0;
615 /* =?[QB]?cset?.?= */
616 for (p = s; (eotoken = strchr(p, '?')); p = eotoken + 1) {
622 /* ignore language specification a la RFC 2231 */
623 t = memchr(p, '*', eotoken - p) ?: eotoken;
624 charset = p_dupstr(p, t - p);
630 enc = ENCQUOTEDPRINTABLE;
644 d0 = q = p_new(char, m_strlen(s) + 1);
646 if (enc == ENCQUOTEDPRINTABLE) {
647 while (p < eotoken) {
648 if (*p == '=' && hexval(p[1]) >= 0 && hexval(p[2]) >= 0) {
649 *q++ = (hexval (p[1]) << 4) | hexval (p[2]);
660 } else { /* enc == ENCBASE64 */
663 while (p < eotoken) {
687 mutt_convert_string(&d0, charset, Charset, M_ICONV_HOOK_FROM);
688 m_strcpy(d, len, d0);
695 * Find the start and end of the first encoded word in the string.
696 * We use the grammar in section 2 of RFC 2047, but the "encoding"
697 * must be B or Q. Also, we don't require the encoded word to be
698 * separated by linear-white-space (section 5(1)).
700 static const char *find_encoded_word(const char *s, const char **x)
704 while ((p = strstr(s, "=?"))) {
706 while (0x20 < *s && *s < 0x7f && !strchr ("()<>@,;:\"/[]?.=", *s)) {
710 if (s[0] != '?' || !strchr("BbQq", s[1]) || s[2] != '?')
714 while (0x20 <= *s && *s < 0x7f && *s != '?') {
718 if (s[0] != '?' || s[1] != '=') {
730 /* return length of linear white space */
731 static ssize_t lwslen(const char *s, ssize_t n)
739 for (p = s; p < s + n; p++) {
740 if (!strchr (" \t\r\n", *p)) {
746 if (p[-1] == '\r' || p[-1] == '\n') /* LWS cannot end with CRLF */
752 /* return length of linear white space : reverse */
753 static ssize_t lwsrlen(const char *s, ssize_t n)
755 const char *p = s + n - 1;
761 if (*p == '\r' || *p == '\n') /* LWS doesn't end with CRLF */
765 if (!strchr(" \t\r\n", *p)) {
775 /* try to decode anything that looks like a valid RFC2047 encoded
776 * header field, ignoring RFC822 parsing rules
778 void rfc2047_decode(char **pd)
780 const int strict_mime = option(OPTSTRICTMIME);
785 int found_encoded = 0;
790 dlen = 4 * m_strlen(s); /* should be enough */
791 d = d0 = p_new(char, dlen + 1);
793 while (*s && dlen > 0) {
796 p = find_encoded_word(s, &q);
799 /* no encoded words */
804 if (found_encoded && (m = lwslen(s, n)) != 0) {
810 if (ascii_strcasecmp(AssumedCharset, "us-ascii")) {
814 if (mutt_convert_nonmime_string(&t) == 0) {
815 d += m_strcpy(d, dlen, t);
817 d += m_strcpy(d, dlen, s);
823 d += m_strcpy(d, dlen, s);
831 /* ignore spaces between encoded words
832 * and linear white spaces between encoded word and *text */
834 if (found_encoded && (m = lwslen(s, n)) != 0) {
840 if ((m = n - lwsrlen(s, n)) != 0) {
841 m = m_strncpy(d, dlen, s, m);
848 if (!found_encoded || (ssize_t)strspn(s, " \t\r\n") != n) {
849 n = m_strncpy(d, dlen, s, n);
855 rfc2047_decode_word(d, dlen, p);
866 void rfc2047_decode_adrlist(address_t *a)
870 rfc2047_decode(&a->personal);
875 void rfc2047_decode_envelope(ENVELOPE* e)
879 /* do RFC2047 decoding */
880 rfc2047_decode_adrlist(e->from);
881 rfc2047_decode_adrlist(e->to);
882 rfc2047_decode_adrlist(e->cc);
883 rfc2047_decode_adrlist(e->bcc);
884 rfc2047_decode_adrlist(e->reply_to);
885 rfc2047_decode_adrlist(e->mail_followup_to);
886 rfc2047_decode_adrlist(e->return_path);
887 rfc2047_decode_adrlist(e->sender);
890 rfc2047_decode(&e->subject);
891 mutt_adjust_subject(e);