2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or (at
5 * your option) any later version.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17 * Copyright © 2006 Pierre Habouzit
21 * Copyright notice from original mutt:
22 * Copyright (C) 1996-2000 Michael R. Elkins <me@mutt.org>
23 * Copyright (C) 2000-2001 Edmund Grimley Evans <edmundo@rano.org>
25 * This file is part of mutt-ng, see http://www.muttng.org/.
26 * It's licensed under the GNU General Public License,
27 * please see the file GPL in the top level source directory.
30 #include <lib-lib/lib-lib.h>
32 #include <lib-mime/mime.h>
44 /* If you are debugging this file, comment out the following line. */
52 #define ENCWORD_LEN_MAX 75
53 #define ENCWORD_LEN_MIN 9 /* m_strlen("=?.?.?.?=") */
55 #define HSPACE(x) ((x) == '\0' || (x) == ' ' || (x) == '\t')
56 #define CONTINUATION_BYTE(c) (((c) & 0xc0) == 0x80)
58 /* converts f of len flen and charset from
59 into *t of len *tlen and charset to
62 returns number of converted chars from f, see iconv(3)
65 convert_string(const char *from, const char *f, ssize_t flen,
66 const char *to, char **t, ssize_t *tlen)
72 cd = mutt_iconv_open(to, from, 0);
74 if (cd == MUTT_ICONV_ERROR)
78 ob = buf = p_new(char, obl);
79 n = my_iconv(cd, &f, &flen, &ob, &obl);
81 if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
97 /* choose the shortest encoding for u */
98 char *mutt_choose_charset(const char *fromcode, const char *charsets,
99 char *u, ssize_t ulen, char **dst, ssize_t *dlen)
107 const char *p = charsets;
110 char cset[SHORT_STRING];
117 n = m_strncpy(cset, sizeof(cset), p, q - p);
120 n = m_strcpy(cset, sizeof(cset), p);
124 if (!n || n > (ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 2 - 12)) {
125 /* Assume that we never need more than 12 characters of
126 encoded-text to encode a single character. */
130 n = convert_string(fromcode, u, ulen, cset, &s, &slen);
134 if (!tocode || n < bestn) {
135 m_strreplace(&tocode, cset);
149 char buf[LONG_STRING];
158 charset_canonicalize(buf, sizeof(buf), tocode);
159 m_strreplace(&tocode, buf);
166 /****************************************************************************/
167 /* Encoding functions */
168 /****************************************************************************/
170 static const char __qp_special[128] = {
171 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
173 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
174 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
175 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
177 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
181 typedef size_t (encoder_t)(char *, const char *, ssize_t, const char *);
184 b_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
188 s += sprintf(s, "=?%s?B?", tocode);
196 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
197 *s++ = __m_b64chars[(*d & 0x03) << 4];
203 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
204 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
205 *s++ = __m_b64chars[(d[1] & 0x0f) << 2];
210 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
211 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
212 *s++ = __m_b64chars[((d[1] & 0x0f) << 2) | ((d[2] >> 6) & 0x03)];
213 *s++ = __m_b64chars[d[2] & 0x3f];
225 q_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
229 s += sprintf(s, "=?%s?Q?", tocode);
231 unsigned char c = *d++;
236 if (c & 0x80 || __qp_special[c]) {
238 *s++ = __m_b36chars_upper[c >> 4];
239 *s++ = __m_b36chars_upper[c & 0xf];
251 * Return 0 if and set *encoder and *wlen if the data (d, dlen) could
252 * be converted to an encoded word of length *wlen using *encoder.
253 * Otherwise return an upper bound on the maximum length of the data
254 * which could be converted.
255 * The data is converted from fromcode (which must be stateless) to
256 * tocode, unless fromcode is 0, in which case the data is assumed to
257 * be already in tocode, which should be 8-bit and stateless.
259 static size_t try_block(const char *d, ssize_t dlen,
260 const char *fromcode, const char *tocode,
261 encoder_t **encoder, ssize_t *wlen)
263 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
264 ssize_t obl = sizeof(buf1) - m_strlen(tocode);
270 iconv_t cd = mutt_iconv_open(tocode, fromcode, 0);
272 assert (cd != MUTT_ICONV_ERROR);
276 if (my_iconv(cd, &ib, &ibl, &ob, &obl) < 0
277 || my_iconv(cd, 0, 0, &ob, &obl) < 0)
279 assert (errno == E2BIG && ib > d);
281 return (ib - d == dlen) ? dlen : ib - d + 1;
287 memcpy(buf1, d, dlen);
293 int count, len, len_b, len_q;
296 for (p = buf1; p < ob; p++) {
297 count += (*p & 0x80 || __qp_special[(int)*p]);
300 len = ENCWORD_LEN_MIN - 2 + m_strlen(tocode);
301 len_b = len + (((ob - buf1) + 2) / 3) * 4;
302 len_q = len + (ob - buf1) + 2 * count;
304 /* Apparently RFC 1468 says to use B encoding for iso-2022-jp. */
305 if (mime_which_token(tocode, -1) == MIME_ISO_2022_JP)
306 len_q = ENCWORD_LEN_MAX + 1;
308 if (len_b < len_q && len_b <= ENCWORD_LEN_MAX) {
309 *encoder = b_encoder;
313 if (len_q <= ENCWORD_LEN_MAX) {
314 *encoder = q_encoder;
324 * Encode the data (d, dlen) into s using the encoder.
325 * Return the length of the encoded word.
328 encode_block(char *s, char *d, ssize_t dlen,
329 const char *fromcode, const char *tocode, encoder_t *encoder)
331 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
332 ssize_t ibl, obl, n1, n2;
338 cd = mutt_iconv_open(tocode, fromcode, 0);
339 assert (cd != MUTT_ICONV_ERROR);
340 ib = d, ibl = dlen, ob = buf1, obl = sizeof(buf1) - m_strlen(tocode);
341 n1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
342 n2 = my_iconv(cd, 0, 0, &ob, &obl);
343 assert (n1 >= 0 && n2 >= 0);
345 return (*encoder)(s, buf1, ob - buf1, tocode);
347 return (*encoder)(s, d, dlen, tocode);
352 * Discover how much of the data (d, dlen) can be converted into
353 * a single encoded word. Return how much data can be converted,
354 * and set the length *wlen of the encoded word and *encoder.
355 * We start in column col, which limits the length of the word.
357 static size_t choose_block(char *d, size_t dlen, int col,
358 const char *fromcode, const char *tocode,
359 encoder_t **encoder, ssize_t *wlen)
362 int utf8 = mime_which_token(fromcode, -1) == MIME_UTF_8;
367 nn = try_block(d, n, fromcode, tocode, encoder, wlen);
368 if (!nn && (col + *wlen <= ENCWORD_LEN_MAX + 1 || n <= 1))
370 n = (nn ? nn : n) - 1;
373 while (n > 1 && CONTINUATION_BYTE(d[n]))
381 * Place the result of RFC-2047-encoding (d, dlen) into the dynamically
382 * allocated buffer (e, elen). The input data is in charset fromcode
383 * and is converted into a charset chosen from charsets.
384 * Return 1 if the conversion to UTF-8 failed, 2 if conversion from UTF-8
385 * failed, otherwise 0. If conversion failed, fromcode is assumed to be
386 * compatible with us-ascii and the original data is used.
387 * The input data is assumed to be a single line starting at column col;
388 * if col is non-zero, the preceding character was a space.
390 /*** XXX: simplify that one day ***/
391 static int rfc2047_encode(const char *d, ssize_t dlen, int col,
392 const char *fromcode, const char *charsets,
393 char **e, ssize_t *elen, const char *specials)
397 ssize_t bufpos, buflen;
399 char *s0, *s1, *t0, *t1;
402 const char *icode = "UTF-8";
403 ssize_t ulen, r, n, wlen;
406 /* Try to convert to UTF-8. */
407 if (convert_string(fromcode, d, dlen, icode, &u, &ulen)) {
410 u = p_dupstr(d, ulen = dlen);
413 /* Find earliest and latest things we must encode. */
414 s0 = s1 = t0 = t1 = NULL;
415 for (t = u; t < u + ulen; t++) {
417 (*t == '=' && t[1] == '?' && (t == u || HSPACE (*(t - 1))))) {
422 else if (specials && strchr (specials, *t)) {
429 /* If we have something to encode, include RFC822 specials */
430 if (t0 && s0 && s0 < t0)
432 if (t1 && s1 && s1 > t1)
436 /* No encoding is required. */
442 /* Choose target charset. */
445 if ((tocode1 = mutt_choose_charset(icode, charsets, u, ulen,
452 /* Hack to avoid labelling 8-bit data as us-ascii. */
453 if (!icode && charset_is_us_ascii(tocode))
454 tocode = "unknown-8bit";
456 /* Adjust t0 for maximum length of line. */
457 t = u + (ENCWORD_LEN_MAX + 1) - col - ENCWORD_LEN_MIN;
464 /* Adjust t0 until we can encode a character after a space. */
465 for (; t0 > u; t0--) {
470 while (t < u + ulen && CONTINUATION_BYTE(*t))
473 if (!try_block(t0, t - t0, icode, tocode, &encoder, &wlen)
474 && col + (t0 - u) + wlen <= ENCWORD_LEN_MAX + 1)
478 /* Adjust t1 until we can encode a character before a space. */
479 for (; t1 < u + ulen; t1++) {
484 while (CONTINUATION_BYTE(*t))
487 if (!try_block (t, t1 - t, icode, tocode, &encoder, &wlen)
488 && 1 + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
492 /* We shall encode the region [t0,t1). */
494 /* Initialise the output buffer with the us-ascii prefix. */
496 buf = p_new(char, buflen);
498 memcpy(buf, u, t0 - u);
504 /* Find how much we can encode. */
505 n = choose_block (t, t1 - t, col, icode, tocode, &encoder, &wlen);
507 /* See if we can fit the us-ascii suffix, too. */
508 if (col + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
512 while (CONTINUATION_BYTE (t[n]))
516 /* This should only happen in the really stupid case where the
517 only word that needs encoding is one character long, but
518 there is too much us-ascii stuff after it to use a single
519 encoded word. We add the next word to the encoded region
521 assert (t1 < u + ulen);
522 for (t1++; t1 < u + ulen && !HSPACE (*t1); t1++);
525 n = choose_block (t, n, col, icode, tocode, &encoder, &wlen);
528 /* Add to output buffer. */
529 #define LINEBREAK "\n\t"
530 if (bufpos + wlen + 2 > buflen) {
531 buflen = bufpos + wlen + 2;
532 p_realloc(&buf, buflen);
534 r = encode_block (buf + bufpos, t, n, icode, tocode, encoder);
537 memcpy (buf + bufpos, LINEBREAK, m_strlen(LINEBREAK));
538 bufpos += m_strlen(LINEBREAK);
546 /* Add last encoded word and us-ascii suffix to buffer. */
547 buflen = bufpos + wlen + (u + ulen - t1);
548 p_realloc(&buf, buflen + 1);
549 r = encode_block (buf + bufpos, t, t1 - t, icode, tocode, encoder);
552 memcpy (buf + bufpos, t1, u + ulen - t1);
565 static void _rfc2047_encode_string(char **pd, int encode_specials, int col)
569 const char *charsets;
571 if (!Charset || !*pd)
574 charsets = m_strisempty(SendCharset) ? "UTF-8" : SendCharset;
576 rfc2047_encode(*pd, m_strlen(*pd), col,
577 Charset, charsets, &e, &elen,
578 encode_specials ? RFC822Specials : NULL);
584 void rfc2047_encode_string(char **pd) {
585 _rfc2047_encode_string(pd, 0, 32);
588 void rfc2047_encode_adrlist(address_t *addr, const char *tag)
590 address_t *ptr = addr;
591 int col = tag ? m_strlen(tag) + 2 : 32;
595 _rfc2047_encode_string(&ptr->personal, 1, col);
601 /****************************************************************************/
602 /* Decoding functions */
603 /****************************************************************************/
605 /* decode one word into d[len] */
606 static int rfc2047_decode_word(char *d, size_t len, const char *s)
608 const char *p, *eotoken;
609 char *charset = NULL;
610 int enc = 0, count = 0;
613 /* =?[QB]?cset?.?= */
614 for (p = s; (eotoken = strchr(p, '?')); p = eotoken + 1) {
620 /* ignore language specification a la RFC 2231 */
621 t = memchr(p, '*', eotoken - p) ?: eotoken;
622 charset = p_dupstr(p, t - p);
628 enc = ENCQUOTEDPRINTABLE;
642 d0 = q = p_new(char, m_strlen(s) + 1);
644 if (enc == ENCQUOTEDPRINTABLE) {
645 while (p < eotoken) {
646 if (*p == '=' && hexval(p[1]) >= 0 && hexval(p[2]) >= 0) {
647 *q++ = (hexval (p[1]) << 4) | hexval (p[2]);
658 } else { /* enc == ENCBASE64 */
661 while (p < eotoken) {
685 mutt_convert_string(&d0, charset, Charset, M_ICONV_HOOK_FROM);
686 m_strcpy(d, len, d0);
693 * Find the start and end of the first encoded word in the string.
694 * We use the grammar in section 2 of RFC 2047, but the "encoding"
695 * must be B or Q. Also, we don't require the encoded word to be
696 * separated by linear-white-space (section 5(1)).
698 static const char *find_encoded_word(const char *s, const char **x)
702 while ((p = strstr(s, "=?"))) {
704 while (0x20 < *s && *s < 0x7f && !strchr ("()<>@,;:\"/[]?.=", *s)) {
708 if (s[0] != '?' || !strchr("BbQq", s[1]) || s[2] != '?')
712 while (0x20 <= *s && *s < 0x7f && *s != '?') {
716 if (s[0] != '?' || s[1] != '=') {
728 /* return length of linear white space */
729 static ssize_t lwslen(const char *s, ssize_t n)
737 for (p = s; p < s + n; p++) {
738 if (!strchr (" \t\r\n", *p)) {
744 if (p[-1] == '\r' || p[-1] == '\n') /* LWS cannot end with CRLF */
750 /* return length of linear white space : reverse */
751 static ssize_t lwsrlen(const char *s, ssize_t n)
753 const char *p = s + n - 1;
759 if (*p == '\r' || *p == '\n') /* LWS doesn't end with CRLF */
763 if (!strchr(" \t\r\n", *p)) {
773 /* try to decode anything that looks like a valid RFC2047 encoded
774 * header field, ignoring RFC822 parsing rules
776 void rfc2047_decode(char **pd)
778 const int strict_mime = option(OPTSTRICTMIME);
783 int found_encoded = 0;
788 dlen = 4 * m_strlen(s); /* should be enough */
789 d = d0 = p_new(char, dlen + 1);
791 while (*s && dlen > 0) {
794 p = find_encoded_word(s, &q);
797 /* no encoded words */
802 if (found_encoded && (m = lwslen(s, n)) != 0) {
808 if (mime_which_token(AssumedCharset, -1) == MIME_US_ASCII) {
812 if (mutt_convert_nonmime_string(&t) == 0) {
813 d += m_strcpy(d, dlen, t);
815 d += m_strcpy(d, dlen, s);
821 d += m_strcpy(d, dlen, s);
829 /* ignore spaces between encoded words
830 * and linear white spaces between encoded word and *text */
832 if (found_encoded && (m = lwslen(s, n)) != 0) {
838 if ((m = n - lwsrlen(s, n)) != 0) {
839 m = m_strncpy(d, dlen, s, m);
846 if (!found_encoded || (ssize_t)strspn(s, " \t\r\n") != n) {
847 n = m_strncpy(d, dlen, s, n);
853 rfc2047_decode_word(d, dlen, p);
864 void rfc2047_decode_adrlist(address_t *a)
868 rfc2047_decode(&a->personal);
873 void rfc2047_decode_envelope(ENVELOPE* e)
877 /* do RFC2047 decoding */
878 rfc2047_decode_adrlist(e->from);
879 rfc2047_decode_adrlist(e->to);
880 rfc2047_decode_adrlist(e->cc);
881 rfc2047_decode_adrlist(e->bcc);
882 rfc2047_decode_adrlist(e->reply_to);
883 rfc2047_decode_adrlist(e->mail_followup_to);
884 rfc2047_decode_adrlist(e->return_path);
885 rfc2047_decode_adrlist(e->sender);
888 rfc2047_decode(&e->subject);
889 mutt_adjust_subject(e);