2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or (at
5 * your option) any later version.
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17 * Copyright © 2006 Pierre Habouzit
21 * Copyright notice from original mutt:
22 * Copyright (C) 1996-2000 Michael R. Elkins <me@mutt.org>
23 * Copyright (C) 2000-2001 Edmund Grimley Evans <edmundo@rano.org>
25 * This file is part of mutt-ng, see http://www.muttng.org/.
26 * It's licensed under the GNU General Public License,
27 * please see the file GPL in the top level source directory.
30 #include <lib-lib/lib-lib.h>
32 #include <lib-mime/mime.h>
37 /* If you are debugging this file, comment out the following line. */
45 #define ENCWORD_LEN_MAX 75
46 #define ENCWORD_LEN_MIN 9 /* m_strlen("=?.?.?.?=") */
48 #define HSPACE(x) ((x) == '\0' || (x) == ' ' || (x) == '\t')
49 #define CONTINUATION_BYTE(c) (((c) & 0xc0) == 0x80)
51 /* converts f of len flen and charset from
52 into *t of len *tlen and charset to
55 returns number of converted chars from f, see iconv(3)
58 convert_string(const char *from, const char *f, ssize_t flen,
59 const char *to, char **t, ssize_t *tlen)
65 cd = mutt_iconv_open(to, from, 0);
67 if (cd == MUTT_ICONV_ERROR)
71 ob = buf = p_new(char, obl);
72 n = my_iconv(cd, &f, &flen, &ob, &obl);
74 if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
90 /* choose the shortest encoding for u */
91 char *mutt_choose_charset(const char *fromcode, const char *charsets,
92 char *u, ssize_t ulen, char **dst, ssize_t *dlen)
100 const char *p = charsets;
103 char cset[SHORT_STRING];
110 n = m_strncpy(cset, sizeof(cset), p, q - p);
113 n = m_strcpy(cset, sizeof(cset), p);
117 if (!n || n > (ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 2 - 12)) {
118 /* Assume that we never need more than 12 characters of
119 encoded-text to encode a single character. */
123 n = convert_string(fromcode, u, ulen, cset, &s, &slen);
127 if (!tocode || n < bestn) {
128 m_strreplace(&tocode, cset);
142 char buf[LONG_STRING];
151 charset_canonicalize(buf, sizeof(buf), tocode);
152 m_strreplace(&tocode, buf);
159 /****************************************************************************/
160 /* Encoding functions */
161 /****************************************************************************/
163 static const char __qp_special[128] = {
164 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
165 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
166 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1,
167 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
168 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1,
170 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
174 typedef size_t (encoder_t)(char *, const char *, ssize_t, const char *);
177 b_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
181 s += sprintf(s, "=?%s?B?", tocode);
189 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
190 *s++ = __m_b64chars[(*d & 0x03) << 4];
196 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
197 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
198 *s++ = __m_b64chars[(d[1] & 0x0f) << 2];
203 *s++ = __m_b64chars[(*d >> 2) & 0x3f];
204 *s++ = __m_b64chars[((*d & 0x03) << 4) | ((d[1] >> 4) & 0x0f)];
205 *s++ = __m_b64chars[((d[1] & 0x0f) << 2) | ((d[2] >> 6) & 0x03)];
206 *s++ = __m_b64chars[d[2] & 0x3f];
218 q_encoder(char *s, const char *d, ssize_t dlen, const char *tocode)
222 s += sprintf(s, "=?%s?Q?", tocode);
224 unsigned char c = *d++;
229 if (c & 0x80 || __qp_special[c]) {
231 *s++ = __m_b36chars_upper[c >> 4];
232 *s++ = __m_b36chars_upper[c & 0xf];
244 * Return 0 if and set *encoder and *wlen if the data (d, dlen) could
245 * be converted to an encoded word of length *wlen using *encoder.
246 * Otherwise return an upper bound on the maximum length of the data
247 * which could be converted.
248 * The data is converted from fromcode (which must be stateless) to
249 * tocode, unless fromcode is 0, in which case the data is assumed to
250 * be already in tocode, which should be 8-bit and stateless.
252 static size_t try_block(const char *d, ssize_t dlen,
253 const char *fromcode, const char *tocode,
254 encoder_t **encoder, ssize_t *wlen)
256 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
257 ssize_t obl = sizeof(buf1) - m_strlen(tocode);
263 iconv_t cd = mutt_iconv_open(tocode, fromcode, 0);
265 assert (cd != MUTT_ICONV_ERROR);
269 if (my_iconv(cd, &ib, &ibl, &ob, &obl) < 0
270 || my_iconv(cd, 0, 0, &ob, &obl) < 0)
272 assert (errno == E2BIG && ib > d);
274 return (ib - d == dlen) ? dlen : ib - d + 1;
280 memcpy(buf1, d, dlen);
286 int count, len, len_b, len_q;
289 for (p = buf1; p < ob; p++) {
290 count += (*p & 0x80 || __qp_special[(int)*p]);
293 len = ENCWORD_LEN_MIN - 2 + m_strlen(tocode);
294 len_b = len + (((ob - buf1) + 2) / 3) * 4;
295 len_q = len + (ob - buf1) + 2 * count;
297 /* Apparently RFC 1468 says to use B encoding for iso-2022-jp. */
298 if (mime_which_token(tocode, -1) == MIME_ISO_2022_JP)
299 len_q = ENCWORD_LEN_MAX + 1;
301 if (len_b < len_q && len_b <= ENCWORD_LEN_MAX) {
302 *encoder = b_encoder;
306 if (len_q <= ENCWORD_LEN_MAX) {
307 *encoder = q_encoder;
317 * Encode the data (d, dlen) into s using the encoder.
318 * Return the length of the encoded word.
321 encode_block(char *s, char *d, ssize_t dlen,
322 const char *fromcode, const char *tocode, encoder_t *encoder)
324 char buf1[ENCWORD_LEN_MAX - ENCWORD_LEN_MIN + 1];
325 ssize_t ibl, obl, n1, n2;
331 cd = mutt_iconv_open(tocode, fromcode, 0);
332 assert (cd != MUTT_ICONV_ERROR);
333 ib = d, ibl = dlen, ob = buf1, obl = sizeof(buf1) - m_strlen(tocode);
334 n1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
335 n2 = my_iconv(cd, 0, 0, &ob, &obl);
336 assert (n1 >= 0 && n2 >= 0);
338 return (*encoder)(s, buf1, ob - buf1, tocode);
340 return (*encoder)(s, d, dlen, tocode);
345 * Discover how much of the data (d, dlen) can be converted into
346 * a single encoded word. Return how much data can be converted,
347 * and set the length *wlen of the encoded word and *encoder.
348 * We start in column col, which limits the length of the word.
350 static size_t choose_block(char *d, size_t dlen, int col,
351 const char *fromcode, const char *tocode,
352 encoder_t **encoder, ssize_t *wlen)
355 int utf8 = mime_which_token(fromcode, -1) == MIME_UTF_8;
360 nn = try_block(d, n, fromcode, tocode, encoder, wlen);
361 if (!nn && (col + *wlen <= ENCWORD_LEN_MAX + 1 || n <= 1))
363 n = (nn ? nn : n) - 1;
366 while (n > 1 && CONTINUATION_BYTE(d[n]))
374 * Place the result of RFC-2047-encoding (d, dlen) into the dynamically
375 * allocated buffer (e, elen). The input data is in charset fromcode
376 * and is converted into a charset chosen from charsets.
377 * Return 1 if the conversion to UTF-8 failed, 2 if conversion from UTF-8
378 * failed, otherwise 0. If conversion failed, fromcode is assumed to be
379 * compatible with us-ascii and the original data is used.
380 * The input data is assumed to be a single line starting at column col;
381 * if col is non-zero, the preceding character was a space.
383 /*** XXX: simplify that one day ***/
384 static int rfc2047_encode(const char *d, ssize_t dlen, int col,
385 const char *fromcode, const char *charsets,
386 char **e, ssize_t *elen, const char *specials)
390 ssize_t bufpos, buflen;
392 char *s0, *s1, *t0, *t1;
395 const char *icode = "UTF-8";
396 ssize_t ulen, r, n, wlen;
399 /* Try to convert to UTF-8. */
400 if (convert_string(fromcode, d, dlen, icode, &u, &ulen)) {
403 u = p_dupstr(d, ulen = dlen);
406 /* Find earliest and latest things we must encode. */
407 s0 = s1 = t0 = t1 = NULL;
408 for (t = u; t < u + ulen; t++) {
410 (*t == '=' && t[1] == '?' && (t == u || HSPACE (*(t - 1))))) {
415 else if (specials && strchr (specials, *t)) {
422 /* If we have something to encode, include RFC822 specials */
423 if (t0 && s0 && s0 < t0)
425 if (t1 && s1 && s1 > t1)
429 /* No encoding is required. */
435 /* Choose target charset. */
438 if ((tocode1 = mutt_choose_charset(icode, charsets, u, ulen,
445 /* Hack to avoid labelling 8-bit data as us-ascii. */
446 if (!icode && charset_is_us_ascii(tocode))
447 tocode = "unknown-8bit";
449 /* Adjust t0 for maximum length of line. */
450 t = u + (ENCWORD_LEN_MAX + 1) - col - ENCWORD_LEN_MIN;
457 /* Adjust t0 until we can encode a character after a space. */
458 for (; t0 > u; t0--) {
463 while (t < u + ulen && CONTINUATION_BYTE(*t))
466 if (!try_block(t0, t - t0, icode, tocode, &encoder, &wlen)
467 && col + (t0 - u) + wlen <= ENCWORD_LEN_MAX + 1)
471 /* Adjust t1 until we can encode a character before a space. */
472 for (; t1 < u + ulen; t1++) {
477 while (CONTINUATION_BYTE(*t))
480 if (!try_block (t, t1 - t, icode, tocode, &encoder, &wlen)
481 && 1 + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
485 /* We shall encode the region [t0,t1). */
487 /* Initialise the output buffer with the us-ascii prefix. */
489 buf = p_new(char, buflen);
491 memcpy(buf, u, t0 - u);
497 /* Find how much we can encode. */
498 n = choose_block (t, t1 - t, col, icode, tocode, &encoder, &wlen);
500 /* See if we can fit the us-ascii suffix, too. */
501 if (col + wlen + (u + ulen - t1) <= ENCWORD_LEN_MAX + 1)
505 while (CONTINUATION_BYTE (t[n]))
509 /* This should only happen in the really stupid case where the
510 only word that needs encoding is one character long, but
511 there is too much us-ascii stuff after it to use a single
512 encoded word. We add the next word to the encoded region
514 assert (t1 < u + ulen);
515 for (t1++; t1 < u + ulen && !HSPACE (*t1); t1++);
518 n = choose_block (t, n, col, icode, tocode, &encoder, &wlen);
521 /* Add to output buffer. */
522 #define LINEBREAK "\n\t"
523 if (bufpos + wlen + 2 > buflen) {
524 buflen = bufpos + wlen + 2;
525 p_realloc(&buf, buflen);
527 r = encode_block (buf + bufpos, t, n, icode, tocode, encoder);
530 memcpy (buf + bufpos, LINEBREAK, m_strlen(LINEBREAK));
531 bufpos += m_strlen(LINEBREAK);
539 /* Add last encoded word and us-ascii suffix to buffer. */
540 buflen = bufpos + wlen + (u + ulen - t1);
541 p_realloc(&buf, buflen + 1);
542 r = encode_block (buf + bufpos, t, t1 - t, icode, tocode, encoder);
545 memcpy (buf + bufpos, t1, u + ulen - t1);
558 static void _rfc2047_encode_string(char **pd, int encode_specials, int col)
562 const char *charsets;
564 if (!Charset || !*pd)
567 charsets = m_strisempty(SendCharset) ? "UTF-8" : SendCharset;
569 rfc2047_encode(*pd, m_strlen(*pd), col,
570 Charset, charsets, &e, &elen,
571 encode_specials ? RFC822Specials : NULL);
577 void rfc2047_encode_string(char **pd) {
578 _rfc2047_encode_string(pd, 0, 32);
581 void rfc2047_encode_adrlist(address_t *addr, const char *tag)
583 address_t *ptr = addr;
584 int col = tag ? m_strlen(tag) + 2 : 32;
588 _rfc2047_encode_string(&ptr->personal, 1, col);
594 /****************************************************************************/
595 /* Decoding functions */
596 /****************************************************************************/
598 /* decode one word into d[len] =?cst?[QB]?....?= */
600 rfc2047_decode_word(char *d, size_t len, const char *p, const char *end)
602 char charset[STRING] = "";
612 m_strncpy(charset, sizeof(charset), p, t - p);
616 enc = ENCQUOTEDPRINTABLE;
630 p = t + 3; /* skip ?[QB]? */
631 d0 = q = p_new(char, end - p + 1); /* it's enough space to decode */
633 if (enc == ENCQUOTEDPRINTABLE) {
634 while (p < end - 2) {
635 if (*p == '=' && hexval(p[1]) >= 0 && hexval(p[2]) >= 0) {
636 *q++ = (hexval (p[1]) << 4) | hexval (p[2]);
646 } else { /* enc == ENCBASE64 */
649 while (p < end - 2) {
670 mutt_convert_string(&d0, charset, Charset, M_ICONV_HOOK_FROM);
671 m_strcpy(d, len, d0);
677 * Find the start and end of the first encoded word in the string.
678 * We use the grammar in section 2 of RFC 2047, but the "encoding"
679 * must be B or Q. Also, we don't require the encoded word to be
680 * separated by linear-white-space (section 5(1)).
682 static const char *find_encoded_word(const char *s, const char **x)
686 while ((p = strstr(s, "=?"))) {
688 while (0x20 < *s && *s < 0x7f && !strchr ("()<>@,;:\"/[]?.=", *s)) {
692 if (s[0] != '?' || !strchr("BbQq", s[1]) || s[2] != '?')
696 while (0x20 <= *s && *s < 0x7f && (*s != '?' || s[1] != '=')) {
700 if (s[0] != '?' || s[1] != '=') {
712 /* return length of linear white space */
713 static ssize_t lwslen(const char *s, ssize_t n)
721 for (p = s; p < s + n; p++) {
722 if (!strchr (" \t\r\n", *p)) {
728 if (p[-1] == '\r' || p[-1] == '\n') /* LWS cannot end with CRLF */
734 /* return length of linear white space : reverse */
735 static ssize_t lwsrlen(const char *s, ssize_t n)
737 const char *p = s + n - 1;
743 if (*p == '\r' || *p == '\n') /* LWS doesn't end with CRLF */
747 if (!strchr(" \t\r\n", *p)) {
757 /* try to decode anything that looks like a valid RFC2047 encoded
758 * header field, ignoring RFC822 parsing rules
760 void rfc2047_decode(char **pd)
765 int found_encoded = 0;
770 dlen = 4 * m_strlen(s); /* should be enough */
771 d = d0 = p_new(char, dlen + 1);
773 while (*s && dlen > 0) {
776 p = find_encoded_word(s, &q);
779 /* no encoded words */
783 if (found_encoded && (m = lwslen(s, n)) != 0) {
789 if (mime_which_token(AssumedCharset, -1) == MIME_US_ASCII) {
793 if (mutt_convert_nonmime_string(&t) == 0) {
794 d += m_strcpy(d, dlen, t);
796 d += m_strcpy(d, dlen, s);
802 d += m_strcpy(d, dlen, s);
810 /* ignore spaces between encoded words
811 * and linear white spaces between encoded word and *text */
812 if (found_encoded && (m = lwslen(s, n)) != 0) {
818 if ((m = n - lwsrlen(s, n)) != 0) {
819 m = m_strncpy(d, dlen, s, m);
827 rfc2047_decode_word(d, dlen, p, q);
838 void rfc2047_decode_adrlist(address_t *a)
842 rfc2047_decode(&a->personal);
847 void rfc2047_decode_envelope(ENVELOPE* e)
851 /* do RFC2047 decoding */
852 rfc2047_decode_adrlist(e->from);
853 rfc2047_decode_adrlist(e->to);
854 rfc2047_decode_adrlist(e->cc);
855 rfc2047_decode_adrlist(e->bcc);
856 rfc2047_decode_adrlist(e->reply_to);
857 rfc2047_decode_adrlist(e->mail_followup_to);
858 rfc2047_decode_adrlist(e->return_path);
859 rfc2047_decode_adrlist(e->sender);
862 rfc2047_decode(&e->subject);
863 mutt_adjust_subject(e);