2 * Copyright notice from original mutt:
3 * Copyright (C) 1999-2000 Thomas Roessler <roessler@does-not-exist.org>
5 * This file is part of mutt-ng, see http://www.muttng.org/.
6 * It's licensed under the GNU General Public License,
7 * please see the file GPL in the top level source directory.
20 #include <sys/types.h>
24 #ifdef HAVE_LANGINFO_CODESET
25 # include <langinfo.h>
28 #include <lib-lib/mem.h>
29 #include <lib-lib/ascii.h>
30 #include <lib-lib/str.h>
31 #include <lib-lib/macros.h>
37 # define EILSEQ EINVAL
41 int Charset_is_utf8 = 0;
44 * The following list has been created manually from the data under:
45 * http://www.isi.edu/in-notes/iana/assignments/character-sets
46 * Last update: 2000-09-07
48 * Note that it includes only the subset of character sets for which
49 * a preferred MIME name is given.
55 } PreferredMIMENames[] = {
56 {"ansi_x3.4-1968", "us-ascii"},
57 {"iso-ir-6", "us-ascii"},
58 {"iso_646.irv:1991", "us-ascii"},
59 {"ascii", "us-ascii"},
60 {"iso646-us", "us-ascii"},
62 {"ibm367", "us-ascii"},
63 {"cp367", "us-ascii"},
64 {"csascii", "us-ascii"},
65 {"csiso2022kr", "iso-2022-kr"},
66 {"cseuckr", "euc-kr"},
67 {"csiso2022jp", "iso-2022-jp"},
68 {"csiso2022jp2", "iso-2022-jp-2"},
69 {"iso_8859-1:1987", "iso-8859-1"},
70 {"iso-ir-100", "iso-8859-1"},
71 {"iso_8859-1", "iso-8859-1"},
72 {"latin1", "iso-8859-1"},
74 {"ibm819", "iso-8859-1"},
75 {"cp819", "iso-8859-1"},
76 {"csisolatin1", "iso-8859-1"},
77 {"iso_8859-2:1987", "iso-8859-2"},
78 {"iso-ir-101", "iso-8859-2"},
79 {"iso_8859-2", "iso-8859-2"},
80 {"latin2", "iso-8859-2"},
82 {"csisolatin2", "iso-8859-2"},
83 {"iso_8859-3:1988", "iso-8859-3"},
84 {"iso-ir-109", "iso-8859-3"},
85 {"iso_8859-3", "iso-8859-3"},
86 {"latin3", "iso-8859-3"},
88 {"csisolatin3", "iso-8859-3"},
89 {"iso_8859-4:1988", "iso-8859-4"},
90 {"iso-ir-110", "iso-8859-4"},
91 {"iso_8859-4", "iso-8859-4"},
92 {"latin4", "iso-8859-4"},
94 {"csisolatin4", "iso-8859-4"},
95 {"iso_8859-6:1987", "iso-8859-6"},
96 {"iso-ir-127", "iso-8859-6"},
97 {"iso_8859-6", "iso-8859-6"},
98 {"ecma-114", "iso-8859-6"},
99 {"asmo-708", "iso-8859-6"},
100 {"arabic", "iso-8859-6"},
101 {"csisolatinarabic", "iso-8859-6"},
102 {"iso_8859-7:1987", "iso-8859-7"},
103 {"iso-ir-126", "iso-8859-7"},
104 {"iso_8859-7", "iso-8859-7"},
105 {"elot_928", "iso-8859-7"},
106 {"ecma-118", "iso-8859-7"},
107 {"greek", "iso-8859-7"},
108 {"greek8", "iso-8859-7"},
109 {"csisolatingreek", "iso-8859-7"},
110 {"iso_8859-8:1988", "iso-8859-8"},
111 {"iso-ir-138", "iso-8859-8"},
112 {"iso_8859-8", "iso-8859-8"},
113 {"hebrew", "iso-8859-8"},
114 {"csisolatinhebrew", "iso-8859-8"},
115 {"iso_8859-5:1988", "iso-8859-5"},
116 {"iso-ir-144", "iso-8859-5"},
117 {"iso_8859-5", "iso-8859-5"},
118 {"cyrillic", "iso-8859-5"},
119 {"csisolatincyrillic", "iso8859-5"},
120 {"iso_8859-9:1989", "iso-8859-9"},
121 {"iso-ir-148", "iso-8859-9"},
122 {"iso_8859-9", "iso-8859-9"},
123 {"latin5", "iso-8859-9"},
124 {"l5", "iso-8859-9"},
125 {"csisolatin5", "iso-8859-9"},
126 {"iso_8859-10:1992", "iso-8859-10"},
127 {"iso-ir-157", "iso-8859-10"},
128 {"latin6", "iso-8859-10"},
129 {"l6", "iso-8859-10"},
130 {"csisolatin6", "iso-8859-10"},
131 {"cskoi8r", "koi8-r"},
132 {"ms_kanji", "shift_jis"},
133 {"csshiftjis", "shift_jis"},
134 {"extended_unix_code_packed_format_for_japanese", "euc-jp"},
135 {"cseucpkdfmtjapanese", "euc-jp"},
136 {"csgb2312", "gb2312"},
138 /* end of official brain damage.
139 what follows has been taken * from glibc's localedata files. */
140 {"iso_8859-13", "iso-8859-13"},
141 {"iso-ir-179", "iso-8859-13"},
142 {"latin7", "iso-8859-13"},
143 {"l7", "iso-8859-13"},
144 {"iso_8859-14", "iso-8859-14"},
145 {"latin8", "iso-8859-14"},
146 {"l8", "iso-8859-14"},
147 {"iso_8859-15", "iso-8859-15"},
148 {"latin9", "iso-8859-15"},
149 {"latin0", "iso-8859-15"},
150 {"iso_8859-16", "iso-8859-16"},
151 {"latin10", "iso-8859-16"},
154 {"pck", "shift_jis"},
155 {"ko_kr-euc", "euc-kr"},
156 {"zh_tw-big5", "big5"},
157 {"sjis", "shift_jis"},
158 {"euc-jp-ms", "eucjp-ms"},
162 void mutt_set_langinfo_charset (void)
164 #ifdef HAVE_LANGINFO_CODESET
165 char buff[LONG_STRING];
166 char buff2[LONG_STRING];
168 m_strcpy(buff, sizeof(buff), nl_langinfo(CODESET));
169 mutt_canonical_charset(buff2, sizeof(buff2), buff);
171 /* finally, set $charset */
172 if (!(Charset = m_strdup(buff2)))
174 Charset = m_strdup("iso-8859-1");
178 void mutt_canonical_charset(char *dest, ssize_t dlen, const char *name)
182 char scratch[LONG_STRING];
184 m_strcpy(scratch, sizeof(scratch), name);
185 m_strtolower(scratch);
187 /* catch some common iso-8859-something misspellings */
188 if (!strncmp(scratch, "8859", 4)) {
189 snprintf(scratch, sizeof(scratch), "iso-8859-%s",
190 name + 4 + (name[4] == '-'));
191 m_strtolower(scratch);
193 if (!strncmp(scratch, "iso8859", 7)) {
194 snprintf(scratch, sizeof(scratch), "iso-8859-%s",
195 name + 7 + (name[7] == '-'));
196 m_strtolower(scratch);
199 for (i = 0; PreferredMIMENames[i].key; i++) {
200 if (!strcmp(scratch, PreferredMIMENames[i].key)) {
201 m_strcpy(dest, dlen, PreferredMIMENames[i].pref);
206 m_strcpy(dest, dlen, scratch);
209 static int mutt_chscmp(const char *s, const char *chs)
216 mutt_canonical_charset(buffer, sizeof(buffer), s);
217 return !strcmp(buffer, chs);
220 int mutt_is_utf8(const char *s)
222 return mutt_chscmp(s, "utf-8");
225 int mutt_is_us_ascii(const char *s)
227 return mutt_chscmp(s, "us-ascii");
232 * Like iconv_open, but canonicalises the charsets
235 iconv_t mutt_iconv_open (const char *tocode, const char *fromcode, int flags)
237 char tocode1[SHORT_STRING];
238 char fromcode1[SHORT_STRING];
239 char *tocode2, *fromcode2;
244 mutt_canonical_charset (tocode1, sizeof (tocode1), tocode);
246 #ifdef M_ICONV_HOOK_TO
248 if ((flags & M_ICONV_HOOK_TO) && (tmp = mutt_charset_hook (tocode1)))
249 mutt_canonical_charset (tocode1, sizeof (tocode1), tmp);
252 mutt_canonical_charset (fromcode1, sizeof (fromcode1), fromcode);
253 if ((flags & M_ICONV_HOOK_FROM) && (tmp = mutt_charset_hook (fromcode1)))
254 mutt_canonical_charset (fromcode1, sizeof (fromcode1), tmp);
256 if ((cd = iconv_open (tocode1, fromcode1)) != (iconv_t) - 1)
258 if ((tocode2 = mutt_iconv_hook (tocode1))
259 && (fromcode2 = mutt_iconv_hook (fromcode1)))
260 return iconv_open (tocode2, fromcode2);
262 return (iconv_t) - 1;
267 * Like iconv, but keeps going even when the input is invalid
268 * If you're supplying inrepls, the source charset should be stateless;
269 * if you're supplying an outrepl, the target charset should be.
272 ssize_t mutt_iconv(iconv_t cd, const char **inbuf, ssize_t *inbytesleft,
273 char **outbuf, ssize_t *outbytesleft,
274 const char **inrepls, const char *outrepl)
276 ssize_t ret = 0, ret1;
277 const char *ib = *inbuf;
278 ssize_t ibl = *inbytesleft;
280 ssize_t obl = *outbytesleft;
283 ret1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
286 if (ibl && obl && errno == EILSEQ) {
288 /* Try replacing the input */
291 for (t = inrepls; *t; t++) {
292 const char *ib1 = *t;
293 ssize_t ibl1 = m_strlen(*t);
297 my_iconv(cd, &ib1, &ibl1, &ob1, &obl1);
300 ob = ob1, obl = obl1;
308 /* Replace the output */
311 my_iconv(cd, 0, 0, &ob, &obl);
313 ssize_t n = m_strlen(outrepl);
319 memcpy (ob, outrepl, n);
323 my_iconv(cd, 0, 0, 0, 0); /* for good measure */
327 *inbuf = ib, *inbytesleft = ibl;
328 *outbuf = ob, *outbytesleft = obl;
336 * Used in rfc2047.c and rfc2231.c
339 int mutt_convert_string (char **ps, const char *from, const char *to,
343 const char *repls[] = { "\357\277\275", "?", 0 };
349 if (to && from && (cd = mutt_iconv_open (to, from, flags)) != (iconv_t) - 1) {
354 const char **inrepls = NULL;
355 const char *outrepl = NULL;
357 if (mutt_is_utf8 (to))
358 outrepl = "\357\277\275";
359 else if (mutt_is_utf8 (from))
365 ib = s, ibl = len + 1;
366 obl = MB_LEN_MAX * ibl;
367 ob = buf = xmalloc(obl + 1);
369 mutt_iconv (cd, &ib, &ibl, &ob, &obl, inrepls, outrepl);
384 * FGETCONV stuff for converting a file while reading it
385 * Used in sendlib.c for converting from mutt's Charset
397 const char **inrepls;
400 struct fgetconv_not {
405 FGETCONV *fgetconv_open (FILE * file, const char *from, const char *to,
408 struct fgetconv_s *fc;
409 iconv_t cd = (iconv_t) - 1;
410 static const char *repls[] = { "\357\277\275", "?", 0 };
413 cd = mutt_iconv_open (to, from, flags);
415 if (cd != (iconv_t) - 1) {
416 fc = p_new(struct fgetconv_s, 1);
417 fc->p = fc->ob = fc->bufo;
420 fc->inrepls = mutt_is_utf8 (to) ? repls : repls + 1;
423 fc = p_new(struct fgetconv_s, 1);
426 return (FGETCONV *) fc;
429 char *fgetconvs (char *buf, ssize_t l, FGETCONV * _fc)
434 for (r = 0; r + 1 < l;) {
435 if ((c = fgetconv (_fc)) == EOF)
449 int fgetconv (FGETCONV * _fc)
451 struct fgetconv_s *fc = (struct fgetconv_s *) _fc;
455 if (fc->cd == (iconv_t) - 1)
456 return fgetc (fc->file);
460 return (unsigned char) *(fc->p)++;
462 /* Try to convert some more */
463 fc->p = fc->ob = fc->bufo;
465 ssize_t obl = ssizeof(fc->bufo);
467 my_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
469 return (unsigned char) *(fc->p)++;
472 /* If we trusted iconv a bit more, we would at this point
473 * ask why it had stopped converting ... */
475 /* Try to read some more */
476 if (fc->ibl == sizeof (fc->bufi) ||
477 (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof (fc->bufi))) {
482 memcpy (fc->bufi, fc->ib, fc->ibl);
485 fread (fc->ib + fc->ibl, 1, sizeof (fc->bufi) - fc->ibl, fc->file);
487 /* Try harder this time to convert some */
489 ssize_t obl = ssizeof(fc->bufo);
491 mutt_iconv (fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob,
492 &obl, fc->inrepls, 0);
494 return (unsigned char) *(fc->p)++;
497 /* Either the file has finished or one of the buffers is too small */
502 void fgetconv_close (FGETCONV ** _fc)
504 struct fgetconv_s *fc = (struct fgetconv_s *) *_fc;
506 if (fc->cd != (iconv_t) - 1)
507 iconv_close (fc->cd);
511 const char *mutt_get_first_charset (const char *charset)
513 static char fcharset[SHORT_STRING];
519 if (!(c1 = strchr (c, ':')))
520 return ((char*) charset);
521 m_strcpy(fcharset, c1 - c + 1, c);
525 static ssize_t convert_string (const char *f, ssize_t flen,
526 const char *from, const char *to,
527 char **t, ssize_t * tlen)
535 cd = mutt_iconv_open (to, from, 0);
536 if (cd == (iconv_t) (-1))
539 ob = buf = xmalloc(obl);
540 n = my_iconv(cd, &f, &flen, &ob, &obl);
541 if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
552 p_realloc(&buf, ob - buf + 1);
559 int mutt_convert_nonmime_string (char **ps)
563 for (c = AssumedCharset; c; c = c1 ? c1 + 1 : 0) {
568 ssize_t ulen = m_strlen(*ps);
574 c1 = strchr (c, ':');
575 n = c1 ? c1 - c : m_strlen(c);
578 fromcode = p_dupstr(c, n);
579 m = convert_string (u, ulen, fromcode, Charset, &s, &slen);
590 void mutt_set_charset (char *charset)
594 mutt_canonical_charset (buffer, sizeof (buffer), charset);
595 Charset_is_utf8 = !strcmp(buffer, "utf-8");
597 #ifdef HAVE_BIND_TEXTDOMAIN_CODESET
598 bind_textdomain_codeset (PACKAGE, buffer);
602 wchar_t replacement_char(void)
604 return Charset_is_utf8 ? 0xfffd : '?';