charset.c

   1 /*
   2  * Copyright notice from original mutt:
   3  * Copyright (C) 1999-2000 Thomas Roessler <roessler@does-not-exist.org>
   4  *
   5  * This file is part of mutt-ng, see http://www.muttng.org/.
   6  * It's licensed under the GNU General Public License,
   7  * please see the file GPL in the top level source directory.
   8  */
   9
  10 #if HAVE_CONFIG_H
  11 # include "config.h"
  12 #endif
  13
  14 #include <string.h>
  15 #include <stdio.h>
  16 #include <stdlib.h>
  17
  18 #include <ctype.h>
  19
  20 #include <sys/types.h>
  21 #include <dirent.h>
  22 #include <unistd.h>
  23 #include <errno.h>
  24
  25 #include <lib-lib/mem.h>
  26 #include <lib-lib/ascii.h>
  27 #include <lib-lib/str.h>
  28 #include <lib-lib/macros.h>
  29
  30 #include "mutt.h"
  31 #include "charset.h"
  32
  33 #ifndef EILSEQ
  34 # define EILSEQ EINVAL
  35 #endif
  36
  37 char *Charset;
  38 int Charset_is_utf8 = 0;
  39
  40 /*
  41  * The following list has been created manually from the data under:
  42  * http://www.isi.edu/in-notes/iana/assignments/character-sets
  43  * Last update: 2000-09-07
  44  *
  45  * Note that it includes only the subset of character sets for which
  46  * a preferred MIME name is given.
  47  */
  48
  49 static struct {
  50     const char *key;
  51     const char *pref;
  52 } PreferredMIMENames[] = {
  53     {"ansi_x3.4-1968", "us-ascii"},
  54     {"iso-ir-6", "us-ascii"},
  55     {"iso_646.irv:1991", "us-ascii"},
  56     {"ascii", "us-ascii"},
  57     {"iso646-us", "us-ascii"},
  58     {"us", "us-ascii"},
  59     {"ibm367", "us-ascii"},
  60     {"cp367", "us-ascii"},
  61     {"csASCII", "us-ascii"},
  62     {"csISO2022KR", "iso-2022-kr"},
  63     {"csEUCKR", "euc-kr"},
  64     {"csISO2022JP", "iso-2022-jp"},
  65     {"csISO2022JP2", "iso-2022-jp-2"},
  66     {"ISO_8859-1:1987", "iso-8859-1"},
  67     {"iso-ir-100", "iso-8859-1"},
  68     {"iso_8859-1", "iso-8859-1"},
  69     {"latin1", "iso-8859-1"},
  70     {"l1", "iso-8859-1"},
  71     {"IBM819", "iso-8859-1"},
  72     {"CP819", "iso-8859-1"},
  73     {"csISOLatin1", "iso-8859-1"},
  74     {"ISO_8859-2:1987", "iso-8859-2"},
  75     {"iso-ir-101", "iso-8859-2"},
  76     {"iso_8859-2", "iso-8859-2"},
  77     {"latin2", "iso-8859-2"},
  78     {"l2", "iso-8859-2"},
  79     {"csISOLatin2", "iso-8859-2"},
  80     {"ISO_8859-3:1988", "iso-8859-3"},
  81     {"iso-ir-109", "iso-8859-3"},
  82     {"ISO_8859-3", "iso-8859-3"},
  83     {"latin3", "iso-8859-3"},
  84     {"l3", "iso-8859-3"},
  85     {"csISOLatin3", "iso-8859-3"},
  86     {"ISO_8859-4:1988", "iso-8859-4"},
  87     {"iso-ir-110", "iso-8859-4"},
  88     {"ISO_8859-4", "iso-8859-4"},
  89     {"latin4", "iso-8859-4"},
  90     {"l4", "iso-8859-4"},
  91     {"csISOLatin4", "iso-8859-4"},
  92     {"ISO_8859-6:1987", "iso-8859-6"},
  93     {"iso-ir-127", "iso-8859-6"},
  94     {"iso_8859-6", "iso-8859-6"},
  95     {"ECMA-114", "iso-8859-6"},
  96     {"ASMO-708", "iso-8859-6"},
  97     {"arabic", "iso-8859-6"},
  98     {"csISOLatinArabic", "iso-8859-6"},
  99     {"ISO_8859-7:1987", "iso-8859-7"},
 100     {"iso-ir-126", "iso-8859-7"},
 101     {"ISO_8859-7", "iso-8859-7"},
 102     {"ELOT_928", "iso-8859-7"},
 103     {"ECMA-118", "iso-8859-7"},
 104     {"greek", "iso-8859-7"},
 105     {"greek8", "iso-8859-7"},
 106     {"csISOLatinGreek", "iso-8859-7"},
 107     {"ISO_8859-8:1988", "iso-8859-8"},
 108     {"iso-ir-138", "iso-8859-8"},
 109     {"ISO_8859-8", "iso-8859-8"},
 110     {"hebrew", "iso-8859-8"},
 111     {"csISOLatinHebrew", "iso-8859-8"},
 112     {"ISO_8859-5:1988", "iso-8859-5"},
 113     {"iso-ir-144", "iso-8859-5"},
 114     {"ISO_8859-5", "iso-8859-5"},
 115     {"cyrillic", "iso-8859-5"},
 116     {"csISOLatinCyrillic", "iso8859-5"},
 117     {"ISO_8859-9:1989", "iso-8859-9"},
 118     {"iso-ir-148", "iso-8859-9"},
 119     {"ISO_8859-9", "iso-8859-9"},
 120     {"latin5", "iso-8859-9"},
 121     {"l5", "iso-8859-9"},
 122     {"csISOLatin5", "iso-8859-9"},
 123     {"ISO_8859-10:1992", "iso-8859-10"},
 124     {"iso-ir-157", "iso-8859-10"},
 125     {"latin6", "iso-8859-10"},
 126     {"l6", "iso-8859-10"},
 127     {"csISOLatin6", "iso-8859-10"},
 128     {"csKOI8r", "koi8-r"},
 129     {"MS_Kanji", "Shift_JIS"},
 130     {"csShiftJis", "Shift_JIS"},
 131     {"Extended_UNIX_Code_Packed_Format_for_Japanese", "EUC-JP"},
 132     {"csEUCPkdFmtJapanese", "EUC-JP"},
 133     {"csGB2312", "gb2312"},
 134     {"csbig5", "big5"},
 135     /* End of official brain damage.
 136        What follows has been taken * from glibc's localedata files.  */
 137     {"iso_8859-13", "iso-8859-13"},
 138     {"iso-ir-179", "iso-8859-13"},
 139     {"latin7", "iso-8859-13"},
 140     {"l7", "iso-8859-13"},
 141     {"iso_8859-14", "iso-8859-14"},
 142     {"latin8", "iso-8859-14"},
 143     {"l8", "iso-8859-14"},
 144     {"iso_8859-15", "iso-8859-15"},
 145     {"latin9", "iso-8859-15"},
 146     {"latin0", "iso-8859-15"},
 147     {"iso_8859-16", "iso-8859-16"},
 148     {"latin10", "iso-8859-16"},
 149     {"646", "us-ascii"},
 150     {"eucJP", "euc-jp"},
 151     {"PCK", "Shift_JIS"},
 152     {"ko_KR-euc", "euc-kr"},
 153     {"zh_TW-big5", "big5"},
 154     {"sjis", "Shift_JIS"},
 155     {"euc-jp-ms", "eucJP-ms"},
 156     {NULL, NULL}
 157 };
 158
 159 #ifdef HAVE_LANGINFO_CODESET
 160 #  include <langinfo.h>
 161 #endif
 162
 163 void mutt_set_langinfo_charset (void)
 164 {
 165 #ifdef HAVE_LANGINFO_CODESET
 166     char buff[LONG_STRING];
 167     char buff2[LONG_STRING];
 168
 169     m_strcpy(buff, sizeof(buff), nl_langinfo(CODESET));
 170     mutt_canonical_charset (buff2, sizeof (buff2), buff);
 171
 172     /* finally, set $charset */
 173     if (!(Charset = m_strdup(buff2)))
 174 #endif
 175         Charset = m_strdup("iso-8859-1");
 176 }
 177
 178
 179 void mutt_canonical_charset(char *dest, ssize_t dlen, const char *name)
 180 {
 181   ssize_t i;
 182   char *p;
 183   char scratch[LONG_STRING];
 184
 185   /* catch some common iso-8859-something misspellings */
 186   if (!ascii_strncasecmp (name, "8859", 4) && name[4] != '-')
 187     snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 4);
 188   else if (!ascii_strncasecmp (name, "8859-", 5))
 189     snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 5);
 190   else if (!ascii_strncasecmp (name, "iso8859", 7) && name[7] != '-')
 191     snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 7);
 192   else if (!ascii_strncasecmp (name, "iso8859-", 8))
 193     snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 8);
 194   else
 195     m_strcpy(scratch, sizeof(scratch), NONULL(name));
 196
 197   for (i = 0; PreferredMIMENames[i].key; i++)
 198     if (!ascii_strcasecmp (scratch, PreferredMIMENames[i].key) ||
 199         !m_strcasecmp(scratch, PreferredMIMENames[i].key)) {
 200       m_strcpy(dest, dlen, PreferredMIMENames[i].pref);
 201       return;
 202     }
 203
 204   m_strcpy(dest, dlen, scratch);
 205
 206   /* for cosmetics' sake, transform to lowercase. */
 207   for (p = dest; *p; p++)
 208     *p = ascii_tolower (*p);
 209 }
 210
 211 int mutt_chscmp (const char *s, const char *chs)
 212 {
 213   char buffer[STRING];
 214
 215   if (!s)
 216     return 0;
 217
 218   mutt_canonical_charset (buffer, sizeof (buffer), s);
 219   return !ascii_strcasecmp (buffer, chs);
 220 }
 221
 222
 223 /*
 224  * Like iconv_open, but canonicalises the charsets
 225  */
 226
 227 iconv_t mutt_iconv_open (const char *tocode, const char *fromcode, int flags)
 228 {
 229   char tocode1[SHORT_STRING];
 230   char fromcode1[SHORT_STRING];
 231   char *tocode2, *fromcode2;
 232   char *tmp;
 233
 234   iconv_t cd;
 235
 236   mutt_canonical_charset (tocode1, sizeof (tocode1), tocode);
 237
 238 #ifdef M_ICONV_HOOK_TO
 239   /* Not used. */
 240   if ((flags & M_ICONV_HOOK_TO) && (tmp = mutt_charset_hook (tocode1)))
 241     mutt_canonical_charset (tocode1, sizeof (tocode1), tmp);
 242 #endif
 243
 244   mutt_canonical_charset (fromcode1, sizeof (fromcode1), fromcode);
 245   if ((flags & M_ICONV_HOOK_FROM) && (tmp = mutt_charset_hook (fromcode1)))
 246     mutt_canonical_charset (fromcode1, sizeof (fromcode1), tmp);
 247
 248   if ((cd = iconv_open (tocode1, fromcode1)) != (iconv_t) - 1)
 249     return cd;
 250   if ((tocode2 = mutt_iconv_hook (tocode1))
 251       && (fromcode2 = mutt_iconv_hook (fromcode1)))
 252     return iconv_open (tocode2, fromcode2);
 253
 254   return (iconv_t) - 1;
 255 }
 256
 257
 258 /*
 259  * Like iconv, but keeps going even when the input is invalid
 260  * If you're supplying inrepls, the source charset should be stateless;
 261  * if you're supplying an outrepl, the target charset should be.
 262  */
 263
 264 ssize_t mutt_iconv(iconv_t cd, const char **inbuf, ssize_t *inbytesleft,
 265                    char **outbuf, ssize_t *outbytesleft,
 266                    const char **inrepls, const char *outrepl)
 267 {
 268   ssize_t ret = 0, ret1;
 269   const char *ib = *inbuf;
 270   ssize_t ibl = *inbytesleft;
 271   char *ob = *outbuf;
 272   ssize_t obl = *outbytesleft;
 273
 274   for (;;) {
 275     ret1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
 276     if (ret1 != -1)
 277       ret += ret1;
 278     if (ibl && obl && errno == EILSEQ) {
 279       if (inrepls) {
 280         /* Try replacing the input */
 281         const char **t;
 282
 283         for (t = inrepls; *t; t++) {
 284           const char *ib1 = *t;
 285           ssize_t ibl1 = m_strlen(*t);
 286           char *ob1 = ob;
 287           ssize_t obl1 = obl;
 288
 289           my_iconv(cd, &ib1, &ibl1, &ob1, &obl1);
 290           if (!ibl1) {
 291             ++ib, --ibl;
 292             ob = ob1, obl = obl1;
 293             ++ret;
 294             break;
 295           }
 296         }
 297         if (*t)
 298           continue;
 299       }
 300       /* Replace the output */
 301       if (!outrepl)
 302         outrepl = "?";
 303       my_iconv(cd, 0, 0, &ob, &obl);
 304       if (obl) {
 305         ssize_t n = m_strlen(outrepl);
 306
 307         if (n > obl) {
 308           outrepl = "?";
 309           n = 1;
 310         }
 311         memcpy (ob, outrepl, n);
 312         ++ib, --ibl;
 313         ob += n, obl -= n;
 314         ++ret;
 315         my_iconv(cd, 0, 0, 0, 0); /* for good measure */
 316         continue;
 317       }
 318     }
 319     *inbuf = ib, *inbytesleft = ibl;
 320     *outbuf = ob, *outbytesleft = obl;
 321     return ret;
 322   }
 323 }
 324
 325
 326 /*
 327  * Convert a string
 328  * Used in rfc2047.c and rfc2231.c
 329  */
 330
 331 int mutt_convert_string (char **ps, const char *from, const char *to,
 332                          int flags)
 333 {
 334   iconv_t cd;
 335   const char *repls[] = { "\357\277\275", "?", 0 };
 336   char *s = *ps;
 337
 338   if (!s || !*s)
 339     return 0;
 340
 341   if (to && from && (cd = mutt_iconv_open (to, from, flags)) != (iconv_t) - 1) {
 342     int len;
 343     const char *ib;
 344     char *buf, *ob;
 345     ssize_t ibl, obl;
 346     const char **inrepls = NULL;
 347     const char *outrepl = NULL;
 348
 349     if (mutt_is_utf8 (to))
 350       outrepl = "\357\277\275";
 351     else if (mutt_is_utf8 (from))
 352       inrepls = repls;
 353     else
 354       outrepl = "?";
 355
 356     len = m_strlen(s);
 357     ib = s, ibl = len + 1;
 358     obl = MB_LEN_MAX * ibl;
 359     ob = buf = xmalloc(obl + 1);
 360
 361     mutt_iconv (cd, &ib, &ibl, &ob, &obl, inrepls, outrepl);
 362     iconv_close (cd);
 363
 364     *ob = '\0';
 365
 366     p_delete(ps);
 367     *ps = buf;
 368     return 0;
 369   }
 370   else
 371     return -1;
 372 }
 373
 374
 375 /*
 376  * FGETCONV stuff for converting a file while reading it
 377  * Used in sendlib.c for converting from mutt's Charset
 378  */
 379
 380 struct fgetconv_s {
 381   FILE *file;
 382   iconv_t cd;
 383   char bufi[512];
 384   char bufo[512];
 385   char *p;
 386   char *ob;
 387   char *ib;
 388   ssize_t ibl;
 389   const char **inrepls;
 390 };
 391
 392 struct fgetconv_not {
 393   FILE *file;
 394   iconv_t cd;
 395 };
 396
 397 FGETCONV *fgetconv_open (FILE * file, const char *from, const char *to,
 398                          int flags)
 399 {
 400   struct fgetconv_s *fc;
 401   iconv_t cd = (iconv_t) - 1;
 402   static const char *repls[] = { "\357\277\275", "?", 0 };
 403
 404   if (from && to)
 405     cd = mutt_iconv_open (to, from, flags);
 406
 407   if (cd != (iconv_t) - 1) {
 408     fc = p_new(struct fgetconv_s, 1);
 409     fc->p = fc->ob = fc->bufo;
 410     fc->ib = fc->bufi;
 411     fc->ibl = 0;
 412     fc->inrepls = mutt_is_utf8 (to) ? repls : repls + 1;
 413   }
 414   else
 415     fc = p_new(struct fgetconv_s, 1);
 416   fc->file = file;
 417   fc->cd = cd;
 418   return (FGETCONV *) fc;
 419 }
 420
 421 char *fgetconvs (char *buf, ssize_t l, FGETCONV * _fc)
 422 {
 423   int c;
 424   ssize_t r;
 425
 426   for (r = 0; r + 1 < l;) {
 427     if ((c = fgetconv (_fc)) == EOF)
 428       break;
 429     buf[r++] = (char) c;
 430     if (c == '\n')
 431       break;
 432   }
 433   buf[r] = '\0';
 434
 435   if (r)
 436     return buf;
 437   else
 438     return NULL;
 439 }
 440
 441 int fgetconv (FGETCONV * _fc)
 442 {
 443   struct fgetconv_s *fc = (struct fgetconv_s *) _fc;
 444
 445   if (!fc)
 446     return EOF;
 447   if (fc->cd == (iconv_t) - 1)
 448     return fgetc (fc->file);
 449   if (!fc->p)
 450     return EOF;
 451   if (fc->p < fc->ob)
 452     return (unsigned char) *(fc->p)++;
 453
 454   /* Try to convert some more */
 455   fc->p = fc->ob = fc->bufo;
 456   if (fc->ibl) {
 457     ssize_t obl = ssizeof(fc->bufo);
 458
 459     my_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
 460     if (fc->p < fc->ob)
 461       return (unsigned char) *(fc->p)++;
 462   }
 463
 464   /* If we trusted iconv a bit more, we would at this point
 465    * ask why it had stopped converting ... */
 466
 467   /* Try to read some more */
 468   if (fc->ibl == sizeof (fc->bufi) ||
 469       (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof (fc->bufi))) {
 470     fc->p = 0;
 471     return EOF;
 472   }
 473   if (fc->ibl)
 474     memcpy (fc->bufi, fc->ib, fc->ibl);
 475   fc->ib = fc->bufi;
 476   fc->ibl +=
 477     fread (fc->ib + fc->ibl, 1, sizeof (fc->bufi) - fc->ibl, fc->file);
 478
 479   /* Try harder this time to convert some */
 480   if (fc->ibl) {
 481     ssize_t obl = ssizeof(fc->bufo);
 482
 483     mutt_iconv (fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob,
 484                 &obl, fc->inrepls, 0);
 485     if (fc->p < fc->ob)
 486       return (unsigned char) *(fc->p)++;
 487   }
 488
 489   /* Either the file has finished or one of the buffers is too small */
 490   fc->p = 0;
 491   return EOF;
 492 }
 493
 494 void fgetconv_close (FGETCONV ** _fc)
 495 {
 496   struct fgetconv_s *fc = (struct fgetconv_s *) *_fc;
 497
 498   if (fc->cd != (iconv_t) - 1)
 499     iconv_close (fc->cd);
 500   p_delete(_fc);
 501 }
 502
 503 const char *mutt_get_first_charset (const char *charset)
 504 {
 505   static char fcharset[SHORT_STRING];
 506   const char *c, *c1;
 507
 508   c = charset;
 509   if (!m_strlen(c))
 510     return "us-ascii";
 511   if (!(c1 = strchr (c, ':')))
 512     return ((char*) charset);
 513   m_strcpy(fcharset, c1 - c + 1, c);
 514   return fcharset;
 515 }
 516
 517 static ssize_t convert_string (const char *f, ssize_t flen,
 518                               const char *from, const char *to,
 519                               char **t, ssize_t * tlen)
 520 {
 521   iconv_t cd;
 522   char *buf, *ob;
 523   ssize_t obl;
 524   ssize_t n;
 525   int e;
 526
 527   cd = mutt_iconv_open (to, from, 0);
 528   if (cd == (iconv_t) (-1))
 529     return -1;
 530   obl = 4 * flen + 1;
 531   ob = buf = xmalloc(obl);
 532   n = my_iconv(cd, &f, &flen, &ob, &obl);
 533   if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
 534     e = errno;
 535     p_delete(&buf);
 536     iconv_close (cd);
 537     errno = e;
 538     return -1;
 539   }
 540   *ob = '\0';
 541
 542   *tlen = ob - buf;
 543
 544   p_realloc(&buf, ob - buf + 1);
 545   *t = buf;
 546   iconv_close (cd);
 547
 548   return n;
 549 }
 550
 551 int mutt_convert_nonmime_string (char **ps)
 552 {
 553   const char *c, *c1;
 554
 555   for (c = AssumedCharset; c; c = c1 ? c1 + 1 : 0) {
 556     char *u = *ps;
 557     char *s = NULL;
 558     char *fromcode;
 559     ssize_t m, n;
 560     ssize_t ulen = m_strlen(*ps);
 561     ssize_t slen;
 562
 563     if (!u || !*u)
 564       return 0;
 565
 566     c1 = strchr (c, ':');
 567     n = c1 ? c1 - c : m_strlen(c);
 568     if (!n)
 569       continue;
 570     fromcode = p_dupstr(c, n);
 571     m = convert_string (u, ulen, fromcode, Charset, &s, &slen);
 572     p_delete(&fromcode);
 573     if (m != -1) {
 574       p_delete(ps);
 575       *ps = s;
 576       return 0;
 577     }
 578   }
 579   return -1;
 580 }
 581
 582 void mutt_set_charset (char *charset)
 583 {
 584     char buffer[STRING];
 585
 586     mutt_canonical_charset (buffer, sizeof (buffer), charset);
 587
 588     Charset_is_utf8 = 0;
 589     if (!strcmp (buffer, "utf-8"))
 590         Charset_is_utf8 = 1;
 591
 592 #ifdef HAVE_BIND_TEXTDOMAIN_CODESET
 593     bind_textdomain_codeset (PACKAGE, buffer);
 594 #endif
 595 }
 596
 597 wchar_t replacement_char(void)
 598 {
 599     return Charset_is_utf8 ? 0xfffd : '?';
 600 }