charset.c

   1 /*
   2  * Copyright notice from original mutt:
   3  * Copyright (C) 1999-2000 Thomas Roessler <roessler@does-not-exist.org>
   4  *
   5  * This file is part of mutt-ng, see http://www.muttng.org/.
   6  * It's licensed under the GNU General Public License,
   7  * please see the file GPL in the top level source directory.
   8  */
   9
  10 #if HAVE_CONFIG_H
  11 # include "config.h"
  12 #endif
  13
  14 #include <string.h>
  15 #include <stdio.h>
  16 #include <stdlib.h>
  17
  18 #include <ctype.h>
  19
  20 #include <sys/types.h>
  21 #include <dirent.h>
  22 #include <unistd.h>
  23 #include <errno.h>
  24
  25 #include <lib-lib/mem.h>
  26 #include <lib-lib/ascii.h>
  27 #include <lib-lib/str.h>
  28 #include <lib-lib/macros.h>
  29
  30 #include "mutt.h"
  31 #include "charset.h"
  32
  33
  34 #ifndef EILSEQ
  35 # define EILSEQ EINVAL
  36 #endif
  37
  38 int Charset_is_utf8 = 0;
  39
  40 /*
  41  * The following list has been created manually from the data under:
  42  * http://www.isi.edu/in-notes/iana/assignments/character-sets
  43  * Last update: 2000-09-07
  44  *
  45  * Note that it includes only the subset of character sets for which
  46  * a preferred MIME name is given.
  47  */
  48
  49 static struct {
  50   const char *key;
  51   const char *pref;
  52 } PreferredMIMENames[] = {
  53   {
  54   "ansi_x3.4-1968", "us-ascii"}, {
  55   "iso-ir-6", "us-ascii"}, {
  56   "iso_646.irv:1991", "us-ascii"}, {
  57   "ascii", "us-ascii"}, {
  58   "iso646-us", "us-ascii"}, {
  59   "us", "us-ascii"}, {
  60   "ibm367", "us-ascii"}, {
  61   "cp367", "us-ascii"}, {
  62   "csASCII", "us-ascii"}, {
  63   "csISO2022KR", "iso-2022-kr"}, {
  64   "csEUCKR", "euc-kr"}, {
  65   "csISO2022JP", "iso-2022-jp"}, {
  66   "csISO2022JP2", "iso-2022-jp-2"}, {
  67   "ISO_8859-1:1987", "iso-8859-1"}, {
  68   "iso-ir-100", "iso-8859-1"}, {
  69   "iso_8859-1", "iso-8859-1"}, {
  70   "latin1", "iso-8859-1"}, {
  71   "l1", "iso-8859-1"}, {
  72   "IBM819", "iso-8859-1"}, {
  73   "CP819", "iso-8859-1"}, {
  74   "csISOLatin1", "iso-8859-1"}, {
  75   "ISO_8859-2:1987", "iso-8859-2"}, {
  76   "iso-ir-101", "iso-8859-2"}, {
  77   "iso_8859-2", "iso-8859-2"}, {
  78   "latin2", "iso-8859-2"}, {
  79   "l2", "iso-8859-2"}, {
  80   "csISOLatin2", "iso-8859-2"}, {
  81   "ISO_8859-3:1988", "iso-8859-3"}, {
  82   "iso-ir-109", "iso-8859-3"}, {
  83   "ISO_8859-3", "iso-8859-3"}, {
  84   "latin3", "iso-8859-3"}, {
  85   "l3", "iso-8859-3"}, {
  86   "csISOLatin3", "iso-8859-3"}, {
  87   "ISO_8859-4:1988", "iso-8859-4"}, {
  88   "iso-ir-110", "iso-8859-4"}, {
  89   "ISO_8859-4", "iso-8859-4"}, {
  90   "latin4", "iso-8859-4"}, {
  91   "l4", "iso-8859-4"}, {
  92   "csISOLatin4", "iso-8859-4"}, {
  93   "ISO_8859-6:1987", "iso-8859-6"}, {
  94   "iso-ir-127", "iso-8859-6"}, {
  95   "iso_8859-6", "iso-8859-6"}, {
  96   "ECMA-114", "iso-8859-6"}, {
  97   "ASMO-708", "iso-8859-6"}, {
  98   "arabic", "iso-8859-6"}, {
  99   "csISOLatinArabic", "iso-8859-6"}, {
 100   "ISO_8859-7:1987", "iso-8859-7"}, {
 101   "iso-ir-126", "iso-8859-7"}, {
 102   "ISO_8859-7", "iso-8859-7"}, {
 103   "ELOT_928", "iso-8859-7"}, {
 104   "ECMA-118", "iso-8859-7"}, {
 105   "greek", "iso-8859-7"}, {
 106   "greek8", "iso-8859-7"}, {
 107   "csISOLatinGreek", "iso-8859-7"}, {
 108   "ISO_8859-8:1988", "iso-8859-8"}, {
 109   "iso-ir-138", "iso-8859-8"}, {
 110   "ISO_8859-8", "iso-8859-8"}, {
 111   "hebrew", "iso-8859-8"}, {
 112   "csISOLatinHebrew", "iso-8859-8"}, {
 113   "ISO_8859-5:1988", "iso-8859-5"}, {
 114   "iso-ir-144", "iso-8859-5"}, {
 115   "ISO_8859-5", "iso-8859-5"}, {
 116   "cyrillic", "iso-8859-5"}, {
 117   "csISOLatinCyrillic", "iso8859-5"}, {
 118   "ISO_8859-9:1989", "iso-8859-9"}, {
 119   "iso-ir-148", "iso-8859-9"}, {
 120   "ISO_8859-9", "iso-8859-9"}, {
 121   "latin5", "iso-8859-9"},      /* this is not a bug */
 122   {
 123   "l5", "iso-8859-9"}, {
 124   "csISOLatin5", "iso-8859-9"}, {
 125   "ISO_8859-10:1992", "iso-8859-10"}, {
 126   "iso-ir-157", "iso-8859-10"}, {
 127   "latin6", "iso-8859-10"},     /* this is not a bug */
 128   {
 129   "l6", "iso-8859-10"}, {
 130   "csISOLatin6", "iso-8859-10"}, {
 131   "csKOI8r", "koi8-r"}, {
 132   "MS_Kanji", "Shift_JIS"},     /* Note the underscore! */
 133   {
 134   "csShiftJis", "Shift_JIS"}, {
 135   "Extended_UNIX_Code_Packed_Format_for_Japanese", "EUC-JP"}, {
 136   "csEUCPkdFmtJapanese", "EUC-JP"}, {
 137   "csGB2312", "gb2312"}, {
 138   "csbig5", "big5"},
 139     /*
 140      * End of official brain damage.  What follows has been taken
 141      * from glibc's localedata files.
 142      */
 143   {
 144   "iso_8859-13", "iso-8859-13"}, {
 145   "iso-ir-179", "iso-8859-13"}, {
 146   "latin7", "iso-8859-13"},     /* this is not a bug */
 147   {
 148   "l7", "iso-8859-13"}, {
 149   "iso_8859-14", "iso-8859-14"}, {
 150   "latin8", "iso-8859-14"},     /* this is not a bug */
 151   {
 152   "l8", "iso-8859-14"}, {
 153   "iso_8859-15", "iso-8859-15"}, {
 154   "latin9", "iso-8859-15"},     /* this is not a bug */
 155     /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
 156   {
 157   "latin0", "iso-8859-15"},     /* this is not a bug */
 158   {
 159   "iso_8859-16", "iso-8859-16"}, {
 160   "latin10", "iso-8859-16"},    /* this is not a bug */
 161     /*
 162      * David Champion <dgc@uchicago.edu> has observed this with
 163      * nl_langinfo under SunOS 5.8.
 164      */
 165   {
 166   "646", "us-ascii"},
 167     /*
 168      * http://www.sun.com/software/white-papers/wp-unicode/
 169      */
 170   {
 171   "eucJP", "euc-jp"}, {
 172   "PCK", "Shift_JIS"}, {
 173   "ko_KR-euc", "euc-kr"}, {
 174   "zh_TW-big5", "big5"},
 175     /* seems to be common on some systems */
 176   {
 177   "sjis", "Shift_JIS"}, {
 178   "euc-jp-ms", "eucJP-ms"},
 179     /*
 180      * If you happen to encounter system-specific brain-damage with
 181      * respect to character set naming, please add it above this
 182      * comment, and submit a patch to <mutt-dev@mutt.org>.
 183      */
 184     /* End of aliases.  Please keep this line last. */
 185   {
 186   NULL, NULL}
 187 };
 188
 189 #ifdef HAVE_LANGINFO_CODESET
 190 # include <langinfo.h>
 191
 192
 193 void mutt_set_langinfo_charset (void)
 194 {
 195   char buff[LONG_STRING];
 196   char buff2[LONG_STRING];
 197
 198   m_strcpy(buff, sizeof(buff), nl_langinfo(CODESET));
 199   mutt_canonical_charset (buff2, sizeof (buff2), buff);
 200
 201   /* finally, set $charset */
 202   if (!(Charset = m_strdup(buff2)))
 203     Charset = m_strdup("iso-8859-1");
 204 }
 205
 206 #else
 207
 208 void mutt_set_langinfo_charset (void)
 209 {
 210   Charset = m_strdup("iso-8859-1");
 211 }
 212
 213 #endif
 214
 215 void mutt_canonical_charset (char *dest, ssize_t dlen, const char *name)
 216 {
 217   ssize_t i;
 218   char *p;
 219   char scratch[LONG_STRING];
 220
 221   /* catch some common iso-8859-something misspellings */
 222   if (!ascii_strncasecmp (name, "8859", 4) && name[4] != '-')
 223     snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 4);
 224   else if (!ascii_strncasecmp (name, "8859-", 5))
 225     snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 5);
 226   else if (!ascii_strncasecmp (name, "iso8859", 7) && name[7] != '-')
 227     snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 7);
 228   else if (!ascii_strncasecmp (name, "iso8859-", 8))
 229     snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 8);
 230   else
 231     m_strcpy(scratch, sizeof(scratch), NONULL(name));
 232
 233   for (i = 0; PreferredMIMENames[i].key; i++)
 234     if (!ascii_strcasecmp (scratch, PreferredMIMENames[i].key) ||
 235         !m_strcasecmp(scratch, PreferredMIMENames[i].key)) {
 236       m_strcpy(dest, dlen, PreferredMIMENames[i].pref);
 237       return;
 238     }
 239
 240   m_strcpy(dest, dlen, scratch);
 241
 242   /* for cosmetics' sake, transform to lowercase. */
 243   for (p = dest; *p; p++)
 244     *p = ascii_tolower (*p);
 245 }
 246
 247 int mutt_chscmp (const char *s, const char *chs)
 248 {
 249   char buffer[STRING];
 250
 251   if (!s)
 252     return 0;
 253
 254   mutt_canonical_charset (buffer, sizeof (buffer), s);
 255   return !ascii_strcasecmp (buffer, chs);
 256 }
 257
 258
 259 /*
 260  * Like iconv_open, but canonicalises the charsets
 261  */
 262
 263 iconv_t mutt_iconv_open (const char *tocode, const char *fromcode, int flags)
 264 {
 265   char tocode1[SHORT_STRING];
 266   char fromcode1[SHORT_STRING];
 267   char *tocode2, *fromcode2;
 268   char *tmp;
 269
 270   iconv_t cd;
 271
 272   mutt_canonical_charset (tocode1, sizeof (tocode1), tocode);
 273
 274 #ifdef M_ICONV_HOOK_TO
 275   /* Not used. */
 276   if ((flags & M_ICONV_HOOK_TO) && (tmp = mutt_charset_hook (tocode1)))
 277     mutt_canonical_charset (tocode1, sizeof (tocode1), tmp);
 278 #endif
 279
 280   mutt_canonical_charset (fromcode1, sizeof (fromcode1), fromcode);
 281   if ((flags & M_ICONV_HOOK_FROM) && (tmp = mutt_charset_hook (fromcode1)))
 282     mutt_canonical_charset (fromcode1, sizeof (fromcode1), tmp);
 283
 284   if ((cd = iconv_open (tocode1, fromcode1)) != (iconv_t) - 1)
 285     return cd;
 286   if ((tocode2 = mutt_iconv_hook (tocode1))
 287       && (fromcode2 = mutt_iconv_hook (fromcode1)))
 288     return iconv_open (tocode2, fromcode2);
 289
 290   return (iconv_t) - 1;
 291 }
 292
 293
 294 /*
 295  * Like iconv, but keeps going even when the input is invalid
 296  * If you're supplying inrepls, the source charset should be stateless;
 297  * if you're supplying an outrepl, the target charset should be.
 298  */
 299
 300 ssize_t mutt_iconv(iconv_t cd, const char **inbuf, ssize_t *inbytesleft,
 301                    char **outbuf, ssize_t *outbytesleft,
 302                    const char **inrepls, const char *outrepl)
 303 {
 304   ssize_t ret = 0, ret1;
 305   const char *ib = *inbuf;
 306   ssize_t ibl = *inbytesleft;
 307   char *ob = *outbuf;
 308   ssize_t obl = *outbytesleft;
 309
 310   for (;;) {
 311     ret1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
 312     if (ret1 != -1)
 313       ret += ret1;
 314     if (ibl && obl && errno == EILSEQ) {
 315       if (inrepls) {
 316         /* Try replacing the input */
 317         const char **t;
 318
 319         for (t = inrepls; *t; t++) {
 320           const char *ib1 = *t;
 321           ssize_t ibl1 = m_strlen(*t);
 322           char *ob1 = ob;
 323           ssize_t obl1 = obl;
 324
 325           my_iconv(cd, &ib1, &ibl1, &ob1, &obl1);
 326           if (!ibl1) {
 327             ++ib, --ibl;
 328             ob = ob1, obl = obl1;
 329             ++ret;
 330             break;
 331           }
 332         }
 333         if (*t)
 334           continue;
 335       }
 336       /* Replace the output */
 337       if (!outrepl)
 338         outrepl = "?";
 339       my_iconv(cd, 0, 0, &ob, &obl);
 340       if (obl) {
 341         ssize_t n = m_strlen(outrepl);
 342
 343         if (n > obl) {
 344           outrepl = "?";
 345           n = 1;
 346         }
 347         memcpy (ob, outrepl, n);
 348         ++ib, --ibl;
 349         ob += n, obl -= n;
 350         ++ret;
 351         my_iconv(cd, 0, 0, 0, 0); /* for good measure */
 352         continue;
 353       }
 354     }
 355     *inbuf = ib, *inbytesleft = ibl;
 356     *outbuf = ob, *outbytesleft = obl;
 357     return ret;
 358   }
 359 }
 360
 361
 362 /*
 363  * Convert a string
 364  * Used in rfc2047.c and rfc2231.c
 365  */
 366
 367 int mutt_convert_string (char **ps, const char *from, const char *to,
 368                          int flags)
 369 {
 370   iconv_t cd;
 371   const char *repls[] = { "\357\277\275", "?", 0 };
 372   char *s = *ps;
 373
 374   if (!s || !*s)
 375     return 0;
 376
 377   if (to && from && (cd = mutt_iconv_open (to, from, flags)) != (iconv_t) - 1) {
 378     int len;
 379     const char *ib;
 380     char *buf, *ob;
 381     ssize_t ibl, obl;
 382     const char **inrepls = NULL;
 383     const char *outrepl = NULL;
 384
 385     if (mutt_is_utf8 (to))
 386       outrepl = "\357\277\275";
 387     else if (mutt_is_utf8 (from))
 388       inrepls = repls;
 389     else
 390       outrepl = "?";
 391
 392     len = m_strlen(s);
 393     ib = s, ibl = len + 1;
 394     obl = MB_LEN_MAX * ibl;
 395     ob = buf = xmalloc(obl + 1);
 396
 397     mutt_iconv (cd, &ib, &ibl, &ob, &obl, inrepls, outrepl);
 398     iconv_close (cd);
 399
 400     *ob = '\0';
 401
 402     p_delete(ps);
 403     *ps = buf;
 404     return 0;
 405   }
 406   else
 407     return -1;
 408 }
 409
 410
 411 /*
 412  * FGETCONV stuff for converting a file while reading it
 413  * Used in sendlib.c for converting from mutt's Charset
 414  */
 415
 416 struct fgetconv_s {
 417   FILE *file;
 418   iconv_t cd;
 419   char bufi[512];
 420   char bufo[512];
 421   char *p;
 422   char *ob;
 423   char *ib;
 424   ssize_t ibl;
 425   const char **inrepls;
 426 };
 427
 428 struct fgetconv_not {
 429   FILE *file;
 430   iconv_t cd;
 431 };
 432
 433 FGETCONV *fgetconv_open (FILE * file, const char *from, const char *to,
 434                          int flags)
 435 {
 436   struct fgetconv_s *fc;
 437   iconv_t cd = (iconv_t) - 1;
 438   static const char *repls[] = { "\357\277\275", "?", 0 };
 439
 440   if (from && to)
 441     cd = mutt_iconv_open (to, from, flags);
 442
 443   if (cd != (iconv_t) - 1) {
 444     fc = p_new(struct fgetconv_s, 1);
 445     fc->p = fc->ob = fc->bufo;
 446     fc->ib = fc->bufi;
 447     fc->ibl = 0;
 448     fc->inrepls = mutt_is_utf8 (to) ? repls : repls + 1;
 449   }
 450   else
 451     fc = p_new(struct fgetconv_s, 1);
 452   fc->file = file;
 453   fc->cd = cd;
 454   return (FGETCONV *) fc;
 455 }
 456
 457 char *fgetconvs (char *buf, ssize_t l, FGETCONV * _fc)
 458 {
 459   int c;
 460   ssize_t r;
 461
 462   for (r = 0; r + 1 < l;) {
 463     if ((c = fgetconv (_fc)) == EOF)
 464       break;
 465     buf[r++] = (char) c;
 466     if (c == '\n')
 467       break;
 468   }
 469   buf[r] = '\0';
 470
 471   if (r)
 472     return buf;
 473   else
 474     return NULL;
 475 }
 476
 477 int fgetconv (FGETCONV * _fc)
 478 {
 479   struct fgetconv_s *fc = (struct fgetconv_s *) _fc;
 480
 481   if (!fc)
 482     return EOF;
 483   if (fc->cd == (iconv_t) - 1)
 484     return fgetc (fc->file);
 485   if (!fc->p)
 486     return EOF;
 487   if (fc->p < fc->ob)
 488     return (unsigned char) *(fc->p)++;
 489
 490   /* Try to convert some more */
 491   fc->p = fc->ob = fc->bufo;
 492   if (fc->ibl) {
 493     ssize_t obl = ssizeof(fc->bufo);
 494
 495     my_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
 496     if (fc->p < fc->ob)
 497       return (unsigned char) *(fc->p)++;
 498   }
 499
 500   /* If we trusted iconv a bit more, we would at this point
 501    * ask why it had stopped converting ... */
 502
 503   /* Try to read some more */
 504   if (fc->ibl == sizeof (fc->bufi) ||
 505       (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof (fc->bufi))) {
 506     fc->p = 0;
 507     return EOF;
 508   }
 509   if (fc->ibl)
 510     memcpy (fc->bufi, fc->ib, fc->ibl);
 511   fc->ib = fc->bufi;
 512   fc->ibl +=
 513     fread (fc->ib + fc->ibl, 1, sizeof (fc->bufi) - fc->ibl, fc->file);
 514
 515   /* Try harder this time to convert some */
 516   if (fc->ibl) {
 517     ssize_t obl = ssizeof(fc->bufo);
 518
 519     mutt_iconv (fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob,
 520                 &obl, fc->inrepls, 0);
 521     if (fc->p < fc->ob)
 522       return (unsigned char) *(fc->p)++;
 523   }
 524
 525   /* Either the file has finished or one of the buffers is too small */
 526   fc->p = 0;
 527   return EOF;
 528 }
 529
 530 void fgetconv_close (FGETCONV ** _fc)
 531 {
 532   struct fgetconv_s *fc = (struct fgetconv_s *) *_fc;
 533
 534   if (fc->cd != (iconv_t) - 1)
 535     iconv_close (fc->cd);
 536   p_delete(_fc);
 537 }
 538
 539 const char *mutt_get_first_charset (const char *charset)
 540 {
 541   static char fcharset[SHORT_STRING];
 542   const char *c, *c1;
 543
 544   c = charset;
 545   if (!m_strlen(c))
 546     return "us-ascii";
 547   if (!(c1 = strchr (c, ':')))
 548     return ((char*) charset);
 549   m_strcpy(fcharset, c1 - c + 1, c);
 550   return fcharset;
 551 }
 552
 553 static ssize_t convert_string (const char *f, ssize_t flen,
 554                               const char *from, const char *to,
 555                               char **t, ssize_t * tlen)
 556 {
 557   iconv_t cd;
 558   char *buf, *ob;
 559   ssize_t obl;
 560   ssize_t n;
 561   int e;
 562
 563   cd = mutt_iconv_open (to, from, 0);
 564   if (cd == (iconv_t) (-1))
 565     return -1;
 566   obl = 4 * flen + 1;
 567   ob = buf = xmalloc(obl);
 568   n = my_iconv(cd, &f, &flen, &ob, &obl);
 569   if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
 570     e = errno;
 571     p_delete(&buf);
 572     iconv_close (cd);
 573     errno = e;
 574     return -1;
 575   }
 576   *ob = '\0';
 577
 578   *tlen = ob - buf;
 579
 580   p_realloc(&buf, ob - buf + 1);
 581   *t = buf;
 582   iconv_close (cd);
 583
 584   return n;
 585 }
 586
 587 int mutt_convert_nonmime_string (char **ps)
 588 {
 589   const char *c, *c1;
 590
 591   for (c = AssumedCharset; c; c = c1 ? c1 + 1 : 0) {
 592     char *u = *ps;
 593     char *s = NULL;
 594     char *fromcode;
 595     ssize_t m, n;
 596     ssize_t ulen = m_strlen(*ps);
 597     ssize_t slen;
 598
 599     if (!u || !*u)
 600       return 0;
 601
 602     c1 = strchr (c, ':');
 603     n = c1 ? c1 - c : m_strlen(c);
 604     if (!n)
 605       continue;
 606     fromcode = p_dupstr(c, n);
 607     m = convert_string (u, ulen, fromcode, Charset, &s, &slen);
 608     p_delete(&fromcode);
 609     if (m != -1) {
 610       p_delete(ps);
 611       *ps = s;
 612       return 0;
 613     }
 614   }
 615   return -1;
 616 }
 617
 618 void mutt_set_charset (char *charset)
 619 {
 620     char buffer[STRING];
 621
 622     mutt_canonical_charset (buffer, sizeof (buffer), charset);
 623
 624     Charset_is_utf8 = 0;
 625     if (!strcmp (buffer, "utf-8"))
 626         Charset_is_utf8 = 1;
 627
 628 #ifdef HAVE_BIND_TEXTDOMAIN_CODESET
 629     bind_textdomain_codeset (PACKAGE, buffer);
 630 #endif
 631 }
 632
 633 wchar_t replacement_char (void)
 634 {
 635     return Charset_is_utf8 ? 0xfffd : '?';
 636 }