charset.c

   1 /*
   2  * Copyright notice from original mutt:
   3  * Copyright (C) 1999-2000 Thomas Roessler <roessler@does-not-exist.org>
   4  *
   5  * This file is part of mutt-ng, see http://www.muttng.org/.
   6  * It's licensed under the GNU General Public License,
   7  * please see the file GPL in the top level source directory.
   8  */
   9
  10 #if HAVE_CONFIG_H
  11 # include "config.h"
  12 #endif
  13
  14 #include <string.h>
  15 #include <stdio.h>
  16 #include <stdlib.h>
  17
  18 #include <ctype.h>
  19
  20 #include <sys/types.h>
  21 #include <dirent.h>
  22 #include <unistd.h>
  23 #include <errno.h>
  24
  25 #include "mutt.h"
  26 #include "charset.h"
  27
  28 #ifndef EILSEQ
  29 # define EILSEQ EINVAL
  30 #endif
  31
  32 /*
  33  * The following list has been created manually from the data under:
  34  * http://www.isi.edu/in-notes/iana/assignments/character-sets
  35  * Last update: 2000-09-07
  36  *
  37  * Note that it includes only the subset of character sets for which
  38  * a preferred MIME name is given.
  39  */
  40
  41 static struct {
  42   char *key;
  43   char *pref;
  44 } PreferredMIMENames[] = {
  45   {
  46   "ansi_x3.4-1968", "us-ascii"}, {
  47   "iso-ir-6", "us-ascii"}, {
  48   "iso_646.irv:1991", "us-ascii"}, {
  49   "ascii", "us-ascii"}, {
  50   "iso646-us", "us-ascii"}, {
  51   "us", "us-ascii"}, {
  52   "ibm367", "us-ascii"}, {
  53   "cp367", "us-ascii"}, {
  54   "csASCII", "us-ascii"}, {
  55   "csISO2022KR", "iso-2022-kr"}, {
  56   "csEUCKR", "euc-kr"}, {
  57   "csISO2022JP", "iso-2022-jp"}, {
  58   "csISO2022JP2", "iso-2022-jp-2"}, {
  59   "ISO_8859-1:1987", "iso-8859-1"}, {
  60   "iso-ir-100", "iso-8859-1"}, {
  61   "iso_8859-1", "iso-8859-1"}, {
  62   "latin1", "iso-8859-1"}, {
  63   "l1", "iso-8859-1"}, {
  64   "IBM819", "iso-8859-1"}, {
  65   "CP819", "iso-8859-1"}, {
  66   "csISOLatin1", "iso-8859-1"}, {
  67   "ISO_8859-2:1987", "iso-8859-2"}, {
  68   "iso-ir-101", "iso-8859-2"}, {
  69   "iso_8859-2", "iso-8859-2"}, {
  70   "latin2", "iso-8859-2"}, {
  71   "l2", "iso-8859-2"}, {
  72   "csISOLatin2", "iso-8859-2"}, {
  73   "ISO_8859-3:1988", "iso-8859-3"}, {
  74   "iso-ir-109", "iso-8859-3"}, {
  75   "ISO_8859-3", "iso-8859-3"}, {
  76   "latin3", "iso-8859-3"}, {
  77   "l3", "iso-8859-3"}, {
  78   "csISOLatin3", "iso-8859-3"}, {
  79   "ISO_8859-4:1988", "iso-8859-4"}, {
  80   "iso-ir-110", "iso-8859-4"}, {
  81   "ISO_8859-4", "iso-8859-4"}, {
  82   "latin4", "iso-8859-4"}, {
  83   "l4", "iso-8859-4"}, {
  84   "csISOLatin4", "iso-8859-4"}, {
  85   "ISO_8859-6:1987", "iso-8859-6"}, {
  86   "iso-ir-127", "iso-8859-6"}, {
  87   "iso_8859-6", "iso-8859-6"}, {
  88   "ECMA-114", "iso-8859-6"}, {
  89   "ASMO-708", "iso-8859-6"}, {
  90   "arabic", "iso-8859-6"}, {
  91   "csISOLatinArabic", "iso-8859-6"}, {
  92   "ISO_8859-7:1987", "iso-8859-7"}, {
  93   "iso-ir-126", "iso-8859-7"}, {
  94   "ISO_8859-7", "iso-8859-7"}, {
  95   "ELOT_928", "iso-8859-7"}, {
  96   "ECMA-118", "iso-8859-7"}, {
  97   "greek", "iso-8859-7"}, {
  98   "greek8", "iso-8859-7"}, {
  99   "csISOLatinGreek", "iso-8859-7"}, {
 100   "ISO_8859-8:1988", "iso-8859-8"}, {
 101   "iso-ir-138", "iso-8859-8"}, {
 102   "ISO_8859-8", "iso-8859-8"}, {
 103   "hebrew", "iso-8859-8"}, {
 104   "csISOLatinHebrew", "iso-8859-8"}, {
 105   "ISO_8859-5:1988", "iso-8859-5"}, {
 106   "iso-ir-144", "iso-8859-5"}, {
 107   "ISO_8859-5", "iso-8859-5"}, {
 108   "cyrillic", "iso-8859-5"}, {
 109   "csISOLatinCyrillic", "iso8859-5"}, {
 110   "ISO_8859-9:1989", "iso-8859-9"}, {
 111   "iso-ir-148", "iso-8859-9"}, {
 112   "ISO_8859-9", "iso-8859-9"}, {
 113   "latin5", "iso-8859-9"},      /* this is not a bug */
 114   {
 115   "l5", "iso-8859-9"}, {
 116   "csISOLatin5", "iso-8859-9"}, {
 117   "ISO_8859-10:1992", "iso-8859-10"}, {
 118   "iso-ir-157", "iso-8859-10"}, {
 119   "latin6", "iso-8859-10"},     /* this is not a bug */
 120   {
 121   "l6", "iso-8859-10"}, {
 122   "csISOLatin6" "iso-8859-10"}, {
 123   "csKOI8r", "koi8-r"}, {
 124   "MS_Kanji", "Shift_JIS"},     /* Note the underscore! */
 125   {
 126   "csShiftJis", "Shift_JIS"}, {
 127   "Extended_UNIX_Code_Packed_Format_for_Japanese", "EUC-JP"}, {
 128   "csEUCPkdFmtJapanese", "EUC-JP"}, {
 129   "csGB2312", "gb2312"}, {
 130   "csbig5", "big5"},
 131     /*
 132      * End of official brain damage.  What follows has been taken
 133      * from glibc's localedata files.
 134      */
 135   {
 136   "iso_8859-13", "iso-8859-13"}, {
 137   "iso-ir-179", "iso-8859-13"}, {
 138   "latin7", "iso-8859-13"},     /* this is not a bug */
 139   {
 140   "l7", "iso-8859-13"}, {
 141   "iso_8859-14", "iso-8859-14"}, {
 142   "latin8", "iso-8859-14"},     /* this is not a bug */
 143   {
 144   "l8", "iso-8859-14"}, {
 145   "iso_8859-15", "iso-8859-15"}, {
 146   "latin9", "iso-8859-15"},     /* this is not a bug */
 147     /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
 148   {
 149   "latin0", "iso-8859-15"},     /* this is not a bug */
 150   {
 151   "iso_8859-16", "iso-8859-16"}, {
 152   "latin10", "iso-8859-16"},    /* this is not a bug */
 153     /*
 154      * David Champion <dgc@uchicago.edu> has observed this with
 155      * nl_langinfo under SunOS 5.8.
 156      */
 157   {
 158   "646", "us-ascii"},
 159     /*
 160      * http://www.sun.com/software/white-papers/wp-unicode/
 161      */
 162   {
 163   "eucJP", "euc-jp"}, {
 164   "PCK", "Shift_JIS"}, {
 165   "ko_KR-euc", "euc-kr"}, {
 166   "zh_TW-big5", "big5"},
 167     /* seems to be common on some systems */
 168   {
 169   "sjis", "Shift_JIS"}, {
 170   "euc-jp-ms", "eucJP-ms"},
 171     /*
 172      * If you happen to encounter system-specific brain-damage with
 173      * respect to character set naming, please add it above this
 174      * comment, and submit a patch to <mutt-dev@mutt.org>.
 175      */
 176     /* End of aliases.  Please keep this line last. */
 177   {
 178   NULL, NULL}
 179 };
 180
 181 #ifdef HAVE_LANGINFO_CODESET
 182 # include <langinfo.h>
 183
 184
 185 void mutt_set_langinfo_charset (void)
 186 {
 187   char buff[LONG_STRING];
 188   char buff2[LONG_STRING];
 189
 190   strfcpy (buff, nl_langinfo (CODESET), sizeof (buff));
 191   mutt_canonical_charset (buff2, sizeof (buff2), buff);
 192
 193   /* finally, set $charset */
 194   if (!(Charset = safe_strdup (buff2)))
 195     Charset = safe_strdup ("iso-8859-1");
 196 }
 197
 198 #else
 199
 200 void mutt_set_langinfo_charset (void)
 201 {
 202   Charset = safe_strdup ("iso-8859-1");
 203 }
 204
 205 #endif
 206
 207 void mutt_canonical_charset (char *dest, size_t dlen, const char *name)
 208 {
 209   size_t i;
 210   char *p;
 211   char scratch[LONG_STRING];
 212
 213   /* catch some common iso-8859-something misspellings */
 214   if (!ascii_strncasecmp (name, "8859", 4) && name[4] != '-')
 215     snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 4);
 216   else if (!ascii_strncasecmp (name, "8859-", 5))
 217     snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 5);
 218   else if (!ascii_strncasecmp (name, "iso8859", 7) && name[7] != '-')
 219     snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 7);
 220   else if (!ascii_strncasecmp (name, "iso8859-", 8))
 221     snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 8);
 222   else
 223     strfcpy (scratch, NONULL (name), sizeof (scratch));
 224
 225   for (i = 0; PreferredMIMENames[i].key; i++)
 226     if (!ascii_strcasecmp (scratch, PreferredMIMENames[i].key) ||
 227         !mutt_strcasecmp (scratch, PreferredMIMENames[i].key)) {
 228       strfcpy (dest, PreferredMIMENames[i].pref, dlen);
 229       return;
 230     }
 231
 232   strfcpy (dest, scratch, dlen);
 233
 234   /* for cosmetics' sake, transform to lowercase. */
 235   for (p = dest; *p; p++)
 236     *p = ascii_tolower (*p);
 237 }
 238
 239 int mutt_chscmp (const char *s, const char *chs)
 240 {
 241   char buffer[STRING];
 242
 243   if (!s)
 244     return 0;
 245
 246   mutt_canonical_charset (buffer, sizeof (buffer), s);
 247   return !ascii_strcasecmp (buffer, chs);
 248 }
 249
 250
 251 #ifndef HAVE_ICONV
 252
 253 iconv_t iconv_open (const char *tocode, const char *fromcode)
 254 {
 255   return (iconv_t) (-1);
 256 }
 257
 258 size_t iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t * inbytesleft,
 259               char **outbuf, size_t * outbytesleft)
 260 {
 261   return 0;
 262 }
 263
 264 int iconv_close (iconv_t cd)
 265 {
 266   return 0;
 267 }
 268
 269 #endif /* !HAVE_ICONV */
 270
 271
 272 /*
 273  * Like iconv_open, but canonicalises the charsets
 274  */
 275
 276 iconv_t mutt_iconv_open (const char *tocode, const char *fromcode, int flags)
 277 {
 278   char tocode1[SHORT_STRING];
 279   char fromcode1[SHORT_STRING];
 280   char *tocode2, *fromcode2;
 281   char *tmp;
 282
 283   iconv_t cd;
 284
 285   mutt_canonical_charset (tocode1, sizeof (tocode1), tocode);
 286
 287 #ifdef M_ICONV_HOOK_TO
 288   /* Not used. */
 289   if ((flags & M_ICONV_HOOK_TO) && (tmp = mutt_charset_hook (tocode1)))
 290     mutt_canonical_charset (tocode1, sizeof (tocode1), tmp);
 291 #endif
 292
 293   mutt_canonical_charset (fromcode1, sizeof (fromcode1), fromcode);
 294   if ((flags & M_ICONV_HOOK_FROM) && (tmp = mutt_charset_hook (fromcode1)))
 295     mutt_canonical_charset (fromcode1, sizeof (fromcode1), tmp);
 296
 297   if ((cd = iconv_open (tocode1, fromcode1)) != (iconv_t) - 1)
 298     return cd;
 299   if ((tocode2 = mutt_iconv_hook (tocode1))
 300       && (fromcode2 = mutt_iconv_hook (fromcode1)))
 301     return iconv_open (tocode2, fromcode2);
 302
 303   return (iconv_t) - 1;
 304 }
 305
 306
 307 /*
 308  * Like iconv, but keeps going even when the input is invalid
 309  * If you're supplying inrepls, the source charset should be stateless;
 310  * if you're supplying an outrepl, the target charset should be.
 311  */
 312
 313 size_t mutt_iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t * inbytesleft,
 314                    char **outbuf, size_t * outbytesleft,
 315                    ICONV_CONST char **inrepls, const char *outrepl)
 316 {
 317   size_t ret = 0, ret1;
 318   ICONV_CONST char *ib = *inbuf;
 319   size_t ibl = *inbytesleft;
 320   char *ob = *outbuf;
 321   size_t obl = *outbytesleft;
 322
 323   for (;;) {
 324     ret1 = iconv (cd, &ib, &ibl, &ob, &obl);
 325     if (ret1 != (size_t) - 1)
 326       ret += ret1;
 327     if (ibl && obl && errno == EILSEQ) {
 328       if (inrepls) {
 329         /* Try replacing the input */
 330         ICONV_CONST char **t;
 331
 332         for (t = inrepls; *t; t++) {
 333           ICONV_CONST char *ib1 = *t;
 334           size_t ibl1 = strlen (*t);
 335           char *ob1 = ob;
 336           size_t obl1 = obl;
 337
 338           iconv (cd, &ib1, &ibl1, &ob1, &obl1);
 339           if (!ibl1) {
 340             ++ib, --ibl;
 341             ob = ob1, obl = obl1;
 342             ++ret;
 343             break;
 344           }
 345         }
 346         if (*t)
 347           continue;
 348       }
 349       /* Replace the output */
 350       if (!outrepl)
 351         outrepl = "?";
 352       iconv (cd, 0, 0, &ob, &obl);
 353       if (obl) {
 354         int n = strlen (outrepl);
 355
 356         if (n > obl) {
 357           outrepl = "?";
 358           n = 1;
 359         }
 360         memcpy (ob, outrepl, n);
 361         ++ib, --ibl;
 362         ob += n, obl -= n;
 363         ++ret;
 364         iconv (cd, 0, 0, 0, 0); /* for good measure */
 365         continue;
 366       }
 367     }
 368     *inbuf = ib, *inbytesleft = ibl;
 369     *outbuf = ob, *outbytesleft = obl;
 370     return ret;
 371   }
 372 }
 373
 374
 375 /*
 376  * Convert a string
 377  * Used in rfc2047.c and rfc2231.c
 378  */
 379
 380 int mutt_convert_string (char **ps, const char *from, const char *to,
 381                          int flags)
 382 {
 383   iconv_t cd;
 384   ICONV_CONST char *repls[] = { "\357\277\275", "?", 0 };
 385   char *s = *ps;
 386
 387   if (!s || !*s)
 388     return 0;
 389
 390   if (to && from && (cd = mutt_iconv_open (to, from, flags)) != (iconv_t) - 1) {
 391     int len;
 392     ICONV_CONST char *ib;
 393     char *buf, *ob;
 394     size_t ibl, obl;
 395     ICONV_CONST char **inrepls = 0;
 396     char *outrepl = 0;
 397
 398     if (mutt_is_utf8 (to))
 399       outrepl = "\357\277\275";
 400     else if (mutt_is_utf8 (from))
 401       inrepls = repls;
 402     else
 403       outrepl = "?";
 404
 405     len = strlen (s);
 406     ib = s, ibl = len + 1;
 407     obl = MB_LEN_MAX * ibl;
 408     ob = buf = safe_malloc (obl + 1);
 409
 410     mutt_iconv (cd, &ib, &ibl, &ob, &obl, inrepls, outrepl);
 411     iconv_close (cd);
 412
 413     *ob = '\0';
 414
 415     FREE (ps);
 416     *ps = buf;
 417
 418     mutt_str_adjust (ps);
 419     return 0;
 420   }
 421   else
 422     return -1;
 423 }
 424
 425
 426 /*
 427  * FGETCONV stuff for converting a file while reading it
 428  * Used in sendlib.c for converting from mutt's Charset
 429  */
 430
 431 struct fgetconv_s {
 432   FILE *file;
 433   iconv_t cd;
 434   char bufi[512];
 435   char bufo[512];
 436   char *p;
 437   char *ob;
 438   char *ib;
 439   size_t ibl;
 440   ICONV_CONST char **inrepls;
 441 };
 442
 443 struct fgetconv_not {
 444   FILE *file;
 445   iconv_t cd;
 446 };
 447
 448 FGETCONV *fgetconv_open (FILE * file, const char *from, const char *to,
 449                          int flags)
 450 {
 451   struct fgetconv_s *fc;
 452   iconv_t cd = (iconv_t) - 1;
 453   static ICONV_CONST char *repls[] = { "\357\277\275", "?", 0 };
 454
 455   if (from && to)
 456     cd = mutt_iconv_open (to, from, flags);
 457
 458   if (cd != (iconv_t) - 1) {
 459     fc = safe_malloc (sizeof (struct fgetconv_s));
 460     fc->p = fc->ob = fc->bufo;
 461     fc->ib = fc->bufi;
 462     fc->ibl = 0;
 463     fc->inrepls = mutt_is_utf8 (to) ? repls : repls + 1;
 464   }
 465   else
 466     fc = safe_malloc (sizeof (struct fgetconv_not));
 467   fc->file = file;
 468   fc->cd = cd;
 469   return (FGETCONV *) fc;
 470 }
 471
 472 char *fgetconvs (char *buf, size_t l, FGETCONV * _fc)
 473 {
 474   int c;
 475   size_t r;
 476
 477   for (r = 0; r + 1 < l;) {
 478     if ((c = fgetconv (_fc)) == EOF)
 479       break;
 480     buf[r++] = (char) c;
 481     if (c == '\n')
 482       break;
 483   }
 484   buf[r] = '\0';
 485
 486   if (r)
 487     return buf;
 488   else
 489     return NULL;
 490 }
 491
 492 int fgetconv (FGETCONV * _fc)
 493 {
 494   struct fgetconv_s *fc = (struct fgetconv_s *) _fc;
 495
 496   if (!fc)
 497     return EOF;
 498   if (fc->cd == (iconv_t) - 1)
 499     return fgetc (fc->file);
 500   if (!fc->p)
 501     return EOF;
 502   if (fc->p < fc->ob)
 503     return (unsigned char) *(fc->p)++;
 504
 505   /* Try to convert some more */
 506   fc->p = fc->ob = fc->bufo;
 507   if (fc->ibl) {
 508     size_t obl = sizeof (fc->bufo);
 509
 510     iconv (fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
 511     if (fc->p < fc->ob)
 512       return (unsigned char) *(fc->p)++;
 513   }
 514
 515   /* If we trusted iconv a bit more, we would at this point
 516    * ask why it had stopped converting ... */
 517
 518   /* Try to read some more */
 519   if (fc->ibl == sizeof (fc->bufi) ||
 520       (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof (fc->bufi))) {
 521     fc->p = 0;
 522     return EOF;
 523   }
 524   if (fc->ibl)
 525     memcpy (fc->bufi, fc->ib, fc->ibl);
 526   fc->ib = fc->bufi;
 527   fc->ibl +=
 528     fread (fc->ib + fc->ibl, 1, sizeof (fc->bufi) - fc->ibl, fc->file);
 529
 530   /* Try harder this time to convert some */
 531   if (fc->ibl) {
 532     size_t obl = sizeof (fc->bufo);
 533
 534     mutt_iconv (fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob,
 535                 &obl, fc->inrepls, 0);
 536     if (fc->p < fc->ob)
 537       return (unsigned char) *(fc->p)++;
 538   }
 539
 540   /* Either the file has finished or one of the buffers is too small */
 541   fc->p = 0;
 542   return EOF;
 543 }
 544
 545 void fgetconv_close (FGETCONV ** _fc)
 546 {
 547   struct fgetconv_s *fc = (struct fgetconv_s *) *_fc;
 548
 549   if (fc->cd != (iconv_t) - 1)
 550     iconv_close (fc->cd);
 551   FREE (_fc);
 552 }
 553
 554 char *mutt_get_first_charset (const char *charset)
 555 {
 556   static char fcharset[SHORT_STRING];
 557   const char *c, *c1;
 558
 559   c = charset;
 560   if (!mutt_strlen (c))
 561     return "us-ascii";
 562   if (!(c1 = strchr (c, ':')))
 563     return charset;
 564   strfcpy (fcharset, c, c1 - c + 1);
 565   return fcharset;
 566 }
 567
 568 static size_t convert_string (ICONV_CONST char *f, size_t flen,
 569                               const char *from, const char *to,
 570                               char **t, size_t * tlen)
 571 {
 572   iconv_t cd;
 573   char *buf, *ob;
 574   size_t obl, n;
 575   int e;
 576
 577   cd = mutt_iconv_open (to, from, 0);
 578   if (cd == (iconv_t) (-1))
 579     return (size_t) (-1);
 580   obl = 4 * flen + 1;
 581   ob = buf = safe_malloc (obl);
 582   n = iconv (cd, &f, &flen, &ob, &obl);
 583   if (n == (size_t) (-1) || iconv (cd, 0, 0, &ob, &obl) == (size_t) (-1)) {
 584     e = errno;
 585     FREE (&buf);
 586     iconv_close (cd);
 587     errno = e;
 588     return (size_t) (-1);
 589   }
 590   *ob = '\0';
 591
 592   *tlen = ob - buf;
 593
 594   safe_realloc (&buf, ob - buf + 1);
 595   *t = buf;
 596   iconv_close (cd);
 597
 598   return n;
 599 }
 600
 601 int mutt_convert_nonmime_string (char **ps)
 602 {
 603   const char *c, *c1;
 604
 605   for (c = AssumedCharset; c; c = c1 ? c1 + 1 : 0) {
 606     char *u = *ps;
 607     char *s;
 608     char *fromcode;
 609     size_t m, n;
 610     size_t ulen = mutt_strlen (*ps);
 611     size_t slen;
 612
 613     if (!u || !*u)
 614       return 0;
 615
 616     c1 = strchr (c, ':');
 617     n = c1 ? c1 - c : mutt_strlen (c);
 618     if (!n)
 619       continue;
 620     fromcode = safe_malloc (n + 1);
 621     strfcpy (fromcode, c, n + 1);
 622     m = convert_string (u, ulen, fromcode, Charset, &s, &slen);
 623     FREE (&fromcode);
 624     if (m != (size_t) (-1)) {
 625       FREE (ps);
 626       *ps = s;
 627       return 0;
 628     }
 629   }
 630   return -1;
 631 }