charset.c

   1 /*
   2  * Copyright notice from original mutt:
   3  * Copyright (C) 1999-2000 Thomas Roessler <roessler@does-not-exist.org>
   4  *
   5  * This file is part of mutt-ng, see http://www.muttng.org/.
   6  * It's licensed under the GNU General Public License,
   7  * please see the file GPL in the top level source directory.
   8  */
   9
  10 #if HAVE_CONFIG_H
  11 # include "config.h"
  12 #endif
  13
  14 #include <string.h>
  15 #include <stdio.h>
  16 #include <stdlib.h>
  17
  18 #include <ctype.h>
  19
  20 #include <sys/types.h>
  21 #include <dirent.h>
  22 #include <unistd.h>
  23 #include <errno.h>
  24
  25 #include <lib-lib/mem.h>
  26
  27 #include "mutt.h"
  28 #include "charset.h"
  29 #include "ascii.h"
  30
  31 #include "lib/mem.h"
  32 #include "lib/intl.h"
  33 #include "lib/str.h"
  34
  35 #ifndef EILSEQ
  36 # define EILSEQ EINVAL
  37 #endif
  38
  39 /*
  40  * The following list has been created manually from the data under:
  41  * http://www.isi.edu/in-notes/iana/assignments/character-sets
  42  * Last update: 2000-09-07
  43  *
  44  * Note that it includes only the subset of character sets for which
  45  * a preferred MIME name is given.
  46  */
  47
  48 static struct {
  49   const char *key;
  50   const char *pref;
  51 } PreferredMIMENames[] = {
  52   {
  53   "ansi_x3.4-1968", "us-ascii"}, {
  54   "iso-ir-6", "us-ascii"}, {
  55   "iso_646.irv:1991", "us-ascii"}, {
  56   "ascii", "us-ascii"}, {
  57   "iso646-us", "us-ascii"}, {
  58   "us", "us-ascii"}, {
  59   "ibm367", "us-ascii"}, {
  60   "cp367", "us-ascii"}, {
  61   "csASCII", "us-ascii"}, {
  62   "csISO2022KR", "iso-2022-kr"}, {
  63   "csEUCKR", "euc-kr"}, {
  64   "csISO2022JP", "iso-2022-jp"}, {
  65   "csISO2022JP2", "iso-2022-jp-2"}, {
  66   "ISO_8859-1:1987", "iso-8859-1"}, {
  67   "iso-ir-100", "iso-8859-1"}, {
  68   "iso_8859-1", "iso-8859-1"}, {
  69   "latin1", "iso-8859-1"}, {
  70   "l1", "iso-8859-1"}, {
  71   "IBM819", "iso-8859-1"}, {
  72   "CP819", "iso-8859-1"}, {
  73   "csISOLatin1", "iso-8859-1"}, {
  74   "ISO_8859-2:1987", "iso-8859-2"}, {
  75   "iso-ir-101", "iso-8859-2"}, {
  76   "iso_8859-2", "iso-8859-2"}, {
  77   "latin2", "iso-8859-2"}, {
  78   "l2", "iso-8859-2"}, {
  79   "csISOLatin2", "iso-8859-2"}, {
  80   "ISO_8859-3:1988", "iso-8859-3"}, {
  81   "iso-ir-109", "iso-8859-3"}, {
  82   "ISO_8859-3", "iso-8859-3"}, {
  83   "latin3", "iso-8859-3"}, {
  84   "l3", "iso-8859-3"}, {
  85   "csISOLatin3", "iso-8859-3"}, {
  86   "ISO_8859-4:1988", "iso-8859-4"}, {
  87   "iso-ir-110", "iso-8859-4"}, {
  88   "ISO_8859-4", "iso-8859-4"}, {
  89   "latin4", "iso-8859-4"}, {
  90   "l4", "iso-8859-4"}, {
  91   "csISOLatin4", "iso-8859-4"}, {
  92   "ISO_8859-6:1987", "iso-8859-6"}, {
  93   "iso-ir-127", "iso-8859-6"}, {
  94   "iso_8859-6", "iso-8859-6"}, {
  95   "ECMA-114", "iso-8859-6"}, {
  96   "ASMO-708", "iso-8859-6"}, {
  97   "arabic", "iso-8859-6"}, {
  98   "csISOLatinArabic", "iso-8859-6"}, {
  99   "ISO_8859-7:1987", "iso-8859-7"}, {
 100   "iso-ir-126", "iso-8859-7"}, {
 101   "ISO_8859-7", "iso-8859-7"}, {
 102   "ELOT_928", "iso-8859-7"}, {
 103   "ECMA-118", "iso-8859-7"}, {
 104   "greek", "iso-8859-7"}, {
 105   "greek8", "iso-8859-7"}, {
 106   "csISOLatinGreek", "iso-8859-7"}, {
 107   "ISO_8859-8:1988", "iso-8859-8"}, {
 108   "iso-ir-138", "iso-8859-8"}, {
 109   "ISO_8859-8", "iso-8859-8"}, {
 110   "hebrew", "iso-8859-8"}, {
 111   "csISOLatinHebrew", "iso-8859-8"}, {
 112   "ISO_8859-5:1988", "iso-8859-5"}, {
 113   "iso-ir-144", "iso-8859-5"}, {
 114   "ISO_8859-5", "iso-8859-5"}, {
 115   "cyrillic", "iso-8859-5"}, {
 116   "csISOLatinCyrillic", "iso8859-5"}, {
 117   "ISO_8859-9:1989", "iso-8859-9"}, {
 118   "iso-ir-148", "iso-8859-9"}, {
 119   "ISO_8859-9", "iso-8859-9"}, {
 120   "latin5", "iso-8859-9"},      /* this is not a bug */
 121   {
 122   "l5", "iso-8859-9"}, {
 123   "csISOLatin5", "iso-8859-9"}, {
 124   "ISO_8859-10:1992", "iso-8859-10"}, {
 125   "iso-ir-157", "iso-8859-10"}, {
 126   "latin6", "iso-8859-10"},     /* this is not a bug */
 127   {
 128   "l6", "iso-8859-10"}, {
 129   "csISOLatin6", "iso-8859-10"}, {
 130   "csKOI8r", "koi8-r"}, {
 131   "MS_Kanji", "Shift_JIS"},     /* Note the underscore! */
 132   {
 133   "csShiftJis", "Shift_JIS"}, {
 134   "Extended_UNIX_Code_Packed_Format_for_Japanese", "EUC-JP"}, {
 135   "csEUCPkdFmtJapanese", "EUC-JP"}, {
 136   "csGB2312", "gb2312"}, {
 137   "csbig5", "big5"},
 138     /*
 139      * End of official brain damage.  What follows has been taken
 140      * from glibc's localedata files.
 141      */
 142   {
 143   "iso_8859-13", "iso-8859-13"}, {
 144   "iso-ir-179", "iso-8859-13"}, {
 145   "latin7", "iso-8859-13"},     /* this is not a bug */
 146   {
 147   "l7", "iso-8859-13"}, {
 148   "iso_8859-14", "iso-8859-14"}, {
 149   "latin8", "iso-8859-14"},     /* this is not a bug */
 150   {
 151   "l8", "iso-8859-14"}, {
 152   "iso_8859-15", "iso-8859-15"}, {
 153   "latin9", "iso-8859-15"},     /* this is not a bug */
 154     /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
 155   {
 156   "latin0", "iso-8859-15"},     /* this is not a bug */
 157   {
 158   "iso_8859-16", "iso-8859-16"}, {
 159   "latin10", "iso-8859-16"},    /* this is not a bug */
 160     /*
 161      * David Champion <dgc@uchicago.edu> has observed this with
 162      * nl_langinfo under SunOS 5.8.
 163      */
 164   {
 165   "646", "us-ascii"},
 166     /*
 167      * http://www.sun.com/software/white-papers/wp-unicode/
 168      */
 169   {
 170   "eucJP", "euc-jp"}, {
 171   "PCK", "Shift_JIS"}, {
 172   "ko_KR-euc", "euc-kr"}, {
 173   "zh_TW-big5", "big5"},
 174     /* seems to be common on some systems */
 175   {
 176   "sjis", "Shift_JIS"}, {
 177   "euc-jp-ms", "eucJP-ms"},
 178     /*
 179      * If you happen to encounter system-specific brain-damage with
 180      * respect to character set naming, please add it above this
 181      * comment, and submit a patch to <mutt-dev@mutt.org>.
 182      */
 183     /* End of aliases.  Please keep this line last. */
 184   {
 185   NULL, NULL}
 186 };
 187
 188 #ifdef HAVE_LANGINFO_CODESET
 189 # include <langinfo.h>
 190
 191
 192 void mutt_set_langinfo_charset (void)
 193 {
 194   char buff[LONG_STRING];
 195   char buff2[LONG_STRING];
 196
 197   strfcpy (buff, nl_langinfo (CODESET), sizeof (buff));
 198   mutt_canonical_charset (buff2, sizeof (buff2), buff);
 199
 200   /* finally, set $charset */
 201   if (!(Charset = str_dup (buff2)))
 202     Charset = str_dup ("iso-8859-1");
 203 }
 204
 205 #else
 206
 207 void mutt_set_langinfo_charset (void)
 208 {
 209   Charset = str_dup ("iso-8859-1");
 210 }
 211
 212 #endif
 213
 214 void mutt_canonical_charset (char *dest, size_t dlen, const char *name)
 215 {
 216   size_t i;
 217   char *p;
 218   char scratch[LONG_STRING];
 219
 220   /* catch some common iso-8859-something misspellings */
 221   if (!ascii_strncasecmp (name, "8859", 4) && name[4] != '-')
 222     snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 4);
 223   else if (!ascii_strncasecmp (name, "8859-", 5))
 224     snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 5);
 225   else if (!ascii_strncasecmp (name, "iso8859", 7) && name[7] != '-')
 226     snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 7);
 227   else if (!ascii_strncasecmp (name, "iso8859-", 8))
 228     snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 8);
 229   else
 230     strfcpy (scratch, NONULL (name), sizeof (scratch));
 231
 232   for (i = 0; PreferredMIMENames[i].key; i++)
 233     if (!ascii_strcasecmp (scratch, PreferredMIMENames[i].key) ||
 234         !str_casecmp (scratch, PreferredMIMENames[i].key)) {
 235       strfcpy (dest, PreferredMIMENames[i].pref, dlen);
 236       return;
 237     }
 238
 239   strfcpy (dest, scratch, dlen);
 240
 241   /* for cosmetics' sake, transform to lowercase. */
 242   for (p = dest; *p; p++)
 243     *p = ascii_tolower (*p);
 244 }
 245
 246 int mutt_chscmp (const char *s, const char *chs)
 247 {
 248   char buffer[STRING];
 249
 250   if (!s)
 251     return 0;
 252
 253   mutt_canonical_charset (buffer, sizeof (buffer), s);
 254   return !ascii_strcasecmp (buffer, chs);
 255 }
 256
 257
 258 /*
 259  * Like iconv_open, but canonicalises the charsets
 260  */
 261
 262 iconv_t mutt_iconv_open (const char *tocode, const char *fromcode, int flags)
 263 {
 264   char tocode1[SHORT_STRING];
 265   char fromcode1[SHORT_STRING];
 266   char *tocode2, *fromcode2;
 267   char *tmp;
 268
 269   iconv_t cd;
 270
 271   mutt_canonical_charset (tocode1, sizeof (tocode1), tocode);
 272
 273 #ifdef M_ICONV_HOOK_TO
 274   /* Not used. */
 275   if ((flags & M_ICONV_HOOK_TO) && (tmp = mutt_charset_hook (tocode1)))
 276     mutt_canonical_charset (tocode1, sizeof (tocode1), tmp);
 277 #endif
 278
 279   mutt_canonical_charset (fromcode1, sizeof (fromcode1), fromcode);
 280   if ((flags & M_ICONV_HOOK_FROM) && (tmp = mutt_charset_hook (fromcode1)))
 281     mutt_canonical_charset (fromcode1, sizeof (fromcode1), tmp);
 282
 283   if ((cd = iconv_open (tocode1, fromcode1)) != (iconv_t) - 1)
 284     return cd;
 285   if ((tocode2 = mutt_iconv_hook (tocode1))
 286       && (fromcode2 = mutt_iconv_hook (fromcode1)))
 287     return iconv_open (tocode2, fromcode2);
 288
 289   return (iconv_t) - 1;
 290 }
 291
 292
 293 /*
 294  * Like iconv, but keeps going even when the input is invalid
 295  * If you're supplying inrepls, the source charset should be stateless;
 296  * if you're supplying an outrepl, the target charset should be.
 297  */
 298
 299 size_t mutt_iconv (iconv_t cd, const char **inbuf, size_t * inbytesleft,
 300                    char **outbuf, size_t * outbytesleft,
 301                    const char **inrepls, const char *outrepl)
 302 {
 303   size_t ret = 0, ret1;
 304   const char *ib = *inbuf;
 305   size_t ibl = *inbytesleft;
 306   char *ob = *outbuf;
 307   size_t obl = *outbytesleft;
 308
 309   for (;;) {
 310     ret1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
 311     if (ret1 != (size_t) - 1)
 312       ret += ret1;
 313     if (ibl && obl && errno == EILSEQ) {
 314       if (inrepls) {
 315         /* Try replacing the input */
 316         const char **t;
 317
 318         for (t = inrepls; *t; t++) {
 319           const char *ib1 = *t;
 320           size_t ibl1 = str_len (*t);
 321           char *ob1 = ob;
 322           size_t obl1 = obl;
 323
 324           my_iconv(cd, &ib1, &ibl1, &ob1, &obl1);
 325           if (!ibl1) {
 326             ++ib, --ibl;
 327             ob = ob1, obl = obl1;
 328             ++ret;
 329             break;
 330           }
 331         }
 332         if (*t)
 333           continue;
 334       }
 335       /* Replace the output */
 336       if (!outrepl)
 337         outrepl = "?";
 338       my_iconv(cd, 0, 0, &ob, &obl);
 339       if (obl) {
 340         int n = str_len (outrepl);
 341
 342         if (n > obl) {
 343           outrepl = "?";
 344           n = 1;
 345         }
 346         memcpy (ob, outrepl, n);
 347         ++ib, --ibl;
 348         ob += n, obl -= n;
 349         ++ret;
 350         my_iconv(cd, 0, 0, 0, 0); /* for good measure */
 351         continue;
 352       }
 353     }
 354     *inbuf = ib, *inbytesleft = ibl;
 355     *outbuf = ob, *outbytesleft = obl;
 356     return ret;
 357   }
 358 }
 359
 360
 361 /*
 362  * Convert a string
 363  * Used in rfc2047.c and rfc2231.c
 364  */
 365
 366 int mutt_convert_string (char **ps, const char *from, const char *to,
 367                          int flags)
 368 {
 369   iconv_t cd;
 370   const char *repls[] = { "\357\277\275", "?", 0 };
 371   char *s = *ps;
 372
 373   if (!s || !*s)
 374     return 0;
 375
 376   if (to && from && (cd = mutt_iconv_open (to, from, flags)) != (iconv_t) - 1) {
 377     int len;
 378     const char *ib;
 379     char *buf, *ob;
 380     size_t ibl, obl;
 381     const char **inrepls = NULL;
 382     const char *outrepl = NULL;
 383
 384     if (mutt_is_utf8 (to))
 385       outrepl = "\357\277\275";
 386     else if (mutt_is_utf8 (from))
 387       inrepls = repls;
 388     else
 389       outrepl = "?";
 390
 391     len = str_len (s);
 392     ib = s, ibl = len + 1;
 393     obl = MB_LEN_MAX * ibl;
 394     ob = buf = xmalloc(obl + 1);
 395
 396     mutt_iconv (cd, &ib, &ibl, &ob, &obl, inrepls, outrepl);
 397     iconv_close (cd);
 398
 399     *ob = '\0';
 400
 401     p_delete(ps);
 402     *ps = buf;
 403
 404     str_adjust (ps);
 405     return 0;
 406   }
 407   else
 408     return -1;
 409 }
 410
 411
 412 /*
 413  * FGETCONV stuff for converting a file while reading it
 414  * Used in sendlib.c for converting from mutt's Charset
 415  */
 416
 417 struct fgetconv_s {
 418   FILE *file;
 419   iconv_t cd;
 420   char bufi[512];
 421   char bufo[512];
 422   char *p;
 423   char *ob;
 424   char *ib;
 425   size_t ibl;
 426   const char **inrepls;
 427 };
 428
 429 struct fgetconv_not {
 430   FILE *file;
 431   iconv_t cd;
 432 };
 433
 434 FGETCONV *fgetconv_open (FILE * file, const char *from, const char *to,
 435                          int flags)
 436 {
 437   struct fgetconv_s *fc;
 438   iconv_t cd = (iconv_t) - 1;
 439   static const char *repls[] = { "\357\277\275", "?", 0 };
 440
 441   if (from && to)
 442     cd = mutt_iconv_open (to, from, flags);
 443
 444   if (cd != (iconv_t) - 1) {
 445     fc = p_new(struct fgetconv_s, 1);
 446     fc->p = fc->ob = fc->bufo;
 447     fc->ib = fc->bufi;
 448     fc->ibl = 0;
 449     fc->inrepls = mutt_is_utf8 (to) ? repls : repls + 1;
 450   }
 451   else
 452     fc = p_new(struct fgetconv_not, 1);
 453   fc->file = file;
 454   fc->cd = cd;
 455   return (FGETCONV *) fc;
 456 }
 457
 458 char *fgetconvs (char *buf, size_t l, FGETCONV * _fc)
 459 {
 460   int c;
 461   size_t r;
 462
 463   for (r = 0; r + 1 < l;) {
 464     if ((c = fgetconv (_fc)) == EOF)
 465       break;
 466     buf[r++] = (char) c;
 467     if (c == '\n')
 468       break;
 469   }
 470   buf[r] = '\0';
 471
 472   if (r)
 473     return buf;
 474   else
 475     return NULL;
 476 }
 477
 478 int fgetconv (FGETCONV * _fc)
 479 {
 480   struct fgetconv_s *fc = (struct fgetconv_s *) _fc;
 481
 482   if (!fc)
 483     return EOF;
 484   if (fc->cd == (iconv_t) - 1)
 485     return fgetc (fc->file);
 486   if (!fc->p)
 487     return EOF;
 488   if (fc->p < fc->ob)
 489     return (unsigned char) *(fc->p)++;
 490
 491   /* Try to convert some more */
 492   fc->p = fc->ob = fc->bufo;
 493   if (fc->ibl) {
 494     size_t obl = sizeof (fc->bufo);
 495
 496     my_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
 497     if (fc->p < fc->ob)
 498       return (unsigned char) *(fc->p)++;
 499   }
 500
 501   /* If we trusted iconv a bit more, we would at this point
 502    * ask why it had stopped converting ... */
 503
 504   /* Try to read some more */
 505   if (fc->ibl == sizeof (fc->bufi) ||
 506       (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof (fc->bufi))) {
 507     fc->p = 0;
 508     return EOF;
 509   }
 510   if (fc->ibl)
 511     memcpy (fc->bufi, fc->ib, fc->ibl);
 512   fc->ib = fc->bufi;
 513   fc->ibl +=
 514     fread (fc->ib + fc->ibl, 1, sizeof (fc->bufi) - fc->ibl, fc->file);
 515
 516   /* Try harder this time to convert some */
 517   if (fc->ibl) {
 518     size_t obl = sizeof (fc->bufo);
 519
 520     mutt_iconv (fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob,
 521                 &obl, fc->inrepls, 0);
 522     if (fc->p < fc->ob)
 523       return (unsigned char) *(fc->p)++;
 524   }
 525
 526   /* Either the file has finished or one of the buffers is too small */
 527   fc->p = 0;
 528   return EOF;
 529 }
 530
 531 void fgetconv_close (FGETCONV ** _fc)
 532 {
 533   struct fgetconv_s *fc = (struct fgetconv_s *) *_fc;
 534
 535   if (fc->cd != (iconv_t) - 1)
 536     iconv_close (fc->cd);
 537   p_delete(_fc);
 538 }
 539
 540 const char *mutt_get_first_charset (const char *charset)
 541 {
 542   static char fcharset[SHORT_STRING];
 543   const char *c, *c1;
 544
 545   c = charset;
 546   if (!str_len (c))
 547     return "us-ascii";
 548   if (!(c1 = strchr (c, ':')))
 549     return ((char*) charset);
 550   strfcpy (fcharset, c, c1 - c + 1);
 551   return fcharset;
 552 }
 553
 554 static size_t convert_string (const char *f, size_t flen,
 555                               const char *from, const char *to,
 556                               char **t, size_t * tlen)
 557 {
 558   iconv_t cd;
 559   char *buf, *ob;
 560   size_t obl, n;
 561   int e;
 562
 563   cd = mutt_iconv_open (to, from, 0);
 564   if (cd == (iconv_t) (-1))
 565     return (size_t) (-1);
 566   obl = 4 * flen + 1;
 567   ob = buf = xmalloc(obl);
 568   n = my_iconv(cd, &f, &flen, &ob, &obl);
 569   if (n == (size_t) (-1) || my_iconv(cd, 0, 0, &ob, &obl) == (size_t) (-1)) {
 570     e = errno;
 571     p_delete(&buf);
 572     iconv_close (cd);
 573     errno = e;
 574     return (size_t) (-1);
 575   }
 576   *ob = '\0';
 577
 578   *tlen = ob - buf;
 579
 580   mem_realloc (&buf, ob - buf + 1);
 581   *t = buf;
 582   iconv_close (cd);
 583
 584   return n;
 585 }
 586
 587 int mutt_convert_nonmime_string (char **ps)
 588 {
 589   const char *c, *c1;
 590
 591   for (c = AssumedCharset; c; c = c1 ? c1 + 1 : 0) {
 592     char *u = *ps;
 593     char *s = NULL;
 594     char *fromcode;
 595     size_t m, n;
 596     size_t ulen = str_len (*ps);
 597     size_t slen;
 598
 599     if (!u || !*u)
 600       return 0;
 601
 602     c1 = strchr (c, ':');
 603     n = c1 ? c1 - c : str_len (c);
 604     if (!n)
 605       continue;
 606     fromcode = p_dupstr(c, n);
 607     m = convert_string (u, ulen, fromcode, Charset, &s, &slen);
 608     p_delete(&fromcode);
 609     if (m != (size_t) (-1)) {
 610       p_delete(ps);
 611       *ps = s;
 612       return 0;
 613     }
 614   }
 615   return -1;
 616 }