charset.c

   1 /*
   2  * Copyright (C) 1999-2000 Thomas Roessler <roessler@does-not-exist.org>
   3  *
   4  *     This program is free software; you can redistribute it
   5  *     and/or modify it under the terms of the GNU General Public
   6  *     License as published by the Free Software Foundation; either
   7  *     version 2 of the License, or (at your option) any later
   8  *     version.
   9  *
  10  *     This program is distributed in the hope that it will be
  11  *     useful, but WITHOUT ANY WARRANTY; without even the implied
  12  *     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  13  *     PURPOSE.  See the GNU General Public License for more
  14  *     details.
  15  *
  16  *     You should have received a copy of the GNU General Public
  17  *     License along with this program; if not, write to the Free
  18  *     Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  *     Boston, MA  02111, USA.
  20  */
  21
  22 #if HAVE_CONFIG_H
  23 # include "config.h"
  24 #endif
  25
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29
  30 #include <ctype.h>
  31
  32 #include <sys/types.h>
  33 #include <dirent.h>
  34 #include <unistd.h>
  35 #include <errno.h>
  36
  37 #include "mutt.h"
  38 #include "charset.h"
  39
  40 #ifndef EILSEQ
  41 # define EILSEQ EINVAL
  42 #endif
  43
  44 /*
  45  * The following list has been created manually from the data under:
  46  * http://www.isi.edu/in-notes/iana/assignments/character-sets
  47  * Last update: 2000-09-07
  48  *
  49  * Note that it includes only the subset of character sets for which
  50  * a preferred MIME name is given.
  51  */
  52
  53 static struct
  54 {
  55   char *key;
  56   char *pref;
  57 }
  58 PreferredMIMENames[] =
  59 {
  60   { "ansi_x3.4-1968",   "us-ascii"      },
  61   { "iso-ir-6",         "us-ascii"      },
  62   { "iso_646.irv:1991", "us-ascii"      },
  63   { "ascii",            "us-ascii"      },
  64   { "iso646-us",        "us-ascii"      },
  65   { "us",               "us-ascii"      },
  66   { "ibm367",           "us-ascii"      },
  67   { "cp367",            "us-ascii"      },
  68   { "csASCII",          "us-ascii"      },
  69
  70   { "csISO2022KR",      "iso-2022-kr"   },
  71   { "csEUCKR",          "euc-kr"        },
  72   { "csISO2022JP",      "iso-2022-jp"   },
  73   { "csISO2022JP2",     "iso-2022-jp-2" },
  74
  75   { "ISO_8859-1:1987",  "iso-8859-1"    },
  76   { "iso-ir-100",       "iso-8859-1"    },
  77   { "iso_8859-1",       "iso-8859-1"    },
  78   { "latin1",           "iso-8859-1"    },
  79   { "l1",               "iso-8859-1"    },
  80   { "IBM819",           "iso-8859-1"    },
  81   { "CP819",            "iso-8859-1"    },
  82   { "csISOLatin1",      "iso-8859-1"    },
  83
  84   { "ISO_8859-2:1987",  "iso-8859-2"    },
  85   { "iso-ir-101",       "iso-8859-2"    },
  86   { "iso_8859-2",       "iso-8859-2"    },
  87   { "latin2",           "iso-8859-2"    },
  88   { "l2",               "iso-8859-2"    },
  89   { "csISOLatin2",      "iso-8859-2"    },
  90
  91   { "ISO_8859-3:1988",  "iso-8859-3"    },
  92   { "iso-ir-109",       "iso-8859-3"    },
  93   { "ISO_8859-3",       "iso-8859-3"    },
  94   { "latin3",           "iso-8859-3"    },
  95   { "l3",               "iso-8859-3"    },
  96   { "csISOLatin3",      "iso-8859-3"    },
  97
  98   { "ISO_8859-4:1988",  "iso-8859-4"    },
  99   { "iso-ir-110",       "iso-8859-4"    },
 100   { "ISO_8859-4",       "iso-8859-4"    },
 101   { "latin4",           "iso-8859-4"    },
 102   { "l4",               "iso-8859-4"    },
 103   { "csISOLatin4",      "iso-8859-4"    },
 104
 105   { "ISO_8859-6:1987",  "iso-8859-6"    },
 106   { "iso-ir-127",       "iso-8859-6"    },
 107   { "iso_8859-6",       "iso-8859-6"    },
 108   { "ECMA-114",         "iso-8859-6"    },
 109   { "ASMO-708",         "iso-8859-6"    },
 110   { "arabic",           "iso-8859-6"    },
 111   { "csISOLatinArabic", "iso-8859-6"    },
 112
 113   { "ISO_8859-7:1987",  "iso-8859-7"    },
 114   { "iso-ir-126",       "iso-8859-7"    },
 115   { "ISO_8859-7",       "iso-8859-7"    },
 116   { "ELOT_928",         "iso-8859-7"    },
 117   { "ECMA-118",         "iso-8859-7"    },
 118   { "greek",            "iso-8859-7"    },
 119   { "greek8",           "iso-8859-7"    },
 120   { "csISOLatinGreek",  "iso-8859-7"    },
 121
 122   { "ISO_8859-8:1988",  "iso-8859-8"    },
 123   { "iso-ir-138",       "iso-8859-8"    },
 124   { "ISO_8859-8",       "iso-8859-8"    },
 125   { "hebrew",           "iso-8859-8"    },
 126   { "csISOLatinHebrew", "iso-8859-8"    },
 127
 128   { "ISO_8859-5:1988",  "iso-8859-5"    },
 129   { "iso-ir-144",       "iso-8859-5"    },
 130   { "ISO_8859-5",       "iso-8859-5"    },
 131   { "cyrillic",         "iso-8859-5"    },
 132   { "csISOLatinCyrillic", "iso8859-5"   },
 133
 134   { "ISO_8859-9:1989",  "iso-8859-9"    },
 135   { "iso-ir-148",       "iso-8859-9"    },
 136   { "ISO_8859-9",       "iso-8859-9"    },
 137   { "latin5",           "iso-8859-9"    }, /* this is not a bug */
 138   { "l5",               "iso-8859-9"    },
 139   { "csISOLatin5",      "iso-8859-9"    },
 140
 141   { "ISO_8859-10:1992", "iso-8859-10"   },
 142   { "iso-ir-157",       "iso-8859-10"   },
 143   { "latin6",           "iso-8859-10"   }, /* this is not a bug */
 144   { "l6",               "iso-8859-10"   },
 145   { "csISOLatin6"       "iso-8859-10"   },
 146
 147   { "csKOI8r",          "koi8-r"        },
 148
 149   { "MS_Kanji",         "Shift_JIS"     }, /* Note the underscore! */
 150   { "csShiftJis",       "Shift_JIS"     },
 151
 152   { "Extended_UNIX_Code_Packed_Format_for_Japanese",
 153                         "EUC-JP"        },
 154   { "csEUCPkdFmtJapanese",
 155                         "EUC-JP"        },
 156
 157   { "csGB2312",         "gb2312"        },
 158   { "csbig5",           "big5"          },
 159
 160   /*
 161    * End of official brain damage.  What follows has been taken
 162    * from glibc's localedata files.
 163    */
 164
 165   { "iso_8859-13",      "iso-8859-13"   },
 166   { "iso-ir-179",       "iso-8859-13"   },
 167   { "latin7",           "iso-8859-13"   }, /* this is not a bug */
 168   { "l7",               "iso-8859-13"   },
 169
 170   { "iso_8859-14",      "iso-8859-14"   },
 171   { "latin8",           "iso-8859-14"   }, /* this is not a bug */
 172   { "l8",               "iso-8859-14"   },
 173
 174   { "iso_8859-15",      "iso-8859-15"   },
 175   { "latin9",           "iso-8859-15"   }, /* this is not a bug */
 176
 177   /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
 178   { "latin0",           "iso-8859-15"   }, /* this is not a bug */
 179
 180   { "iso_8859-16",      "iso-8859-16"   },
 181   { "latin10",          "iso-8859-16"   }, /* this is not a bug */
 182
 183   /*
 184    * David Champion <dgc@uchicago.edu> has observed this with
 185    * nl_langinfo under SunOS 5.8.
 186    */
 187
 188   { "646",              "us-ascii"      },
 189
 190   /*
 191    * http://www.sun.com/software/white-papers/wp-unicode/
 192    */
 193
 194   { "eucJP",            "euc-jp"        },
 195   { "PCK",              "Shift_JIS"     },
 196   { "ko_KR-euc",        "euc-kr"        },
 197   { "zh_TW-big5",       "big5"          },
 198
 199   /* seems to be common on some systems */
 200
 201   { "sjis",             "Shift_JIS"     },
 202   { "euc-jp-ms",        "eucJP-ms"      },
 203
 204
 205   /*
 206    * If you happen to encounter system-specific brain-damage with
 207    * respect to character set naming, please add it above this
 208    * comment, and submit a patch to <mutt-dev@mutt.org>.
 209    */
 210
 211   /* End of aliases.  Please keep this line last. */
 212
 213   { NULL,               NULL            }
 214 };
 215
 216 #ifdef HAVE_LANGINFO_CODESET
 217 # include <langinfo.h>
 218
 219
 220 void mutt_set_langinfo_charset (void)
 221 {
 222   char buff[LONG_STRING];
 223   char buff2[LONG_STRING];
 224
 225   strfcpy (buff, nl_langinfo (CODESET), sizeof (buff));
 226   mutt_canonical_charset (buff2, sizeof (buff2), buff);
 227
 228   /* finally, set $charset */
 229   if (!(Charset = safe_strdup (buff2)))
 230     Charset = safe_strdup ("iso-8859-1");
 231 }
 232
 233 #else
 234
 235 void mutt_set_langinfo_charset (void)
 236 {
 237   Charset = safe_strdup ("iso-8859-1");
 238 }
 239
 240 #endif
 241
 242 void mutt_canonical_charset (char *dest, size_t dlen, const char *name)
 243 {
 244   size_t i;
 245   char *p;
 246   char scratch[LONG_STRING];
 247
 248   /* catch some common iso-8859-something misspellings */
 249   if (!ascii_strncasecmp (name, "8859", 4) && name[4] != '-')
 250     snprintf (scratch, sizeof (scratch), "iso-8859-%s", name +4);
 251   else if (!ascii_strncasecmp (name, "8859-", 5))
 252     snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 5);
 253   else if (!ascii_strncasecmp (name, "iso8859", 7) && name[7] != '-')
 254     snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 7);
 255   else if (!ascii_strncasecmp (name, "iso8859-", 8))
 256     snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 8);
 257   else
 258     strfcpy (scratch, NONULL(name), sizeof (scratch));
 259
 260   for (i = 0; PreferredMIMENames[i].key; i++)
 261     if (!ascii_strcasecmp (scratch, PreferredMIMENames[i].key) ||
 262         !mutt_strcasecmp (scratch, PreferredMIMENames[i].key))
 263     {
 264       strfcpy (dest, PreferredMIMENames[i].pref, dlen);
 265       return;
 266     }
 267
 268   strfcpy (dest, scratch, dlen);
 269
 270   /* for cosmetics' sake, transform to lowercase. */
 271   for (p = dest; *p; p++)
 272     *p = ascii_tolower (*p);
 273 }
 274
 275 int mutt_chscmp (const char *s, const char *chs)
 276 {
 277   char buffer[STRING];
 278
 279   if (!s) return 0;
 280
 281   mutt_canonical_charset (buffer, sizeof (buffer), s);
 282   return !ascii_strcasecmp (buffer, chs);
 283 }
 284
 285
 286 #ifndef HAVE_ICONV
 287
 288 iconv_t iconv_open (const char *tocode, const char *fromcode)
 289 {
 290   return (iconv_t)(-1);
 291 }
 292
 293 size_t iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t *inbytesleft,
 294               char **outbuf, size_t *outbytesleft)
 295 {
 296   return 0;
 297 }
 298
 299 int iconv_close (iconv_t cd)
 300 {
 301   return 0;
 302 }
 303
 304 #endif /* !HAVE_ICONV */
 305
 306
 307 /*
 308  * Like iconv_open, but canonicalises the charsets
 309  */
 310
 311 iconv_t mutt_iconv_open (const char *tocode, const char *fromcode, int flags)
 312 {
 313   char tocode1[SHORT_STRING];
 314   char fromcode1[SHORT_STRING];
 315   char *tocode2, *fromcode2;
 316   char *tmp;
 317
 318   iconv_t cd;
 319
 320   mutt_canonical_charset (tocode1, sizeof (tocode1), tocode);
 321
 322 #ifdef M_ICONV_HOOK_TO
 323   /* Not used. */
 324   if ((flags & M_ICONV_HOOK_TO) && (tmp = mutt_charset_hook (tocode1)))
 325     mutt_canonical_charset (tocode1, sizeof (tocode1), tmp);
 326 #endif
 327
 328   mutt_canonical_charset (fromcode1, sizeof (fromcode1), fromcode);
 329   if ((flags & M_ICONV_HOOK_FROM) && (tmp = mutt_charset_hook (fromcode1)))
 330     mutt_canonical_charset (fromcode1, sizeof (fromcode1), tmp);
 331
 332   if ((cd = iconv_open (tocode1, fromcode1)) != (iconv_t) -1)
 333     return cd;
 334   if ((tocode2 = mutt_iconv_hook (tocode1)) && (fromcode2 = mutt_iconv_hook (fromcode1)))
 335     return iconv_open (tocode2, fromcode2);
 336
 337   return (iconv_t) -1;
 338 }
 339
 340
 341 /*
 342  * Like iconv, but keeps going even when the input is invalid
 343  * If you're supplying inrepls, the source charset should be stateless;
 344  * if you're supplying an outrepl, the target charset should be.
 345  */
 346
 347 size_t mutt_iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t *inbytesleft,
 348                    char **outbuf, size_t *outbytesleft,
 349                    ICONV_CONST char **inrepls, const char *outrepl)
 350 {
 351   size_t ret = 0, ret1;
 352   ICONV_CONST char *ib = *inbuf;
 353   size_t ibl = *inbytesleft;
 354   char *ob = *outbuf;
 355   size_t obl = *outbytesleft;
 356
 357   for (;;)
 358   {
 359     ret1 = iconv (cd, &ib, &ibl, &ob, &obl);
 360     if (ret1 != (size_t)-1)
 361       ret += ret1;
 362     if (ibl && obl && errno == EILSEQ)
 363     {
 364       if (inrepls)
 365       {
 366         /* Try replacing the input */
 367         ICONV_CONST char **t;
 368         for (t = inrepls; *t; t++)
 369         {
 370           ICONV_CONST char *ib1 = *t;
 371           size_t ibl1 = strlen (*t);
 372           char *ob1 = ob;
 373           size_t obl1 = obl;
 374           iconv (cd, &ib1, &ibl1, &ob1, &obl1);
 375           if (!ibl1)
 376           {
 377             ++ib, --ibl;
 378             ob = ob1, obl = obl1;
 379             ++ret;
 380             break;
 381           }
 382         }
 383         if (*t)
 384           continue;
 385       }
 386       /* Replace the output */
 387       if (!outrepl)
 388         outrepl = "?";
 389       iconv (cd, 0, 0, &ob, &obl);
 390       if (obl)
 391       {
 392         int n = strlen (outrepl);
 393         if (n > obl)
 394         {
 395           outrepl = "?";
 396           n = 1;
 397         }
 398         memcpy (ob, outrepl, n);
 399         ++ib, --ibl;
 400         ob += n, obl -= n;
 401         ++ret;
 402         iconv (cd, 0, 0, 0, 0); /* for good measure */
 403         continue;
 404       }
 405     }
 406     *inbuf = ib, *inbytesleft = ibl;
 407     *outbuf = ob, *outbytesleft = obl;
 408     return ret;
 409   }
 410 }
 411
 412
 413 /*
 414  * Convert a string
 415  * Used in rfc2047.c and rfc2231.c
 416  */
 417
 418 int mutt_convert_string (char **ps, const char *from, const char *to, int flags)
 419 {
 420   iconv_t cd;
 421   ICONV_CONST char *repls[] = { "\357\277\275", "?", 0 };
 422   char *s = *ps;
 423
 424   if (!s || !*s)
 425     return 0;
 426
 427   if (to && from && (cd = mutt_iconv_open (to, from, flags)) != (iconv_t)-1)
 428   {
 429     int len;
 430     ICONV_CONST char *ib;
 431     char *buf, *ob;
 432     size_t ibl, obl;
 433     ICONV_CONST char **inrepls = 0;
 434     char *outrepl = 0;
 435
 436     if (mutt_is_utf8 (to))
 437       outrepl = "\357\277\275";
 438     else if (mutt_is_utf8 (from))
 439       inrepls = repls;
 440     else
 441       outrepl = "?";
 442
 443     len = strlen (s);
 444     ib = s, ibl = len + 1;
 445     obl = MB_LEN_MAX * ibl;
 446     ob = buf = safe_malloc (obl + 1);
 447
 448     mutt_iconv (cd, &ib, &ibl, &ob, &obl, inrepls, outrepl);
 449     iconv_close (cd);
 450
 451     *ob = '\0';
 452
 453     FREE (ps);
 454     *ps = buf;
 455
 456     mutt_str_adjust (ps);
 457     return 0;
 458   }
 459   else
 460     return -1;
 461 }
 462
 463
 464 /*
 465  * FGETCONV stuff for converting a file while reading it
 466  * Used in sendlib.c for converting from mutt's Charset
 467  */
 468
 469 struct fgetconv_s
 470 {
 471   FILE *file;
 472   iconv_t cd;
 473   char bufi[512];
 474   char bufo[512];
 475   char *p;
 476   char *ob;
 477   char *ib;
 478   size_t ibl;
 479   ICONV_CONST char **inrepls;
 480 };
 481
 482 struct fgetconv_not
 483 {
 484   FILE *file;
 485   iconv_t cd;
 486 };
 487
 488 FGETCONV *fgetconv_open (FILE *file, const char *from, const char *to, int flags)
 489 {
 490   struct fgetconv_s *fc;
 491   iconv_t cd = (iconv_t)-1;
 492   static ICONV_CONST char *repls[] = { "\357\277\275", "?", 0 };
 493
 494   if (from && to)
 495     cd = mutt_iconv_open (to, from, flags);
 496
 497   if (cd != (iconv_t)-1)
 498   {
 499     fc = safe_malloc (sizeof (struct fgetconv_s));
 500     fc->p = fc->ob = fc->bufo;
 501     fc->ib = fc->bufi;
 502     fc->ibl = 0;
 503     fc->inrepls = mutt_is_utf8 (to) ? repls : repls + 1;
 504   }
 505   else
 506     fc = safe_malloc (sizeof (struct fgetconv_not));
 507   fc->file = file;
 508   fc->cd = cd;
 509   return (FGETCONV *)fc;
 510 }
 511
 512 char *fgetconvs (char *buf, size_t l, FGETCONV *_fc)
 513 {
 514   int c;
 515   size_t r;
 516
 517   for (r = 0; r + 1 < l;)
 518   {
 519     if ((c = fgetconv (_fc)) == EOF)
 520       break;
 521     buf[r++] = (char) c;
 522     if (c == '\n')
 523       break;
 524   }
 525   buf[r] = '\0';
 526
 527   if (r)
 528     return buf;
 529   else
 530     return NULL;
 531 }
 532
 533 int fgetconv (FGETCONV *_fc)
 534 {
 535   struct fgetconv_s *fc = (struct fgetconv_s *)_fc;
 536
 537   if (!fc)
 538     return EOF;
 539   if (fc->cd == (iconv_t)-1)
 540     return fgetc (fc->file);
 541   if (!fc->p)
 542     return EOF;
 543   if (fc->p < fc->ob)
 544     return (unsigned char)*(fc->p)++;
 545
 546   /* Try to convert some more */
 547   fc->p = fc->ob = fc->bufo;
 548   if (fc->ibl)
 549   {
 550     size_t obl = sizeof (fc->bufo);
 551     iconv (fc->cd, (ICONV_CONST char **)&fc->ib, &fc->ibl, &fc->ob, &obl);
 552     if (fc->p < fc->ob)
 553       return (unsigned char)*(fc->p)++;
 554   }
 555
 556   /* If we trusted iconv a bit more, we would at this point
 557    * ask why it had stopped converting ... */
 558
 559   /* Try to read some more */
 560   if (fc->ibl == sizeof (fc->bufi) ||
 561       (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof (fc->bufi)))
 562   {
 563     fc->p = 0;
 564     return EOF;
 565   }
 566   if (fc->ibl)
 567     memcpy (fc->bufi, fc->ib, fc->ibl);
 568   fc->ib = fc->bufi;
 569   fc->ibl += fread (fc->ib + fc->ibl, 1, sizeof (fc->bufi) - fc->ibl, fc->file);
 570
 571   /* Try harder this time to convert some */
 572   if (fc->ibl)
 573   {
 574     size_t obl = sizeof (fc->bufo);
 575     mutt_iconv (fc->cd, (ICONV_CONST char **)&fc->ib, &fc->ibl, &fc->ob, &obl,
 576                 fc->inrepls, 0);
 577     if (fc->p < fc->ob)
 578       return (unsigned char)*(fc->p)++;
 579   }
 580
 581   /* Either the file has finished or one of the buffers is too small */
 582   fc->p = 0;
 583   return EOF;
 584 }
 585
 586 void fgetconv_close (FGETCONV **_fc)
 587 {
 588   struct fgetconv_s *fc = (struct fgetconv_s *) *_fc;
 589
 590   if (fc->cd != (iconv_t)-1)
 591     iconv_close (fc->cd);
 592   FREE (_fc);
 593 }
 594
 595 char *mutt_get_first_charset (const char *charset)
 596 {
 597   static char fcharset[SHORT_STRING];
 598   const char *c, *c1;
 599
 600   c = charset;
 601   if (!mutt_strlen(c))
 602     return "us-ascii";
 603   if (!(c1 = strchr (c, ':')))
 604     return charset;
 605   strfcpy (fcharset, c, c1 - c + 1);
 606   return fcharset;
 607 }
 608
 609 static size_t convert_string (ICONV_CONST char *f, size_t flen,
 610                              const char *from, const char *to,
 611                              char **t, size_t *tlen)
 612 {
 613   iconv_t cd;
 614   char *buf, *ob;
 615   size_t obl, n;
 616   int e;
 617
 618   cd = mutt_iconv_open (to, from, 0);
 619   if (cd == (iconv_t)(-1))
 620     return (size_t)(-1);
 621   obl = 4 * flen + 1;
 622   ob = buf = safe_malloc (obl);
 623   n = iconv (cd, &f, &flen, &ob, &obl);
 624   if (n == (size_t)(-1) || iconv (cd, 0, 0, &ob, &obl) == (size_t)(-1))
 625   {
 626     e = errno;
 627     FREE (&buf);
 628     iconv_close (cd);
 629     errno = e;
 630     return (size_t)(-1);
 631   }
 632   *ob = '\0';
 633
 634   *tlen = ob - buf;
 635
 636   safe_realloc ((void **) &buf, ob - buf + 1);
 637   *t = buf;
 638   iconv_close (cd);
 639
 640   return n;
 641 }
 642
 643 int mutt_convert_nonmime_string (char **ps)
 644 {
 645   const char *c, *c1;
 646
 647   for (c = AssumedCharset; c; c = c1 ? c1 + 1 : 0)
 648   {
 649     char *u = *ps;
 650     char *s;
 651     char *fromcode;
 652     size_t m, n;
 653     size_t ulen = mutt_strlen (*ps);
 654     size_t slen;
 655
 656     if (!u || !*u)
 657       return 0;
 658
 659     c1 = strchr (c, ':');
 660     n = c1 ? c1 - c : mutt_strlen (c);
 661     if (!n)
 662       continue;
 663     fromcode = safe_malloc (n + 1);
 664     strfcpy (fromcode, c, n + 1);
 665     m = convert_string (u, ulen, fromcode, Charset, &s, &slen);
 666     FREE (&fromcode);
 667     if (m != (size_t)(-1))
 668     {
 669       FREE (ps);
 670       *ps = s;
 671       return 0;
 672     }
 673   }
 674   return -1;
 675 }
 676