charset.c

   1 /*
   2  * Copyright (C) 1999-2000 Thomas Roessler <roessler@does-not-exist.org>
   3  *
   4  *     This program is free software; you can redistribute it
   5  *     and/or modify it under the terms of the GNU General Public
   6  *     License as published by the Free Software Foundation; either
   7  *     version 2 of the License, or (at your option) any later
   8  *     version.
   9  *
  10  *     This program is distributed in the hope that it will be
  11  *     useful, but WITHOUT ANY WARRANTY; without even the implied
  12  *     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
  13  *     PURPOSE.  See the GNU General Public License for more
  14  *     details.
  15  *
  16  *     You should have received a copy of the GNU General Public
  17  *     License along with this program; if not, write to the Free
  18  *     Software Foundation, Inc., 59 Temple Place - Suite 330,
  19  *     Boston, MA  02111, USA.
  20  */
  21
  22
  23 #include <string.h>
  24 #include <stdio.h>
  25 #include <stdlib.h>
  26
  27 #include <ctype.h>
  28
  29 #include <sys/types.h>
  30 #include <dirent.h>
  31 #include <unistd.h>
  32 #include <errno.h>
  33
  34 #include "mutt.h"
  35 #include "charset.h"
  36
  37 #ifndef EILSEQ
  38 # define EILSEQ EINVAL
  39 #endif
  40
  41 /*
  42  * The following list has been created manually from the data under:
  43  * http://www.isi.edu/in-notes/iana/assignments/character-sets
  44  * Last update: 2000-09-07
  45  *
  46  * Note that it includes only the subset of character sets for which
  47  * a preferred MIME name is given.
  48  */
  49
  50 static struct
  51 {
  52   char *key;
  53   char *pref;
  54 }
  55 PreferredMIMENames[] =
  56 {
  57   { "ansi_x3.4-1968",   "us-ascii"      },
  58   { "iso-ir-6",         "us-ascii"      },
  59   { "iso_646.irv:1991", "us-ascii"      },
  60   { "ascii",            "us-ascii"      },
  61   { "iso646-us",        "us-ascii"      },
  62   { "us",               "us-ascii"      },
  63   { "ibm367",           "us-ascii"      },
  64   { "cp367",            "us-ascii"      },
  65   { "csASCII",          "us-ascii"      },
  66
  67   { "csISO2022KR",      "iso-2022-kr"   },
  68   { "csEUCKR",          "euc-kr"        },
  69   { "csISO2022JP",      "iso-2022-jp"   },
  70   { "csISO2022JP2",     "iso-2022-jp-2" },
  71
  72   { "ISO_8859-1:1987",  "iso-8859-1"    },
  73   { "iso-ir-100",       "iso-8859-1"    },
  74   { "iso_8859-1",       "iso-8859-1"    },
  75   { "latin1",           "iso-8859-1"    },
  76   { "l1",               "iso-8859-1"    },
  77   { "IBM819",           "iso-8859-1"    },
  78   { "CP819",            "iso-8859-1"    },
  79   { "csISOLatin1",      "iso-8859-1"    },
  80
  81   { "ISO_8859-2:1987",  "iso-8859-2"    },
  82   { "iso-ir-101",       "iso-8859-2"    },
  83   { "iso_8859-2",       "iso-8859-2"    },
  84   { "latin2",           "iso-8859-2"    },
  85   { "l2",               "iso-8859-2"    },
  86   { "csISOLatin2",      "iso-8859-2"    },
  87
  88   { "ISO_8859-3:1988",  "iso-8859-3"    },
  89   { "iso-ir-109",       "iso-8859-3"    },
  90   { "ISO_8859-3",       "iso-8859-3"    },
  91   { "latin3",           "iso-8859-3"    },
  92   { "l3",               "iso-8859-3"    },
  93   { "csISOLatin3",      "iso-8859-3"    },
  94
  95   { "ISO_8859-4:1988",  "iso-8859-4"    },
  96   { "iso-ir-110",       "iso-8859-4"    },
  97   { "ISO_8859-4",       "iso-8859-4"    },
  98   { "latin4",           "iso-8859-4"    },
  99   { "l4",               "iso-8859-4"    },
 100   { "csISOLatin4",      "iso-8859-4"    },
 101
 102   { "ISO_8859-6:1987",  "iso-8859-6"    },
 103   { "iso-ir-127",       "iso-8859-6"    },
 104   { "iso_8859-6",       "iso-8859-6"    },
 105   { "ECMA-114",         "iso-8859-6"    },
 106   { "ASMO-708",         "iso-8859-6"    },
 107   { "arabic",           "iso-8859-6"    },
 108   { "csISOLatinArabic", "iso-8859-6"    },
 109
 110   { "ISO_8859-7:1987",  "iso-8859-7"    },
 111   { "iso-ir-126",       "iso-8859-7"    },
 112   { "ISO_8859-7",       "iso-8859-7"    },
 113   { "ELOT_928",         "iso-8859-7"    },
 114   { "ECMA-118",         "iso-8859-7"    },
 115   { "greek",            "iso-8859-7"    },
 116   { "greek8",           "iso-8859-7"    },
 117   { "csISOLatinGreek",  "iso-8859-7"    },
 118
 119   { "ISO_8859-8:1988",  "iso-8859-8"    },
 120   { "iso-ir-138",       "iso-8859-8"    },
 121   { "ISO_8859-8",       "iso-8859-8"    },
 122   { "hebrew",           "iso-8859-8"    },
 123   { "csISOLatinHebrew", "iso-8859-8"    },
 124
 125   { "ISO_8859-5:1988",  "iso-8859-5"    },
 126   { "iso-ir-144",       "iso-8859-5"    },
 127   { "ISO_8859-5",       "iso-8859-5"    },
 128   { "cyrillic",         "iso-8859-5"    },
 129   { "csISOLatinCyrillic", "iso8859-5"   },
 130
 131   { "ISO_8859-9:1989",  "iso-8859-9"    },
 132   { "iso-ir-148",       "iso-8859-9"    },
 133   { "ISO_8859-9",       "iso-8859-9"    },
 134   { "latin5",           "iso-8859-9"    }, /* this is not a bug */
 135   { "l5",               "iso-8859-9"    },
 136   { "csISOLatin5",      "iso-8859-9"    },
 137
 138   { "ISO_8859-10:1992", "iso-8859-10"   },
 139   { "iso-ir-157",       "iso-8859-10"   },
 140   { "latin6",           "iso-8859-10"   }, /* this is not a bug */
 141   { "l6",               "iso-8859-10"   },
 142   { "csISOLatin6"       "iso-8859-10"   },
 143
 144   { "csKOI8r",          "koi8-r"        },
 145
 146   { "MS_Kanji",         "Shift_JIS"     }, /* Note the underscore! */
 147   { "csShiftJis",       "Shift_JIS"     },
 148
 149   { "Extended_UNIX_Code_Packed_Format_for_Japanese",
 150                         "EUC-JP"        },
 151   { "csEUCPkdFmtJapanese",
 152                         "EUC-JP"        },
 153
 154   { "csGB2312",         "gb2312"        },
 155   { "csbig5",           "big5"          },
 156
 157   /*
 158    * End of official brain damage.  What follows has been taken
 159    * from glibc's localedata files.
 160    */
 161
 162   { "iso_8859-13",      "iso-8859-13"   },
 163   { "iso-ir-179",       "iso-8859-13"   },
 164   { "latin7",           "iso-8859-13"   }, /* this is not a bug */
 165   { "l7",               "iso-8859-13"   },
 166
 167   { "iso_8859-14",      "iso-8859-14"   },
 168   { "latin8",           "iso-8859-14"   }, /* this is not a bug */
 169   { "l8",               "iso-8859-14"   },
 170
 171   { "iso_8859-15",      "iso-8859-15"   },
 172   { "latin9",           "iso-8859-15"   }, /* this is not a bug */
 173
 174   /* Suggested by Ionel Mugurel Ciobica <tgakic@sg10.chem.tue.nl> */
 175   { "latin0",           "iso-8859-15"   }, /* this is not a bug */
 176
 177   { "iso_8859-16",      "iso-8859-16"   },
 178   { "latin10",          "iso-8859-16"   }, /* this is not a bug */
 179
 180   /*
 181    * David Champion <dgc@uchicago.edu> has observed this with
 182    * nl_langinfo under SunOS 5.8.
 183    */
 184
 185   { "646",              "us-ascii"      },
 186
 187   /*
 188    * http://www.sun.com/software/white-papers/wp-unicode/
 189    */
 190
 191   { "eucJP",            "euc-jp"        },
 192   { "PCK",              "Shift_JIS"     },
 193   { "ko_KR-euc",        "euc-kr"        },
 194   { "zh_TW-big5",       "big5"          },
 195
 196   /* seems to be common on some systems */
 197
 198   { "sjis",             "Shift_JIS"     },
 199
 200
 201   /*
 202    * If you happen to encounter system-specific brain-damage with
 203    * respect to character set naming, please add it above this
 204    * comment, and submit a patch to <mutt-dev@mutt.org>.
 205    */
 206
 207   /* End of aliases.  Please keep this line last. */
 208
 209   { NULL,               NULL            }
 210 };
 211
 212 #ifdef HAVE_LANGINFO_CODESET
 213 # include <langinfo.h>
 214
 215
 216 void mutt_set_langinfo_charset (void)
 217 {
 218   char buff[LONG_STRING];
 219   char buff2[LONG_STRING];
 220
 221   strfcpy (buff, nl_langinfo (CODESET), sizeof (buff));
 222   mutt_canonical_charset (buff2, sizeof (buff2), buff);
 223
 224   /* finally, set $charset */
 225   if (!(Charset = safe_strdup (buff2)))
 226     Charset = safe_strdup ("iso-8859-1");
 227 }
 228
 229 #else
 230
 231 void mutt_set_langinfo_charset (void)
 232 {
 233   Charset = safe_strdup ("iso-8859-1");
 234 }
 235
 236 #endif
 237
 238 void mutt_canonical_charset (char *dest, size_t dlen, const char *name)
 239 {
 240   size_t i;
 241   char *p;
 242   char scratch[LONG_STRING];
 243
 244   /* catch some common iso-8859-something misspellings */
 245   if (!ascii_strncasecmp (name, "8859", 4) && name[4] != '-')
 246     snprintf (scratch, sizeof (scratch), "iso-8859-%s", name +4);
 247   else if (!ascii_strncasecmp (name, "8859-", 5))
 248     snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 5);
 249   else if (!ascii_strncasecmp (name, "iso8859", 7) && name[7] != '-')
 250     snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 7);
 251   else if (!ascii_strncasecmp (name, "iso8859-", 8))
 252     snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 8);
 253   else
 254     strfcpy (scratch, NONULL(name), sizeof (scratch));
 255
 256   for (i = 0; PreferredMIMENames[i].key; i++)
 257     if (!ascii_strcasecmp (scratch, PreferredMIMENames[i].key) ||
 258         !mutt_strcasecmp (scratch, PreferredMIMENames[i].key))
 259     {
 260       strfcpy (dest, PreferredMIMENames[i].pref, dlen);
 261       return;
 262     }
 263
 264   strfcpy (dest, scratch, dlen);
 265
 266   /* for cosmetics' sake, transform to lowercase. */
 267   for (p = dest; *p; p++)
 268     *p = ascii_tolower (*p);
 269 }
 270
 271 int mutt_chscmp (const char *s, const char *chs)
 272 {
 273   char buffer[STRING];
 274
 275   if (!s) return 0;
 276
 277   mutt_canonical_charset (buffer, sizeof (buffer), s);
 278   return !ascii_strcasecmp (buffer, chs);
 279 }
 280
 281
 282 #ifndef HAVE_ICONV
 283
 284 iconv_t iconv_open (const char *tocode, const char *fromcode)
 285 {
 286   return (iconv_t)(-1);
 287 }
 288
 289 size_t iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t *inbytesleft,
 290               char **outbuf, size_t *outbytesleft)
 291 {
 292   return 0;
 293 }
 294
 295 int iconv_close (iconv_t cd)
 296 {
 297   return 0;
 298 }
 299
 300 #endif /* !HAVE_ICONV */
 301
 302
 303 /*
 304  * Like iconv_open, but canonicalises the charsets
 305  */
 306
 307 iconv_t mutt_iconv_open (const char *tocode, const char *fromcode, int flags)
 308 {
 309   char tocode1[SHORT_STRING];
 310   char fromcode1[SHORT_STRING];
 311   char *tocode2, *fromcode2;
 312   char *tmp;
 313
 314   iconv_t cd;
 315
 316   mutt_canonical_charset (tocode1, sizeof (tocode1), tocode);
 317
 318 #ifdef M_ICONV_HOOK_TO
 319   /* Not used. */
 320   if ((flags & M_ICONV_HOOK_TO) && (tmp = mutt_charset_hook (tocode1)))
 321     mutt_canonical_charset (tocode1, sizeof (tocode1), tmp);
 322 #endif
 323
 324   mutt_canonical_charset (fromcode1, sizeof (fromcode1), fromcode);
 325   if ((flags & M_ICONV_HOOK_FROM) && (tmp = mutt_charset_hook (fromcode1)))
 326     mutt_canonical_charset (fromcode1, sizeof (fromcode1), tmp);
 327
 328   if ((cd = iconv_open (tocode1, fromcode1)) != (iconv_t) -1)
 329     return cd;
 330   if ((tocode2 = mutt_iconv_hook (tocode1)) && (fromcode2 = mutt_iconv_hook (fromcode1)))
 331     return iconv_open (tocode2, fromcode2);
 332
 333   return (iconv_t) -1;
 334 }
 335
 336
 337 /*
 338  * Like iconv, but keeps going even when the input is invalid
 339  * If you're supplying inrepls, the source charset should be stateless;
 340  * if you're supplying an outrepl, the target charset should be.
 341  */
 342
 343 size_t mutt_iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t *inbytesleft,
 344                    char **outbuf, size_t *outbytesleft,
 345                    ICONV_CONST char **inrepls, const char *outrepl)
 346 {
 347   size_t ret = 0, ret1;
 348   ICONV_CONST char *ib = *inbuf;
 349   size_t ibl = *inbytesleft;
 350   char *ob = *outbuf;
 351   size_t obl = *outbytesleft;
 352
 353   for (;;)
 354   {
 355     ret1 = iconv (cd, &ib, &ibl, &ob, &obl);
 356     if (ret1 != (size_t)-1)
 357       ret += ret1;
 358     if (ibl && obl && errno == EILSEQ)
 359     {
 360       if (inrepls)
 361       {
 362         /* Try replacing the input */
 363         ICONV_CONST char **t;
 364         for (t = inrepls; *t; t++)
 365         {
 366           ICONV_CONST char *ib1 = *t;
 367           size_t ibl1 = strlen (*t);
 368           char *ob1 = ob;
 369           size_t obl1 = obl;
 370           iconv (cd, &ib1, &ibl1, &ob1, &obl1);
 371           if (!ibl1)
 372           {
 373             ++ib, --ibl;
 374             ob = ob1, obl = obl1;
 375             ++ret;
 376             break;
 377           }
 378         }
 379         if (*t)
 380           continue;
 381       }
 382       /* Replace the output */
 383       if (!outrepl)
 384         outrepl = "?";
 385       iconv (cd, 0, 0, &ob, &obl);
 386       if (obl)
 387       {
 388         int n = strlen (outrepl);
 389         if (n > obl)
 390         {
 391           outrepl = "?";
 392           n = 1;
 393         }
 394         memcpy (ob, outrepl, n);
 395         ++ib, --ibl;
 396         ob += n, obl -= n;
 397         ++ret;
 398         iconv (cd, 0, 0, 0, 0); /* for good measure */
 399         continue;
 400       }
 401     }
 402     *inbuf = ib, *inbytesleft = ibl;
 403     *outbuf = ob, *outbytesleft = obl;
 404     return ret;
 405   }
 406 }
 407
 408
 409 /*
 410  * Convert a string
 411  * Used in rfc2047.c and rfc2231.c
 412  */
 413
 414 int mutt_convert_string (char **ps, const char *from, const char *to, int flags)
 415 {
 416   iconv_t cd;
 417   ICONV_CONST char *repls[] = { "\357\277\275", "?", 0 };
 418   char *s = *ps;
 419
 420   if (!s || !*s)
 421     return 0;
 422
 423   if (to && from && (cd = mutt_iconv_open (to, from, flags)) != (iconv_t)-1)
 424   {
 425     int len;
 426     ICONV_CONST char *ib;
 427     char *buf, *ob;
 428     size_t ibl, obl;
 429     ICONV_CONST char **inrepls = 0;
 430     char *outrepl = 0;
 431
 432     if (mutt_is_utf8 (to))
 433       outrepl = "\357\277\275";
 434     else if (mutt_is_utf8 (from))
 435       inrepls = repls;
 436     else
 437       outrepl = "?";
 438
 439     len = strlen (s);
 440     ib = s, ibl = len + 1;
 441     obl = MB_LEN_MAX * ibl;
 442     ob = buf = safe_malloc (obl + 1);
 443
 444     mutt_iconv (cd, &ib, &ibl, &ob, &obl, inrepls, outrepl);
 445     iconv_close (cd);
 446
 447     *ob = '\0';
 448
 449     FREE (ps);
 450     *ps = buf;
 451
 452     mutt_str_adjust (ps);
 453     return 0;
 454   }
 455   else
 456     return -1;
 457 }
 458
 459
 460 /*
 461  * FGETCONV stuff for converting a file while reading it
 462  * Used in sendlib.c for converting from mutt's Charset
 463  */
 464
 465 struct fgetconv_s
 466 {
 467   FILE *file;
 468   iconv_t cd;
 469   char bufi[512];
 470   char bufo[512];
 471   char *p;
 472   char *ob;
 473   char *ib;
 474   size_t ibl;
 475   ICONV_CONST char **inrepls;
 476 };
 477
 478 struct fgetconv_not
 479 {
 480   FILE *file;
 481   iconv_t cd;
 482 };
 483
 484 FGETCONV *fgetconv_open (FILE *file, const char *from, const char *to, int flags)
 485 {
 486   struct fgetconv_s *fc;
 487   iconv_t cd = (iconv_t)-1;
 488   static ICONV_CONST char *repls[] = { "\357\277\275", "?", 0 };
 489
 490   if (from && to)
 491     cd = mutt_iconv_open (to, from, flags);
 492
 493   if (cd != (iconv_t)-1)
 494   {
 495     fc = safe_malloc (sizeof (struct fgetconv_s));
 496     fc->p = fc->ob = fc->bufo;
 497     fc->ib = fc->bufi;
 498     fc->ibl = 0;
 499     fc->inrepls = mutt_is_utf8 (to) ? repls : repls + 1;
 500   }
 501   else
 502     fc = safe_malloc (sizeof (struct fgetconv_not));
 503   fc->file = file;
 504   fc->cd = cd;
 505   return (FGETCONV *)fc;
 506 }
 507
 508 char *fgetconvs (char *buf, size_t l, FGETCONV *_fc)
 509 {
 510   int c;
 511   size_t r;
 512
 513   for (r = 0; r + 1 < l;)
 514   {
 515     if ((c = fgetconv (_fc)) == EOF)
 516       break;
 517     buf[r++] = (char) c;
 518     if (c == '\n')
 519       break;
 520   }
 521   buf[r] = '\0';
 522
 523   if (r)
 524     return buf;
 525   else
 526     return NULL;
 527 }
 528
 529 int fgetconv (FGETCONV *_fc)
 530 {
 531   struct fgetconv_s *fc = (struct fgetconv_s *)_fc;
 532
 533   if (!fc)
 534     return EOF;
 535   if (fc->cd == (iconv_t)-1)
 536     return fgetc (fc->file);
 537   if (!fc->p)
 538     return EOF;
 539   if (fc->p < fc->ob)
 540     return (unsigned char)*(fc->p)++;
 541
 542   /* Try to convert some more */
 543   fc->p = fc->ob = fc->bufo;
 544   if (fc->ibl)
 545   {
 546     size_t obl = sizeof (fc->bufo);
 547     iconv (fc->cd, (ICONV_CONST char **)&fc->ib, &fc->ibl, &fc->ob, &obl);
 548     if (fc->p < fc->ob)
 549       return (unsigned char)*(fc->p)++;
 550   }
 551
 552   /* If we trusted iconv a bit more, we would at this point
 553    * ask why it had stopped converting ... */
 554
 555   /* Try to read some more */
 556   if (fc->ibl == sizeof (fc->bufi) ||
 557       (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof (fc->bufi)))
 558   {
 559     fc->p = 0;
 560     return EOF;
 561   }
 562   if (fc->ibl)
 563     memcpy (fc->bufi, fc->ib, fc->ibl);
 564   fc->ib = fc->bufi;
 565   fc->ibl += fread (fc->ib + fc->ibl, 1, sizeof (fc->bufi) - fc->ibl, fc->file);
 566
 567   /* Try harder this time to convert some */
 568   if (fc->ibl)
 569   {
 570     size_t obl = sizeof (fc->bufo);
 571     mutt_iconv (fc->cd, (ICONV_CONST char **)&fc->ib, &fc->ibl, &fc->ob, &obl,
 572                 fc->inrepls, 0);
 573     if (fc->p < fc->ob)
 574       return (unsigned char)*(fc->p)++;
 575   }
 576
 577   /* Either the file has finished or one of the buffers is too small */
 578   fc->p = 0;
 579   return EOF;
 580 }
 581
 582 void fgetconv_close (FGETCONV **_fc)
 583 {
 584   struct fgetconv_s *fc = (struct fgetconv_s *) *_fc;
 585
 586   if (fc->cd != (iconv_t)-1)
 587     iconv_close (fc->cd);
 588   FREE (_fc);
 589 }