charset.c

   1 /*
   2  * Copyright notice from original mutt:
   3  * Copyright (C) 1999-2000 Thomas Roessler <roessler@does-not-exist.org>
   4  *
   5  * This file is part of mutt-ng, see http://www.muttng.org/.
   6  * It's licensed under the GNU General Public License,
   7  * please see the file GPL in the top level source directory.
   8  */
   9
  10 #if HAVE_CONFIG_H
  11 # include "config.h"
  12 #endif
  13
  14 #include <string.h>
  15 #include <stdio.h>
  16 #include <stdlib.h>
  17
  18 #include <ctype.h>
  19
  20 #include <sys/types.h>
  21 #include <dirent.h>
  22 #include <unistd.h>
  23 #include <errno.h>
  24 #ifdef HAVE_LANGINFO_CODESET
  25 #  include <langinfo.h>
  26 #endif
  27
  28 #include <lib-lib/mem.h>
  29 #include <lib-lib/ascii.h>
  30 #include <lib-lib/str.h>
  31 #include <lib-lib/macros.h>
  32
  33 #include "mutt.h"
  34 #include "charset.h"
  35
  36 #ifndef EILSEQ
  37 #  define EILSEQ EINVAL
  38 #endif
  39
  40 char *Charset;
  41 int Charset_is_utf8 = 0;
  42
  43 /*
  44  * The following list has been created manually from the data under:
  45  * http://www.isi.edu/in-notes/iana/assignments/character-sets
  46  * Last update: 2000-09-07
  47  *
  48  * Note that it includes only the subset of character sets for which
  49  * a preferred MIME name is given.
  50  */
  51
  52 static struct {
  53     const char *key;
  54     const char *pref;
  55 } PreferredMIMENames[] = {
  56     {"ansi_x3.4-1968", "us-ascii"},
  57     {"iso-ir-6", "us-ascii"},
  58     {"iso_646.irv:1991", "us-ascii"},
  59     {"ascii", "us-ascii"},
  60     {"iso646-us", "us-ascii"},
  61     {"us", "us-ascii"},
  62     {"ibm367", "us-ascii"},
  63     {"cp367", "us-ascii"},
  64     {"csascii", "us-ascii"},
  65     {"csiso2022kr", "iso-2022-kr"},
  66     {"cseuckr", "euc-kr"},
  67     {"csiso2022jp", "iso-2022-jp"},
  68     {"csiso2022jp2", "iso-2022-jp-2"},
  69     {"iso_8859-1:1987", "iso-8859-1"},
  70     {"iso-ir-100", "iso-8859-1"},
  71     {"iso_8859-1", "iso-8859-1"},
  72     {"latin1", "iso-8859-1"},
  73     {"l1", "iso-8859-1"},
  74     {"ibm819", "iso-8859-1"},
  75     {"cp819", "iso-8859-1"},
  76     {"csisolatin1", "iso-8859-1"},
  77     {"iso_8859-2:1987", "iso-8859-2"},
  78     {"iso-ir-101", "iso-8859-2"},
  79     {"iso_8859-2", "iso-8859-2"},
  80     {"latin2", "iso-8859-2"},
  81     {"l2", "iso-8859-2"},
  82     {"csisolatin2", "iso-8859-2"},
  83     {"iso_8859-3:1988", "iso-8859-3"},
  84     {"iso-ir-109", "iso-8859-3"},
  85     {"iso_8859-3", "iso-8859-3"},
  86     {"latin3", "iso-8859-3"},
  87     {"l3", "iso-8859-3"},
  88     {"csisolatin3", "iso-8859-3"},
  89     {"iso_8859-4:1988", "iso-8859-4"},
  90     {"iso-ir-110", "iso-8859-4"},
  91     {"iso_8859-4", "iso-8859-4"},
  92     {"latin4", "iso-8859-4"},
  93     {"l4", "iso-8859-4"},
  94     {"csisolatin4", "iso-8859-4"},
  95     {"iso_8859-6:1987", "iso-8859-6"},
  96     {"iso-ir-127", "iso-8859-6"},
  97     {"iso_8859-6", "iso-8859-6"},
  98     {"ecma-114", "iso-8859-6"},
  99     {"asmo-708", "iso-8859-6"},
 100     {"arabic", "iso-8859-6"},
 101     {"csisolatinarabic", "iso-8859-6"},
 102     {"iso_8859-7:1987", "iso-8859-7"},
 103     {"iso-ir-126", "iso-8859-7"},
 104     {"iso_8859-7", "iso-8859-7"},
 105     {"elot_928", "iso-8859-7"},
 106     {"ecma-118", "iso-8859-7"},
 107     {"greek", "iso-8859-7"},
 108     {"greek8", "iso-8859-7"},
 109     {"csisolatingreek", "iso-8859-7"},
 110     {"iso_8859-8:1988", "iso-8859-8"},
 111     {"iso-ir-138", "iso-8859-8"},
 112     {"iso_8859-8", "iso-8859-8"},
 113     {"hebrew", "iso-8859-8"},
 114     {"csisolatinhebrew", "iso-8859-8"},
 115     {"iso_8859-5:1988", "iso-8859-5"},
 116     {"iso-ir-144", "iso-8859-5"},
 117     {"iso_8859-5", "iso-8859-5"},
 118     {"cyrillic", "iso-8859-5"},
 119     {"csisolatincyrillic", "iso8859-5"},
 120     {"iso_8859-9:1989", "iso-8859-9"},
 121     {"iso-ir-148", "iso-8859-9"},
 122     {"iso_8859-9", "iso-8859-9"},
 123     {"latin5", "iso-8859-9"},
 124     {"l5", "iso-8859-9"},
 125     {"csisolatin5", "iso-8859-9"},
 126     {"iso_8859-10:1992", "iso-8859-10"},
 127     {"iso-ir-157", "iso-8859-10"},
 128     {"latin6", "iso-8859-10"},
 129     {"l6", "iso-8859-10"},
 130     {"csisolatin6", "iso-8859-10"},
 131     {"cskoi8r", "koi8-r"},
 132     {"ms_kanji", "shift_jis"},
 133     {"csshiftjis", "shift_jis"},
 134     {"extended_unix_code_packed_format_for_japanese", "euc-jp"},
 135     {"cseucpkdfmtjapanese", "euc-jp"},
 136     {"csgb2312", "gb2312"},
 137     {"csbig5", "big5"},
 138     /* end of official brain damage.
 139        what follows has been taken * from glibc's localedata files.  */
 140     {"iso_8859-13", "iso-8859-13"},
 141     {"iso-ir-179", "iso-8859-13"},
 142     {"latin7", "iso-8859-13"},
 143     {"l7", "iso-8859-13"},
 144     {"iso_8859-14", "iso-8859-14"},
 145     {"latin8", "iso-8859-14"},
 146     {"l8", "iso-8859-14"},
 147     {"iso_8859-15", "iso-8859-15"},
 148     {"latin9", "iso-8859-15"},
 149     {"latin0", "iso-8859-15"},
 150     {"iso_8859-16", "iso-8859-16"},
 151     {"latin10", "iso-8859-16"},
 152     {"646", "us-ascii"},
 153     {"eucjp", "euc-jp"},
 154     {"pck", "shift_jis"},
 155     {"ko_kr-euc", "euc-kr"},
 156     {"zh_tw-big5", "big5"},
 157     {"sjis", "shift_jis"},
 158     {"euc-jp-ms", "eucjp-ms"},
 159     {NULL, NULL}
 160 };
 161
 162 void mutt_set_langinfo_charset (void)
 163 {
 164 #ifdef HAVE_LANGINFO_CODESET
 165     char buff[LONG_STRING];
 166     char buff2[LONG_STRING];
 167
 168     m_strcpy(buff, sizeof(buff), nl_langinfo(CODESET));
 169     mutt_canonical_charset(buff2, sizeof(buff2), buff);
 170
 171     /* finally, set $charset */
 172     if (!(Charset = m_strdup(buff2)))
 173 #endif
 174         Charset = m_strdup("iso-8859-1");
 175 }
 176
 177
 178 void mutt_canonical_charset(char *dest, ssize_t dlen, const char *name)
 179 {
 180     ssize_t i;
 181     char *p;
 182     char scratch[LONG_STRING];
 183
 184     m_strcpy(scratch, sizeof(scratch), name);
 185     m_strtolower(scratch);
 186
 187     /* catch some common iso-8859-something misspellings */
 188     if (!strncmp(scratch, "8859", 4)) {
 189         snprintf(scratch, sizeof(scratch), "iso-8859-%s",
 190                  name + 4 + (name[4] == '-'));
 191         m_strtolower(scratch);
 192     } else
 193     if (!strncmp(scratch, "iso8859", 7)) {
 194         snprintf(scratch, sizeof(scratch), "iso-8859-%s",
 195                  name + 7 + (name[7] == '-'));
 196         m_strtolower(scratch);
 197     }
 198
 199     for (i = 0; PreferredMIMENames[i].key; i++) {
 200         if (!strcmp(scratch, PreferredMIMENames[i].key)) {
 201             m_strcpy(dest, dlen, PreferredMIMENames[i].pref);
 202             return;
 203         }
 204     }
 205
 206     m_strcpy(dest, dlen, scratch);
 207 }
 208
 209 static int mutt_chscmp(const char *s, const char *chs)
 210 {
 211     char buffer[STRING];
 212
 213     if (!s)
 214         return 0;
 215
 216     mutt_canonical_charset(buffer, sizeof(buffer), s);
 217     return !strcmp(buffer, chs);
 218 }
 219
 220 int mutt_is_utf8(const char *s)
 221 {
 222     return mutt_chscmp(s, "utf-8");
 223 }
 224
 225 int mutt_is_us_ascii(const char *s)
 226 {
 227     return mutt_chscmp(s, "us-ascii");
 228 }
 229
 230
 231 /*
 232  * Like iconv_open, but canonicalises the charsets
 233  */
 234
 235 iconv_t mutt_iconv_open (const char *tocode, const char *fromcode, int flags)
 236 {
 237   char tocode1[SHORT_STRING];
 238   char fromcode1[SHORT_STRING];
 239   char *tocode2, *fromcode2;
 240   char *tmp;
 241
 242   iconv_t cd;
 243
 244   mutt_canonical_charset (tocode1, sizeof (tocode1), tocode);
 245
 246 #ifdef M_ICONV_HOOK_TO
 247   /* Not used. */
 248   if ((flags & M_ICONV_HOOK_TO) && (tmp = mutt_charset_hook (tocode1)))
 249     mutt_canonical_charset (tocode1, sizeof (tocode1), tmp);
 250 #endif
 251
 252   mutt_canonical_charset (fromcode1, sizeof (fromcode1), fromcode);
 253   if ((flags & M_ICONV_HOOK_FROM) && (tmp = mutt_charset_hook (fromcode1)))
 254     mutt_canonical_charset (fromcode1, sizeof (fromcode1), tmp);
 255
 256   if ((cd = iconv_open (tocode1, fromcode1)) != (iconv_t) - 1)
 257     return cd;
 258   if ((tocode2 = mutt_iconv_hook (tocode1))
 259       && (fromcode2 = mutt_iconv_hook (fromcode1)))
 260     return iconv_open (tocode2, fromcode2);
 261
 262   return (iconv_t) - 1;
 263 }
 264
 265
 266 /*
 267  * Like iconv, but keeps going even when the input is invalid
 268  * If you're supplying inrepls, the source charset should be stateless;
 269  * if you're supplying an outrepl, the target charset should be.
 270  */
 271
 272 ssize_t mutt_iconv(iconv_t cd, const char **inbuf, ssize_t *inbytesleft,
 273                    char **outbuf, ssize_t *outbytesleft,
 274                    const char **inrepls, const char *outrepl)
 275 {
 276   ssize_t ret = 0, ret1;
 277   const char *ib = *inbuf;
 278   ssize_t ibl = *inbytesleft;
 279   char *ob = *outbuf;
 280   ssize_t obl = *outbytesleft;
 281
 282   for (;;) {
 283     ret1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
 284     if (ret1 != -1)
 285       ret += ret1;
 286     if (ibl && obl && errno == EILSEQ) {
 287       if (inrepls) {
 288         /* Try replacing the input */
 289         const char **t;
 290
 291         for (t = inrepls; *t; t++) {
 292           const char *ib1 = *t;
 293           ssize_t ibl1 = m_strlen(*t);
 294           char *ob1 = ob;
 295           ssize_t obl1 = obl;
 296
 297           my_iconv(cd, &ib1, &ibl1, &ob1, &obl1);
 298           if (!ibl1) {
 299             ++ib, --ibl;
 300             ob = ob1, obl = obl1;
 301             ++ret;
 302             break;
 303           }
 304         }
 305         if (*t)
 306           continue;
 307       }
 308       /* Replace the output */
 309       if (!outrepl)
 310         outrepl = "?";
 311       my_iconv(cd, 0, 0, &ob, &obl);
 312       if (obl) {
 313         ssize_t n = m_strlen(outrepl);
 314
 315         if (n > obl) {
 316           outrepl = "?";
 317           n = 1;
 318         }
 319         memcpy (ob, outrepl, n);
 320         ++ib, --ibl;
 321         ob += n, obl -= n;
 322         ++ret;
 323         my_iconv(cd, 0, 0, 0, 0); /* for good measure */
 324         continue;
 325       }
 326     }
 327     *inbuf = ib, *inbytesleft = ibl;
 328     *outbuf = ob, *outbytesleft = obl;
 329     return ret;
 330   }
 331 }
 332
 333
 334 /*
 335  * Convert a string
 336  * Used in rfc2047.c and rfc2231.c
 337  */
 338
 339 int mutt_convert_string (char **ps, const char *from, const char *to,
 340                          int flags)
 341 {
 342   iconv_t cd;
 343   const char *repls[] = { "\357\277\275", "?", 0 };
 344   char *s = *ps;
 345
 346   if (!s || !*s)
 347     return 0;
 348
 349   if (to && from && (cd = mutt_iconv_open (to, from, flags)) != (iconv_t) - 1) {
 350     int len;
 351     const char *ib;
 352     char *buf, *ob;
 353     ssize_t ibl, obl;
 354     const char **inrepls = NULL;
 355     const char *outrepl = NULL;
 356
 357     if (mutt_is_utf8 (to))
 358       outrepl = "\357\277\275";
 359     else if (mutt_is_utf8 (from))
 360       inrepls = repls;
 361     else
 362       outrepl = "?";
 363
 364     len = m_strlen(s);
 365     ib = s, ibl = len + 1;
 366     obl = MB_LEN_MAX * ibl;
 367     ob = buf = xmalloc(obl + 1);
 368
 369     mutt_iconv (cd, &ib, &ibl, &ob, &obl, inrepls, outrepl);
 370     iconv_close (cd);
 371
 372     *ob = '\0';
 373
 374     p_delete(ps);
 375     *ps = buf;
 376     return 0;
 377   }
 378   else
 379     return -1;
 380 }
 381
 382
 383 /*
 384  * FGETCONV stuff for converting a file while reading it
 385  * Used in sendlib.c for converting from mutt's Charset
 386  */
 387
 388 struct fgetconv_s {
 389   FILE *file;
 390   iconv_t cd;
 391   char bufi[512];
 392   char bufo[512];
 393   char *p;
 394   char *ob;
 395   char *ib;
 396   ssize_t ibl;
 397   const char **inrepls;
 398 };
 399
 400 struct fgetconv_not {
 401   FILE *file;
 402   iconv_t cd;
 403 };
 404
 405 FGETCONV *fgetconv_open (FILE * file, const char *from, const char *to,
 406                          int flags)
 407 {
 408   struct fgetconv_s *fc;
 409   iconv_t cd = (iconv_t) - 1;
 410   static const char *repls[] = { "\357\277\275", "?", 0 };
 411
 412   if (from && to)
 413     cd = mutt_iconv_open (to, from, flags);
 414
 415   if (cd != (iconv_t) - 1) {
 416     fc = p_new(struct fgetconv_s, 1);
 417     fc->p = fc->ob = fc->bufo;
 418     fc->ib = fc->bufi;
 419     fc->ibl = 0;
 420     fc->inrepls = mutt_is_utf8 (to) ? repls : repls + 1;
 421   }
 422   else
 423     fc = p_new(struct fgetconv_s, 1);
 424   fc->file = file;
 425   fc->cd = cd;
 426   return (FGETCONV *) fc;
 427 }
 428
 429 char *fgetconvs (char *buf, ssize_t l, FGETCONV * _fc)
 430 {
 431   int c;
 432   ssize_t r;
 433
 434   for (r = 0; r + 1 < l;) {
 435     if ((c = fgetconv (_fc)) == EOF)
 436       break;
 437     buf[r++] = (char) c;
 438     if (c == '\n')
 439       break;
 440   }
 441   buf[r] = '\0';
 442
 443   if (r)
 444     return buf;
 445   else
 446     return NULL;
 447 }
 448
 449 int fgetconv (FGETCONV * _fc)
 450 {
 451   struct fgetconv_s *fc = (struct fgetconv_s *) _fc;
 452
 453   if (!fc)
 454     return EOF;
 455   if (fc->cd == (iconv_t) - 1)
 456     return fgetc (fc->file);
 457   if (!fc->p)
 458     return EOF;
 459   if (fc->p < fc->ob)
 460     return (unsigned char) *(fc->p)++;
 461
 462   /* Try to convert some more */
 463   fc->p = fc->ob = fc->bufo;
 464   if (fc->ibl) {
 465     ssize_t obl = ssizeof(fc->bufo);
 466
 467     my_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl);
 468     if (fc->p < fc->ob)
 469       return (unsigned char) *(fc->p)++;
 470   }
 471
 472   /* If we trusted iconv a bit more, we would at this point
 473    * ask why it had stopped converting ... */
 474
 475   /* Try to read some more */
 476   if (fc->ibl == sizeof (fc->bufi) ||
 477       (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof (fc->bufi))) {
 478     fc->p = 0;
 479     return EOF;
 480   }
 481   if (fc->ibl)
 482     memcpy (fc->bufi, fc->ib, fc->ibl);
 483   fc->ib = fc->bufi;
 484   fc->ibl +=
 485     fread (fc->ib + fc->ibl, 1, sizeof (fc->bufi) - fc->ibl, fc->file);
 486
 487   /* Try harder this time to convert some */
 488   if (fc->ibl) {
 489     ssize_t obl = ssizeof(fc->bufo);
 490
 491     mutt_iconv (fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob,
 492                 &obl, fc->inrepls, 0);
 493     if (fc->p < fc->ob)
 494       return (unsigned char) *(fc->p)++;
 495   }
 496
 497   /* Either the file has finished or one of the buffers is too small */
 498   fc->p = 0;
 499   return EOF;
 500 }
 501
 502 void fgetconv_close (FGETCONV ** _fc)
 503 {
 504   struct fgetconv_s *fc = (struct fgetconv_s *) *_fc;
 505
 506   if (fc->cd != (iconv_t) - 1)
 507     iconv_close (fc->cd);
 508   p_delete(_fc);
 509 }
 510
 511 const char *mutt_get_first_charset (const char *charset)
 512 {
 513   static char fcharset[SHORT_STRING];
 514   const char *c, *c1;
 515
 516   c = charset;
 517   if (!m_strlen(c))
 518     return "us-ascii";
 519   if (!(c1 = strchr (c, ':')))
 520     return ((char*) charset);
 521   m_strcpy(fcharset, c1 - c + 1, c);
 522   return fcharset;
 523 }
 524
 525 static ssize_t convert_string (const char *f, ssize_t flen,
 526                               const char *from, const char *to,
 527                               char **t, ssize_t * tlen)
 528 {
 529   iconv_t cd;
 530   char *buf, *ob;
 531   ssize_t obl;
 532   ssize_t n;
 533   int e;
 534
 535   cd = mutt_iconv_open (to, from, 0);
 536   if (cd == (iconv_t) (-1))
 537     return -1;
 538   obl = 4 * flen + 1;
 539   ob = buf = xmalloc(obl);
 540   n = my_iconv(cd, &f, &flen, &ob, &obl);
 541   if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
 542     e = errno;
 543     p_delete(&buf);
 544     iconv_close (cd);
 545     errno = e;
 546     return -1;
 547   }
 548   *ob = '\0';
 549
 550   *tlen = ob - buf;
 551
 552   p_realloc(&buf, ob - buf + 1);
 553   *t = buf;
 554   iconv_close (cd);
 555
 556   return n;
 557 }
 558
 559 int mutt_convert_nonmime_string (char **ps)
 560 {
 561   const char *c, *c1;
 562
 563   for (c = AssumedCharset; c; c = c1 ? c1 + 1 : 0) {
 564     char *u = *ps;
 565     char *s = NULL;
 566     char *fromcode;
 567     ssize_t m, n;
 568     ssize_t ulen = m_strlen(*ps);
 569     ssize_t slen;
 570
 571     if (!u || !*u)
 572       return 0;
 573
 574     c1 = strchr (c, ':');
 575     n = c1 ? c1 - c : m_strlen(c);
 576     if (!n)
 577       continue;
 578     fromcode = p_dupstr(c, n);
 579     m = convert_string (u, ulen, fromcode, Charset, &s, &slen);
 580     p_delete(&fromcode);
 581     if (m != -1) {
 582       p_delete(ps);
 583       *ps = s;
 584       return 0;
 585     }
 586   }
 587   return -1;
 588 }
 589
 590 void mutt_set_charset (char *charset)
 591 {
 592     char buffer[STRING];
 593
 594     mutt_canonical_charset (buffer, sizeof (buffer), charset);
 595     Charset_is_utf8 = !strcmp(buffer, "utf-8");
 596
 597 #ifdef HAVE_BIND_TEXTDOMAIN_CODESET
 598     bind_textdomain_codeset (PACKAGE, buffer);
 599 #endif
 600 }
 601
 602 wchar_t replacement_char(void)
 603 {
 604     return Charset_is_utf8 ? 0xfffd : '?';
 605 }