charset.cpkg

   1 /*
   2  *  This program is free software; you can redistribute it and/or modify
   3  *  it under the terms of the GNU General Public License as published by
   4  *  the Free Software Foundation; either version 2 of the License, or (at
   5  *  your option) any later version.
   6  *
   7  *  This program is distributed in the hope that it will be useful, but
   8  *  WITHOUT ANY WARRANTY; without even the implied warranty of
   9  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  10  *  General Public License for more details.
  11  *
  12  *  You should have received a copy of the GNU General Public License
  13  *  along with this program; if not, write to the Free Software
  14  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  15  *  MA 02110-1301, USA.
  16  *
  17  *  Copyright © 2006 Pierre Habouzit
  18  */
  19 /*
  20  * Copyright notice from original mutt:
  21  * Copyright (C) 1999-2000 Thomas Roessler <roessler@does-not-exist.org>
  22  *
  23  * This file is part of mutt-ng, see http://www.muttng.org/.
  24  * It's licensed under the GNU General Public License,
  25  * please see the file GPL in the top level source directory.
  26  */
  27
  28 #include <lib-lib/lib-lib.h>
  29
  30 #ifdef HAVE_LANGINFO_CODESET
  31 #  include <langinfo.h>
  32 #endif
  33
  34 #include "charset.h"
  35
  36 #ifndef EILSEQ
  37 #  define EILSEQ EINVAL
  38 #endif
  39 @import "lib-lua/base.cpkg"
  40
  41 static rx_t *charset_hooks = NULL;
  42 static rx_t *iconv_hooks   = NULL;
  43
  44 @package MCharset {
  45     /*
  46      ** .pp
  47      ** This variable is a colon-separated list of character encoding
  48      ** schemes for messages without character encoding indication.
  49      ** Header field values and message body content without character encoding
  50      ** indication would be assumed that they are written in one of this list.
  51      ** By default, all the header fields and message body without any charset
  52      ** indication are assumed to be in \fTus-ascii\fP.
  53      ** .pp
  54      ** For example, Japanese users might prefer this:
  55      ** .pp
  56      ** \fTset assumed_charset="iso-2022-jp:euc-jp:shift_jis:utf-8"\fP
  57      ** .pp
  58      ** However, only the first content is valid for the message body.
  59      ** This variable is valid only if $$strict_mime is unset.
  60      */
  61     string_t assumed_charset = m_strdup("us-ascii");
  62
  63     /*
  64      ** .pp
  65      ** Character set your terminal uses to display and enter textual data.
  66      */
  67     string_t charset         = NULL;
  68
  69     /*
  70      ** .pp
  71      ** This variable is a colon-separated list of character encoding
  72      ** schemes for text file attatchments.
  73      ** If \fIunset\fP, $$charset value will be used instead.
  74      ** For example, the following configuration would work for Japanese
  75      ** text handling:
  76      ** .pp
  77      ** \fTset file_charset="iso-2022-jp:euc-jp:shift_jis:utf-8"\fP
  78      ** .pp
  79      ** Note: ``\fTiso-2022-*\fP'' must be put at the head of the value as shown above
  80      ** if included.
  81      */
  82     string_t file_charset    = NULL;
  83
  84     /*
  85      ** .pp
  86      ** A list of character sets for outgoing messages. Madmutt will use the
  87      ** first character set into which the text can be converted exactly.
  88      ** If your ``$$charset'' is not \fTiso-8859-1\fP and recipients may not
  89      ** understand \fTUTF-8\fP, it is advisable to include in the list an
  90      ** appropriate widely used standard character set (such as
  91      ** \fTiso-8859-2\fP, \fTkoi8-r\fP or \fTiso-2022-jp\fP) either
  92      ** instead of or after \fTiso-8859-1\fP.
  93      */
  94     string_t send_charset    = m_strdup("us-ascii:iso-8859-1:utf-8");
  95
  96     void charset_hook(rx_t local, const string_t alias) {
  97         rx_set_template(local, alias);
  98         rx_list_append(&charset_hooks, local);
  99     };
 100
 101     void iconv_hook(rx_t local, const string_t alias) {
 102         rx_set_template(local, alias);
 103         rx_list_append(&iconv_hooks, local);
 104     };
 105 };
 106
 107 int Charset_is_utf8 = 0;
 108 wchar_t CharsetReplacement = '?';
 109
 110
 111 /****************************************************************************/
 112 /* charset functions                                                        */
 113 /****************************************************************************/
 114
 115 void charset_initialize(void)
 116 {
 117 #ifdef HAVE_LANGINFO_CODESET
 118     char buff[STRING];
 119     char buff2[STRING];
 120
 121     m_strcpy(buff, sizeof(buff), nl_langinfo(CODESET));
 122     charset_canonicalize(buff2, sizeof(buff2), buff);
 123
 124     /* finally, set $charset */
 125     if (!m_strisempty(buff2)) {
 126         m_strreplace(&MCharset.charset, buff2);
 127     } else
 128 #endif
 129     {
 130         m_strreplace(&MCharset.charset, "iso-8859-1");
 131     }
 132
 133     Charset_is_utf8    = !m_strcmp(MCharset.charset, "utf-8");
 134     CharsetReplacement = Charset_is_utf8 ? 0xfffd : '?';
 135
 136 #ifdef HAVE_BIND_TEXTDOMAIN_CODESET
 137     bind_textdomain_codeset(PACKAGE, MCharset.charset);
 138 #endif
 139 }
 140
 141 #include "charset.gperf"
 142 void charset_canonicalize(char *dest, ssize_t dlen, const char *name)
 143 {
 144     const struct cset_pair *cp;
 145     char scratch[STRING];
 146     const char *p;
 147     int i = 0;
 148
 149     if (!name) {
 150         m_strcpy(dest, dlen, "us-ascii");
 151         return;
 152     }
 153
 154     // canonize name: only keep a-z0-9 and dots, put into lowercase
 155     for (p = name; *p && *p != ':' && i < ssizeof(scratch) - 1; p++) {
 156         if (isalnum(*p) || *p== '.') {
 157             scratch[i++] = tolower((unsigned char)*p);
 158         }
 159     }
 160     scratch[i] = '\0';
 161
 162     cp = charset_canonicalize_aux(scratch, strlen(scratch));
 163     if (cp) {
 164         m_strcpy(dest, dlen, cp->pref);
 165     } else {
 166         m_strcpy(dest, dlen, name);
 167         m_strtolower(dest);
 168     }
 169 }
 170
 171 /* XXX: MC: UGLY return of local static */
 172 const char *charset_getfirst(const char *charset)
 173 {
 174     static char fcharset[STRING];
 175     const char *p;
 176
 177     if (m_strisempty(charset))
 178         return "us-ascii";
 179
 180     p = m_strchrnul(charset, ':');
 181     m_strncpy(fcharset, sizeof(fcharset), charset, p - charset);
 182     return fcharset;
 183 }
 184
 185 int charset_is_utf8(const char *s)
 186 {
 187     char buf[STRING];
 188     charset_canonicalize(buf, sizeof(buf), s);
 189     return !m_strcmp(buf, "utf-8");
 190 }
 191
 192 int charset_is_us_ascii(const char *s)
 193 {
 194     char buf[STRING];
 195     charset_canonicalize(buf, sizeof(buf), s);
 196     return !m_strcmp(buf, "us-ascii");
 197 }
 198
 199
 200 /****************************************************************************/
 201 /* iconv-line functions                                                     */
 202 /****************************************************************************/
 203
 204 /* Like iconv_open, but canonicalises the charsets */
 205 iconv_t mutt_iconv_open(const char *tocode, const char *fromcode, int flags)
 206 {
 207     char to1[STRING];
 208     char from1[STRING];
 209     char tmp[STRING];
 210     iconv_t cd;
 211
 212     if ((flags & M_ICONV_HOOK_TO)
 213     &&  rx_list_match2(charset_hooks, tocode, tmp, sizeof(tmp))) {
 214         charset_canonicalize(to1, sizeof(to1), tmp);
 215     } else {
 216         charset_canonicalize(to1, sizeof(to1), tocode);
 217     }
 218
 219     if ((flags & M_ICONV_HOOK_FROM)
 220     &&  rx_list_match2(charset_hooks, fromcode, tmp, sizeof(tmp))) {
 221         charset_canonicalize(from1, sizeof(from1), tmp);
 222     } else {
 223         charset_canonicalize(from1, sizeof(from1), fromcode);
 224     }
 225
 226     if ((cd = iconv_open(to1, from1)) != MUTT_ICONV_ERROR)
 227         return cd;
 228
 229     {
 230         char to2[STRING];
 231         char from2[STRING];
 232
 233         if (rx_list_match2(iconv_hooks, to1, to2, sizeof(to2))
 234         &&  rx_list_match2(iconv_hooks, from1, from2, sizeof(from2)))
 235             return iconv_open(to2, from2);
 236     }
 237     return MUTT_ICONV_ERROR;
 238 }
 239
 240
 241 /* Like iconv, but keeps going even when the input is invalid
 242    If you're supplying inrepls, the source charset should be stateless;
 243    if you're supplying an outrepl, the target charset should be.  */
 244 /* XXX: MC: I do not understand what it does yet */
 245 ssize_t mutt_iconv(iconv_t cd,
 246                    const char **inbuf, ssize_t *inbytesleft,
 247                    char **outbuf, ssize_t *outbytesleft,
 248                    const char **inrepls, const char *outrepl)
 249 {
 250     ssize_t ret = 0, ret1;
 251     const char *ib = *inbuf;
 252     ssize_t ibl = *inbytesleft;
 253     char *ob = *outbuf;
 254     ssize_t obl = *outbytesleft;
 255
 256     for (;;) {
 257         ret1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
 258         if (ret1 != -1)
 259             ret += ret1;
 260
 261         if (ibl && obl && errno == EILSEQ) {
 262             if (inrepls) {
 263                 /* Try replacing the input */
 264                 const char **t;
 265
 266                 for (t = inrepls; *t; t++) {
 267                     const char *ib1 = *t;
 268                     ssize_t ibl1 = m_strlen(*t);
 269                     char *ob1 = ob;
 270                     ssize_t obl1 = obl;
 271
 272                     my_iconv(cd, &ib1, &ibl1, &ob1, &obl1);
 273                     if (!ibl1) {
 274                         ++ib, --ibl;
 275                         ob = ob1, obl = obl1;
 276                         ++ret;
 277                         break;
 278                     }
 279                 }
 280                 if (*t)
 281                     continue;
 282             }
 283             /* Replace the output */
 284             if (!outrepl)
 285                 outrepl = "?";
 286             my_iconv(cd, 0, 0, &ob, &obl);
 287             if (obl) {
 288                 ssize_t n = m_strlen(outrepl);
 289
 290                 if (n > obl) {
 291                     outrepl = "?";
 292                     n = 1;
 293                 }
 294                 memcpy(ob, outrepl, n);
 295                 ++ib, --ibl;
 296                 ob += n, obl -= n;
 297                 ++ret;
 298                 my_iconv(cd, 0, 0, 0, 0); /* for good measure */
 299                 continue;
 300             }
 301         }
 302         *inbuf = ib, *inbytesleft = ibl;
 303         *outbuf = ob, *outbytesleft = obl;
 304         return ret;
 305     }
 306 }
 307
 308 /* Convert a string */
 309 int
 310 mutt_convert_string(char **ps, const char *from, const char *to, int flags)
 311 {
 312     iconv_t cd;
 313     const char *repls[] = { "\357\277\275", "?", 0 };
 314
 315     if (m_strisempty(*ps))
 316         return 0;
 317
 318     cd = mutt_iconv_open(to, from, flags);
 319     if (cd != MUTT_ICONV_ERROR) {
 320         const char **inrepls = NULL;
 321         const char *outrepl = NULL;
 322         const char *ib;
 323         char *buf, *ob;
 324         ssize_t ibl, obl;
 325
 326         if (charset_is_utf8(to))
 327             outrepl = "\357\277\275";
 328         else
 329         if (charset_is_utf8(from))
 330             inrepls = repls;
 331         else
 332             outrepl = "?";
 333
 334         ibl = m_strlen(*ps) + 1;
 335         ib  = *ps;
 336
 337         obl = MB_LEN_MAX * ibl;
 338         ob  = buf = p_new(char, obl + 1);
 339
 340         mutt_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl);
 341         iconv_close(cd);
 342
 343         *ob = '\0';
 344
 345         p_delete(ps);
 346         *ps = buf;
 347         return 0;
 348     }
 349
 350     return -1;
 351 }
 352
 353 static ssize_t convert_string(const char *f, ssize_t flen,
 354                               const char *from, const char *to,
 355                               char **t, ssize_t * tlen)
 356 {
 357     iconv_t cd;
 358     char *buf, *ob;
 359     ssize_t obl;
 360     ssize_t n;
 361     int e;
 362
 363     if ((cd = mutt_iconv_open(to, from, 0)) == MUTT_ICONV_ERROR)
 364         return -1;
 365
 366     obl = 4 * flen + 1;
 367     ob  = buf = p_new(char, obl);
 368     n   = my_iconv(cd, &f, &flen, &ob, &obl);
 369
 370     if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
 371         e = errno;
 372         p_delete(&buf);
 373         iconv_close(cd);
 374         errno = e;
 375         return -1;
 376     }
 377
 378     *ob   = '\0';
 379     *tlen = ob - buf;
 380     *t    = buf;
 381     iconv_close(cd);
 382     return n;
 383 }
 384
 385 int mutt_convert_nonmime_string(char **ps)
 386 {
 387     const char *p = MCharset.assumed_charset;
 388     ssize_t ulen = m_strlen(*ps);
 389     char *u = *ps;
 390
 391     while (*p) {
 392         const char *q;
 393         char fromcode[LONG_STRING], *s = NULL;
 394         ssize_t slen;
 395
 396         if (!ulen)
 397             return 0;
 398
 399         while (*p == ':')
 400             *p++;
 401
 402         q = m_strchrnul(p, ':');
 403         m_strncpy(fromcode, sizeof(fromcode), p, q - p);
 404         p = q;
 405
 406         if (convert_string(u, ulen, fromcode, MCharset.charset, &s, &slen) >= 0) {
 407             p_delete(ps);
 408             *ps = s;
 409             return 0;
 410         }
 411     }
 412
 413     return -1;
 414 }
 415
 416 /****************************************************************************/
 417 /* fgetconv functions                                                       */
 418 /****************************************************************************/
 419
 420 /* fgetconv_t stuff for converting a file while reading it
 421    Used in sendlib.c for converting from mutt's charset */
 422
 423 struct fgetconv_t {
 424     FILE *file;
 425     iconv_t cd;
 426     char bufi[BUFSIZ];
 427     char bufo[BUFSIZ];
 428     char *p;
 429     char *ob;
 430     char *ib;
 431     ssize_t ibl;
 432     const char **inrepls;
 433 };
 434
 435 fgetconv_t *
 436 fgetconv_open(FILE *file, const char *from, const char *to, int flags)
 437 {
 438     static const char *repls[] = { "\357\277\275", "?", 0 };
 439
 440     struct fgetconv_t *fc = p_new(struct fgetconv_t, 1);
 441
 442     fc->file = file;
 443     fc->cd   = MUTT_ICONV_ERROR;
 444     if (from && to)
 445         fc->cd = mutt_iconv_open(to, from, flags);
 446
 447     if (fc->cd != MUTT_ICONV_ERROR) {
 448         fc->p  = fc->ob = fc->bufo;
 449         fc->ib = fc->bufi;
 450         fc->ibl = 0;
 451         fc->inrepls = repls + charset_is_utf8(to);
 452     }
 453     return fc;
 454 }
 455
 456 void fgetconv_close(fgetconv_t **fcp)
 457 {
 458     struct fgetconv_t *fc = *fcp;
 459
 460     if (fc->cd != MUTT_ICONV_ERROR)
 461         iconv_close (fc->cd);
 462     p_delete(fcp);
 463 }
 464
 465
 466 int fgetconv(fgetconv_t *fc)
 467 {
 468     if (!fc)
 469         return EOF;
 470
 471     if (fc->cd == MUTT_ICONV_ERROR)
 472         return fgetc(fc->file);
 473
 474     if (!fc->p)
 475         return EOF;
 476     if (fc->p < fc->ob)
 477         return (unsigned char)*(fc->p)++;
 478
 479     /* Try to convert some more */
 480     fc->p = fc->ob = fc->bufo;
 481     if (fc->ibl) {
 482         ssize_t obl = ssizeof(fc->bufo);
 483
 484         my_iconv(fc->cd, (const char **)&fc->ib, &fc->ibl, &fc->ob, &obl);
 485         if (fc->p < fc->ob)
 486             return (unsigned char)*(fc->p)++;
 487     }
 488
 489     /* If we trusted iconv a bit more, we would at this point
 490      * ask why it had stopped converting ... */
 491
 492     /* Try to read some more */
 493     if (fc->ibl == sizeof(fc->bufi)
 494     || (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))) {
 495         fc->p = NULL;
 496         return EOF;
 497     }
 498
 499     if (fc->ibl) {
 500         memcpy(fc->bufi, fc->ib, fc->ibl);
 501     }
 502     fc->ib = fc->bufi;
 503     fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl,
 504                      fc->file);
 505
 506     /* Try harder this time to convert some */
 507     if (fc->ibl) {
 508         ssize_t obl = ssizeof(fc->bufo);
 509
 510         mutt_iconv(fc->cd, (const char **)&fc->ib, &fc->ibl, &fc->ob, &obl,
 511                    fc->inrepls, 0);
 512         if (fc->p < fc->ob) {
 513             return (unsigned char)*(fc->p)++;
 514         }
 515     }
 516
 517     /* Either the file has finished or one of the buffers is too small */
 518     fc->p = NULL;
 519     return EOF;
 520 }
 521
 522 char *fgetconvs(char *buf, ssize_t len, fgetconv_t *fc)
 523 {
 524     ssize_t pos = 0;
 525
 526     while (pos < len - 1) {
 527         int c = fgetconv(fc);
 528         if (c == EOF)
 529             break;
 530
 531         buf[pos++] = c;
 532         if (c == '\n')
 533             break;
 534     }
 535     buf[pos] = '\0';
 536
 537     return pos ? buf : NULL;
 538 }
 539
 540 /* vim:set ft=c: */