charset.cpkg

   1 /*
   2  *  This program is free software; you can redistribute it and/or modify
   3  *  it under the terms of the GNU General Public License as published by
   4  *  the Free Software Foundation; either version 2 of the License, or (at
   5  *  your option) any later version.
   6  *
   7  *  This program is distributed in the hope that it will be useful, but
   8  *  WITHOUT ANY WARRANTY; without even the implied warranty of
   9  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  10  *  General Public License for more details.
  11  *
  12  *  You should have received a copy of the GNU General Public License
  13  *  along with this program; if not, write to the Free Software
  14  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  15  *  MA 02110-1301, USA.
  16  *
  17  *  Copyright © 2006 Pierre Habouzit
  18  */
  19 /*
  20  * Copyright notice from original mutt:
  21  * Copyright (C) 1999-2000 Thomas Roessler <roessler@does-not-exist.org>
  22  *
  23  * This file is part of mutt-ng, see http://www.muttng.org/.
  24  * It's licensed under the GNU General Public License,
  25  * please see the file GPL in the top level source directory.
  26  */
  27
  28 #include <lib-lib/lib-lib.h>
  29
  30 #ifdef HAVE_LANGINFO_H
  31 #  include <langinfo.h>
  32 #endif
  33
  34 #include "charset.h"
  35
  36 #ifndef EILSEQ
  37 #  define EILSEQ EINVAL
  38 #endif
  39 @import "lib-lua/base.cpkg"
  40
  41 int     Charset_is_utf8    = 0;
  42 wchar_t CharsetReplacement = '?';
  43
  44 static rx_t *charset_hooks = NULL;
  45 static rx_t *iconv_hooks   = NULL;
  46
  47 static char *charset_init(void)
  48 {
  49     const char *res = "iso-8859-1";
  50 #ifdef HAVE_LANGINFO_H
  51     char buff[STRING];
  52     char buff2[STRING];
  53
  54     m_strcpy(buff, sizeof(buff), nl_langinfo(CODESET));
  55     charset_canonicalize(buff2, sizeof(buff2), buff);
  56
  57     /* finally, set $charset */
  58     if (!m_strisempty(buff2)) {
  59         res = buff2;
  60     }
  61 #endif
  62     bind_textdomain_codeset(PACKAGE, res);
  63     return m_strdup(res);
  64 }
  65
  66 static void charset_onchange(const char *cset)
  67 {
  68     Charset_is_utf8    = charset_is_utf8(cset);
  69     CharsetReplacement = Charset_is_utf8 ? 0xfffd : '?';
  70 }
  71
  72 @package mod_cset {
  73     /*
  74      ** .pp
  75      ** This variable is a colon-separated list of character encoding
  76      ** schemes for messages without character encoding indication.
  77      ** Header field values and message body content without character encoding
  78      ** indication would be assumed that they are written in one of this list.
  79      ** By default, all the header fields and message body without any charset
  80      ** indication are assumed to be in \fTus-ascii\fP.
  81      ** .pp
  82      ** For example, Japanese users might prefer this:
  83      ** .pp
  84      ** \fTset assumed_charset="iso-2022-jp:euc-jp:shift_jis:utf-8"\fP
  85      ** .pp
  86      ** However, only the first content is valid for the message body.
  87      ** This variable is valid only if $$strict_mime is unset.
  88      */
  89     string_t assumed_charset = m_strdup("us-ascii");
  90
  91     /*
  92      ** .pp
  93      ** Character set your terminal uses to display and enter textual data.
  94      */
  95     string_t charset = {
  96         .init     = charset_init();
  97         .onchange = charset_onchange($$);
  98     };
  99
 100     /*
 101      ** .pp
 102      ** This variable is a colon-separated list of character encoding
 103      ** schemes for text file attatchments.
 104      ** If \fIunset\fP, $$charset value will be used instead.
 105      ** For example, the following configuration would work for Japanese
 106      ** text handling:
 107      ** .pp
 108      ** \fTset file_charset="iso-2022-jp:euc-jp:shift_jis:utf-8"\fP
 109      ** .pp
 110      ** Note: ``\fTiso-2022-*\fP'' must be put at the head of the value as shown above
 111      ** if included.
 112      */
 113     string_t file_charset    = NULL;
 114
 115     /*
 116      ** .pp
 117      ** A list of character sets for outgoing messages. Madmutt will use the
 118      ** first character set into which the text can be converted exactly.
 119      ** If your ``$$charset'' is not \fTiso-8859-1\fP and recipients may not
 120      ** understand \fTUTF-8\fP, it is advisable to include in the list an
 121      ** appropriate widely used standard character set (such as
 122      ** \fTiso-8859-2\fP, \fTkoi8-r\fP or \fTiso-2022-jp\fP) either
 123      ** instead of or after \fTiso-8859-1\fP.
 124      */
 125     string_t send_charset    = m_strdup("us-ascii:iso-8859-1:utf-8");
 126
 127     void charset_hook(rx_t local, const string_t alias) {
 128         rx_set_template(local, alias);
 129         rx_list_add2(&charset_hooks, &local);
 130         RETURN();
 131     };
 132
 133     void iconv_hook(rx_t local, const string_t alias) {
 134         rx_set_template(local, alias);
 135         rx_list_add2(&iconv_hooks, &local);
 136         RETURN();
 137     };
 138 };
 139
 140 /****************************************************************************/
 141 /* charset functions                                                        */
 142 /****************************************************************************/
 143
 144 #include "charset.gperf"
 145 void charset_canonicalize(char *dest, ssize_t dlen, const char *name)
 146 {
 147     const struct cset_pair *cp;
 148     char scratch[STRING];
 149     const char *p;
 150     int i = 0;
 151
 152     if (!name) {
 153         m_strcpy(dest, dlen, "us-ascii");
 154         return;
 155     }
 156
 157     // canonize name: only keep a-z0-9 and dots, put into lowercase
 158     for (p = name; *p && *p != ':' && i < ssizeof(scratch) - 1; p++) {
 159         if (isalnum(*p) || *p== '.') {
 160             scratch[i++] = tolower((unsigned char)*p);
 161         }
 162     }
 163     scratch[i] = '\0';
 164
 165     cp = charset_canonicalize_aux(scratch, strlen(scratch));
 166     if (cp) {
 167         m_strcpy(dest, dlen, cp->pref);
 168     } else {
 169         m_strcpy(dest, dlen, name);
 170         m_strtolower(dest);
 171     }
 172 }
 173
 174 /* XXX: MC: UGLY return of local static */
 175 const char *charset_getfirst(const char *charset)
 176 {
 177     static char fcharset[STRING];
 178     const char *p;
 179
 180     if (m_strisempty(charset))
 181         return "us-ascii";
 182
 183     p = m_strchrnul(charset, ':');
 184     m_strncpy(fcharset, sizeof(fcharset), charset, p - charset);
 185     return fcharset;
 186 }
 187
 188 int charset_is_utf8(const char *s)
 189 {
 190     char buf[STRING];
 191     charset_canonicalize(buf, sizeof(buf), s);
 192     return !m_strcmp(buf, "utf-8");
 193 }
 194
 195 int charset_is_us_ascii(const char *s)
 196 {
 197     char buf[STRING];
 198     charset_canonicalize(buf, sizeof(buf), s);
 199     return !m_strcmp(buf, "us-ascii");
 200 }
 201
 202
 203 /****************************************************************************/
 204 /* iconv-line functions                                                     */
 205 /****************************************************************************/
 206
 207 /* Like iconv_open, but canonicalises the charsets */
 208 iconv_t mutt_iconv_open(const char *tocode, const char *fromcode, int flags)
 209 {
 210     char to1[STRING],   to2[STRING];
 211     char from1[STRING], from2[STRING];
 212     char tmp[STRING];
 213     iconv_t cd;
 214
 215     if ((flags & M_ICONV_HOOK_TO)
 216     &&  rx_list_match2(charset_hooks, tocode, tmp, sizeof(tmp))) {
 217         charset_canonicalize(to1, sizeof(to1), tmp);
 218     } else {
 219         charset_canonicalize(to1, sizeof(to1), tocode);
 220     }
 221
 222     if ((flags & M_ICONV_HOOK_FROM)
 223     &&  rx_list_match2(charset_hooks, fromcode, tmp, sizeof(tmp))) {
 224         charset_canonicalize(from1, sizeof(from1), tmp);
 225     } else {
 226         charset_canonicalize(from1, sizeof(from1), fromcode);
 227     }
 228
 229     if ((cd = iconv_open(to1, from1)) != MUTT_ICONV_ERROR)
 230         return cd;
 231
 232     if (rx_list_match2(iconv_hooks, to1, to2, sizeof(to2))
 233     &&  rx_list_match2(iconv_hooks, from1, from2, sizeof(from2)))
 234         return iconv_open(to2, from2);
 235
 236     return MUTT_ICONV_ERROR;
 237 }
 238
 239
 240 /* Like iconv, but keeps going even when the input is invalid
 241    If you're supplying inrepls, the source charset should be stateless;
 242    if you're supplying an outrepl, the target charset should be.  */
 243 /* XXX: MC: I do not understand what it does yet */
 244 ssize_t mutt_iconv(iconv_t cd,
 245                    const char **inbuf, ssize_t *inbytesleft,
 246                    char **outbuf, ssize_t *outbytesleft,
 247                    const char **inrepls, const char *outrepl)
 248 {
 249     ssize_t ret = 0, ret1;
 250     const char *ib = *inbuf;
 251     ssize_t ibl = *inbytesleft;
 252     char *ob = *outbuf;
 253     ssize_t obl = *outbytesleft;
 254
 255     for (;;) {
 256         ret1 = my_iconv(cd, &ib, &ibl, &ob, &obl);
 257         if (ret1 != -1)
 258             ret += ret1;
 259
 260         if (ibl && obl && errno == EILSEQ) {
 261             if (inrepls) {
 262                 /* Try replacing the input */
 263                 const char **t;
 264
 265                 for (t = inrepls; *t; t++) {
 266                     const char *ib1 = *t;
 267                     ssize_t ibl1 = m_strlen(*t);
 268                     char *ob1 = ob;
 269                     ssize_t obl1 = obl;
 270
 271                     my_iconv(cd, &ib1, &ibl1, &ob1, &obl1);
 272                     if (!ibl1) {
 273                         ++ib, --ibl;
 274                         ob = ob1, obl = obl1;
 275                         ++ret;
 276                         break;
 277                     }
 278                 }
 279                 if (*t)
 280                     continue;
 281             }
 282             /* Replace the output */
 283             if (!outrepl)
 284                 outrepl = "?";
 285             my_iconv(cd, 0, 0, &ob, &obl);
 286             if (obl) {
 287                 ssize_t n = m_strlen(outrepl);
 288
 289                 if (n > obl) {
 290                     outrepl = "?";
 291                     n = 1;
 292                 }
 293                 memcpy(ob, outrepl, n);
 294                 ++ib, --ibl;
 295                 ob += n, obl -= n;
 296                 ++ret;
 297                 my_iconv(cd, 0, 0, 0, 0); /* for good measure */
 298                 continue;
 299             }
 300         }
 301         *inbuf = ib, *inbytesleft = ibl;
 302         *outbuf = ob, *outbytesleft = obl;
 303         return ret;
 304     }
 305 }
 306
 307 /* Convert a string */
 308 int
 309 mutt_convert_string(char **ps, const char *from, const char *to, int flags)
 310 {
 311     iconv_t cd;
 312     const char *repls[] = { "\357\277\275", "?", 0 };
 313
 314     if (m_strisempty(*ps))
 315         return 0;
 316
 317     cd = mutt_iconv_open(to, from, flags);
 318     if (cd != MUTT_ICONV_ERROR) {
 319         const char **inrepls = NULL;
 320         const char *outrepl = NULL;
 321         const char *ib;
 322         char *buf, *ob;
 323         ssize_t ibl, obl;
 324
 325         if (charset_is_utf8(to))
 326             outrepl = "\357\277\275";
 327         else
 328         if (charset_is_utf8(from))
 329             inrepls = repls;
 330         else
 331             outrepl = "?";
 332
 333         ibl = m_strlen(*ps) + 1;
 334         ib  = *ps;
 335
 336         obl = MB_LEN_MAX * ibl;
 337         ob  = buf = p_new(char, obl + 1);
 338
 339         mutt_iconv(cd, &ib, &ibl, &ob, &obl, inrepls, outrepl);
 340         iconv_close(cd);
 341
 342         *ob = '\0';
 343
 344         p_delete(ps);
 345         *ps = buf;
 346         return 0;
 347     }
 348
 349     return -1;
 350 }
 351
 352 static ssize_t convert_string(const char *f, ssize_t flen,
 353                               const char *from, const char *to,
 354                               char **t, ssize_t * tlen)
 355 {
 356     iconv_t cd;
 357     char *buf, *ob;
 358     ssize_t obl;
 359     ssize_t n;
 360     int e;
 361
 362     if ((cd = mutt_iconv_open(to, from, 0)) == MUTT_ICONV_ERROR)
 363         return -1;
 364
 365     obl = 4 * flen + 1;
 366     ob  = buf = p_new(char, obl);
 367     n   = my_iconv(cd, &f, &flen, &ob, &obl);
 368
 369     if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) {
 370         e = errno;
 371         p_delete(&buf);
 372         iconv_close(cd);
 373         errno = e;
 374         return -1;
 375     }
 376
 377     *ob   = '\0';
 378     *tlen = ob - buf;
 379     *t    = buf;
 380     iconv_close(cd);
 381     return n;
 382 }
 383
 384 int mutt_convert_nonmime_string(char **ps)
 385 {
 386     const char *p = mod_cset.assumed_charset;
 387     ssize_t ulen = m_strlen(*ps);
 388     char *u = *ps;
 389
 390     while (*p) {
 391         const char *q;
 392         char fromcode[LONG_STRING], *s = NULL;
 393         ssize_t slen;
 394
 395         if (!ulen)
 396             return 0;
 397
 398         while (*p == ':')
 399             p++;
 400
 401         q = m_strchrnul(p, ':');
 402         m_strncpy(fromcode, sizeof(fromcode), p, q - p);
 403         p = q;
 404
 405         if (convert_string(u, ulen, fromcode, mod_cset.charset, &s, &slen) >= 0) {
 406             p_delete(ps);
 407             *ps = s;
 408             return 0;
 409         }
 410     }
 411
 412     return -1;
 413 }
 414
 415 /****************************************************************************/
 416 /* fgetconv functions                                                       */
 417 /****************************************************************************/
 418
 419 /* fgetconv_t stuff for converting a file while reading it
 420    Used in sendlib.c for converting from mutt's charset */
 421
 422 struct fgetconv_t {
 423     FILE *file;
 424     iconv_t cd;
 425     char bufi[BUFSIZ];
 426     char bufo[BUFSIZ];
 427     char *p;
 428     char *ob;
 429     char *ib;
 430     ssize_t ibl;
 431     const char **inrepls;
 432 };
 433
 434 fgetconv_t *
 435 fgetconv_open(FILE *file, const char *from, const char *to, int flags)
 436 {
 437     static const char *repls[] = { "\357\277\275", "?", 0 };
 438
 439     struct fgetconv_t *fc = p_new(struct fgetconv_t, 1);
 440
 441     fc->file = file;
 442     fc->cd   = MUTT_ICONV_ERROR;
 443     if (from && to)
 444         fc->cd = mutt_iconv_open(to, from, flags);
 445
 446     if (fc->cd != MUTT_ICONV_ERROR) {
 447         fc->p  = fc->ob = fc->bufo;
 448         fc->ib = fc->bufi;
 449         fc->ibl = 0;
 450         fc->inrepls = repls + charset_is_utf8(to);
 451     }
 452     return fc;
 453 }
 454
 455 void fgetconv_close(fgetconv_t **fcp)
 456 {
 457     struct fgetconv_t *fc = *fcp;
 458
 459     if (fc->cd != MUTT_ICONV_ERROR)
 460         iconv_close (fc->cd);
 461     p_delete(fcp);
 462 }
 463
 464
 465 int fgetconv(fgetconv_t *fc)
 466 {
 467     if (!fc)
 468         return EOF;
 469
 470     if (fc->cd == MUTT_ICONV_ERROR)
 471         return fgetc(fc->file);
 472
 473     if (!fc->p)
 474         return EOF;
 475     if (fc->p < fc->ob)
 476         return (unsigned char)*(fc->p)++;
 477
 478     /* Try to convert some more */
 479     fc->p = fc->ob = fc->bufo;
 480     if (fc->ibl) {
 481         ssize_t obl = ssizeof(fc->bufo);
 482
 483         my_iconv(fc->cd, (const char **)&fc->ib, &fc->ibl, &fc->ob, &obl);
 484         if (fc->p < fc->ob)
 485             return (unsigned char)*(fc->p)++;
 486     }
 487
 488     /* If we trusted iconv a bit more, we would at this point
 489      * ask why it had stopped converting ... */
 490
 491     /* Try to read some more */
 492     if (fc->ibl == sizeof(fc->bufi)
 493     || (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))) {
 494         fc->p = NULL;
 495         return EOF;
 496     }
 497
 498     if (fc->ibl) {
 499         memcpy(fc->bufi, fc->ib, fc->ibl);
 500     }
 501     fc->ib = fc->bufi;
 502     fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl,
 503                      fc->file);
 504
 505     /* Try harder this time to convert some */
 506     if (fc->ibl) {
 507         ssize_t obl = ssizeof(fc->bufo);
 508
 509         mutt_iconv(fc->cd, (const char **)&fc->ib, &fc->ibl, &fc->ob, &obl,
 510                    fc->inrepls, 0);
 511         if (fc->p < fc->ob) {
 512             return (unsigned char)*(fc->p)++;
 513         }
 514     }
 515
 516     /* Either the file has finished or one of the buffers is too small */
 517     fc->p = NULL;
 518     return EOF;
 519 }
 520
 521 char *fgetconvs(char *buf, ssize_t len, fgetconv_t *fc)
 522 {
 523     ssize_t pos = 0;
 524
 525     while (pos < len - 1) {
 526         int c = fgetconv(fc);
 527         if (c == EOF)
 528             break;
 529
 530         buf[pos++] = c;
 531         if (c == '\n')
 532             break;
 533     }
 534     buf[pos] = '\0';
 535
 536     return pos ? buf : NULL;
 537 }
 538
 539 /* vim:set ft=c: */