charset.cpkg

   1 /*
   2  *  This program is free software; you can redistribute it and/or modify
   3  *  it under the terms of the GNU General Public License as published by
   4  *  the Free Software Foundation; either version 2 of the License, or (at
   5  *  your option) any later version.
   6  *
   7  *  This program is distributed in the hope that it will be useful, but
   8  *  WITHOUT ANY WARRANTY; without even the implied warranty of
   9  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  10  *  General Public License for more details.
  11  *
  12  *  You should have received a copy of the GNU General Public License
  13  *  along with this program; if not, write to the Free Software
  14  *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
  15  *  MA 02110-1301, USA.
  16  *
  17  *  Copyright © 2006 Pierre Habouzit
  18  */
  19 /*
  20  * Copyright notice from original mutt:
  21  * Copyright (C) 1999-2000 Thomas Roessler <roessler@does-not-exist.org>
  22  *
  23  * This file is part of mutt-ng, see http://www.muttng.org/.
  24  * It's licensed under the GNU General Public License,
  25  * please see the file GPL in the top level source directory.
  26  */
  27
  28 #include <lib-lib/lib-lib.h>
  29
  30 #ifdef HAVE_LANGINFO_H
  31 #  include <langinfo.h>
  32 #endif
  33
  34 #include "charset.h"
  35
  36 #ifndef EILSEQ
  37 #  define EILSEQ EINVAL
  38 #endif
  39 @import "lib-lua/base.cpkg"
  40
  41 int     Charset_is_utf8    = 0;
  42 wchar_t CharsetReplacement = '?';
  43
  44 static rx_t *charset_hooks = NULL;
  45 static rx_t *iconv_hooks   = NULL;
  46
  47 static char *charset_init(void)
  48 {
  49     const char *res = "iso-8859-1";
  50 #ifdef HAVE_LANGINFO_H
  51     char buff[STRING];
  52     char buff2[STRING];
  53
  54     m_strcpy(buff, sizeof(buff), nl_langinfo(CODESET));
  55     charset_canonicalize(buff2, sizeof(buff2), buff);
  56
  57     /* finally, set $charset */
  58     if (!m_strisempty(buff2)) {
  59         res = buff2;
  60     }
  61 #endif
  62     bind_textdomain_codeset(PACKAGE, res);
  63     return m_strdup(res);
  64 }
  65
  66 static void charset_onchange(const char *cset)
  67 {
  68     Charset_is_utf8    = charset_is_utf8(cset);
  69     CharsetReplacement = Charset_is_utf8 ? 0xfffd : '?';
  70 }
  71
  72 @package mod_cset {
  73     /*
  74      ** .pp
  75      ** This variable is a colon-separated list of character encoding
  76      ** schemes for messages without character encoding indication.
  77      ** Header field values and message body content without character encoding
  78      ** indication would be assumed that they are written in one of this list.
  79      ** By default, all the header fields and message body without any charset
  80      ** indication are assumed to be in \fTus-ascii\fP.
  81      ** .pp
  82      ** For example, Japanese users might prefer this:
  83      ** .pp
  84      ** \fTset assumed_charset="iso-2022-jp:euc-jp:shift_jis:utf-8"\fP
  85      ** .pp
  86      ** However, only the first content is valid for the message body.
  87      ** This variable is valid only if $$strict_mime is unset.
  88      */
  89     string_t assumed_charset = m_strdup("us-ascii");
  90
  91     /*
  92      ** .pp
  93      ** Character set your terminal uses to display and enter textual data.
  94      */
  95     string_t charset = {
  96         .init     = charset_init();
  97         .onchange = charset_onchange($$);
  98     };
  99
 100     /*
 101      ** .pp
 102      ** This variable is a colon-separated list of character encoding
 103      ** schemes for text file attatchments.
 104      ** If \fIunset\fP, $$charset value will be used instead.
 105      ** For example, the following configuration would work for Japanese
 106      ** text handling:
 107      ** .pp
 108      ** \fTset file_charset="iso-2022-jp:euc-jp:shift_jis:utf-8"\fP
 109      ** .pp
 110      ** Note: ``\fTiso-2022-*\fP'' must be put at the head of the value as shown above
 111      ** if included.
 112      */
 113     string_t file_charset    = NULL;
 114
 115     /*
 116      ** .pp
 117      ** A list of character sets for outgoing messages. Madmutt will use the
 118      ** first character set into which the text can be converted exactly.
 119      ** If your ``$$charset'' is not \fTiso-8859-1\fP and recipients may not
 120      ** understand \fTUTF-8\fP, it is advisable to include in the list an
 121      ** appropriate widely used standard character set (such as
 122      ** \fTiso-8859-2\fP, \fTkoi8-r\fP or \fTiso-2022-jp\fP) either
 123      ** instead of or after \fTiso-8859-1\fP.
 124      */
 125     string_t send_charset    = m_strdup("us-ascii:iso-8859-1:utf-8");
 126
 127     void charset_hook(rx_t local, const string_t alias) {
 128         rx_set_template(local, alias);
 129         rx_list_add2(&charset_hooks, &local);
 130         RETURN();
 131     };
 132
 133     void iconv_hook(rx_t local, const string_t alias) {
 134         rx_set_template(local, alias);
 135         rx_list_add2(&iconv_hooks, &local);
 136         RETURN();
 137     };
 138 };
 139
 140 /****************************************************************************/
 141 /* charset functions                                                        */
 142 /****************************************************************************/
 143
 144 #include "charset.gperf"
 145 void charset_canonicalize(char *dest, ssize_t dlen, const char *name)
 146 {
 147     const struct cset_pair *cp;
 148     char scratch[STRING];
 149     const char *p;
 150     int i = 0;
 151
 152     if (!name) {
 153         m_strcpy(dest, dlen, "us-ascii");
 154         return;
 155     }
 156
 157     // canonize name: only keep a-z0-9 and dots, put into lowercase
 158     for (p = name; *p && *p != ':' && i < ssizeof(scratch) - 1; p++) {
 159         if (isalnum(*p) || *p== '.') {
 160             scratch[i++] = tolower((unsigned char)*p);
 161         }
 162     }
 163     scratch[i] = '\0';
 164
 165     cp = charset_canonicalize_aux(scratch, strlen(scratch));
 166     if (cp) {
 167         m_strcpy(dest, dlen, cp->pref);
 168     } else {
 169         m_strcpy(dest, dlen, name);
 170         m_strtolower(dest);
 171     }
 172 }
 173
 174 /* XXX: MC: UGLY return of local static */
 175 const char *charset_getfirst(const char *charset)
 176 {
 177     static char fcharset[STRING];
 178     const char *p;
 179
 180     if (m_strisempty(charset))
 181         return "us-ascii";
 182
 183     p = m_strchrnul(charset, ':');
 184     m_strncpy(fcharset, sizeof(fcharset), charset, p - charset);
 185     return fcharset;
 186 }
 187
 188 int charset_is_utf8(const char *s)
 189 {
 190     char buf[STRING];
 191     charset_canonicalize(buf, sizeof(buf), s);
 192     return !m_strcmp(buf, "utf-8");
 193 }
 194
 195 int charset_is_us_ascii(const char *s)
 196 {
 197     char buf[STRING];
 198     charset_canonicalize(buf, sizeof(buf), s);
 199     return !m_strcmp(buf, "us-ascii");
 200 }
 201
 202
 203 /****************************************************************************/
 204 /* iconv-line functions                                                     */
 205 /****************************************************************************/
 206
 207 /* Like iconv_open, but canonicalises the charsets */
 208 iconv_t mutt_iconv_open(const char *tocode, const char *fromcode, int flags)
 209 {
 210     char to1[STRING],   to2[STRING];
 211     char from1[STRING], from2[STRING];
 212     char tmp[STRING];
 213     iconv_t cd;
 214
 215     if ((flags & M_ICONV_HOOK_TO)
 216     &&  rx_list_match2(charset_hooks, tocode, tmp, sizeof(tmp))) {
 217         charset_canonicalize(to1, sizeof(to1), tmp);
 218     } else {
 219         charset_canonicalize(to1, sizeof(to1), tocode);
 220     }
 221
 222     if ((flags & M_ICONV_HOOK_FROM)
 223     &&  rx_list_match2(charset_hooks, fromcode, tmp, sizeof(tmp))) {
 224         charset_canonicalize(from1, sizeof(from1), tmp);
 225     } else {
 226         charset_canonicalize(from1, sizeof(from1), fromcode);
 227     }
 228
 229     m_strcat(to1, sizeof(to1), "//TRANSLIT");
 230     if ((cd = iconv_open(to1, from1)) != MUTT_ICONV_ERROR)
 231         return cd;
 232
 233     if (rx_list_match2(iconv_hooks, to1, to2, sizeof(to2))
 234     &&  rx_list_match2(iconv_hooks, from1, from2, sizeof(from2))) {
 235         m_strcat(to2, sizeof(to2), "//TRANSLIT");
 236         return iconv_open(to2, from2);
 237     }
 238
 239     return MUTT_ICONV_ERROR;
 240 }
 241
 242
 243 /* Convert a string */
 244 int
 245 mutt_convert_string(char **ps, const char *from, const char *to, int flags)
 246 {
 247     iconv_t cd;
 248     if (m_strisempty(*ps))
 249         return 0;
 250
 251     cd = mutt_iconv_open(to, from, flags);
 252     if (cd != MUTT_ICONV_ERROR) {
 253         const char *ib;
 254         char *buf, *ob;
 255         ssize_t ibl, obl;
 256
 257         ibl = m_strlen(*ps) + 1;
 258         ib  = *ps;
 259
 260         obl = MB_LEN_MAX * ibl;
 261         ob  = buf = p_new(char, obl + 1);
 262
 263         mutt_iconv(cd, &ib, &ibl, &ob, &obl);
 264         iconv_close(cd);
 265
 266         *ob = '\0';
 267
 268         p_delete(ps);
 269         *ps = buf;
 270         return 0;
 271     }
 272
 273     return -1;
 274 }
 275
 276 static ssize_t convert_string(const char *f, ssize_t flen,
 277                               const char *from, const char *to,
 278                               char **t, ssize_t * tlen)
 279 {
 280     iconv_t cd;
 281     char *buf, *ob;
 282     ssize_t obl;
 283     ssize_t n;
 284     int e;
 285
 286     if ((cd = mutt_iconv_open(to, from, 0)) == MUTT_ICONV_ERROR)
 287         return -1;
 288
 289     obl = 4 * flen + 1;
 290     ob  = buf = p_new(char, obl);
 291     n   = mutt_iconv(cd, &f, &flen, &ob, &obl);
 292
 293     if (n < 0 || mutt_iconv(cd, 0, 0, &ob, &obl) < 0) {
 294         e = errno;
 295         p_delete(&buf);
 296         iconv_close(cd);
 297         errno = e;
 298         return -1;
 299     }
 300
 301     *ob   = '\0';
 302     *tlen = ob - buf;
 303     *t    = buf;
 304     iconv_close(cd);
 305     return n;
 306 }
 307
 308 int mutt_convert_nonmime_string(char **ps)
 309 {
 310     const char *p = mod_cset.assumed_charset;
 311     ssize_t ulen = m_strlen(*ps);
 312     char *u = *ps;
 313
 314     while (*p) {
 315         const char *q;
 316         char fromcode[LONG_STRING], *s = NULL;
 317         ssize_t slen;
 318
 319         if (!ulen)
 320             return 0;
 321
 322         while (*p == ':')
 323             p++;
 324
 325         q = m_strchrnul(p, ':');
 326         m_strncpy(fromcode, sizeof(fromcode), p, q - p);
 327         p = q;
 328
 329         if (convert_string(u, ulen, fromcode, mod_cset.charset, &s, &slen) >= 0) {
 330             p_delete(ps);
 331             *ps = s;
 332             return 0;
 333         }
 334     }
 335
 336     return -1;
 337 }
 338
 339 /****************************************************************************/
 340 /* fgetconv functions                                                       */
 341 /****************************************************************************/
 342
 343 /* fgetconv_t stuff for converting a file while reading it
 344    Used in sendlib.c for converting from mutt's charset */
 345
 346 struct fgetconv_t {
 347     FILE *file;
 348     iconv_t cd;
 349     char bufi[BUFSIZ];
 350     char bufo[BUFSIZ];
 351     char *p;
 352     char *ob;
 353     char *ib;
 354     ssize_t ibl;
 355 };
 356
 357 fgetconv_t *
 358 fgetconv_open(FILE *file, const char *from, const char *to, int flags)
 359 {
 360     struct fgetconv_t *fc = p_new(struct fgetconv_t, 1);
 361
 362     fc->file = file;
 363     fc->cd   = MUTT_ICONV_ERROR;
 364     if (from && to)
 365         fc->cd = mutt_iconv_open(to, from, flags);
 366
 367     if (fc->cd != MUTT_ICONV_ERROR) {
 368         fc->p  = fc->ob = fc->bufo;
 369         fc->ib = fc->bufi;
 370         fc->ibl = 0;
 371     }
 372     return fc;
 373 }
 374
 375 void fgetconv_close(fgetconv_t **fcp)
 376 {
 377     struct fgetconv_t *fc = *fcp;
 378
 379     if (fc->cd != MUTT_ICONV_ERROR)
 380         iconv_close(fc->cd);
 381     p_delete(fcp);
 382 }
 383
 384
 385 int fgetconv(fgetconv_t *fc)
 386 {
 387     if (!fc)
 388         return EOF;
 389
 390     if (fc->cd == MUTT_ICONV_ERROR)
 391         return fgetc(fc->file);
 392
 393     if (!fc->p)
 394         return EOF;
 395     if (fc->p < fc->ob)
 396         return (unsigned char)*(fc->p)++;
 397
 398     /* Try to convert some more */
 399     fc->p = fc->ob = fc->bufo;
 400     if (fc->ibl) {
 401         ssize_t obl = ssizeof(fc->bufo);
 402
 403         mutt_iconv(fc->cd, (const char **)&fc->ib, &fc->ibl, &fc->ob, &obl);
 404         if (fc->p < fc->ob)
 405             return (unsigned char)*(fc->p)++;
 406     }
 407
 408     /* If we trusted iconv a bit more, we would at this point
 409      * ask why it had stopped converting ... */
 410
 411     /* Try to read some more */
 412     if (fc->ibl == sizeof(fc->bufi)
 413     || (fc->ibl && fc->ib + fc->ibl < fc->bufi + sizeof(fc->bufi))) {
 414         fc->p = NULL;
 415         return EOF;
 416     }
 417
 418     if (fc->ibl) {
 419         memcpy(fc->bufi, fc->ib, fc->ibl);
 420     }
 421     fc->ib = fc->bufi;
 422     fc->ibl += fread(fc->ib + fc->ibl, 1, sizeof(fc->bufi) - fc->ibl,
 423                      fc->file);
 424
 425     /* Try harder this time to convert some */
 426     if (fc->ibl) {
 427         ssize_t obl = ssizeof(fc->bufo);
 428
 429         mutt_iconv(fc->cd, (const char **)&fc->ib, &fc->ibl, &fc->ob, &obl);
 430         if (fc->p < fc->ob) {
 431             return (unsigned char)*(fc->p)++;
 432         }
 433     }
 434
 435     /* Either the file has finished or one of the buffers is too small */
 436     fc->p = NULL;
 437     return EOF;
 438 }
 439
 440 char *fgetconvs(char *buf, ssize_t len, fgetconv_t *fc)
 441 {
 442     ssize_t pos = 0;
 443
 444     while (pos < len - 1) {
 445         int c = fgetconv(fc);
 446         if (c == EOF)
 447             break;
 448
 449         buf[pos++] = c;
 450         if (c == '\n')
 451             break;
 452     }
 453     buf[pos] = '\0';
 454
 455     return pos ? buf : NULL;
 456 }
 457
 458 /* vim:set ft=c: */