mbyte.c

   1 /*
   2  * Copyright notice from original mutt:
   3  * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
   4  *
   5  * This file is part of mutt-ng, see http://www.muttng.org/.
   6  * It's licensed under the GNU General Public License,
   7  * please see the file GPL in the top level source directory.
   8  */
   9
  10 /*
  11  * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>.
  12  */
  13
  14 #if HAVE_CONFIG_H
  15 # include "config.h"
  16 #endif
  17
  18 #include "mutt.h"
  19 #include "mbyte.h"
  20 #include "charset.h"
  21
  22 #include "lib/intl.h"
  23
  24 #include <errno.h>
  25
  26 #include <ctype.h>
  27
  28 #ifndef EILSEQ
  29 #define EILSEQ EINVAL
  30 #endif
  31
  32 int Charset_is_utf8 = 0;
  33
  34 #ifndef HAVE_WC_FUNCS
  35 static int charset_is_ja = 0;
  36 static iconv_t charset_to_utf8 = (iconv_t) (-1);
  37 static iconv_t charset_from_utf8 = (iconv_t) (-1);
  38 #endif
  39
  40 void mutt_set_charset (char *charset)
  41 {
  42   char buffer[STRING];
  43
  44   mutt_canonical_charset (buffer, sizeof (buffer), charset);
  45
  46   Charset_is_utf8 = 0;
  47 #ifndef HAVE_WC_FUNCS
  48   charset_is_ja = 0;
  49   if (charset_to_utf8 != (iconv_t) (-1)) {
  50     iconv_close (charset_to_utf8);
  51     charset_to_utf8 = (iconv_t) (-1);
  52   }
  53   if (charset_from_utf8 != (iconv_t) (-1)) {
  54     iconv_close (charset_from_utf8);
  55     charset_from_utf8 = (iconv_t) (-1);
  56   }
  57 #endif
  58
  59   if (!strcmp (buffer, "utf-8"))
  60     Charset_is_utf8 = 1;
  61 #ifndef HAVE_WC_FUNCS
  62   else if (!ascii_strcasecmp (buffer, "euc-jp")
  63            || !ascii_strcasecmp (buffer, "shift_jis")
  64            || !ascii_strcasecmp (buffer, "cp932")
  65            || !ascii_strcasecmp (buffer, "eucJP-ms")) {
  66     charset_is_ja = 1;
  67     charset_to_utf8 = iconv_open ("UTF-8", charset);
  68     charset_from_utf8 = iconv_open (charset, "UTF-8");
  69   }
  70 #endif
  71
  72 #ifdef HAVE_BIND_TEXTDOMAIN_CODESET
  73   bind_textdomain_codeset (PACKAGE, buffer);
  74 #endif
  75 }
  76
  77 #ifndef HAVE_WC_FUNCS
  78
  79 /*
  80  * For systems that don't have them, we provide here our own
  81  * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
  82  * Instead of using the locale, as these functions normally would,
  83  * we use Mutt's Charset variable. We support 3 types of charset:
  84  * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
  85  * (2) For UTF-8, wchar_t uses UCS.
  86  * (3) For stateless Japanese encodings, we use UCS and convert
  87  *     via UTF-8 using iconv.
  88  * Unfortunately, we can't handle non-stateless encodings.
  89  */
  90
  91 static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
  92 {
  93   char buf[MB_LEN_MAX];
  94   ICONV_CONST char *ib;
  95   char *ob;
  96   size_t ibl, obl, r;
  97
  98   if (s) {
  99     ibl = mutt_wctoutf8 (buf, wc);
 100     if (ibl == (size_t) (-1))
 101       return (size_t) (-1);
 102     ib = buf;
 103     ob = s;
 104     obl = MB_LEN_MAX;
 105     r = iconv (cd, &ib, &ibl, &ob, &obl);
 106   }
 107   else {
 108     ib = "";
 109     ibl = 1;
 110     ob = buf;
 111     obl = sizeof (buf);
 112     r = iconv (cd, &ib, &ibl, &ob, &obl);
 113   }
 114   return ob - s;
 115 }
 116
 117 size_t wcrtomb (char *s, wchar_t wc, mbstate_t * ps)
 118 {
 119   /* We only handle stateless encodings, so we can ignore ps. */
 120
 121   if (Charset_is_utf8)
 122     return mutt_wctoutf8 (s, wc);
 123   else if (charset_from_utf8 != (iconv_t) (-1))
 124     return wcrtomb_iconv (s, wc, charset_from_utf8);
 125   else {
 126     if (!s)
 127       return 1;
 128     if (wc < 0x100) {
 129       *s = wc;
 130       return 1;
 131     }
 132     errno = EILSEQ;
 133     return (size_t) (-1);
 134   }
 135 }
 136
 137 size_t mbrtowc_iconv (wchar_t * pwc, const char *s, size_t n,
 138                       mbstate_t * ps, iconv_t cd)
 139 {
 140   static mbstate_t mbstate;
 141   ICONV_CONST char *ib, *ibmax;
 142   char *ob, *t;
 143   size_t ibl, obl, k, r;
 144   char bufi[8], bufo[6];
 145
 146   if (!n)
 147     return (size_t) (-2);
 148
 149   t = memchr (ps, 0, sizeof (*ps));
 150   k = t ? (t - (char *) ps) : sizeof (*ps);
 151   if (k > sizeof (bufi))
 152     k = 0;
 153   if (k) {
 154     /* use the buffer for input */
 155     memcpy (bufi, ps, k);
 156     ib = bufi;
 157     ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
 158     memcpy (bufi + k, s, ibmax - bufi - k);
 159   }
 160   else {
 161     /* use the real input */
 162     ib = s;
 163     ibmax = s + n;
 164   }
 165
 166   ob = bufo;
 167   obl = sizeof (bufo);
 168   ibl = 1;
 169
 170   for (;;) {
 171     r = iconv (cd, &ib, &ibl, &ob, &obl);
 172     if (ob > bufo && (!k || ib > bufi + k)) {
 173       /* we have a character */
 174       memset (ps, 0, sizeof (*ps));
 175       utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
 176       return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0;
 177     }
 178     else if (!r || (r == (size_t) (-1) && errno == EINVAL)) {
 179       if (ib + ibl < ibmax)
 180         /* try using more input */
 181         ++ibl;
 182       else if (k && ib > bufi + k && bufi + k + n > ibmax) {
 183         /* switch to using real input */
 184         ib = s + (ib - bufi - k);
 185         ibmax = s + n;
 186         k = 0;
 187         ++ibl;
 188       }
 189       else {
 190         /* save the state and give up */
 191         memset (ps, 0, sizeof (*ps));
 192         if (ibl <= sizeof (mbstate_t))  /* need extra condition here! */
 193           memcpy (ps, ib, ibl);
 194         return (size_t) (-2);
 195       }
 196     }
 197     else {
 198       /* bad input */
 199       errno = EILSEQ;
 200       return (size_t) (-1);
 201     }
 202   }
 203 }
 204
 205 size_t mbrtowc (wchar_t * pwc, const char *s, size_t n, mbstate_t * ps)
 206 {
 207   static mbstate_t mbstate;
 208
 209   if (!ps)
 210     ps = &mbstate;
 211
 212   if (Charset_is_utf8)
 213     return utf8rtowc (pwc, s, n, ps);
 214   else if (charset_to_utf8 != (iconv_t) (-1))
 215     return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
 216   else {
 217     if (!s) {
 218       memset (ps, 0, sizeof (*ps));
 219       return 0;
 220     }
 221     if (!n)
 222       return (size_t) - 2;
 223     if (pwc)
 224       *pwc = (wchar_t) (unsigned char) *s;
 225     return (*s != 0);
 226   }
 227 }
 228
 229 int iswprint (wint_t wc)
 230 {
 231   if (Charset_is_utf8 || charset_is_ja)
 232     return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
 233   else
 234     return (0 <= wc && wc < 256) ? IsPrint (wc) : 0;
 235 }
 236
 237 int iswspace (wint_t wc)
 238 {
 239   if (Charset_is_utf8 || charset_is_ja)
 240     return (9 <= wc && wc <= 13) || wc == 32;
 241   else
 242     return (0 <= wc && wc < 256) ? isspace (wc) : 0;
 243 }
 244
 245 static wint_t towupper_ucs (wint_t x)
 246 {
 247   /* Only works for x < 0x130 */
 248   if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
 249     return x - 32;
 250   else if (0x100 <= x && x < 0x130)
 251     return x & ~1;
 252   else if (x == 0xb5)
 253     return 0x39c;
 254   else if (x == 0xff)
 255     return 0x178;
 256   else
 257     return x;
 258 }
 259
 260 static wint_t towlower_ucs (wint_t x)
 261 {
 262   /* Only works for x < 0x130 */
 263   if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7))
 264     return x + 32;
 265   else if (0x100 <= x && x < 0x130)
 266     return x | 1;
 267   else
 268     return x;
 269 }
 270
 271 static int iswalnum_ucs (wint_t wc)
 272 {
 273   /* Only works for x < 0x220 */
 274   if (wc >= 0x100)
 275     return 1;
 276   else if (wc < 0x30)
 277     return 0;
 278   else if (wc < 0x3a)
 279     return 1;
 280   else if (wc < 0xa0)
 281     return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
 282   else if (wc < 0xc0)
 283     return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
 284   else
 285     return !(wc == 0xd7 || wc == 0xf7);
 286 }
 287
 288 wint_t towupper (wint_t wc)
 289 {
 290   if (Charset_is_utf8 || charset_is_ja)
 291     return towupper_ucs (wc);
 292   else
 293     return (0 <= wc && wc < 256) ? toupper (wc) : wc;
 294 }
 295
 296 wint_t towlower (wint_t wc)
 297 {
 298   if (Charset_is_utf8 || charset_is_ja)
 299     return towlower_ucs (wc);
 300   else
 301     return (0 <= wc && wc < 256) ? tolower (wc) : wc;
 302 }
 303
 304 int iswalnum (wint_t wc)
 305 {
 306   if (Charset_is_utf8 || charset_is_ja)
 307     return iswalnum_ucs (wc);
 308   else
 309     return (0 <= wc && wc < 256) ? isalnum (wc) : 0;
 310 }
 311
 312 /*
 313  * l10n for Japanese:
 314  *   Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
 315  *   Character Set, have a column width of 2.
 316  */
 317 int wcwidth_ja (wchar_t ucs)
 318 {
 319   if (ucs >= 0x3021)
 320     return -1;                  /* continue with the normal check */
 321   /* a rough range for quick check */
 322   if ((ucs >= 0x00a1 && ucs <= 0x00fe) ||       /* Latin-1 Supplement */
 323       (ucs >= 0x0391 && ucs <= 0x0451) ||       /* Greek and Cyrillic */
 324       (ucs >= 0x2010 && ucs <= 0x266f) ||       /* Symbols */
 325       (ucs >= 0x3000 && ucs <= 0x3020)) /* CJK Symbols and Punctuation */
 326     return 2;
 327   else
 328     return -1;
 329 }
 330
 331 int wcwidth_ucs (wchar_t ucs);
 332
 333 int wcwidth (wchar_t wc)
 334 {
 335   if (!Charset_is_utf8) {
 336     if (!charset_is_ja) {
 337       /* 8-bit case */
 338       if (!wc)
 339         return 0;
 340       else if ((0 <= wc && wc < 256) && IsPrint (wc))
 341         return 1;
 342       else
 343         return -1;
 344     }
 345     else {
 346       /* Japanese */
 347       int k = wcwidth_ja (wc);
 348
 349       if (k != -1)
 350         return k;
 351     }
 352   }
 353   return wcwidth_ucs (wc);
 354 }
 355
 356 size_t utf8rtowc (wchar_t * pwc, const char *s, size_t n, mbstate_t * _ps)
 357 {
 358   static wchar_t mbstate;
 359   wchar_t *ps = (wchar_t *) _ps;
 360   size_t k = 1;
 361   unsigned char c;
 362   wchar_t wc;
 363   int count;
 364
 365   if (!ps)
 366     ps = &mbstate;
 367
 368   if (!s) {
 369     *ps = 0;
 370     return 0;
 371   }
 372   if (!n)
 373     return (size_t) - 2;
 374
 375   if (!*ps) {
 376     c = (unsigned char) *s;
 377     if (c < 0x80) {
 378       if (pwc)
 379         *pwc = c;
 380       return (c != 0);
 381     }
 382     else if (c < 0xc2) {
 383       errno = EILSEQ;
 384       return (size_t) - 1;
 385     }
 386     else if (c < 0xe0)
 387       wc = ((c & 0x1f) << 6) + (count = 0);
 388     else if (c < 0xf0)
 389       wc = ((c & 0x0f) << 12) + (count = 1);
 390     else if (c < 0xf8)
 391       wc = ((c & 0x07) << 18) + (count = 2);
 392     else if (c < 0xfc)
 393       wc = ((c & 0x03) << 24) + (count = 3);
 394     else if (c < 0xfe)
 395       wc = ((c & 0x01) << 30) + (count = 4);
 396     else {
 397       errno = EILSEQ;
 398       return (size_t) - 1;
 399     }
 400     ++s, --n, ++k;
 401   }
 402   else {
 403     wc = *ps & 0x7fffffff;
 404     count = wc & 7;             /* if count > 4 it will be caught below */
 405   }
 406
 407   for (; n; ++s, --n, ++k) {
 408     c = (unsigned char) *s;
 409     if (0x80 <= c && c < 0xc0) {
 410       wc |= (c & 0x3f) << (6 * count);
 411       if (!count) {
 412         if (pwc)
 413           *pwc = wc;
 414         *ps = 0;
 415         return wc ? k : 0;
 416       }
 417       --count, --wc;
 418       if (!(wc >> (11 + count * 5))) {
 419         errno = count < 4 ? EILSEQ : EINVAL;
 420         return (size_t) - 1;
 421       }
 422     }
 423     else {
 424       errno = EILSEQ;
 425       return (size_t) - 1;
 426     }
 427   }
 428   *ps = wc;
 429   return (size_t) - 2;
 430 }
 431
 432 #endif /* !HAVE_WC_FUNCS */
 433
 434 wchar_t replacement_char ()
 435 {
 436   return Charset_is_utf8 ? 0xfffd : '?';
 437 }