mbyte.c

   1 /*
   2  * Copyright notice from original mutt:
   3  * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
   4  *
   5  * This file is part of mutt-ng, see http://www.muttng.org/.
   6  * It's licensed under the GNU General Public License,
   7  * please see the file GPL in the top level source directory.
   8  */
   9
  10 /*
  11  * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>.
  12  */
  13
  14 #if HAVE_CONFIG_H
  15 # include "config.h"
  16 #endif
  17
  18 #include <lib-lib/macros.h>
  19
  20 #include "mutt.h"
  21 #include "mbyte.h"
  22 #include "charset.h"
  23
  24
  25 #include <errno.h>
  26
  27 #include <ctype.h>
  28
  29 #ifndef EILSEQ
  30 #define EILSEQ EINVAL
  31 #endif
  32
  33 int Charset_is_utf8 = 0;
  34
  35 #ifndef HAVE_WC_FUNCS
  36 static int charset_is_ja = 0;
  37 static iconv_t charset_to_utf8 = (iconv_t) (-1);
  38 static iconv_t charset_from_utf8 = (iconv_t) (-1);
  39 #endif
  40
  41 void mutt_set_charset (char *charset)
  42 {
  43   char buffer[STRING];
  44
  45   mutt_canonical_charset (buffer, sizeof (buffer), charset);
  46
  47   Charset_is_utf8 = 0;
  48 #ifndef HAVE_WC_FUNCS
  49   charset_is_ja = 0;
  50   if (charset_to_utf8 != (iconv_t) (-1)) {
  51     iconv_close (charset_to_utf8);
  52     charset_to_utf8 = (iconv_t) (-1);
  53   }
  54   if (charset_from_utf8 != (iconv_t) (-1)) {
  55     iconv_close (charset_from_utf8);
  56     charset_from_utf8 = (iconv_t) (-1);
  57   }
  58 #endif
  59
  60   if (!strcmp (buffer, "utf-8"))
  61     Charset_is_utf8 = 1;
  62 #ifndef HAVE_WC_FUNCS
  63   else if (!ascii_strcasecmp (buffer, "euc-jp")
  64            || !ascii_strcasecmp (buffer, "shift_jis")
  65            || !ascii_strcasecmp (buffer, "cp932")
  66            || !ascii_strcasecmp (buffer, "eucJP-ms")) {
  67     charset_is_ja = 1;
  68     charset_to_utf8 = iconv_open ("UTF-8", charset);
  69     charset_from_utf8 = iconv_open (charset, "UTF-8");
  70   }
  71 #endif
  72
  73 #ifdef HAVE_BIND_TEXTDOMAIN_CODESET
  74   bind_textdomain_codeset (PACKAGE, buffer);
  75 #endif
  76 }
  77
  78 #ifndef HAVE_WC_FUNCS
  79
  80 /*
  81  * For systems that don't have them, we provide here our own
  82  * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
  83  * Instead of using the locale, as these functions normally would,
  84  * we use Mutt's Charset variable. We support 3 types of charset:
  85  * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
  86  * (2) For UTF-8, wchar_t uses UCS.
  87  * (3) For stateless Japanese encodings, we use UCS and convert
  88  *     via UTF-8 using iconv.
  89  * Unfortunately, we can't handle non-stateless encodings.
  90  */
  91
  92 static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
  93 {
  94   char buf[MB_LEN_MAX];
  95   const char *ib;
  96   char *ob;
  97   size_t ibl, obl, r;
  98
  99   if (s) {
 100     ibl = mutt_wctoutf8 (buf, wc);
 101     if (ibl == (size_t) (-1))
 102       return (size_t) (-1);
 103     ib = buf;
 104     ob = s;
 105     obl = MB_LEN_MAX;
 106     r = my_iconv(cd, &ib, &ibl, &ob, &obl);
 107   }
 108   else {
 109     ib = "";
 110     ibl = 1;
 111     ob = buf;
 112     obl = sizeof (buf);
 113     r = my_iconv(cd, &ib, &ibl, &ob, &obl);
 114   }
 115   return ob - s;
 116 }
 117
 118 size_t wcrtomb (char *s, wchar_t wc, mbstate_t * ps)
 119 {
 120   /* We only handle stateless encodings, so we can ignore ps. */
 121
 122   if (Charset_is_utf8)
 123     return mutt_wctoutf8 (s, wc);
 124   else if (charset_from_utf8 != (iconv_t) (-1))
 125     return wcrtomb_iconv (s, wc, charset_from_utf8);
 126   else {
 127     if (!s)
 128       return 1;
 129     if (wc < 0x100) {
 130       *s = wc;
 131       return 1;
 132     }
 133     errno = EILSEQ;
 134     return (size_t) (-1);
 135   }
 136 }
 137
 138 size_t mbrtowc_iconv (wchar_t * pwc, const char *s, size_t n,
 139                       mbstate_t * ps, iconv_t cd)
 140 {
 141   static mbstate_t mbstate;
 142   const char *ib, *ibmax;
 143   char *ob, *t;
 144   size_t ibl, obl, k, r;
 145   char bufi[8], bufo[6];
 146
 147   if (!n)
 148     return (size_t) (-2);
 149
 150   t = memchr (ps, 0, sizeof (*ps));
 151   k = t ? (t - (char *) ps) : sizeof (*ps);
 152   if (k > sizeof (bufi))
 153     k = 0;
 154   if (k) {
 155     /* use the buffer for input */
 156     memcpy (bufi, ps, k);
 157     ib = bufi;
 158     ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
 159     memcpy (bufi + k, s, ibmax - bufi - k);
 160   }
 161   else {
 162     /* use the real input */
 163     ib = s;
 164     ibmax = s + n;
 165   }
 166
 167   ob = bufo;
 168   obl = sizeof (bufo);
 169   ibl = 1;
 170
 171   for (;;) {
 172     r = my_iconv(cd, &ib, &ibl, &ob, &obl);
 173     if (ob > bufo && (!k || ib > bufi + k)) {
 174       /* we have a character */
 175       p_clear(ps, 1);
 176       utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
 177       return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0;
 178     }
 179     else if (!r || (r == (size_t) (-1) && errno == EINVAL)) {
 180       if (ib + ibl < ibmax)
 181         /* try using more input */
 182         ++ibl;
 183       else if (k && ib > bufi + k && bufi + k + n > ibmax) {
 184         /* switch to using real input */
 185         ib = s + (ib - bufi - k);
 186         ibmax = s + n;
 187         k = 0;
 188         ++ibl;
 189       }
 190       else {
 191         /* save the state and give up */
 192         p_clear(ps, 1);
 193         if (ibl <= sizeof (mbstate_t))  /* need extra condition here! */
 194           memcpy (ps, ib, ibl);
 195         return (size_t) (-2);
 196       }
 197     }
 198     else {
 199       /* bad input */
 200       errno = EILSEQ;
 201       return (size_t) (-1);
 202     }
 203   }
 204 }
 205
 206 size_t mbrtowc (wchar_t * pwc, const char *s, size_t n, mbstate_t * ps)
 207 {
 208   static mbstate_t mbstate;
 209
 210   if (!ps)
 211     ps = &mbstate;
 212
 213   if (Charset_is_utf8)
 214     return utf8rtowc (pwc, s, n, ps);
 215   else if (charset_to_utf8 != (iconv_t) (-1))
 216     return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
 217   else {
 218     if (!s) {
 219       p_clear(ps, 1);
 220       return 0;
 221     }
 222     if (!n)
 223       return (size_t) - 2;
 224     if (pwc)
 225       *pwc = (wchar_t) (unsigned char) *s;
 226     return (*s != 0);
 227   }
 228 }
 229
 230 int iswprint (wint_t wc)
 231 {
 232   if (Charset_is_utf8 || charset_is_ja)
 233     return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
 234   else
 235     return (0 <= wc && wc < 256) ? IsPrint (wc) : 0;
 236 }
 237
 238 int iswspace (wint_t wc)
 239 {
 240   if (Charset_is_utf8 || charset_is_ja)
 241     return (9 <= wc && wc <= 13) || wc == 32;
 242   else
 243     return (0 <= wc && wc < 256) ? isspace (wc) : 0;
 244 }
 245
 246 static wint_t towupper_ucs (wint_t x)
 247 {
 248   /* Only works for x < 0x130 */
 249   if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
 250     return x - 32;
 251   else if (0x100 <= x && x < 0x130)
 252     return x & ~1;
 253   else if (x == 0xb5)
 254     return 0x39c;
 255   else if (x == 0xff)
 256     return 0x178;
 257   else
 258     return x;
 259 }
 260
 261 static wint_t towlower_ucs (wint_t x)
 262 {
 263   /* Only works for x < 0x130 */
 264   if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7))
 265     return x + 32;
 266   else if (0x100 <= x && x < 0x130)
 267     return x | 1;
 268   else
 269     return x;
 270 }
 271
 272 static int iswalnum_ucs (wint_t wc)
 273 {
 274   /* Only works for x < 0x220 */
 275   if (wc >= 0x100)
 276     return 1;
 277   else if (wc < 0x30)
 278     return 0;
 279   else if (wc < 0x3a)
 280     return 1;
 281   else if (wc < 0xa0)
 282     return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
 283   else if (wc < 0xc0)
 284     return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
 285   else
 286     return !(wc == 0xd7 || wc == 0xf7);
 287 }
 288
 289 wint_t towupper (wint_t wc)
 290 {
 291   if (Charset_is_utf8 || charset_is_ja)
 292     return towupper_ucs (wc);
 293   else
 294     return (0 <= wc && wc < 256) ? toupper (wc) : wc;
 295 }
 296
 297 wint_t towlower (wint_t wc)
 298 {
 299   if (Charset_is_utf8 || charset_is_ja)
 300     return towlower_ucs (wc);
 301   else
 302     return (0 <= wc && wc < 256) ? tolower (wc) : wc;
 303 }
 304
 305 int iswalnum (wint_t wc)
 306 {
 307   if (Charset_is_utf8 || charset_is_ja)
 308     return iswalnum_ucs (wc);
 309   else
 310     return (0 <= wc && wc < 256) ? isalnum (wc) : 0;
 311 }
 312
 313 /*
 314  * l10n for Japanese:
 315  *   Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
 316  *   Character Set, have a column width of 2.
 317  */
 318 int wcwidth_ja (wchar_t ucs)
 319 {
 320   if (ucs >= 0x3021)
 321     return -1;                  /* continue with the normal check */
 322   /* a rough range for quick check */
 323   if ((ucs >= 0x00a1 && ucs <= 0x00fe) ||       /* Latin-1 Supplement */
 324       (ucs >= 0x0391 && ucs <= 0x0451) ||       /* Greek and Cyrillic */
 325       (ucs >= 0x2010 && ucs <= 0x266f) ||       /* Symbols */
 326       (ucs >= 0x3000 && ucs <= 0x3020)) /* CJK Symbols and Punctuation */
 327     return 2;
 328   else
 329     return -1;
 330 }
 331
 332 int wcwidth_ucs (wchar_t ucs);
 333
 334 int wcwidth (wchar_t wc)
 335 {
 336   if (!Charset_is_utf8) {
 337     if (!charset_is_ja) {
 338       /* 8-bit case */
 339       if (!wc)
 340         return 0;
 341       else if ((0 <= wc && wc < 256) && IsPrint (wc))
 342         return 1;
 343       else
 344         return -1;
 345     }
 346     else {
 347       /* Japanese */
 348       int k = wcwidth_ja (wc);
 349
 350       if (k != -1)
 351         return k;
 352     }
 353   }
 354   return wcwidth_ucs (wc);
 355 }
 356
 357 size_t utf8rtowc (wchar_t * pwc, const char *s, size_t n, mbstate_t * _ps)
 358 {
 359   static wchar_t mbstate;
 360   wchar_t *ps = (wchar_t *) _ps;
 361   size_t k = 1;
 362   unsigned char c;
 363   wchar_t wc;
 364   int count;
 365
 366   if (!ps)
 367     ps = &mbstate;
 368
 369   if (!s) {
 370     *ps = 0;
 371     return 0;
 372   }
 373   if (!n)
 374     return (size_t) - 2;
 375
 376   if (!*ps) {
 377     c = (unsigned char) *s;
 378     if (c < 0x80) {
 379       if (pwc)
 380         *pwc = c;
 381       return (c != 0);
 382     }
 383     else if (c < 0xc2) {
 384       errno = EILSEQ;
 385       return (size_t) - 1;
 386     }
 387     else if (c < 0xe0)
 388       wc = ((c & 0x1f) << 6) + (count = 0);
 389     else if (c < 0xf0)
 390       wc = ((c & 0x0f) << 12) + (count = 1);
 391     else if (c < 0xf8)
 392       wc = ((c & 0x07) << 18) + (count = 2);
 393     else if (c < 0xfc)
 394       wc = ((c & 0x03) << 24) + (count = 3);
 395     else if (c < 0xfe)
 396       wc = ((c & 0x01) << 30) + (count = 4);
 397     else {
 398       errno = EILSEQ;
 399       return (size_t) - 1;
 400     }
 401     ++s, --n, ++k;
 402   }
 403   else {
 404     wc = *ps & 0x7fffffff;
 405     count = wc & 7;             /* if count > 4 it will be caught below */
 406   }
 407
 408   for (; n; ++s, --n, ++k) {
 409     c = (unsigned char) *s;
 410     if (0x80 <= c && c < 0xc0) {
 411       wc |= (c & 0x3f) << (6 * count);
 412       if (!count) {
 413         if (pwc)
 414           *pwc = wc;
 415         *ps = 0;
 416         return wc ? k : 0;
 417       }
 418       --count, --wc;
 419       if (!(wc >> (11 + count * 5))) {
 420         errno = count < 4 ? EILSEQ : EINVAL;
 421         return (size_t) - 1;
 422       }
 423     }
 424     else {
 425       errno = EILSEQ;
 426       return (size_t) - 1;
 427     }
 428   }
 429   *ps = wc;
 430   return (size_t) - 2;
 431 }
 432
 433 #endif /* !HAVE_WC_FUNCS */
 434
 435 wchar_t replacement_char (void)
 436 {
 437   return Charset_is_utf8 ? 0xfffd : '?';
 438 }