mbyte.c

   1 /*
   2  * Copyright notice from original mutt:
   3  * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
   4  *
   5  * This file is part of mutt-ng, see http://www.muttng.org/.
   6  * It's licensed under the GNU General Public License,
   7  * please see the file GPL in the top level source directory.
   8  */
   9
  10 /*
  11  * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>.
  12  */
  13
  14 #if HAVE_CONFIG_H
  15 # include "config.h"
  16 #endif
  17
  18 #include "mutt.h"
  19 #include "mbyte.h"
  20 #include "charset.h"
  21
  22 #include <errno.h>
  23
  24 #include <ctype.h>
  25
  26 #ifndef EILSEQ
  27 #define EILSEQ EINVAL
  28 #endif
  29
  30 int Charset_is_utf8 = 0;
  31
  32 #ifndef HAVE_WC_FUNCS
  33 static int charset_is_ja = 0;
  34 static iconv_t charset_to_utf8 = (iconv_t) (-1);
  35 static iconv_t charset_from_utf8 = (iconv_t) (-1);
  36 #endif
  37
  38 void mutt_set_charset (char *charset)
  39 {
  40   char buffer[STRING];
  41
  42   mutt_canonical_charset (buffer, sizeof (buffer), charset);
  43
  44   Charset_is_utf8 = 0;
  45 #ifndef HAVE_WC_FUNCS
  46   charset_is_ja = 0;
  47   if (charset_to_utf8 != (iconv_t) (-1)) {
  48     iconv_close (charset_to_utf8);
  49     charset_to_utf8 = (iconv_t) (-1);
  50   }
  51   if (charset_from_utf8 != (iconv_t) (-1)) {
  52     iconv_close (charset_from_utf8);
  53     charset_from_utf8 = (iconv_t) (-1);
  54   }
  55 #endif
  56
  57   if (!strcmp (buffer, "utf-8"))
  58     Charset_is_utf8 = 1;
  59 #ifndef HAVE_WC_FUNCS
  60   else if (!ascii_strcasecmp (buffer, "euc-jp")
  61            || !ascii_strcasecmp (buffer, "shift_jis")
  62            || !ascii_strcasecmp (buffer, "cp932")
  63            || !ascii_strcasecmp (buffer, "eucJP-ms")) {
  64     charset_is_ja = 1;
  65     charset_to_utf8 = iconv_open ("UTF-8", charset);
  66     charset_from_utf8 = iconv_open (charset, "UTF-8");
  67   }
  68 #endif
  69
  70 #ifdef HAVE_BIND_TEXTDOMAIN_CODESET
  71   bind_textdomain_codeset (PACKAGE, buffer);
  72 #endif
  73 }
  74
  75 #ifndef HAVE_WC_FUNCS
  76
  77 /*
  78  * For systems that don't have them, we provide here our own
  79  * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
  80  * Instead of using the locale, as these functions normally would,
  81  * we use Mutt's Charset variable. We support 3 types of charset:
  82  * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
  83  * (2) For UTF-8, wchar_t uses UCS.
  84  * (3) For stateless Japanese encodings, we use UCS and convert
  85  *     via UTF-8 using iconv.
  86  * Unfortunately, we can't handle non-stateless encodings.
  87  */
  88
  89 static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
  90 {
  91   char buf[MB_LEN_MAX];
  92   ICONV_CONST char *ib;
  93   char *ob;
  94   size_t ibl, obl, r;
  95
  96   if (s) {
  97     ibl = mutt_wctoutf8 (buf, wc);
  98     if (ibl == (size_t) (-1))
  99       return (size_t) (-1);
 100     ib = buf;
 101     ob = s;
 102     obl = MB_LEN_MAX;
 103     r = iconv (cd, &ib, &ibl, &ob, &obl);
 104   }
 105   else {
 106     ib = "";
 107     ibl = 1;
 108     ob = buf;
 109     obl = sizeof (buf);
 110     r = iconv (cd, &ib, &ibl, &ob, &obl);
 111   }
 112   return ob - s;
 113 }
 114
 115 size_t wcrtomb (char *s, wchar_t wc, mbstate_t * ps)
 116 {
 117   /* We only handle stateless encodings, so we can ignore ps. */
 118
 119   if (Charset_is_utf8)
 120     return mutt_wctoutf8 (s, wc);
 121   else if (charset_from_utf8 != (iconv_t) (-1))
 122     return wcrtomb_iconv (s, wc, charset_from_utf8);
 123   else {
 124     if (!s)
 125       return 1;
 126     if (wc < 0x100) {
 127       *s = wc;
 128       return 1;
 129     }
 130     errno = EILSEQ;
 131     return (size_t) (-1);
 132   }
 133 }
 134
 135 size_t mbrtowc_iconv (wchar_t * pwc, const char *s, size_t n,
 136                       mbstate_t * ps, iconv_t cd)
 137 {
 138   static mbstate_t mbstate;
 139   ICONV_CONST char *ib, *ibmax;
 140   char *ob, *t;
 141   size_t ibl, obl, k, r;
 142   char bufi[8], bufo[6];
 143
 144   if (!n)
 145     return (size_t) (-2);
 146
 147   t = memchr (ps, 0, sizeof (*ps));
 148   k = t ? (t - (char *) ps) : sizeof (*ps);
 149   if (k > sizeof (bufi))
 150     k = 0;
 151   if (k) {
 152     /* use the buffer for input */
 153     memcpy (bufi, ps, k);
 154     ib = bufi;
 155     ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
 156     memcpy (bufi + k, s, ibmax - bufi - k);
 157   }
 158   else {
 159     /* use the real input */
 160     ib = s;
 161     ibmax = s + n;
 162   }
 163
 164   ob = bufo;
 165   obl = sizeof (bufo);
 166   ibl = 1;
 167
 168   for (;;) {
 169     r = iconv (cd, &ib, &ibl, &ob, &obl);
 170     if (ob > bufo && (!k || ib > bufi + k)) {
 171       /* we have a character */
 172       memset (ps, 0, sizeof (*ps));
 173       utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
 174       return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0;
 175     }
 176     else if (!r || (r == (size_t) (-1) && errno == EINVAL)) {
 177       if (ib + ibl < ibmax)
 178         /* try using more input */
 179         ++ibl;
 180       else if (k && ib > bufi + k && bufi + k + n > ibmax) {
 181         /* switch to using real input */
 182         ib = s + (ib - bufi - k);
 183         ibmax = s + n;
 184         k = 0;
 185         ++ibl;
 186       }
 187       else {
 188         /* save the state and give up */
 189         memset (ps, 0, sizeof (*ps));
 190         if (ibl <= sizeof (mbstate_t))  /* need extra condition here! */
 191           memcpy (ps, ib, ibl);
 192         return (size_t) (-2);
 193       }
 194     }
 195     else {
 196       /* bad input */
 197       errno = EILSEQ;
 198       return (size_t) (-1);
 199     }
 200   }
 201 }
 202
 203 size_t mbrtowc (wchar_t * pwc, const char *s, size_t n, mbstate_t * ps)
 204 {
 205   static mbstate_t mbstate;
 206
 207   if (!ps)
 208     ps = &mbstate;
 209
 210   if (Charset_is_utf8)
 211     return utf8rtowc (pwc, s, n, ps);
 212   else if (charset_to_utf8 != (iconv_t) (-1))
 213     return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
 214   else {
 215     if (!s) {
 216       memset (ps, 0, sizeof (*ps));
 217       return 0;
 218     }
 219     if (!n)
 220       return (size_t) - 2;
 221     if (pwc)
 222       *pwc = (wchar_t) (unsigned char) *s;
 223     return (*s != 0);
 224   }
 225 }
 226
 227 int iswprint (wint_t wc)
 228 {
 229   if (Charset_is_utf8 || charset_is_ja)
 230     return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
 231   else
 232     return (0 <= wc && wc < 256) ? IsPrint (wc) : 0;
 233 }
 234
 235 int iswspace (wint_t wc)
 236 {
 237   if (Charset_is_utf8 || charset_is_ja)
 238     return (9 <= wc && wc <= 13) || wc == 32;
 239   else
 240     return (0 <= wc && wc < 256) ? isspace (wc) : 0;
 241 }
 242
 243 static wint_t towupper_ucs (wint_t x)
 244 {
 245   /* Only works for x < 0x130 */
 246   if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
 247     return x - 32;
 248   else if (0x100 <= x && x < 0x130)
 249     return x & ~1;
 250   else if (x == 0xb5)
 251     return 0x39c;
 252   else if (x == 0xff)
 253     return 0x178;
 254   else
 255     return x;
 256 }
 257
 258 static wint_t towlower_ucs (wint_t x)
 259 {
 260   /* Only works for x < 0x130 */
 261   if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7))
 262     return x + 32;
 263   else if (0x100 <= x && x < 0x130)
 264     return x | 1;
 265   else
 266     return x;
 267 }
 268
 269 static int iswalnum_ucs (wint_t wc)
 270 {
 271   /* Only works for x < 0x220 */
 272   if (wc >= 0x100)
 273     return 1;
 274   else if (wc < 0x30)
 275     return 0;
 276   else if (wc < 0x3a)
 277     return 1;
 278   else if (wc < 0xa0)
 279     return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
 280   else if (wc < 0xc0)
 281     return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
 282   else
 283     return !(wc == 0xd7 || wc == 0xf7);
 284 }
 285
 286 wint_t towupper (wint_t wc)
 287 {
 288   if (Charset_is_utf8 || charset_is_ja)
 289     return towupper_ucs (wc);
 290   else
 291     return (0 <= wc && wc < 256) ? toupper (wc) : wc;
 292 }
 293
 294 wint_t towlower (wint_t wc)
 295 {
 296   if (Charset_is_utf8 || charset_is_ja)
 297     return towlower_ucs (wc);
 298   else
 299     return (0 <= wc && wc < 256) ? tolower (wc) : wc;
 300 }
 301
 302 int iswalnum (wint_t wc)
 303 {
 304   if (Charset_is_utf8 || charset_is_ja)
 305     return iswalnum_ucs (wc);
 306   else
 307     return (0 <= wc && wc < 256) ? isalnum (wc) : 0;
 308 }
 309
 310 /*
 311  * l10n for Japanese:
 312  *   Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
 313  *   Character Set, have a column width of 2.
 314  */
 315 int wcwidth_ja (wchar_t ucs)
 316 {
 317   if (ucs >= 0x3021)
 318     return -1;                  /* continue with the normal check */
 319   /* a rough range for quick check */
 320   if ((ucs >= 0x00a1 && ucs <= 0x00fe) ||       /* Latin-1 Supplement */
 321       (ucs >= 0x0391 && ucs <= 0x0451) ||       /* Greek and Cyrillic */
 322       (ucs >= 0x2010 && ucs <= 0x266f) ||       /* Symbols */
 323       (ucs >= 0x3000 && ucs <= 0x3020)) /* CJK Symbols and Punctuation */
 324     return 2;
 325   else
 326     return -1;
 327 }
 328
 329 int wcwidth_ucs (wchar_t ucs);
 330
 331 int wcwidth (wchar_t wc)
 332 {
 333   if (!Charset_is_utf8) {
 334     if (!charset_is_ja) {
 335       /* 8-bit case */
 336       if (!wc)
 337         return 0;
 338       else if ((0 <= wc && wc < 256) && IsPrint (wc))
 339         return 1;
 340       else
 341         return -1;
 342     }
 343     else {
 344       /* Japanese */
 345       int k = wcwidth_ja (wc);
 346
 347       if (k != -1)
 348         return k;
 349     }
 350   }
 351   return wcwidth_ucs (wc);
 352 }
 353
 354 size_t utf8rtowc (wchar_t * pwc, const char *s, size_t n, mbstate_t * _ps)
 355 {
 356   static wchar_t mbstate;
 357   wchar_t *ps = (wchar_t *) _ps;
 358   size_t k = 1;
 359   unsigned char c;
 360   wchar_t wc;
 361   int count;
 362
 363   if (!ps)
 364     ps = &mbstate;
 365
 366   if (!s) {
 367     *ps = 0;
 368     return 0;
 369   }
 370   if (!n)
 371     return (size_t) - 2;
 372
 373   if (!*ps) {
 374     c = (unsigned char) *s;
 375     if (c < 0x80) {
 376       if (pwc)
 377         *pwc = c;
 378       return (c != 0);
 379     }
 380     else if (c < 0xc2) {
 381       errno = EILSEQ;
 382       return (size_t) - 1;
 383     }
 384     else if (c < 0xe0)
 385       wc = ((c & 0x1f) << 6) + (count = 0);
 386     else if (c < 0xf0)
 387       wc = ((c & 0x0f) << 12) + (count = 1);
 388     else if (c < 0xf8)
 389       wc = ((c & 0x07) << 18) + (count = 2);
 390     else if (c < 0xfc)
 391       wc = ((c & 0x03) << 24) + (count = 3);
 392     else if (c < 0xfe)
 393       wc = ((c & 0x01) << 30) + (count = 4);
 394     else {
 395       errno = EILSEQ;
 396       return (size_t) - 1;
 397     }
 398     ++s, --n, ++k;
 399   }
 400   else {
 401     wc = *ps & 0x7fffffff;
 402     count = wc & 7;             /* if count > 4 it will be caught below */
 403   }
 404
 405   for (; n; ++s, --n, ++k) {
 406     c = (unsigned char) *s;
 407     if (0x80 <= c && c < 0xc0) {
 408       wc |= (c & 0x3f) << (6 * count);
 409       if (!count) {
 410         if (pwc)
 411           *pwc = wc;
 412         *ps = 0;
 413         return wc ? k : 0;
 414       }
 415       --count, --wc;
 416       if (!(wc >> (11 + count * 5))) {
 417         errno = count < 4 ? EILSEQ : EINVAL;
 418         return (size_t) - 1;
 419       }
 420     }
 421     else {
 422       errno = EILSEQ;
 423       return (size_t) - 1;
 424     }
 425   }
 426   *ps = wc;
 427   return (size_t) - 2;
 428 }
 429
 430 #endif /* !HAVE_WC_FUNCS */
 431
 432 wchar_t replacement_char ()
 433 {
 434   return Charset_is_utf8 ? 0xfffd : '?';
 435 }