mbyte.c

   1 /*
   2  * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
   3  *
   4  *     This program is free software; you can redistribute it and/or modify
   5  *     it under the terms of the GNU General Public License as published by
   6  *     the Free Software Foundation; either version 2 of the License, or
   7  *     (at your option) any later version.
   8  *
   9  *     This program is distributed in the hope that it will be useful,
  10  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  *     GNU General Public License for more details.
  13  *
  14  *     You should have received a copy of the GNU General Public License
  15  *     along with this program; if not, write to the Free Software
  16  *     Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  17  */
  18
  19 /*
  20  * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>.
  21  */
  22
  23 #if HAVE_CONFIG_H
  24 # include "config.h"
  25 #endif
  26
  27 #include "mutt.h"
  28 #include "mbyte.h"
  29 #include "charset.h"
  30
  31 #include <errno.h>
  32
  33 #include <ctype.h>
  34
  35 #ifndef EILSEQ
  36 #define EILSEQ EINVAL
  37 #endif
  38
  39 int Charset_is_utf8 = 0;
  40
  41 #ifndef HAVE_WC_FUNCS
  42 static int charset_is_ja = 0;
  43 static iconv_t charset_to_utf8 = (iconv_t) (-1);
  44 static iconv_t charset_from_utf8 = (iconv_t) (-1);
  45 #endif
  46
  47 void mutt_set_charset (char *charset)
  48 {
  49   char buffer[STRING];
  50
  51   mutt_canonical_charset (buffer, sizeof (buffer), charset);
  52
  53   Charset_is_utf8 = 0;
  54 #ifndef HAVE_WC_FUNCS
  55   charset_is_ja = 0;
  56   if (charset_to_utf8 != (iconv_t) (-1)) {
  57     iconv_close (charset_to_utf8);
  58     charset_to_utf8 = (iconv_t) (-1);
  59   }
  60   if (charset_from_utf8 != (iconv_t) (-1)) {
  61     iconv_close (charset_from_utf8);
  62     charset_from_utf8 = (iconv_t) (-1);
  63   }
  64 #endif
  65
  66   if (!strcmp (buffer, "utf-8"))
  67     Charset_is_utf8 = 1;
  68 #ifndef HAVE_WC_FUNCS
  69   else if (!ascii_strcasecmp (buffer, "euc-jp")
  70            || !ascii_strcasecmp (buffer, "shift_jis")
  71            || !ascii_strcasecmp (buffer, "cp932")
  72            || !ascii_strcasecmp (buffer, "eucJP-ms")) {
  73     charset_is_ja = 1;
  74     charset_to_utf8 = iconv_open ("UTF-8", charset);
  75     charset_from_utf8 = iconv_open (charset, "UTF-8");
  76   }
  77 #endif
  78
  79 #ifdef HAVE_BIND_TEXTDOMAIN_CODESET
  80   bind_textdomain_codeset (PACKAGE, buffer);
  81 #endif
  82 }
  83
  84 #ifndef HAVE_WC_FUNCS
  85
  86 /*
  87  * For systems that don't have them, we provide here our own
  88  * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
  89  * Instead of using the locale, as these functions normally would,
  90  * we use Mutt's Charset variable. We support 3 types of charset:
  91  * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
  92  * (2) For UTF-8, wchar_t uses UCS.
  93  * (3) For stateless Japanese encodings, we use UCS and convert
  94  *     via UTF-8 using iconv.
  95  * Unfortunately, we can't handle non-stateless encodings.
  96  */
  97
  98 static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
  99 {
 100   char buf[MB_LEN_MAX];
 101   ICONV_CONST char *ib;
 102   char *ob;
 103   size_t ibl, obl, r;
 104
 105   if (s) {
 106     ibl = mutt_wctoutf8 (buf, wc);
 107     if (ibl == (size_t) (-1))
 108       return (size_t) (-1);
 109     ib = buf;
 110     ob = s;
 111     obl = MB_LEN_MAX;
 112     r = iconv (cd, &ib, &ibl, &ob, &obl);
 113   }
 114   else {
 115     ib = "";
 116     ibl = 1;
 117     ob = buf;
 118     obl = sizeof (buf);
 119     r = iconv (cd, &ib, &ibl, &ob, &obl);
 120   }
 121   return ob - s;
 122 }
 123
 124 size_t wcrtomb (char *s, wchar_t wc, mbstate_t * ps)
 125 {
 126   /* We only handle stateless encodings, so we can ignore ps. */
 127
 128   if (Charset_is_utf8)
 129     return mutt_wctoutf8 (s, wc);
 130   else if (charset_from_utf8 != (iconv_t) (-1))
 131     return wcrtomb_iconv (s, wc, charset_from_utf8);
 132   else {
 133     if (!s)
 134       return 1;
 135     if (wc < 0x100) {
 136       *s = wc;
 137       return 1;
 138     }
 139     errno = EILSEQ;
 140     return (size_t) (-1);
 141   }
 142 }
 143
 144 size_t mbrtowc_iconv (wchar_t * pwc, const char *s, size_t n,
 145                       mbstate_t * ps, iconv_t cd)
 146 {
 147   static mbstate_t mbstate;
 148   ICONV_CONST char *ib, *ibmax;
 149   char *ob, *t;
 150   size_t ibl, obl, k, r;
 151   char bufi[8], bufo[6];
 152
 153   if (!n)
 154     return (size_t) (-2);
 155
 156   t = memchr (ps, 0, sizeof (*ps));
 157   k = t ? (t - (char *) ps) : sizeof (*ps);
 158   if (k > sizeof (bufi))
 159     k = 0;
 160   if (k) {
 161     /* use the buffer for input */
 162     memcpy (bufi, ps, k);
 163     ib = bufi;
 164     ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
 165     memcpy (bufi + k, s, ibmax - bufi - k);
 166   }
 167   else {
 168     /* use the real input */
 169     ib = s;
 170     ibmax = s + n;
 171   }
 172
 173   ob = bufo;
 174   obl = sizeof (bufo);
 175   ibl = 1;
 176
 177   for (;;) {
 178     r = iconv (cd, &ib, &ibl, &ob, &obl);
 179     if (ob > bufo && (!k || ib > bufi + k)) {
 180       /* we have a character */
 181       memset (ps, 0, sizeof (*ps));
 182       utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
 183       return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0;
 184     }
 185     else if (!r || (r == (size_t) (-1) && errno == EINVAL)) {
 186       if (ib + ibl < ibmax)
 187         /* try using more input */
 188         ++ibl;
 189       else if (k && ib > bufi + k && bufi + k + n > ibmax) {
 190         /* switch to using real input */
 191         ib = s + (ib - bufi - k);
 192         ibmax = s + n;
 193         k = 0;
 194         ++ibl;
 195       }
 196       else {
 197         /* save the state and give up */
 198         memset (ps, 0, sizeof (*ps));
 199         if (ibl <= sizeof (mbstate_t))  /* need extra condition here! */
 200           memcpy (ps, ib, ibl);
 201         return (size_t) (-2);
 202       }
 203     }
 204     else {
 205       /* bad input */
 206       errno = EILSEQ;
 207       return (size_t) (-1);
 208     }
 209   }
 210 }
 211
 212 size_t mbrtowc (wchar_t * pwc, const char *s, size_t n, mbstate_t * ps)
 213 {
 214   static mbstate_t mbstate;
 215
 216   if (!ps)
 217     ps = &mbstate;
 218
 219   if (Charset_is_utf8)
 220     return utf8rtowc (pwc, s, n, ps);
 221   else if (charset_to_utf8 != (iconv_t) (-1))
 222     return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
 223   else {
 224     if (!s) {
 225       memset (ps, 0, sizeof (*ps));
 226       return 0;
 227     }
 228     if (!n)
 229       return (size_t) - 2;
 230     if (pwc)
 231       *pwc = (wchar_t) (unsigned char) *s;
 232     return (*s != 0);
 233   }
 234 }
 235
 236 int iswprint (wint_t wc)
 237 {
 238   if (Charset_is_utf8 || charset_is_ja)
 239     return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
 240   else
 241     return (0 <= wc && wc < 256) ? IsPrint (wc) : 0;
 242 }
 243
 244 int iswspace (wint_t wc)
 245 {
 246   if (Charset_is_utf8 || charset_is_ja)
 247     return (9 <= wc && wc <= 13) || wc == 32;
 248   else
 249     return (0 <= wc && wc < 256) ? isspace (wc) : 0;
 250 }
 251
 252 static wint_t towupper_ucs (wint_t x)
 253 {
 254   /* Only works for x < 0x130 */
 255   if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
 256     return x - 32;
 257   else if (0x100 <= x && x < 0x130)
 258     return x & ~1;
 259   else if (x == 0xb5)
 260     return 0x39c;
 261   else if (x == 0xff)
 262     return 0x178;
 263   else
 264     return x;
 265 }
 266
 267 static wint_t towlower_ucs (wint_t x)
 268 {
 269   /* Only works for x < 0x130 */
 270   if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7))
 271     return x + 32;
 272   else if (0x100 <= x && x < 0x130)
 273     return x | 1;
 274   else
 275     return x;
 276 }
 277
 278 static int iswalnum_ucs (wint_t wc)
 279 {
 280   /* Only works for x < 0x220 */
 281   if (wc >= 0x100)
 282     return 1;
 283   else if (wc < 0x30)
 284     return 0;
 285   else if (wc < 0x3a)
 286     return 1;
 287   else if (wc < 0xa0)
 288     return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
 289   else if (wc < 0xc0)
 290     return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
 291   else
 292     return !(wc == 0xd7 || wc == 0xf7);
 293 }
 294
 295 wint_t towupper (wint_t wc)
 296 {
 297   if (Charset_is_utf8 || charset_is_ja)
 298     return towupper_ucs (wc);
 299   else
 300     return (0 <= wc && wc < 256) ? toupper (wc) : wc;
 301 }
 302
 303 wint_t towlower (wint_t wc)
 304 {
 305   if (Charset_is_utf8 || charset_is_ja)
 306     return towlower_ucs (wc);
 307   else
 308     return (0 <= wc && wc < 256) ? tolower (wc) : wc;
 309 }
 310
 311 int iswalnum (wint_t wc)
 312 {
 313   if (Charset_is_utf8 || charset_is_ja)
 314     return iswalnum_ucs (wc);
 315   else
 316     return (0 <= wc && wc < 256) ? isalnum (wc) : 0;
 317 }
 318
 319 /*
 320  * l10n for Japanese:
 321  *   Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
 322  *   Character Set, have a column width of 2.
 323  */
 324 int wcwidth_ja (wchar_t ucs)
 325 {
 326   if (ucs >= 0x3021)
 327     return -1;                  /* continue with the normal check */
 328   /* a rough range for quick check */
 329   if ((ucs >= 0x00a1 && ucs <= 0x00fe) ||       /* Latin-1 Supplement */
 330       (ucs >= 0x0391 && ucs <= 0x0451) ||       /* Greek and Cyrillic */
 331       (ucs >= 0x2010 && ucs <= 0x266f) ||       /* Symbols */
 332       (ucs >= 0x3000 && ucs <= 0x3020)) /* CJK Symbols and Punctuation */
 333     return 2;
 334   else
 335     return -1;
 336 }
 337
 338 int wcwidth_ucs (wchar_t ucs);
 339
 340 int wcwidth (wchar_t wc)
 341 {
 342   if (!Charset_is_utf8) {
 343     if (!charset_is_ja) {
 344       /* 8-bit case */
 345       if (!wc)
 346         return 0;
 347       else if ((0 <= wc && wc < 256) && IsPrint (wc))
 348         return 1;
 349       else
 350         return -1;
 351     }
 352     else {
 353       /* Japanese */
 354       int k = wcwidth_ja (wc);
 355
 356       if (k != -1)
 357         return k;
 358     }
 359   }
 360   return wcwidth_ucs (wc);
 361 }
 362
 363 size_t utf8rtowc (wchar_t * pwc, const char *s, size_t n, mbstate_t * _ps)
 364 {
 365   static wchar_t mbstate;
 366   wchar_t *ps = (wchar_t *) _ps;
 367   size_t k = 1;
 368   unsigned char c;
 369   wchar_t wc;
 370   int count;
 371
 372   if (!ps)
 373     ps = &mbstate;
 374
 375   if (!s) {
 376     *ps = 0;
 377     return 0;
 378   }
 379   if (!n)
 380     return (size_t) - 2;
 381
 382   if (!*ps) {
 383     c = (unsigned char) *s;
 384     if (c < 0x80) {
 385       if (pwc)
 386         *pwc = c;
 387       return (c != 0);
 388     }
 389     else if (c < 0xc2) {
 390       errno = EILSEQ;
 391       return (size_t) - 1;
 392     }
 393     else if (c < 0xe0)
 394       wc = ((c & 0x1f) << 6) + (count = 0);
 395     else if (c < 0xf0)
 396       wc = ((c & 0x0f) << 12) + (count = 1);
 397     else if (c < 0xf8)
 398       wc = ((c & 0x07) << 18) + (count = 2);
 399     else if (c < 0xfc)
 400       wc = ((c & 0x03) << 24) + (count = 3);
 401     else if (c < 0xfe)
 402       wc = ((c & 0x01) << 30) + (count = 4);
 403     else {
 404       errno = EILSEQ;
 405       return (size_t) - 1;
 406     }
 407     ++s, --n, ++k;
 408   }
 409   else {
 410     wc = *ps & 0x7fffffff;
 411     count = wc & 7;             /* if count > 4 it will be caught below */
 412   }
 413
 414   for (; n; ++s, --n, ++k) {
 415     c = (unsigned char) *s;
 416     if (0x80 <= c && c < 0xc0) {
 417       wc |= (c & 0x3f) << (6 * count);
 418       if (!count) {
 419         if (pwc)
 420           *pwc = wc;
 421         *ps = 0;
 422         return wc ? k : 0;
 423       }
 424       --count, --wc;
 425       if (!(wc >> (11 + count * 5))) {
 426         errno = count < 4 ? EILSEQ : EINVAL;
 427         return (size_t) - 1;
 428       }
 429     }
 430     else {
 431       errno = EILSEQ;
 432       return (size_t) - 1;
 433     }
 434   }
 435   *ps = wc;
 436   return (size_t) - 2;
 437 }
 438
 439 #endif /* !HAVE_WC_FUNCS */
 440
 441 wchar_t replacement_char ()
 442 {
 443   return Charset_is_utf8 ? 0xfffd : '?';
 444 }