mbyte.c

   1 /*
   2  * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
   3  *
   4  *     This program is free software; you can redistribute it and/or modify
   5  *     it under the terms of the GNU General Public License as published by
   6  *     the Free Software Foundation; either version 2 of the License, or
   7  *     (at your option) any later version.
   8  *
   9  *     This program is distributed in the hope that it will be useful,
  10  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  *     GNU General Public License for more details.
  13  *
  14  *     You should have received a copy of the GNU General Public License
  15  *     along with this program; if not, write to the Free Software
  16  *     Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  17  */
  18
  19 /*
  20  * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>.
  21  */
  22
  23 #if HAVE_CONFIG_H
  24 # include "config.h"
  25 #endif
  26
  27 #include "mutt.h"
  28 #include "mbyte.h"
  29 #include "charset.h"
  30
  31 #include <errno.h>
  32
  33 #include <ctype.h>
  34
  35 #ifndef EILSEQ
  36 #define EILSEQ EINVAL
  37 #endif
  38
  39 int Charset_is_utf8 = 0;
  40 #ifndef HAVE_WC_FUNCS
  41 static int charset_is_ja = 0;
  42 static iconv_t charset_to_utf8 = (iconv_t)(-1);
  43 static iconv_t charset_from_utf8 = (iconv_t)(-1);
  44 #endif
  45
  46 void mutt_set_charset (char *charset)
  47 {
  48   char buffer[STRING];
  49
  50   mutt_canonical_charset (buffer, sizeof (buffer), charset);
  51
  52   Charset_is_utf8 = 0;
  53 #ifndef HAVE_WC_FUNCS
  54   charset_is_ja = 0;
  55   if (charset_to_utf8 != (iconv_t)(-1))
  56   {
  57     iconv_close (charset_to_utf8);
  58     charset_to_utf8 = (iconv_t)(-1);
  59   }
  60   if (charset_from_utf8 != (iconv_t)(-1))
  61   {
  62     iconv_close (charset_from_utf8);
  63     charset_from_utf8 = (iconv_t)(-1);
  64   }
  65 #endif
  66
  67   if (!strcmp(buffer, "utf-8"))
  68     Charset_is_utf8 = 1;
  69 #ifndef HAVE_WC_FUNCS
  70   else if (!ascii_strcasecmp(buffer, "euc-jp") || !ascii_strcasecmp(buffer, "shift_jis")
  71         || !ascii_strcasecmp(buffer, "cp932") || !ascii_strcasecmp(buffer, "eucJP-ms"))
  72   {
  73     charset_is_ja = 1;
  74     charset_to_utf8 = iconv_open ("UTF-8", charset);
  75     charset_from_utf8 = iconv_open (charset, "UTF-8");
  76   }
  77 #endif
  78
  79 #ifdef HAVE_BIND_TEXTDOMAIN_CODESET
  80   bind_textdomain_codeset(PACKAGE, buffer);
  81 #endif
  82 }
  83
  84 #ifndef HAVE_WC_FUNCS
  85
  86 /*
  87  * For systems that don't have them, we provide here our own
  88  * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
  89  * Instead of using the locale, as these functions normally would,
  90  * we use Mutt's Charset variable. We support 3 types of charset:
  91  * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
  92  * (2) For UTF-8, wchar_t uses UCS.
  93  * (3) For stateless Japanese encodings, we use UCS and convert
  94  *     via UTF-8 using iconv.
  95  * Unfortunately, we can't handle non-stateless encodings.
  96  */
  97
  98 static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
  99 {
 100   char buf[MB_LEN_MAX];
 101   ICONV_CONST char *ib;
 102   char *ob;
 103   size_t ibl, obl, r;
 104
 105   if (s)
 106   {
 107     ibl = mutt_wctoutf8 (buf, wc);
 108     if (ibl == (size_t)(-1))
 109       return (size_t)(-1);
 110     ib = buf;
 111     ob = s;
 112     obl = MB_LEN_MAX;
 113     r = iconv (cd, &ib, &ibl, &ob, &obl);
 114   }
 115   else
 116   {
 117     ib = "";
 118     ibl = 1;
 119     ob = buf;
 120     obl = sizeof (buf);
 121     r = iconv (cd, &ib, &ibl, &ob, &obl);
 122   }
 123   return ob - s;
 124 }
 125
 126 size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
 127 {
 128   /* We only handle stateless encodings, so we can ignore ps. */
 129
 130   if (Charset_is_utf8)
 131     return mutt_wctoutf8 (s, wc);
 132   else if (charset_from_utf8 != (iconv_t)(-1))
 133     return wcrtomb_iconv (s, wc, charset_from_utf8);
 134   else
 135   {
 136     if (!s)
 137       return 1;
 138     if (wc < 0x100)
 139     {
 140       *s = wc;
 141       return 1;
 142     }
 143     errno = EILSEQ;
 144     return (size_t)(-1);
 145   }
 146 }
 147
 148 size_t mbrtowc_iconv (wchar_t *pwc, const char *s, size_t n,
 149                       mbstate_t *ps, iconv_t cd)
 150 {
 151   static mbstate_t mbstate;
 152   ICONV_CONST char *ib, *ibmax;
 153   char *ob, *t;
 154   size_t ibl, obl, k, r;
 155   char bufi[8], bufo[6];
 156
 157   if (!n)
 158     return (size_t)(-2);
 159
 160   t = memchr (ps, 0, sizeof (*ps));
 161   k = t ? (t - (char *)ps) : sizeof (*ps);
 162   if (k > sizeof (bufi))
 163     k = 0;
 164   if (k)
 165   {
 166     /* use the buffer for input */
 167     memcpy (bufi, ps, k);
 168     ib = bufi;
 169     ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
 170     memcpy (bufi + k, s, ibmax - bufi - k);
 171   }
 172   else
 173   {
 174     /* use the real input */
 175     ib = s;
 176     ibmax = s + n;
 177   }
 178
 179   ob = bufo;
 180   obl = sizeof (bufo);
 181   ibl = 1;
 182
 183   for (;;)
 184   {
 185     r = iconv (cd, &ib, &ibl, &ob, &obl);
 186     if (ob > bufo && (!k || ib > bufi + k))
 187     {
 188       /* we have a character */
 189       memset (ps, 0, sizeof (*ps));
 190       utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
 191       return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0;
 192     }
 193     else if (!r || (r == (size_t)(-1) && errno == EINVAL))
 194     {
 195       if (ib + ibl < ibmax)
 196         /* try using more input */
 197         ++ibl;
 198       else if (k && ib > bufi + k && bufi + k + n > ibmax)
 199       {
 200         /* switch to using real input */
 201         ib = s + (ib - bufi - k);
 202         ibmax = s + n;
 203         k = 0;
 204         ++ibl;
 205       }
 206       else
 207       {
 208         /* save the state and give up */
 209         memset (ps, 0, sizeof (*ps));
 210         if (ibl <= sizeof (mbstate_t)) /* need extra condition here! */
 211           memcpy (ps, ib, ibl);
 212         return (size_t)(-2);
 213       }
 214     }
 215     else
 216     {
 217       /* bad input */
 218       errno = EILSEQ;
 219       return (size_t)(-1);
 220     }
 221   }
 222 }
 223
 224 size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 225 {
 226   static mbstate_t mbstate;
 227
 228   if (!ps)
 229     ps = &mbstate;
 230
 231   if (Charset_is_utf8)
 232     return utf8rtowc (pwc, s, n, ps);
 233   else if (charset_to_utf8 != (iconv_t)(-1))
 234     return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
 235   else
 236   {
 237     if (!s)
 238     {
 239       memset(ps, 0, sizeof(*ps));
 240       return 0;
 241     }
 242     if (!n)
 243       return (size_t)-2;
 244     if (pwc)
 245       *pwc = (wchar_t)(unsigned char)*s;
 246     return (*s != 0);
 247   }
 248 }
 249
 250 int iswprint (wint_t wc)
 251 {
 252   if (Charset_is_utf8 || charset_is_ja)
 253     return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
 254   else
 255     return (0 <= wc && wc < 256) ? IsPrint (wc) : 0;
 256 }
 257
 258 int iswspace (wint_t wc)
 259 {
 260   if (Charset_is_utf8 || charset_is_ja)
 261     return (9 <= wc && wc <= 13) || wc == 32;
 262   else
 263     return (0 <= wc && wc < 256) ? isspace (wc) : 0;
 264 }
 265
 266 static wint_t towupper_ucs (wint_t x)
 267 {
 268   /* Only works for x < 0x130 */
 269   if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
 270     return x - 32;
 271   else if (0x100 <= x && x < 0x130)
 272     return x & ~1;
 273   else if (x == 0xb5)
 274     return 0x39c;
 275   else if (x == 0xff)
 276     return 0x178;
 277   else
 278     return x;
 279 }
 280
 281 static wint_t towlower_ucs (wint_t x)
 282 {
 283   /* Only works for x < 0x130 */
 284   if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7))
 285     return x + 32;
 286   else if (0x100 <= x && x < 0x130)
 287     return x | 1;
 288   else
 289     return x;
 290 }
 291
 292 static int iswalnum_ucs (wint_t wc)
 293 {
 294   /* Only works for x < 0x220 */
 295   if (wc >= 0x100)
 296     return 1;
 297   else if (wc < 0x30)
 298     return 0;
 299   else if (wc < 0x3a)
 300     return 1;
 301   else if (wc < 0xa0)
 302     return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
 303   else if (wc < 0xc0)
 304     return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
 305   else
 306     return !(wc == 0xd7 || wc == 0xf7);
 307 }
 308
 309 wint_t towupper (wint_t wc)
 310 {
 311   if (Charset_is_utf8 || charset_is_ja)
 312     return towupper_ucs (wc);
 313   else
 314     return (0 <= wc && wc < 256) ? toupper (wc) : wc;
 315 }
 316
 317 wint_t towlower (wint_t wc)
 318 {
 319   if (Charset_is_utf8 || charset_is_ja)
 320     return towlower_ucs (wc);
 321   else
 322     return (0 <= wc && wc < 256) ? tolower (wc) : wc;
 323 }
 324
 325 int iswalnum (wint_t wc)
 326 {
 327   if (Charset_is_utf8 || charset_is_ja)
 328     return iswalnum_ucs (wc);
 329   else
 330     return (0 <= wc && wc < 256) ? isalnum (wc) : 0;
 331 }
 332
 333 /*
 334  * l10n for Japanese:
 335  *   Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
 336  *   Character Set, have a column width of 2.
 337  */
 338 int wcwidth_ja (wchar_t ucs)
 339 {
 340   if (ucs >= 0x3021)
 341     return -1; /* continue with the normal check */
 342   /* a rough range for quick check */
 343   if ((ucs >= 0x00a1 && ucs <= 0x00fe) || /* Latin-1 Supplement */
 344       (ucs >= 0x0391 && ucs <= 0x0451) || /* Greek and Cyrillic */
 345       (ucs >= 0x2010 && ucs <= 0x266f) || /* Symbols */
 346       (ucs >= 0x3000 && ucs <= 0x3020))   /* CJK Symbols and Punctuation */
 347     return 2;
 348   else
 349     return -1;
 350 }
 351
 352 int wcwidth_ucs(wchar_t ucs);
 353
 354 int wcwidth (wchar_t wc)
 355 {
 356   if (!Charset_is_utf8)
 357   {
 358     if (!charset_is_ja)
 359     {
 360       /* 8-bit case */
 361       if (!wc)
 362         return 0;
 363       else if ((0 <= wc && wc < 256) && IsPrint (wc))
 364         return 1;
 365       else
 366         return -1;
 367     }
 368     else
 369     {
 370       /* Japanese */
 371       int k = wcwidth_ja (wc);
 372       if (k != -1)
 373         return k;
 374     }
 375   }
 376   return wcwidth_ucs (wc);
 377 }
 378
 379 size_t utf8rtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps)
 380 {
 381   static wchar_t mbstate;
 382   wchar_t *ps = (wchar_t *)_ps;
 383   size_t k = 1;
 384   unsigned char c;
 385   wchar_t wc;
 386   int count;
 387
 388   if (!ps)
 389     ps = &mbstate;
 390
 391   if (!s)
 392   {
 393     *ps = 0;
 394     return 0;
 395   }
 396   if (!n)
 397     return (size_t)-2;
 398
 399   if (!*ps)
 400   {
 401     c = (unsigned char)*s;
 402     if (c < 0x80)
 403     {
 404       if (pwc)
 405         *pwc = c;
 406       return (c != 0);
 407     }
 408     else if (c < 0xc2)
 409     {
 410       errno = EILSEQ;
 411       return (size_t)-1;
 412     }
 413     else if (c < 0xe0)
 414       wc = ((c & 0x1f) << 6) + (count = 0);
 415     else if (c < 0xf0)
 416       wc = ((c & 0x0f) << 12) + (count = 1);
 417     else if (c < 0xf8)
 418       wc = ((c & 0x07) << 18) + (count = 2);
 419     else if (c < 0xfc)
 420       wc = ((c & 0x03) << 24) + (count = 3);
 421     else if (c < 0xfe)
 422       wc = ((c & 0x01) << 30) + (count = 4);
 423     else
 424     {
 425       errno = EILSEQ;
 426       return (size_t)-1;
 427     }
 428     ++s, --n, ++k;
 429   }
 430   else
 431   {
 432     wc = *ps & 0x7fffffff;
 433     count = wc & 7; /* if count > 4 it will be caught below */
 434   }
 435
 436   for (; n; ++s, --n, ++k)
 437   {
 438     c = (unsigned char)*s;
 439     if (0x80 <= c && c < 0xc0)
 440     {
 441       wc |= (c & 0x3f) << (6 * count);
 442       if (!count)
 443       {
 444         if (pwc)
 445           *pwc = wc;
 446         *ps = 0;
 447         return wc ? k : 0;
 448       }
 449       --count, --wc;
 450       if (!(wc >> (11+count*5)))
 451       {
 452         errno = count < 4 ? EILSEQ : EINVAL;
 453         return (size_t)-1;
 454       }
 455     }
 456     else
 457     {
 458       errno = EILSEQ;
 459       return (size_t)-1;
 460     }
 461   }
 462   *ps = wc;
 463   return (size_t)-2;
 464 }
 465
 466 #endif /* !HAVE_WC_FUNCS */
 467
 468 wchar_t replacement_char ()
 469 {
 470   return Charset_is_utf8 ? 0xfffd : '?';
 471 }