mbyte.c

   1 /*
   2  * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
   3  *
   4  *     This program is free software; you can redistribute it and/or modify
   5  *     it under the terms of the GNU General Public License as published by
   6  *     the Free Software Foundation; either version 2 of the License, or
   7  *     (at your option) any later version.
   8  *
   9  *     This program is distributed in the hope that it will be useful,
  10  *     but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  *     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  *     GNU General Public License for more details.
  13  *
  14  *     You should have received a copy of the GNU General Public License
  15  *     along with this program; if not, write to the Free Software
  16  *     Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111, USA.
  17  */
  18
  19 /*
  20  * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>.
  21  */
  22
  23 #include "mutt.h"
  24 #include "mbyte.h"
  25 #include "charset.h"
  26
  27 #include <errno.h>
  28
  29 #include <ctype.h>
  30
  31 #ifndef EILSEQ
  32 #define EILSEQ EINVAL
  33 #endif
  34
  35 int Charset_is_utf8 = 0;
  36 #ifndef HAVE_WC_FUNCS
  37 static int charset_is_ja = 0;
  38 static iconv_t charset_to_utf8 = (iconv_t)(-1);
  39 static iconv_t charset_from_utf8 = (iconv_t)(-1);
  40 #endif
  41
  42 void mutt_set_charset (char *charset)
  43 {
  44   char buffer[STRING];
  45
  46   mutt_canonical_charset (buffer, sizeof (buffer), charset);
  47
  48   Charset_is_utf8 = 0;
  49 #ifndef HAVE_WC_FUNCS
  50   charset_is_ja = 0;
  51   if (charset_to_utf8 != (iconv_t)(-1))
  52   {
  53     iconv_close (charset_to_utf8);
  54     charset_to_utf8 = (iconv_t)(-1);
  55   }
  56   if (charset_from_utf8 != (iconv_t)(-1))
  57   {
  58     iconv_close (charset_from_utf8);
  59     charset_from_utf8 = (iconv_t)(-1);
  60   }
  61 #endif
  62
  63   if (!strcmp(buffer, "utf-8"))
  64     Charset_is_utf8 = 1;
  65 #ifndef HAVE_WC_FUNCS
  66   else if (!strcmp(buffer, "euc-jp") || !strcmp(buffer, "shift_jis")
  67         || !strcmp(buffer, "cp932"))
  68   {
  69     charset_is_ja = 1;
  70     charset_to_utf8 = iconv_open ("UTF-8", charset);
  71     charset_from_utf8 = iconv_open (charset, "UTF-8");
  72   }
  73 #endif
  74
  75 #ifdef HAVE_BIND_TEXTDOMAIN_CODESET
  76   bind_textdomain_codeset(PACKAGE, buffer);
  77 #endif
  78 }
  79
  80 #ifndef HAVE_WC_FUNCS
  81
  82 /*
  83  * For systems that don't have them, we provide here our own
  84  * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
  85  * Instead of using the locale, as these functions normally would,
  86  * we use Mutt's Charset variable. We support 3 types of charset:
  87  * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
  88  * (2) For UTF-8, wchar_t uses UCS.
  89  * (3) For stateless Japanese encodings, we use UCS and convert
  90  *     via UTF-8 using iconv.
  91  * Unfortunately, we can't handle non-stateless encodings.
  92  */
  93
  94 static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
  95 {
  96   char buf[MB_LEN_MAX];
  97   ICONV_CONST char *ib;
  98   char *ob;
  99   size_t ibl, obl, r;
 100
 101   if (s)
 102   {
 103     ibl = mutt_wctoutf8 (buf, wc);
 104     if (ibl == (size_t)(-1))
 105       return (size_t)(-1);
 106     ib = buf;
 107     ob = s;
 108     obl = MB_LEN_MAX;
 109     r = iconv (cd, &ib, &ibl, &ob, &obl);
 110   }
 111   else
 112   {
 113     ib = "";
 114     ibl = 1;
 115     ob = buf;
 116     obl = sizeof (buf);
 117     r = iconv (cd, &ib, &ibl, &ob, &obl);
 118   }
 119   return ob - s;
 120 }
 121
 122 size_t wcrtomb (char *s, wchar_t wc, mbstate_t *ps)
 123 {
 124   /* We only handle stateless encodings, so we can ignore ps. */
 125
 126   if (Charset_is_utf8)
 127     return mutt_wctoutf8 (s, wc);
 128   else if (charset_from_utf8 != (iconv_t)(-1))
 129     return wcrtomb_iconv (s, wc, charset_from_utf8);
 130   else
 131   {
 132     if (!s)
 133       return 1;
 134     if (wc < 0x100)
 135     {
 136       *s = wc;
 137       return 1;
 138     }
 139     errno = EILSEQ;
 140     return (size_t)(-1);
 141   }
 142 }
 143
 144 size_t mbrtowc_iconv (wchar_t *pwc, const char *s, size_t n,
 145                       mbstate_t *ps, iconv_t cd)
 146 {
 147   static mbstate_t mbstate;
 148   ICONV_CONST char *ib, *ibmax;
 149   char *ob, *t;
 150   size_t ibl, obl, k, r;
 151   char bufi[8], bufo[6];
 152
 153   if (!n)
 154     return (size_t)(-2);
 155
 156   t = memchr (ps, 0, sizeof (*ps));
 157   k = t ? (t - (char *)ps) : sizeof (*ps);
 158   if (k > sizeof (bufi))
 159     k = 0;
 160   if (k)
 161   {
 162     /* use the buffer for input */
 163     memcpy (bufi, ps, k);
 164     ib = bufi;
 165     ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
 166     memcpy (bufi + k, s, ibmax - bufi - k);
 167   }
 168   else
 169   {
 170     /* use the real input */
 171     ib = s;
 172     ibmax = s + n;
 173   }
 174
 175   ob = bufo;
 176   obl = sizeof (bufo);
 177   ibl = 1;
 178
 179   for (;;)
 180   {
 181     r = iconv (cd, &ib, &ibl, &ob, &obl);
 182     if (ob > bufo && (!k || ib > bufi + k))
 183     {
 184       /* we have a character */
 185       memset (ps, 0, sizeof (*ps));
 186       utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
 187       return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0;
 188     }
 189     else if (!r || (r == (size_t)(-1) && errno == EINVAL))
 190     {
 191       if (ib + ibl < ibmax)
 192         /* try using more input */
 193         ++ibl;
 194       else if (k && ib > bufi + k && bufi + k + n > ibmax)
 195       {
 196         /* switch to using real input */
 197         ib = s + (ib - bufi - k);
 198         ibmax = s + n;
 199         k = 0;
 200         ++ibl;
 201       }
 202       else
 203       {
 204         /* save the state and give up */
 205         memset (ps, 0, sizeof (*ps));
 206         if (ibl <= sizeof (mbstate_t)) /* need extra condition here! */
 207           memcpy (ps, ib, ibl);
 208         return (size_t)(-2);
 209       }
 210     }
 211     else
 212     {
 213       /* bad input */
 214       errno = EILSEQ;
 215       return (size_t)(-1);
 216     }
 217   }
 218 }
 219
 220 size_t mbrtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *ps)
 221 {
 222   static mbstate_t mbstate;
 223
 224   if (!ps)
 225     ps = &mbstate;
 226
 227   if (Charset_is_utf8)
 228     return utf8rtowc (pwc, s, n, ps);
 229   else if (charset_to_utf8 != (iconv_t)(-1))
 230     return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
 231   else
 232   {
 233     if (!s)
 234     {
 235       memset(ps, 0, sizeof(*ps));
 236       return 0;
 237     }
 238     if (!n)
 239       return (size_t)-2;
 240     if (pwc)
 241       *pwc = (wchar_t)(unsigned char)*s;
 242     return (*s != 0);
 243   }
 244 }
 245
 246 int iswprint (wint_t wc)
 247 {
 248   if (Charset_is_utf8 || charset_is_ja)
 249     return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
 250   else
 251     return (0 <= wc && wc < 256) ? IsPrint (wc) : 0;
 252 }
 253
 254 int iswspace (wint_t wc)
 255 {
 256   if (Charset_is_utf8 || charset_is_ja)
 257     return (9 <= wc && wc <= 13) || wc == 32;
 258   else
 259     return (0 <= wc && wc < 256) ? isspace (wc) : 0;
 260 }
 261
 262 static wint_t towupper_ucs (wint_t x)
 263 {
 264   /* Only works for x < 0x130 */
 265   if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
 266     return x - 32;
 267   else if (0x100 <= x && x < 0x130)
 268     return x & ~1;
 269   else if (x == 0xb5)
 270     return 0x39c;
 271   else if (x == 0xff)
 272     return 0x178;
 273   else
 274     return x;
 275 }
 276
 277 static wint_t towlower_ucs (wint_t x)
 278 {
 279   /* Only works for x < 0x130 */
 280   if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7))
 281     return x + 32;
 282   else if (0x100 <= x && x < 0x130)
 283     return x | 1;
 284   else
 285     return x;
 286 }
 287
 288 static int iswalnum_ucs (wint_t wc)
 289 {
 290   /* Only works for x < 0x220 */
 291   if (wc >= 0x100)
 292     return 1;
 293   else if (wc < 0x30)
 294     return 0;
 295   else if (wc < 0x3a)
 296     return 1;
 297   else if (wc < 0xa0)
 298     return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
 299   else if (wc < 0xc0)
 300     return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
 301   else
 302     return !(wc == 0xd7 || wc == 0xf7);
 303 }
 304
 305 wint_t towupper (wint_t wc)
 306 {
 307   if (Charset_is_utf8 || charset_is_ja)
 308     return towupper_ucs (wc);
 309   else
 310     return (0 <= wc && wc < 256) ? toupper (wc) : wc;
 311 }
 312
 313 wint_t towlower (wint_t wc)
 314 {
 315   if (Charset_is_utf8 || charset_is_ja)
 316     return towlower_ucs (wc);
 317   else
 318     return (0 <= wc && wc < 256) ? tolower (wc) : wc;
 319 }
 320
 321 int iswalnum (wint_t wc)
 322 {
 323   if (Charset_is_utf8 || charset_is_ja)
 324     return iswalnum_ucs (wc);
 325   else
 326     return (0 <= wc && wc < 256) ? isalnum (wc) : 0;
 327 }
 328
 329 /*
 330  * l10n for Japanese:
 331  *   Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
 332  *   Character Set, have a column width of 2.
 333  */
 334 int wcwidth_ja (wchar_t ucs)
 335 {
 336   if (ucs >= 0x3021)
 337     return -1; /* continue with the normal check */
 338   /* a rough range for quick check */
 339   if ((ucs >= 0x00a1 && ucs <= 0x00fe) || /* Latin-1 Supplement */
 340       (ucs >= 0x0391 && ucs <= 0x0451) || /* Greek and Cyrillic */
 341       (ucs >= 0x2010 && ucs <= 0x266f) || /* Symbols */
 342       (ucs >= 0x3000 && ucs <= 0x3020))   /* CJK Symbols and Punctuation */
 343     return 2;
 344   else
 345     return -1;
 346 }
 347
 348 int wcwidth_ucs(wchar_t ucs);
 349
 350 int wcwidth (wchar_t wc)
 351 {
 352   if (!Charset_is_utf8)
 353   {
 354     if (!charset_is_ja)
 355     {
 356       /* 8-bit case */
 357       if (!wc)
 358         return 0;
 359       else if ((0 <= wc && wc < 256) && IsPrint (wc))
 360         return 1;
 361       else
 362         return -1;
 363     }
 364     else
 365     {
 366       /* Japanese */
 367       int k = wcwidth_ja (wc);
 368       if (k != -1)
 369         return k;
 370     }
 371   }
 372   return wcwidth_ucs (wc);
 373 }
 374
 375 size_t utf8rtowc (wchar_t *pwc, const char *s, size_t n, mbstate_t *_ps)
 376 {
 377   static wchar_t mbstate;
 378   wchar_t *ps = (wchar_t *)_ps;
 379   size_t k = 1;
 380   unsigned char c;
 381   wchar_t wc;
 382   int count;
 383
 384   if (!ps)
 385     ps = &mbstate;
 386
 387   if (!s)
 388   {
 389     *ps = 0;
 390     return 0;
 391   }
 392   if (!n)
 393     return (size_t)-2;
 394
 395   if (!*ps)
 396   {
 397     c = (unsigned char)*s;
 398     if (c < 0x80)
 399     {
 400       if (pwc)
 401         *pwc = c;
 402       return (c != 0);
 403     }
 404     else if (c < 0xc2)
 405     {
 406       errno = EILSEQ;
 407       return (size_t)-1;
 408     }
 409     else if (c < 0xe0)
 410       wc = ((c & 0x1f) << 6) + (count = 0);
 411     else if (c < 0xf0)
 412       wc = ((c & 0x0f) << 12) + (count = 1);
 413     else if (c < 0xf8)
 414       wc = ((c & 0x07) << 18) + (count = 2);
 415     else if (c < 0xfc)
 416       wc = ((c & 0x03) << 24) + (count = 3);
 417     else if (c < 0xfe)
 418       wc = ((c & 0x01) << 30) + (count = 4);
 419     else
 420     {
 421       errno = EILSEQ;
 422       return (size_t)-1;
 423     }
 424     ++s, --n, ++k;
 425   }
 426   else
 427   {
 428     wc = *ps & 0x7fffffff;
 429     count = wc & 7; /* if count > 4 it will be caught below */
 430   }
 431
 432   for (; n; ++s, --n, ++k)
 433   {
 434     c = (unsigned char)*s;
 435     if (0x80 <= c && c < 0xc0)
 436     {
 437       wc |= (c & 0x3f) << (6 * count);
 438       if (!count)
 439       {
 440         if (pwc)
 441           *pwc = wc;
 442         *ps = 0;
 443         return wc ? k : 0;
 444       }
 445       --count, --wc;
 446       if (!(wc >> (11+count*5)))
 447       {
 448         errno = count < 4 ? EILSEQ : EINVAL;
 449         return (size_t)-1;
 450       }
 451     }
 452     else
 453     {
 454       errno = EILSEQ;
 455       return (size_t)-1;
 456     }
 457   }
 458   *ps = wc;
 459   return (size_t)-2;
 460 }
 461
 462 #endif /* !HAVE_WC_FUNCS */
 463
 464 wchar_t replacement_char ()
 465 {
 466   return Charset_is_utf8 ? 0xfffd : '?';
 467 }