2 * Copyright notice from original mutt:
3 * Copyright (C) 2000 Edmund Grimley Evans <edmundo@rano.org>
5 * This file is part of mutt-ng, see http://www.muttng.org/.
6 * It's licensed under the GNU General Public License,
7 * please see the file GPL in the top level source directory.
11 * Japanese support by TAKIZAWA Takashi <taki@luna.email.ne.jp>.
32 int Charset_is_utf8 = 0;
35 static int charset_is_ja = 0;
36 static iconv_t charset_to_utf8 = (iconv_t) (-1);
37 static iconv_t charset_from_utf8 = (iconv_t) (-1);
40 void mutt_set_charset (char *charset)
44 mutt_canonical_charset (buffer, sizeof (buffer), charset);
49 if (charset_to_utf8 != (iconv_t) (-1)) {
50 iconv_close (charset_to_utf8);
51 charset_to_utf8 = (iconv_t) (-1);
53 if (charset_from_utf8 != (iconv_t) (-1)) {
54 iconv_close (charset_from_utf8);
55 charset_from_utf8 = (iconv_t) (-1);
59 if (!strcmp (buffer, "utf-8"))
62 else if (!ascii_strcasecmp (buffer, "euc-jp")
63 || !ascii_strcasecmp (buffer, "shift_jis")
64 || !ascii_strcasecmp (buffer, "cp932")
65 || !ascii_strcasecmp (buffer, "eucJP-ms")) {
67 charset_to_utf8 = iconv_open ("UTF-8", charset);
68 charset_from_utf8 = iconv_open (charset, "UTF-8");
72 #ifdef HAVE_BIND_TEXTDOMAIN_CODESET
73 bind_textdomain_codeset (PACKAGE, buffer);
80 * For systems that don't have them, we provide here our own
81 * implementations of wcrtomb(), mbrtowc(), iswprint() and wcwidth().
82 * Instead of using the locale, as these functions normally would,
83 * we use Mutt's Charset variable. We support 3 types of charset:
84 * (1) For 8-bit charsets, wchar_t uses the same encoding as char.
85 * (2) For UTF-8, wchar_t uses UCS.
86 * (3) For stateless Japanese encodings, we use UCS and convert
87 * via UTF-8 using iconv.
88 * Unfortunately, we can't handle non-stateless encodings.
91 static size_t wcrtomb_iconv (char *s, wchar_t wc, iconv_t cd)
99 ibl = mutt_wctoutf8 (buf, wc);
100 if (ibl == (size_t) (-1))
101 return (size_t) (-1);
105 r = iconv (cd, &ib, &ibl, &ob, &obl);
112 r = iconv (cd, &ib, &ibl, &ob, &obl);
117 size_t wcrtomb (char *s, wchar_t wc, mbstate_t * ps)
119 /* We only handle stateless encodings, so we can ignore ps. */
122 return mutt_wctoutf8 (s, wc);
123 else if (charset_from_utf8 != (iconv_t) (-1))
124 return wcrtomb_iconv (s, wc, charset_from_utf8);
133 return (size_t) (-1);
137 size_t mbrtowc_iconv (wchar_t * pwc, const char *s, size_t n,
138 mbstate_t * ps, iconv_t cd)
140 static mbstate_t mbstate;
141 ICONV_CONST char *ib, *ibmax;
143 size_t ibl, obl, k, r;
144 char bufi[8], bufo[6];
147 return (size_t) (-2);
149 t = memchr (ps, 0, sizeof (*ps));
150 k = t ? (t - (char *) ps) : sizeof (*ps);
151 if (k > sizeof (bufi))
154 /* use the buffer for input */
155 memcpy (bufi, ps, k);
157 ibmax = bufi + (k + n < sizeof (bufi) ? k + n : sizeof (bufi));
158 memcpy (bufi + k, s, ibmax - bufi - k);
161 /* use the real input */
171 r = iconv (cd, &ib, &ibl, &ob, &obl);
172 if (ob > bufo && (!k || ib > bufi + k)) {
173 /* we have a character */
174 memset (ps, 0, sizeof (*ps));
175 utf8rtowc (pwc, bufo, ob - bufo, &mbstate);
176 return (pwc && *pwc) ? (ib - (k ? bufi + k : s)) : 0;
178 else if (!r || (r == (size_t) (-1) && errno == EINVAL)) {
179 if (ib + ibl < ibmax)
180 /* try using more input */
182 else if (k && ib > bufi + k && bufi + k + n > ibmax) {
183 /* switch to using real input */
184 ib = s + (ib - bufi - k);
190 /* save the state and give up */
191 memset (ps, 0, sizeof (*ps));
192 if (ibl <= sizeof (mbstate_t)) /* need extra condition here! */
193 memcpy (ps, ib, ibl);
194 return (size_t) (-2);
200 return (size_t) (-1);
205 size_t mbrtowc (wchar_t * pwc, const char *s, size_t n, mbstate_t * ps)
207 static mbstate_t mbstate;
213 return utf8rtowc (pwc, s, n, ps);
214 else if (charset_to_utf8 != (iconv_t) (-1))
215 return mbrtowc_iconv (pwc, s, n, ps, charset_to_utf8);
218 memset (ps, 0, sizeof (*ps));
224 *pwc = (wchar_t) (unsigned char) *s;
229 int iswprint (wint_t wc)
231 if (Charset_is_utf8 || charset_is_ja)
232 return ((0x20 <= wc && wc < 0x7f) || 0xa0 <= wc);
234 return (0 <= wc && wc < 256) ? IsPrint (wc) : 0;
237 int iswspace (wint_t wc)
239 if (Charset_is_utf8 || charset_is_ja)
240 return (9 <= wc && wc <= 13) || wc == 32;
242 return (0 <= wc && wc < 256) ? isspace (wc) : 0;
245 static wint_t towupper_ucs (wint_t x)
247 /* Only works for x < 0x130 */
248 if ((0x60 < x && x < 0x7b) || (0xe0 <= x && x < 0xff && x != 0xf7))
250 else if (0x100 <= x && x < 0x130)
260 static wint_t towlower_ucs (wint_t x)
262 /* Only works for x < 0x130 */
263 if ((0x40 < x && x < 0x5b) || (0xc0 <= x && x < 0xdf && x != 0xd7))
265 else if (0x100 <= x && x < 0x130)
271 static int iswalnum_ucs (wint_t wc)
273 /* Only works for x < 0x220 */
281 return (0x40 < (wc & ~0x20) && (wc & ~0x20) < 0x5b);
283 return (wc == 0xaa || wc == 0xb5 || wc == 0xba);
285 return !(wc == 0xd7 || wc == 0xf7);
288 wint_t towupper (wint_t wc)
290 if (Charset_is_utf8 || charset_is_ja)
291 return towupper_ucs (wc);
293 return (0 <= wc && wc < 256) ? toupper (wc) : wc;
296 wint_t towlower (wint_t wc)
298 if (Charset_is_utf8 || charset_is_ja)
299 return towlower_ucs (wc);
301 return (0 <= wc && wc < 256) ? tolower (wc) : wc;
304 int iswalnum (wint_t wc)
306 if (Charset_is_utf8 || charset_is_ja)
307 return iswalnum_ucs (wc);
309 return (0 <= wc && wc < 256) ? isalnum (wc) : 0;
314 * Symbols, Greek and Cyrillic in JIS X 0208, Japanese Kanji
315 * Character Set, have a column width of 2.
317 int wcwidth_ja (wchar_t ucs)
320 return -1; /* continue with the normal check */
321 /* a rough range for quick check */
322 if ((ucs >= 0x00a1 && ucs <= 0x00fe) || /* Latin-1 Supplement */
323 (ucs >= 0x0391 && ucs <= 0x0451) || /* Greek and Cyrillic */
324 (ucs >= 0x2010 && ucs <= 0x266f) || /* Symbols */
325 (ucs >= 0x3000 && ucs <= 0x3020)) /* CJK Symbols and Punctuation */
331 int wcwidth_ucs (wchar_t ucs);
333 int wcwidth (wchar_t wc)
335 if (!Charset_is_utf8) {
336 if (!charset_is_ja) {
340 else if ((0 <= wc && wc < 256) && IsPrint (wc))
347 int k = wcwidth_ja (wc);
353 return wcwidth_ucs (wc);
356 size_t utf8rtowc (wchar_t * pwc, const char *s, size_t n, mbstate_t * _ps)
358 static wchar_t mbstate;
359 wchar_t *ps = (wchar_t *) _ps;
376 c = (unsigned char) *s;
387 wc = ((c & 0x1f) << 6) + (count = 0);
389 wc = ((c & 0x0f) << 12) + (count = 1);
391 wc = ((c & 0x07) << 18) + (count = 2);
393 wc = ((c & 0x03) << 24) + (count = 3);
395 wc = ((c & 0x01) << 30) + (count = 4);
403 wc = *ps & 0x7fffffff;
404 count = wc & 7; /* if count > 4 it will be caught below */
407 for (; n; ++s, --n, ++k) {
408 c = (unsigned char) *s;
409 if (0x80 <= c && c < 0xc0) {
410 wc |= (c & 0x3f) << (6 * count);
418 if (!(wc >> (11 + count * 5))) {
419 errno = count < 4 ? EILSEQ : EINVAL;
432 #endif /* !HAVE_WC_FUNCS */
434 wchar_t replacement_char ()
436 return Charset_is_utf8 ? 0xfffd : '?';