X-Git-Url: http://git.madism.org/?p=apps%2Fmadmutt.git;a=blobdiff_plain;f=charset.c;h=9a80d319366dacc7f3a3795063a9449366096760;hp=3fd46286b3541cffb5a07bd9e8b5b6f445bd29fd;hb=fdb93a08e305b8755260144807e4d45106a9cb9f;hpb=df70e07e24add1869bcc9b7af2277d9d0c09a281 diff --git a/charset.c b/charset.c index 3fd4628..9a80d31 100644 --- a/charset.c +++ b/charset.c @@ -1,22 +1,28 @@ /* - * Copyright (C) 1999-2000 Thomas Roessler + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. * - * This program is free software; you can redistribute it - * and/or modify it under the terms of the GNU General Public - * License as published by the Free Software Foundation; either - * version 2 of the License, or (at your option) any later - * version. + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. * - * This program is distributed in the hope that it will be - * useful, but WITHOUT ANY WARRANTY; without even the implied - * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR - * PURPOSE. See the GNU General Public License for more - * details. + * Copyright © 2006 Pierre Habouzit + */ +/* + * Copyright notice from original mutt: + * Copyright (C) 1999-2000 Thomas Roessler * - * You should have received a copy of the GNU General Public - * License along with this program; if not, write to the Free - * Software Foundation, Inc., 59 Temple Place - Suite 330, - * Boston, MA 02111, USA. + * This file is part of mutt-ng, see http://www.muttng.org/. + * It's licensed under the GNU General Public License, + * please see the file GPL in the top level source directory. */ #if HAVE_CONFIG_H @@ -33,253 +39,98 @@ #include #include #include +#ifdef HAVE_LANGINFO_CODESET +# include +#endif + +#include +#include +#include +#include #include "mutt.h" #include "charset.h" #ifndef EILSEQ -# define EILSEQ EINVAL +# define EILSEQ EINVAL #endif -/* - * The following list has been created manually from the data under: - * http://www.isi.edu/in-notes/iana/assignments/character-sets - * Last update: 2000-09-07 - * - * Note that it includes only the subset of character sets for which - * a preferred MIME name is given. - */ - -static struct { - char *key; - char *pref; -} PreferredMIMENames[] = { - { - "ansi_x3.4-1968", "us-ascii"}, { - "iso-ir-6", "us-ascii"}, { - "iso_646.irv:1991", "us-ascii"}, { - "ascii", "us-ascii"}, { - "iso646-us", "us-ascii"}, { - "us", "us-ascii"}, { - "ibm367", "us-ascii"}, { - "cp367", "us-ascii"}, { - "csASCII", "us-ascii"}, { - "csISO2022KR", "iso-2022-kr"}, { - "csEUCKR", "euc-kr"}, { - "csISO2022JP", "iso-2022-jp"}, { - "csISO2022JP2", "iso-2022-jp-2"}, { - "ISO_8859-1:1987", "iso-8859-1"}, { - "iso-ir-100", "iso-8859-1"}, { - "iso_8859-1", "iso-8859-1"}, { - "latin1", "iso-8859-1"}, { - "l1", "iso-8859-1"}, { - "IBM819", "iso-8859-1"}, { - "CP819", "iso-8859-1"}, { - "csISOLatin1", "iso-8859-1"}, { - "ISO_8859-2:1987", "iso-8859-2"}, { - "iso-ir-101", "iso-8859-2"}, { - "iso_8859-2", "iso-8859-2"}, { - "latin2", "iso-8859-2"}, { - "l2", "iso-8859-2"}, { - "csISOLatin2", "iso-8859-2"}, { - "ISO_8859-3:1988", "iso-8859-3"}, { - "iso-ir-109", "iso-8859-3"}, { - "ISO_8859-3", "iso-8859-3"}, { - "latin3", "iso-8859-3"}, { - "l3", "iso-8859-3"}, { - "csISOLatin3", "iso-8859-3"}, { - "ISO_8859-4:1988", "iso-8859-4"}, { - "iso-ir-110", "iso-8859-4"}, { - "ISO_8859-4", "iso-8859-4"}, { - "latin4", "iso-8859-4"}, { - "l4", "iso-8859-4"}, { - "csISOLatin4", "iso-8859-4"}, { - "ISO_8859-6:1987", "iso-8859-6"}, { - "iso-ir-127", "iso-8859-6"}, { - "iso_8859-6", "iso-8859-6"}, { - "ECMA-114", "iso-8859-6"}, { - "ASMO-708", "iso-8859-6"}, { - "arabic", "iso-8859-6"}, { - "csISOLatinArabic", "iso-8859-6"}, { - "ISO_8859-7:1987", "iso-8859-7"}, { - "iso-ir-126", "iso-8859-7"}, { - "ISO_8859-7", "iso-8859-7"}, { - "ELOT_928", "iso-8859-7"}, { - "ECMA-118", "iso-8859-7"}, { - "greek", "iso-8859-7"}, { - "greek8", "iso-8859-7"}, { - "csISOLatinGreek", "iso-8859-7"}, { - "ISO_8859-8:1988", "iso-8859-8"}, { - "iso-ir-138", "iso-8859-8"}, { - "ISO_8859-8", "iso-8859-8"}, { - "hebrew", "iso-8859-8"}, { - "csISOLatinHebrew", "iso-8859-8"}, { - "ISO_8859-5:1988", "iso-8859-5"}, { - "iso-ir-144", "iso-8859-5"}, { - "ISO_8859-5", "iso-8859-5"}, { - "cyrillic", "iso-8859-5"}, { - "csISOLatinCyrillic", "iso8859-5"}, { - "ISO_8859-9:1989", "iso-8859-9"}, { - "iso-ir-148", "iso-8859-9"}, { - "ISO_8859-9", "iso-8859-9"}, { - "latin5", "iso-8859-9"}, /* this is not a bug */ - { - "l5", "iso-8859-9"}, { - "csISOLatin5", "iso-8859-9"}, { - "ISO_8859-10:1992", "iso-8859-10"}, { - "iso-ir-157", "iso-8859-10"}, { - "latin6", "iso-8859-10"}, /* this is not a bug */ - { - "l6", "iso-8859-10"}, { - "csISOLatin6" "iso-8859-10"}, { - "csKOI8r", "koi8-r"}, { - "MS_Kanji", "Shift_JIS"}, /* Note the underscore! */ - { - "csShiftJis", "Shift_JIS"}, { - "Extended_UNIX_Code_Packed_Format_for_Japanese", "EUC-JP"}, { - "csEUCPkdFmtJapanese", "EUC-JP"}, { - "csGB2312", "gb2312"}, { - "csbig5", "big5"}, - /* - * End of official brain damage. What follows has been taken - * from glibc's localedata files. - */ - { - "iso_8859-13", "iso-8859-13"}, { - "iso-ir-179", "iso-8859-13"}, { - "latin7", "iso-8859-13"}, /* this is not a bug */ - { - "l7", "iso-8859-13"}, { - "iso_8859-14", "iso-8859-14"}, { - "latin8", "iso-8859-14"}, /* this is not a bug */ - { - "l8", "iso-8859-14"}, { - "iso_8859-15", "iso-8859-15"}, { - "latin9", "iso-8859-15"}, /* this is not a bug */ - /* Suggested by Ionel Mugurel Ciobica */ - { - "latin0", "iso-8859-15"}, /* this is not a bug */ - { - "iso_8859-16", "iso-8859-16"}, { - "latin10", "iso-8859-16"}, /* this is not a bug */ - /* - * David Champion has observed this with - * nl_langinfo under SunOS 5.8. - */ - { - "646", "us-ascii"}, - /* - * http://www.sun.com/software/white-papers/wp-unicode/ - */ - { - "eucJP", "euc-jp"}, { - "PCK", "Shift_JIS"}, { - "ko_KR-euc", "euc-kr"}, { - "zh_TW-big5", "big5"}, - /* seems to be common on some systems */ - { - "sjis", "Shift_JIS"}, { - "euc-jp-ms", "eucJP-ms"}, - /* - * If you happen to encounter system-specific brain-damage with - * respect to character set naming, please add it above this - * comment, and submit a patch to . - */ - /* End of aliases. Please keep this line last. */ - { - NULL, NULL} -}; - -#ifdef HAVE_LANGINFO_CODESET -# include - +char *Charset; +int Charset_is_utf8 = 0; +wchar_t CharsetReplacement = '?'; -void mutt_set_langinfo_charset (void) +void charset_initialize(void) { - char buff[LONG_STRING]; - char buff2[LONG_STRING]; - - strfcpy (buff, nl_langinfo (CODESET), sizeof (buff)); - mutt_canonical_charset (buff2, sizeof (buff2), buff); - - /* finally, set $charset */ - if (!(Charset = safe_strdup (buff2))) - Charset = safe_strdup ("iso-8859-1"); -} - -#else +#ifdef HAVE_LANGINFO_CODESET + char buff[LONG_STRING]; + char buff2[LONG_STRING]; -void mutt_set_langinfo_charset (void) -{ - Charset = safe_strdup ("iso-8859-1"); -} + m_strcpy(buff, sizeof(buff), nl_langinfo(CODESET)); + charset_canonicalize(buff2, sizeof(buff2), buff); + /* finally, set $charset */ + if (!m_strisempty(buff2)) { + m_strreplace(&Charset, buff2); + } else #endif - -void mutt_canonical_charset (char *dest, size_t dlen, const char *name) -{ - size_t i; - char *p; - char scratch[LONG_STRING]; - - /* catch some common iso-8859-something misspellings */ - if (!ascii_strncasecmp (name, "8859", 4) && name[4] != '-') - snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 4); - else if (!ascii_strncasecmp (name, "8859-", 5)) - snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 5); - else if (!ascii_strncasecmp (name, "iso8859", 7) && name[7] != '-') - snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 7); - else if (!ascii_strncasecmp (name, "iso8859-", 8)) - snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 8); - else - strfcpy (scratch, NONULL (name), sizeof (scratch)); - - for (i = 0; PreferredMIMENames[i].key; i++) - if (!ascii_strcasecmp (scratch, PreferredMIMENames[i].key) || - !mutt_strcasecmp (scratch, PreferredMIMENames[i].key)) { - strfcpy (dest, PreferredMIMENames[i].pref, dlen); - return; + { + m_strreplace(&Charset, "iso-8859-1"); } - strfcpy (dest, scratch, dlen); + Charset_is_utf8 = !strcmp(Charset, "utf-8"); + CharsetReplacement = Charset_is_utf8 ? 0xfffd : '?'; - /* for cosmetics' sake, transform to lowercase. */ - for (p = dest; *p; p++) - *p = ascii_tolower (*p); +#ifdef HAVE_BIND_TEXTDOMAIN_CODESET + bind_textdomain_codeset(PACKAGE, Charset); +#endif } -int mutt_chscmp (const char *s, const char *chs) +#include "charset.gperf" +void charset_canonicalize(char *dest, ssize_t dlen, const char *name) { - char buffer[STRING]; - - if (!s) - return 0; - - mutt_canonical_charset (buffer, sizeof (buffer), s); - return !ascii_strcasecmp (buffer, chs); + const struct cset_pair *cp; + char scratch[LONG_STRING]; + const char *p; + int i = 0; + + // canonize name: only keep a-z0-9 and dots, put into lowercase + for (p = name; *p && *p != ':' && i < ssizeof(scratch) - 1; p++) { + if (isalnum(*p) || *p== '.') { + scratch[i++] = tolower((unsigned char)*p); + } + } + scratch[i] = '\0'; + + cp = charset_canonicalize_aux(scratch, strlen(scratch)); + if (cp) { + m_strcpy(dest, dlen, cp->pref); + } else { + m_strcpy(dest, dlen, name); + m_strtolower(dest); + } } +static int mutt_chscmp(const char *s, const char *chs) +{ + char buffer[STRING]; -#ifndef HAVE_ICONV + if (!s) + return 0; -iconv_t iconv_open (const char *tocode, const char *fromcode) -{ - return (iconv_t) (-1); + charset_canonicalize(buffer, sizeof(buffer), s); + return !strcmp(buffer, chs); } -size_t iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t * inbytesleft, - char **outbuf, size_t * outbytesleft) +int charset_is_utf8(const char *s) { - return 0; + return mutt_chscmp(s, "utf-8"); } -int iconv_close (iconv_t cd) +int charset_is_us_ascii(const char *s) { - return 0; + return mutt_chscmp(s, "us-ascii"); } -#endif /* !HAVE_ICONV */ - /* * Like iconv_open, but canonicalises the charsets @@ -294,17 +145,17 @@ iconv_t mutt_iconv_open (const char *tocode, const char *fromcode, int flags) iconv_t cd; - mutt_canonical_charset (tocode1, sizeof (tocode1), tocode); + charset_canonicalize (tocode1, sizeof (tocode1), tocode); #ifdef M_ICONV_HOOK_TO /* Not used. */ if ((flags & M_ICONV_HOOK_TO) && (tmp = mutt_charset_hook (tocode1))) - mutt_canonical_charset (tocode1, sizeof (tocode1), tmp); + charset_canonicalize (tocode1, sizeof (tocode1), tmp); #endif - mutt_canonical_charset (fromcode1, sizeof (fromcode1), fromcode); + charset_canonicalize (fromcode1, sizeof (fromcode1), fromcode); if ((flags & M_ICONV_HOOK_FROM) && (tmp = mutt_charset_hook (fromcode1))) - mutt_canonical_charset (fromcode1, sizeof (fromcode1), tmp); + charset_canonicalize (fromcode1, sizeof (fromcode1), tmp); if ((cd = iconv_open (tocode1, fromcode1)) != (iconv_t) - 1) return cd; @@ -322,32 +173,32 @@ iconv_t mutt_iconv_open (const char *tocode, const char *fromcode, int flags) * if you're supplying an outrepl, the target charset should be. */ -size_t mutt_iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t * inbytesleft, - char **outbuf, size_t * outbytesleft, - ICONV_CONST char **inrepls, const char *outrepl) +ssize_t mutt_iconv(iconv_t cd, const char **inbuf, ssize_t *inbytesleft, + char **outbuf, ssize_t *outbytesleft, + const char **inrepls, const char *outrepl) { - size_t ret = 0, ret1; - ICONV_CONST char *ib = *inbuf; - size_t ibl = *inbytesleft; + ssize_t ret = 0, ret1; + const char *ib = *inbuf; + ssize_t ibl = *inbytesleft; char *ob = *outbuf; - size_t obl = *outbytesleft; + ssize_t obl = *outbytesleft; for (;;) { - ret1 = iconv (cd, &ib, &ibl, &ob, &obl); - if (ret1 != (size_t) - 1) + ret1 = my_iconv(cd, &ib, &ibl, &ob, &obl); + if (ret1 != -1) ret += ret1; if (ibl && obl && errno == EILSEQ) { if (inrepls) { /* Try replacing the input */ - ICONV_CONST char **t; + const char **t; for (t = inrepls; *t; t++) { - ICONV_CONST char *ib1 = *t; - size_t ibl1 = strlen (*t); + const char *ib1 = *t; + ssize_t ibl1 = m_strlen(*t); char *ob1 = ob; - size_t obl1 = obl; + ssize_t obl1 = obl; - iconv (cd, &ib1, &ibl1, &ob1, &obl1); + my_iconv(cd, &ib1, &ibl1, &ob1, &obl1); if (!ibl1) { ++ib, --ibl; ob = ob1, obl = obl1; @@ -361,9 +212,9 @@ size_t mutt_iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t * inbytesleft, /* Replace the output */ if (!outrepl) outrepl = "?"; - iconv (cd, 0, 0, &ob, &obl); + my_iconv(cd, 0, 0, &ob, &obl); if (obl) { - int n = strlen (outrepl); + ssize_t n = m_strlen(outrepl); if (n > obl) { outrepl = "?"; @@ -373,7 +224,7 @@ size_t mutt_iconv (iconv_t cd, ICONV_CONST char **inbuf, size_t * inbytesleft, ++ib, --ibl; ob += n, obl -= n; ++ret; - iconv (cd, 0, 0, 0, 0); /* for good measure */ + my_iconv(cd, 0, 0, 0, 0); /* for good measure */ continue; } } @@ -393,7 +244,7 @@ int mutt_convert_string (char **ps, const char *from, const char *to, int flags) { iconv_t cd; - ICONV_CONST char *repls[] = { "\357\277\275", "?", 0 }; + const char *repls[] = { "\357\277\275", "?", 0 }; char *s = *ps; if (!s || !*s) @@ -401,33 +252,31 @@ int mutt_convert_string (char **ps, const char *from, const char *to, if (to && from && (cd = mutt_iconv_open (to, from, flags)) != (iconv_t) - 1) { int len; - ICONV_CONST char *ib; + const char *ib; char *buf, *ob; - size_t ibl, obl; - ICONV_CONST char **inrepls = 0; - char *outrepl = 0; + ssize_t ibl, obl; + const char **inrepls = NULL; + const char *outrepl = NULL; - if (mutt_is_utf8 (to)) + if (charset_is_utf8 (to)) outrepl = "\357\277\275"; - else if (mutt_is_utf8 (from)) + else if (charset_is_utf8 (from)) inrepls = repls; else outrepl = "?"; - len = strlen (s); + len = m_strlen(s); ib = s, ibl = len + 1; obl = MB_LEN_MAX * ibl; - ob = buf = safe_malloc (obl + 1); + ob = buf = xmalloc(obl + 1); mutt_iconv (cd, &ib, &ibl, &ob, &obl, inrepls, outrepl); iconv_close (cd); *ob = '\0'; - FREE (ps); + p_delete(ps); *ps = buf; - - mutt_str_adjust (ps); return 0; } else @@ -448,8 +297,8 @@ struct fgetconv_s { char *p; char *ob; char *ib; - size_t ibl; - ICONV_CONST char **inrepls; + ssize_t ibl; + const char **inrepls; }; struct fgetconv_not { @@ -462,29 +311,29 @@ FGETCONV *fgetconv_open (FILE * file, const char *from, const char *to, { struct fgetconv_s *fc; iconv_t cd = (iconv_t) - 1; - static ICONV_CONST char *repls[] = { "\357\277\275", "?", 0 }; + static const char *repls[] = { "\357\277\275", "?", 0 }; if (from && to) cd = mutt_iconv_open (to, from, flags); if (cd != (iconv_t) - 1) { - fc = safe_malloc (sizeof (struct fgetconv_s)); + fc = p_new(struct fgetconv_s, 1); fc->p = fc->ob = fc->bufo; fc->ib = fc->bufi; fc->ibl = 0; - fc->inrepls = mutt_is_utf8 (to) ? repls : repls + 1; + fc->inrepls = charset_is_utf8 (to) ? repls : repls + 1; } else - fc = safe_malloc (sizeof (struct fgetconv_not)); + fc = p_new(struct fgetconv_s, 1); fc->file = file; fc->cd = cd; return (FGETCONV *) fc; } -char *fgetconvs (char *buf, size_t l, FGETCONV * _fc) +char *fgetconvs (char *buf, ssize_t l, FGETCONV * _fc) { int c; - size_t r; + ssize_t r; for (r = 0; r + 1 < l;) { if ((c = fgetconv (_fc)) == EOF) @@ -517,9 +366,9 @@ int fgetconv (FGETCONV * _fc) /* Try to convert some more */ fc->p = fc->ob = fc->bufo; if (fc->ibl) { - size_t obl = sizeof (fc->bufo); + ssize_t obl = ssizeof(fc->bufo); - iconv (fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, &obl); + my_iconv(fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl); if (fc->p < fc->ob) return (unsigned char) *(fc->p)++; } @@ -541,9 +390,9 @@ int fgetconv (FGETCONV * _fc) /* Try harder this time to convert some */ if (fc->ibl) { - size_t obl = sizeof (fc->bufo); + ssize_t obl = ssizeof(fc->bufo); - mutt_iconv (fc->cd, (ICONV_CONST char **) &fc->ib, &fc->ibl, &fc->ob, + mutt_iconv (fc->cd, (const char **) &fc->ib, &fc->ibl, &fc->ob, &obl, fc->inrepls, 0); if (fc->p < fc->ob) return (unsigned char) *(fc->p)++; @@ -560,50 +409,51 @@ void fgetconv_close (FGETCONV ** _fc) if (fc->cd != (iconv_t) - 1) iconv_close (fc->cd); - FREE (_fc); + p_delete(_fc); } -char *mutt_get_first_charset (const char *charset) +const char *mutt_get_first_charset (const char *charset) { static char fcharset[SHORT_STRING]; const char *c, *c1; c = charset; - if (!mutt_strlen (c)) + if (!m_strlen(c)) return "us-ascii"; if (!(c1 = strchr (c, ':'))) - return charset; - strfcpy (fcharset, c, c1 - c + 1); + return ((char*) charset); + m_strcpy(fcharset, c1 - c + 1, c); return fcharset; } -static size_t convert_string (ICONV_CONST char *f, size_t flen, +static ssize_t convert_string (const char *f, ssize_t flen, const char *from, const char *to, - char **t, size_t * tlen) + char **t, ssize_t * tlen) { iconv_t cd; char *buf, *ob; - size_t obl, n; + ssize_t obl; + ssize_t n; int e; cd = mutt_iconv_open (to, from, 0); if (cd == (iconv_t) (-1)) - return (size_t) (-1); + return -1; obl = 4 * flen + 1; - ob = buf = safe_malloc (obl); - n = iconv (cd, &f, &flen, &ob, &obl); - if (n == (size_t) (-1) || iconv (cd, 0, 0, &ob, &obl) == (size_t) (-1)) { + ob = buf = xmalloc(obl); + n = my_iconv(cd, &f, &flen, &ob, &obl); + if (n < 0 || my_iconv(cd, 0, 0, &ob, &obl) < 0) { e = errno; - FREE (&buf); + p_delete(&buf); iconv_close (cd); errno = e; - return (size_t) (-1); + return -1; } *ob = '\0'; *tlen = ob - buf; - safe_realloc (&buf, ob - buf + 1); + p_realloc(&buf, ob - buf + 1); *t = buf; iconv_close (cd); @@ -616,25 +466,24 @@ int mutt_convert_nonmime_string (char **ps) for (c = AssumedCharset; c; c = c1 ? c1 + 1 : 0) { char *u = *ps; - char *s; + char *s = NULL; char *fromcode; - size_t m, n; - size_t ulen = mutt_strlen (*ps); - size_t slen; + ssize_t m, n; + ssize_t ulen = m_strlen(*ps); + ssize_t slen; if (!u || !*u) return 0; c1 = strchr (c, ':'); - n = c1 ? c1 - c : mutt_strlen (c); + n = c1 ? c1 - c : m_strlen(c); if (!n) continue; - fromcode = safe_malloc (n + 1); - strfcpy (fromcode, c, n + 1); + fromcode = p_dupstr(c, n); m = convert_string (u, ulen, fromcode, Charset, &s, &slen); - FREE (&fromcode); - if (m != (size_t) (-1)) { - FREE (ps); + p_delete(&fromcode); + if (m != -1) { + p_delete(ps); *ps = s; return 0; }