X-Git-Url: http://git.madism.org/?p=apps%2Fmadmutt.git;a=blobdiff_plain;f=charset.c;h=8cea280dc205444206bdd3a9e972555393cfc653;hp=bf5660735a61b6b9820d7ba4cda37a7ce6401add;hb=d6e1782b1e788bd1c4767443712bf1713cc013ef;hpb=711f787502b6a1a1c150b948a5ed9156c8ef9ba1 diff --git a/charset.c b/charset.c index bf56607..8cea280 100644 --- a/charset.c +++ b/charset.c @@ -21,6 +21,9 @@ #include #include #include +#ifdef HAVE_LANGINFO_CODESET +# include +#endif #include #include @@ -31,192 +34,80 @@ #include "charset.h" #ifndef EILSEQ -# define EILSEQ EINVAL +# define EILSEQ EINVAL #endif -char *Charset; +char *Charset = NULL; int Charset_is_utf8 = 0; -/* - * The following list has been created manually from the data under: - * http://www.isi.edu/in-notes/iana/assignments/character-sets - * Last update: 2000-09-07 - * - * Note that it includes only the subset of character sets for which - * a preferred MIME name is given. - */ - -static struct { - const char *key; - const char *pref; -} PreferredMIMENames[] = { - {"ansi_x3.4-1968", "us-ascii"}, - {"iso-ir-6", "us-ascii"}, - {"iso_646.irv:1991", "us-ascii"}, - {"ascii", "us-ascii"}, - {"iso646-us", "us-ascii"}, - {"us", "us-ascii"}, - {"ibm367", "us-ascii"}, - {"cp367", "us-ascii"}, - {"csASCII", "us-ascii"}, - {"csISO2022KR", "iso-2022-kr"}, - {"csEUCKR", "euc-kr"}, - {"csISO2022JP", "iso-2022-jp"}, - {"csISO2022JP2", "iso-2022-jp-2"}, - {"ISO_8859-1:1987", "iso-8859-1"}, - {"iso-ir-100", "iso-8859-1"}, - {"iso_8859-1", "iso-8859-1"}, - {"latin1", "iso-8859-1"}, - {"l1", "iso-8859-1"}, - {"IBM819", "iso-8859-1"}, - {"CP819", "iso-8859-1"}, - {"csISOLatin1", "iso-8859-1"}, - {"ISO_8859-2:1987", "iso-8859-2"}, - {"iso-ir-101", "iso-8859-2"}, - {"iso_8859-2", "iso-8859-2"}, - {"latin2", "iso-8859-2"}, - {"l2", "iso-8859-2"}, - {"csISOLatin2", "iso-8859-2"}, - {"ISO_8859-3:1988", "iso-8859-3"}, - {"iso-ir-109", "iso-8859-3"}, - {"ISO_8859-3", "iso-8859-3"}, - {"latin3", "iso-8859-3"}, - {"l3", "iso-8859-3"}, - {"csISOLatin3", "iso-8859-3"}, - {"ISO_8859-4:1988", "iso-8859-4"}, - {"iso-ir-110", "iso-8859-4"}, - {"ISO_8859-4", "iso-8859-4"}, - {"latin4", "iso-8859-4"}, - {"l4", "iso-8859-4"}, - {"csISOLatin4", "iso-8859-4"}, - {"ISO_8859-6:1987", "iso-8859-6"}, - {"iso-ir-127", "iso-8859-6"}, - {"iso_8859-6", "iso-8859-6"}, - {"ECMA-114", "iso-8859-6"}, - {"ASMO-708", "iso-8859-6"}, - {"arabic", "iso-8859-6"}, - {"csISOLatinArabic", "iso-8859-6"}, - {"ISO_8859-7:1987", "iso-8859-7"}, - {"iso-ir-126", "iso-8859-7"}, - {"ISO_8859-7", "iso-8859-7"}, - {"ELOT_928", "iso-8859-7"}, - {"ECMA-118", "iso-8859-7"}, - {"greek", "iso-8859-7"}, - {"greek8", "iso-8859-7"}, - {"csISOLatinGreek", "iso-8859-7"}, - {"ISO_8859-8:1988", "iso-8859-8"}, - {"iso-ir-138", "iso-8859-8"}, - {"ISO_8859-8", "iso-8859-8"}, - {"hebrew", "iso-8859-8"}, - {"csISOLatinHebrew", "iso-8859-8"}, - {"ISO_8859-5:1988", "iso-8859-5"}, - {"iso-ir-144", "iso-8859-5"}, - {"ISO_8859-5", "iso-8859-5"}, - {"cyrillic", "iso-8859-5"}, - {"csISOLatinCyrillic", "iso8859-5"}, - {"ISO_8859-9:1989", "iso-8859-9"}, - {"iso-ir-148", "iso-8859-9"}, - {"ISO_8859-9", "iso-8859-9"}, - {"latin5", "iso-8859-9"}, - {"l5", "iso-8859-9"}, - {"csISOLatin5", "iso-8859-9"}, - {"ISO_8859-10:1992", "iso-8859-10"}, - {"iso-ir-157", "iso-8859-10"}, - {"latin6", "iso-8859-10"}, - {"l6", "iso-8859-10"}, - {"csISOLatin6", "iso-8859-10"}, - {"csKOI8r", "koi8-r"}, - {"MS_Kanji", "Shift_JIS"}, - {"csShiftJis", "Shift_JIS"}, - {"Extended_UNIX_Code_Packed_Format_for_Japanese", "EUC-JP"}, - {"csEUCPkdFmtJapanese", "EUC-JP"}, - {"csGB2312", "gb2312"}, - {"csbig5", "big5"}, - /* End of official brain damage. - What follows has been taken * from glibc's localedata files. */ - {"iso_8859-13", "iso-8859-13"}, - {"iso-ir-179", "iso-8859-13"}, - {"latin7", "iso-8859-13"}, - {"l7", "iso-8859-13"}, - {"iso_8859-14", "iso-8859-14"}, - {"latin8", "iso-8859-14"}, - {"l8", "iso-8859-14"}, - {"iso_8859-15", "iso-8859-15"}, - {"latin9", "iso-8859-15"}, - {"latin0", "iso-8859-15"}, - {"iso_8859-16", "iso-8859-16"}, - {"latin10", "iso-8859-16"}, - {"646", "us-ascii"}, - {"eucJP", "euc-jp"}, - {"PCK", "Shift_JIS"}, - {"ko_KR-euc", "euc-kr"}, - {"zh_TW-big5", "big5"}, - {"sjis", "Shift_JIS"}, - {"euc-jp-ms", "eucJP-ms"}, - {NULL, NULL} -}; - -#ifdef HAVE_LANGINFO_CODESET -# include -#endif - -void mutt_set_langinfo_charset (void) +void charset_initialize(void) { #ifdef HAVE_LANGINFO_CODESET char buff[LONG_STRING]; char buff2[LONG_STRING]; m_strcpy(buff, sizeof(buff), nl_langinfo(CODESET)); - mutt_canonical_charset (buff2, sizeof (buff2), buff); + mutt_canonical_charset(buff2, sizeof(buff2), buff); /* finally, set $charset */ - if (!(Charset = m_strdup(buff2))) + if (!m_strisempty(buff2)) { + m_strreplace(&Charset, buff2); + } else +#endif + m_strreplace(&Charset, "iso-8859-1"); + + Charset_is_utf8 = !strcmp(Charset, "utf-8"); +#ifdef HAVE_BIND_TEXTDOMAIN_CODESET + bind_textdomain_codeset(PACKAGE, Charset); #endif - Charset = m_strdup("iso-8859-1"); } +#include "charset.gperf" + void mutt_canonical_charset(char *dest, ssize_t dlen, const char *name) { - ssize_t i; - char *p; - char scratch[LONG_STRING]; - - /* catch some common iso-8859-something misspellings */ - if (!ascii_strncasecmp (name, "8859", 4) && name[4] != '-') - snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 4); - else if (!ascii_strncasecmp (name, "8859-", 5)) - snprintf (scratch, sizeof (scratch), "iso-8859-%s", name + 5); - else if (!ascii_strncasecmp (name, "iso8859", 7) && name[7] != '-') - snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 7); - else if (!ascii_strncasecmp (name, "iso8859-", 8)) - snprintf (scratch, sizeof (scratch), "iso_8859-%s", name + 8); - else - m_strcpy(scratch, sizeof(scratch), NONULL(name)); - - for (i = 0; PreferredMIMENames[i].key; i++) - if (!ascii_strcasecmp (scratch, PreferredMIMENames[i].key) || - !m_strcasecmp(scratch, PreferredMIMENames[i].key)) { - m_strcpy(dest, dlen, PreferredMIMENames[i].pref); - return; + const struct cset_pair *cp; + char scratch[LONG_STRING]; + const char *p; + int i = 0; + + // canonize name: only keep a-z0-9 and dots, put into lowercase + for (p = name; *p && *p != ':' && i < ssizeof(scratch) - 1; p++) { + if (isalnum(*p) || *p== '.') { + scratch[i++] = tolower((unsigned char)*p); + } + } + scratch[i] = '\0'; + + cp = mutt_canonical_charset_aux(scratch, strlen(scratch)); + if (cp) { + m_strcpy(dest, dlen, cp->pref); + } else { + m_strcpy(dest, dlen, name); + m_strtolower(dest); } +} + +static int mutt_chscmp(const char *s, const char *chs) +{ + char buffer[STRING]; - m_strcpy(dest, dlen, scratch); + if (!s) + return 0; - /* for cosmetics' sake, transform to lowercase. */ - for (p = dest; *p; p++) - *p = ascii_tolower (*p); + mutt_canonical_charset(buffer, sizeof(buffer), s); + return !strcmp(buffer, chs); } -int mutt_chscmp (const char *s, const char *chs) +int charset_is_utf8(const char *s) { - char buffer[STRING]; - - if (!s) - return 0; + return mutt_chscmp(s, "utf-8"); +} - mutt_canonical_charset (buffer, sizeof (buffer), s); - return !ascii_strcasecmp (buffer, chs); +int charset_is_us_ascii(const char *s) +{ + return mutt_chscmp(s, "us-ascii"); } @@ -346,9 +237,9 @@ int mutt_convert_string (char **ps, const char *from, const char *to, const char **inrepls = NULL; const char *outrepl = NULL; - if (mutt_is_utf8 (to)) + if (charset_is_utf8 (to)) outrepl = "\357\277\275"; - else if (mutt_is_utf8 (from)) + else if (charset_is_utf8 (from)) inrepls = repls; else outrepl = "?"; @@ -409,7 +300,7 @@ FGETCONV *fgetconv_open (FILE * file, const char *from, const char *to, fc->p = fc->ob = fc->bufo; fc->ib = fc->bufi; fc->ibl = 0; - fc->inrepls = mutt_is_utf8 (to) ? repls : repls + 1; + fc->inrepls = charset_is_utf8 (to) ? repls : repls + 1; } else fc = p_new(struct fgetconv_s, 1); @@ -579,21 +470,6 @@ int mutt_convert_nonmime_string (char **ps) return -1; } -void mutt_set_charset (char *charset) -{ - char buffer[STRING]; - - mutt_canonical_charset (buffer, sizeof (buffer), charset); - - Charset_is_utf8 = 0; - if (!strcmp (buffer, "utf-8")) - Charset_is_utf8 = 1; - -#ifdef HAVE_BIND_TEXTDOMAIN_CODESET - bind_textdomain_codeset (PACKAGE, buffer); -#endif -} - wchar_t replacement_char(void) { return Charset_is_utf8 ? 0xfffd : '?';