From: Pierre Habouzit Date: Wed, 27 Dec 2006 08:21:20 +0000 (+0100) Subject: add some very efficient utf8 routines. more to come. X-Git-Url: http://git.madism.org/?p=apps%2Fmadmutt.git;a=commitdiff_plain;h=c2ac9f23deba034e50cd3babc843d589d0b62d1b add some very efficient utf8 routines. more to come. Signed-off-by: Pierre Habouzit --- diff --git a/lib-lib/Makefile.am b/lib-lib/Makefile.am index 28d0665..c4bacb2 100644 --- a/lib-lib/Makefile.am +++ b/lib-lib/Makefile.am @@ -1,10 +1,10 @@ noinst_LIBRARIES = liblib.a liblib_a_SOURCES = lib-lib.h mem.h \ - str.h buffer.h hash.h array.h list.h file.h mapping.h date.h rx.h url.h \ - str.c buffer.c hash.c array.c list.c file.c mapping.c date.c rx.c url.c + str.h utf8.h buffer.h hash.h array.h list.h file.h mapping.h date.h rx.h url.h \ + str.c utf8.c buffer.c hash.c array.c list.c file.c mapping.c date.c rx.c url.c noinst_HEADERS = lib-lib.h mem.h \ - str.h buffer.h hash.h array.h list.h file.h mapping.h date.h rx.h url.h + str.h utf8.h buffer.h hash.h array.h list.h file.h mapping.h date.h rx.h url.h -include ../cflags.mk diff --git a/lib-lib/lib-lib.h b/lib-lib/lib-lib.h index a4098e2..6dd3900 100644 --- a/lib-lib/lib-lib.h +++ b/lib-lib/lib-lib.h @@ -110,6 +110,7 @@ typedef union __attribute__((transparent_union)) anytype { #include "mem.h" #include "str.h" +#include "utf8.h" #include "array.h" #include "buffer.h" diff --git a/lib-lib/utf8.c b/lib-lib/utf8.c new file mode 100644 index 0000000..7c928a6 --- /dev/null +++ b/lib-lib/utf8.c @@ -0,0 +1,95 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + * + * Copyright © 2006 Pierre Habouzit + */ + +#include "lib-lib.h" + +/** \addtogroup mutt_strings */ +/*@{*/ + +static char const __utf8_trail[256] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, + 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5, +}; + +static uint32_t const __utf8_offs[6] = { + 0x00000000UL, 0x00003080UL, 0x000e2080UL, + 0x03c82080UL, 0xfa082080UL, 0x82082080UL +}; + +static const uint8_t __utf8_mark[7] = { + 0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc +}; + +static inline int ustrlen(const char *s) +{ + int len = 0; + + while (*s) { + len += (*s & 0xc0) != 0x80; + } + + return len; +} + +static inline int ustrgetc(const char *s, const char **outp) +{ + uint32_t ret = 0; + int trail = __utf8_trail[(unsigned char)*s]; + + switch (trail) { + case 5: ret += (unsigned char)*s++; ret <<= 6; if (!*s) return -1; + case 4: ret += (unsigned char)*s++; ret <<= 6; if (!*s) return -1; + case 3: ret += (unsigned char)*s++; ret <<= 6; if (!*s) return -1; + case 2: ret += (unsigned char)*s++; ret <<= 6; if (!*s) return -1; + case 1: ret += (unsigned char)*s++; ret <<= 6; if (!*s) return -1; + case 0: ret += (unsigned char)*s++; + } + + if (*outp) + *outp = s; + return ret - __utf8_offs[trail]; +} + +static inline int ustrputc(char *dst, ssize_t n, int c) +{ + int bytes = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000); + + if (bytes >= n) + return bytes; + + dst[bytes] = '\0'; + + switch (bytes) { + case 4: dst[3] = (c | 0x80) & 0xbf; c >>= 6; + case 3: dst[2] = (c | 0x80) & 0xbf; c >>= 6; + case 2: dst[1] = (c | 0x80) & 0xbf; c >>= 6; + case 1: dst[0] = (c | __utf8_mark[bytes]); + } + + return bytes; +} + + +/*@}*/ diff --git a/lib-lib/utf8.h b/lib-lib/utf8.h new file mode 100644 index 0000000..2f1e0da --- /dev/null +++ b/lib-lib/utf8.h @@ -0,0 +1,28 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + * + * Copyright © 2006 Pierre Habouzit + */ + +#ifndef MUTT_LIB_LIB_UTF8_H +#define MUTT_LIB_LIB_UTF8_H + +/** \addtogroup mutt_strings */ +/*@{*/ + + +/*@}*/ +#endif /* MUTT_LIB_LIB_UTF8_H */