add some very efficient utf8 routines. more to come.
[apps/madmutt.git] / lib-lib / utf8.c
diff --git a/lib-lib/utf8.c b/lib-lib/utf8.c
new file mode 100644 (file)
index 0000000..7c928a6
--- /dev/null
@@ -0,0 +1,95 @@
+/*
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or (at
+ *  your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful, but
+ *  WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+ *  MA 02110-1301, USA.
+ *
+ *  Copyright © 2006 Pierre Habouzit
+ */
+
+#include "lib-lib.h"
+
+/** \addtogroup mutt_strings */
+/*@{*/
+
+static char const __utf8_trail[256] = {
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5,
+};
+
+static uint32_t const __utf8_offs[6] = {
+    0x00000000UL, 0x00003080UL, 0x000e2080UL,
+    0x03c82080UL, 0xfa082080UL, 0x82082080UL
+};
+
+static const uint8_t __utf8_mark[7] = {
+    0x00, 0x00, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc
+};
+
+static inline int ustrlen(const char *s)
+{
+    int len = 0;
+
+    while (*s) {
+        len += (*s & 0xc0) != 0x80;
+    }
+
+    return len;
+}
+
+static inline int ustrgetc(const char *s, const char **outp)
+{
+    uint32_t ret = 0;
+    int trail = __utf8_trail[(unsigned char)*s];
+
+    switch (trail) {
+      case 5: ret += (unsigned char)*s++; ret <<= 6; if (!*s) return -1;
+      case 4: ret += (unsigned char)*s++; ret <<= 6; if (!*s) return -1;
+      case 3: ret += (unsigned char)*s++; ret <<= 6; if (!*s) return -1;
+      case 2: ret += (unsigned char)*s++; ret <<= 6; if (!*s) return -1;
+      case 1: ret += (unsigned char)*s++; ret <<= 6; if (!*s) return -1;
+      case 0: ret += (unsigned char)*s++;
+    }
+
+    if (*outp)
+        *outp = s;
+    return ret - __utf8_offs[trail];
+}
+
+static inline int ustrputc(char *dst, ssize_t n, int c)
+{
+    int bytes = 1 + (c >= 0x80) + (c >= 0x800) + (c >= 0x10000);
+
+    if (bytes >= n)
+        return bytes;
+
+    dst[bytes] = '\0';
+
+    switch (bytes) {
+        case 4: dst[3] = (c | 0x80) & 0xbf; c >>= 6;
+        case 3: dst[2] = (c | 0x80) & 0xbf; c >>= 6;
+        case 2: dst[1] = (c | 0x80) & 0xbf; c >>= 6;
+        case 1: dst[0] = (c | __utf8_mark[bytes]);
+    }
+
+    return bytes;
+}
+
+
+/*@}*/