* Copyright © 2008 Florent Bruneau
*/
+#include <sys/mman.h>
+
#include "str.h"
#include "trie.h"
typedef struct trie_entry_t trie_entry_t;
struct trie_entry_t {
- int c_offset;
- int c_len;
-
- int* children;
- int children_len;
- int children_size;
+ int32_t c_offset;
+ int32_t c_len;
- bool locked;
+ int32_t children_offset;
+ int32_t children_len;
};
struct trie_t {
trie_entry_t *entries;
- int entries_len;
- int entries_size;
+ int32_t entries_len;
+ int32_t entries_size;
char *c;
- int c_len;
- int c_size;
+ int32_t c_len;
+ int32_t c_size;
+
+ char **keys;
+ int32_t keys_len;
+ int32_t keys_size;
bool locked;
};
return p_new(trie_t, 1);
}
+static inline void trie_cleanup_build_data(trie_t *trie)
+{
+ for (int i = 0 ; i < trie->keys_len ; ++i) {
+ p_delete(&trie->keys[i]);
+ }
+ p_delete(&trie->keys);
+ trie->keys_len = trie->keys_size = 0;
+}
+
void trie_delete(trie_t **trie)
{
if (*trie) {
- for (int i = 0 ; i < (*trie)->entries_len ; ++i) {
- trie_entry_t *entry = &(*trie)->entries[i];
- p_delete(&(entry->children));
- }
+ trie_cleanup_build_data(*trie);
p_delete(&(*trie)->entries);
p_delete(&(*trie)->c);
p_delete(trie);
const trie_entry_t* entry,
const char *key)
{
- int start = 0;
- int end = entry->children_len;
+ int start = entry->children_offset;
+ int end = start + entry->children_len;
const char c = *key;
while (start < end) {
int mid = (start + end) / 2;
- trie_entry_t* child = &trie->entries[entry->children[mid]];
+ trie_entry_t* child = &trie->entries[mid];
const char c2 = trie->c[child->c_offset];
if (child->c_len) {
static inline void trie_entry_insert_child(trie_t *trie, trie_entry_t *entry,
int pchild)
{
- const char c = trie->c[trie->entries[pchild].c_offset];
- int start = 0;
- int end = entry->children_len;
-
- p_allocgrow(&entry->children, entry->children_len + 1, &entry->children_size);
- while (start < end) {
- int mid = (start + end) / 2;
- const trie_entry_t* child = &trie->entries[entry->children[mid]];
- const char c2 = trie->c[child->c_offset];
-
- if (child->c_len) {
- if (c2 == c) {
- abort();
- }
- if (c < c2) {
- end = mid;
- } else {
- start = mid + 1;
- }
- } else {
+ if (entry->children_len == 0) {
+ entry->children_offset = pchild;
+ entry->children_len = 1;
+ } else {
+ if (entry->children_offset + entry->children_len != pchild) {
+ trie_inspect(trie);
+ printf("Inserting child %d while offset is %d[%d]\n",
+ pchild, entry->children_offset, entry->children_len);
abort();
}
+ ++entry->children_len;
}
- memmove(entry->children + start + 1,
- entry->children + start,
- sizeof(int) * (entry->children_len - start));
- entry->children[start] = pchild;
- ++entry->children_len;
}
static inline void trie_entry_split(trie_t *trie, trie_entry_t *entry, int pos)
child->c_len = entry->c_len - pos;
entry->c_len = pos;
}
- child->children = entry->children;
- child->children_len = entry->children_len;
- child->children_size = entry->children_size;
- entry->children = NULL;
- entry->children_len = 0;
- entry->children_size = 0;
- trie_entry_insert_child(trie, entry, trie->entries_len - 1);
+ child->children_offset = entry->children_offset;
+ child->children_len = entry->children_len;
+ entry->children_offset = trie->entries_len - 1;
+ entry->children_len = 1;
}
void trie_insert(trie_t *trie, const char* key)
{
- trie_grow(trie, 2);
- if (trie->entries_len == 0) {
- (void)trie_add_leaf(trie, key);
- } else {
- trie_entry_t *current = trie->entries;
- while (true) {
- int pos = 0;
- if (trie_entry_c_match(trie, current, key, &pos)) {
- if (trie_entry_is_leaf(current)) {
- return;
- }
- trie_entry_t *next = NULL;
- key += pos;
- next = trie_entry_child(trie, current, key);
- if (next == NULL) {
- trie_entry_insert_child(trie, current,
- trie_add_leaf(trie, key));
- return;
- } else {
- current = next;
+ GROW(trie->keys, 1, trie->keys_len, trie->keys_size);
+ trie->keys[trie->keys_len++] = strdup(key);
+}
+
+
+static inline void trie_compile_aux(trie_t *trie, int id,
+ int first_key, int last_key, int offset,
+ int initial_diff)
+{
+ int forks[256];
+ int fork_pos = 0;
+ char current = '\0';
+
+ for (int off_diff = initial_diff ; fork_pos == 0 ; ++off_diff, ++offset) {
+ current = trie->keys[first_key][offset];
+ for (int i = first_key + 1 ; i < last_key ; ++i) {
+ if (trie->keys[i][offset] != current) {
+ trie_grow(trie, 2);
+ if (fork_pos == 0) {
+ trie_entry_split(trie, &trie->entries[id], off_diff);
}
- } else {
- trie_entry_split(trie, current, pos);
- trie_entry_insert_child(trie, current,
- trie_add_leaf(trie, key + pos));
- return;
+ trie_entry_insert_child(trie, &trie->entries[id],
+ trie_add_leaf(trie, trie->keys[i] +
+ offset));
+ forks[fork_pos++] = i;
+ current = trie->keys[i][offset];
}
}
+ if (fork_pos == 0 && current == '\0') {
+ return;
+ }
+ }
+ forks[fork_pos] = last_key;
+
+ const int children_len = trie->entries[id].children_len;
+ for (int i = 0 ; i < children_len ; ++i) {
+ int child = trie->entries[id].children_offset + i;
+ if (forks[i] - 1 > first_key) {
+ trie_compile_aux(trie, child, first_key, forks[i], offset, 1);
+ }
+ first_key = forks[i];
+ }
+}
+
+
+static inline void trie_shrink(trie_t *trie)
+{
+ p_shrink(&trie->entries, trie->entries_len, &trie->entries_size);
+ p_shrink(&trie->c, trie->c_len, &trie->c_size);
+}
+
+static inline void trie_lock(trie_t *trie)
+{
+ if (mlock(trie->entries, sizeof(trie_entry_t) * trie->entries_len) != 0) {
+ UNIXERR("mlock");
+ return;
+ }
+ if (mlock(trie->c, trie->c_len) != 0) {
+ UNIXERR("mlock");
+ munlock(trie->entries, sizeof(trie_entry_t) * trie->entries_len);
+ return;
+ }
+ trie->locked = true;
+}
+
+typedef char *str_t;
+
+void trie_compile(trie_t *trie, bool memlock)
+{
+ {
+# define QSORT_TYPE str_t
+# define QSORT_BASE trie->keys
+# define QSORT_NELT trie->keys_len
+# define QSORT_LT(a,b) strcmp(*a, *b) < 0
+# include "qsort.c"
+ }
+
+ trie_grow(trie, trie->keys_len);
+ trie_compile_aux(trie, trie_add_leaf(trie, trie->keys[0]),
+ 0, trie->keys_len, 0, 0);
+ trie_cleanup_build_data(trie);
+ trie_shrink(trie);
+ if (memlock) {
+ trie_lock(trie);
}
}
static inline void trie_entry_inspect(const trie_t *trie,
const trie_entry_t *entry, int level)
{
- static int c_sum = 0;
- static int nodes = 0;
+ static int max_depth = 0;
+ static int leaves = 0;
+ static int depth_sum = 0;
- ++nodes;
- c_sum += entry->c_len;
+ if (trie_entry_is_leaf(entry)) {
+ if (level > max_depth) {
+ max_depth = level;
+ }
+ ++leaves;
+ depth_sum += level;
+ }
for (int i = 0 ; i < level ; ++i) {
fputs(" ", stdout);
}
fputs("(nil)", stdout);
} else {
const char *c = trie->c + entry->c_offset;
+ printf("(%d) ", entry->c_len);
for (int i = 0 ; i < entry->c_len ; ++i) {
if (c[i]) {
printf("%c ", c[i]);
}
}
fputs("\n", stdout);
- for (int i = 0 ; i < entry->children_len ; ++i) {
- trie_entry_inspect(trie, &trie->entries[entry->children[i]], level + 1);
+ for (int i = entry->children_offset ;
+ i < entry->children_offset + entry->children_len ; ++i) {
+ trie_entry_inspect(trie, &trie->entries[i], level + 1);
}
if (level == 0) {
- printf("Mean char per node: %d\n", c_sum / nodes);
+ printf("Average char per node: %d\n", trie->c_len / trie->entries_len);
+ printf("Number of nodes: %d\n", trie->entries_len);
+ printf("Number of leaves: %d\n", leaves);
+ printf("Max depth: %d\n", max_depth);
+ printf("Average leaf depth: %d\n", depth_sum / leaves);
+ printf("Memory used: %d\n", (trie->entries_size * sizeof(trie_entry_t))
+ + (trie->c_size) + sizeof(trie_t));
}
}