From 3b74718def0fd873118d80332cfd1c7e83642394 Mon Sep 17 00:00:00 2001 From: Jeffrey Stedfast Date: Mon, 9 Dec 2002 23:16:38 +0000 Subject: New implementation of an Aho-Corasick trie. 2002-12-09 Jeffrey Stedfast * e-trie.c: New implementation of an Aho-Corasick trie. svn path=/trunk/; revision=19074 --- e-util/ChangeLog | 4 + e-util/Makefile.am | 2 + e-util/e-trie.c | 344 +++++++++++++++++++++++++++++++++++++++++++++++++++++ e-util/e-trie.h | 47 ++++++++ 4 files changed, 397 insertions(+) create mode 100644 e-util/e-trie.c create mode 100644 e-util/e-trie.h (limited to 'e-util') diff --git a/e-util/ChangeLog b/e-util/ChangeLog index b922da3953..3cb13b194c 100644 --- a/e-util/ChangeLog +++ b/e-util/ChangeLog @@ -1,3 +1,7 @@ +2002-12-09 Jeffrey Stedfast + + * e-trie.c: New implementation of an Aho-Corasick trie. + 2002-12-09 Chris Toshok * e-passwords.h: track changes to api (e_passwords_init is gone, diff --git a/e-util/Makefile.am b/e-util/Makefile.am index 4a1b42d9dc..2c54e5cad4 100644 --- a/e-util/Makefile.am +++ b/e-util/Makefile.am @@ -42,6 +42,7 @@ eutilinclude_HEADERS = \ e-request.h \ e-sexp.h \ e-time-utils.h \ + e-trie.h \ e-url.h \ md5-utils.h @@ -71,6 +72,7 @@ libeutil_la_SOURCES = \ e-request.c \ e-sexp.c \ e-time-utils.c \ + e-trie.c \ e-url.c \ e-util-marshal.c \ md5-utils.c diff --git a/e-util/e-trie.c b/e-util/e-trie.c new file mode 100644 index 0000000000..6a94be1d6f --- /dev/null +++ b/e-util/e-trie.c @@ -0,0 +1,344 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ +/* + * Authors: Jeffrey Stedfast + * + * Copyright 2002 Ximian, Inc. (www.ximian.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA. + * + */ + + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include + +#include "e-trie.h" +#include "e-memory.h" + +#define d(x) + +struct _trie_state { + struct _trie_state *next; + struct _trie_state *fail; + struct _trie_match *match; + unsigned int final; + int id; +}; + +struct _trie_match { + struct _trie_match *next; + struct _trie_state *state; + gunichar c; +}; + +struct _ETrie { + struct _trie_state root; + GPtrArray *fail_states; + gboolean icase; + + E_Memchunk *match_chunks; + E_Memchunk *state_chunks; +}; + + +static __inline__ gunichar +trie_utf8_getc (const unsigned char **in, size_t inlen) +{ + register const unsigned char *inptr = *in; + const unsigned char *inend = inptr + inlen; + register unsigned char c, r; + register gunichar u, m; + + if (inlen == 0) + return 0; + +again: + r = *inptr++; +loop: + if (r < 0x80) { + *in = inptr; + u = r; + } else if (r < 0xfe) { /* valid start char? */ + u = r; + m = 0x7f80; /* used to mask out the length bits */ + do { + if (inptr >= inend) + return 0; + + c = *inptr++; + if ((c & 0xc0) != 0x80) { + r = c; + goto loop; + } + + u = (u << 6) | (c & 0x3f); + r <<= 1; + m <<= 5; + } while (r & 0x40); + + *in = inptr; + + u &= ~m; + } else { + goto again; + } + + return u; +} + + +ETrie * +e_trie_new (gboolean icase) +{ + ETrie *trie; + + trie = g_new (ETrie, 1); + trie->root.next = NULL; + trie->root.fail = NULL; + trie->root.match = NULL; + trie->root.final = 0; + + trie->fail_states = g_ptr_array_sized_new (8); + trie->icase = icase; + + trie->match_chunks = e_memchunk_new (8, sizeof (struct _trie_match)); + trie->state_chunks = e_memchunk_new (8, sizeof (struct _trie_state)); + + return trie; +} + +void +e_trie_free (ETrie *trie) +{ + e_memchunk_destroy (trie->match_chunks); + e_memchunk_destroy (trie->state_chunks); + g_free (trie); +} + + + +static struct _trie_match * +g (struct _trie_state *s, gunichar c) +{ + struct _trie_match *m = s->match; + + while (m && m->c != c) + m = m->next; + + return m; +} + +static struct _trie_state * +trie_insert (ETrie *trie, int depth, struct _trie_state *q, gunichar c) +{ + struct _trie_match *m; + + m = e_memchunk_alloc (trie->match_chunks); + m->next = q->match; + m->c = c; + + q->match = m; + q = m->state = e_memchunk_alloc (trie->state_chunks); + q->match = NULL; + q->fail = &trie->root; + q->final = 0; + q->id = -1; + + if (trie->fail_states->len < depth + 1) { + unsigned int size = trie->fail_states->len; + + size = MAX (size + 64, depth + 1); + g_ptr_array_set_size (trie->fail_states, size); + } + + q->next = trie->fail_states->pdata[depth]; + trie->fail_states->pdata[depth] = q; + + return q; +} + + +#if 1 +static void +dump_trie (struct _trie_state *s, int depth) +{ + char *p = g_alloca ((depth * 2) + 1); + struct _trie_match *m; + + memset (p, ' ', depth * 2); + p[depth * 2] = '\0'; + + fprintf (stderr, "%s[state] %p: final=%d; pattern-id=%d; fail=%p\n", + p, s, s->final, s->id, s->fail); + m = s->match; + while (m) { + fprintf (stderr, " %s'%c' -> %p\n", p, m->c, m->state); + if (m->state) + dump_trie (m->state, depth + 1); + + m = m->next; + } +} +#endif + + +/* + * final = empty set + * FOR p = 1 TO #pat + * q = root + * FOR j = 1 TO m[p] + * IF g(q, pat[p][j]) == null + * insert(q, pat[p][j]) + * ENDIF + * q = g(q, pat[p][j]) + * ENDFOR + * final = union(final, q) + * ENDFOR +*/ + +void +e_trie_add (ETrie *trie, const char *pattern) +{ + const unsigned char *inptr = (const unsigned char *) pattern; + struct _trie_state *q, *q1, *r; + struct _trie_match *m, *n; + int i, depth = 0; + gunichar c; + + /* Step 1: add the pattern to the trie */ + + q = &trie->root; + + while ((c = trie_utf8_getc (&inptr, -1))) { + if (trie->icase) + c = g_unichar_tolower (c); + + m = g (q, c); + if (m == NULL) { + q = trie_insert (trie, depth, q, c); + } else { + q = m->state; + } + + depth++; + } + + q->final = depth; + q->id = pattern_id; + + /* Step 2: compute failure graph */ + + for (i = 0; i < trie->fail_states->len; i++) { + q = trie->fail_states->pdata[i]; + while (q) { + m = q->match; + while (m) { + c = m->c; + q1 = m->state; + r = q->fail; + while (r && (n = g (r, c)) == NULL) + r = r->fail; + + if (r != NULL) { + q1->fail = n->state; + if (q1->fail->final > q1->final) + q1->final = q1->fail->final; + } else { + if ((n = g (&trie->root, c))) + q1->fail = n->state; + else + q1->fail = &trie->root; + } + + m = m->next; + } + + q = q->next; + } + } + + d(fprintf (stderr, "\nafter adding pattern '%s' to trie %p:\n", pattern, trie)); + d(dump_trie (&trie->root, 0)); +} + +/* + * Aho-Corasick + * + * q = root + * FOR i = 1 TO n + * WHILE q != fail AND g(q, text[i]) == fail + * q = h(q) + * ENDWHILE + * IF q == fail + * q = root + * ELSE + * q = g(q, text[i]) + * ENDIF + * IF isElement(q, final) + * RETURN TRUE + * ENDIF + * ENDFOR + * RETURN FALSE + */ + +const char * +e_trie_search (ETrie *trie, const char *buffer, size_t buflen, const char **pattern) +{ + const unsigned char *inptr, *inend, *prev, *pat; + register size_t inlen = buflen; + struct _trie_state *q; + struct _trie_match *m; + gunichar c, u; + + inptr = (const unsigned char *) buffer; + inend = inptr + buflen; + + q = &trie->root; + pat = prev = inptr; + while ((c = trie_utf8_getc (&inptr, inlen))) { + inlen = (inend - inptr); + + if (trie->icase) + c = g_unichar_tolower (c); + + while (q != NULL && (m = g (q, c)) == NULL) + q = q->fail; + + if (q == &trie->root) + pat = prev; + + if (q == NULL) { + q = &trie->root; + pat = inptr; + } else if (m != NULL) { + q = m->state; + + if (q->final) { + if (matched_id) + *matched_id = q->id; + + return (const char *) pat; + } + } + + prev = inptr; + } + + return NULL; +} diff --git a/e-util/e-trie.h b/e-util/e-trie.h new file mode 100644 index 0000000000..cabf5fc627 --- /dev/null +++ b/e-util/e-trie.h @@ -0,0 +1,47 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ +/* + * Authors: Jeffrey Stedfast + * + * Copyright 2002 Ximian, Inc. (www.ximian.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA. + * + */ + + +#ifndef __E_TRIE_H__ +#define __E_TRIE_H__ + +#ifdef __cplusplus +extern "C" { +#pragma } +#endif /* __cplusplus */ + +#include + +typedef struct _ETrie ETrie; + +ETrie *e_trie_new (gboolean icase); +void e_trie_free (ETrie *trie); + +void e_trie_add (ETrie *trie, const char *pattern, int pattern_id); + +const char *e_trie_search (ETrie *trie, const char *buffer, size_t buflen, int *matched_id); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ + +#endif /* __E_TRIE_H__ */ -- cgit v1.2.3