From d2971bf6453efbd6b0d63ac9d4a06727062c2955 Mon Sep 17 00:00:00 2001 From: Jeffrey Stedfast Date: Mon, 9 Dec 2002 23:18:31 +0000 Subject: New code to scan for patterns (used only for url pattern matching atm, but 2002-12-09 Jeffrey Stedfast * camel-url-scanner.c: New code to scan for patterns (used only for url pattern matching atm, but we may find other uses for this and thus rename it? I dunno). Uses ETrie. svn path=/trunk/; revision=19075 --- camel/camel-url-scanner.c | 396 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 396 insertions(+) create mode 100644 camel/camel-url-scanner.c (limited to 'camel/camel-url-scanner.c') diff --git a/camel/camel-url-scanner.c b/camel/camel-url-scanner.c new file mode 100644 index 0000000000..8ff6a14156 --- /dev/null +++ b/camel/camel-url-scanner.c @@ -0,0 +1,396 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ +/* + * Authors: Jeffrey Stedfast + * + * Copyright 2002 Ximian, Inc. (www.ximian.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA. + * + */ + + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include + +#include "e-util/e-trie.h" +#include "camel-url-scanner.h" + + +struct _CamelUrlScanner { + GPtrArray *patterns; + ETrie *trie; +}; + + +CamelUrlScanner * +camel_url_scanner_new (void) +{ + CamelUrlScanner *scanner; + + scanner = g_new (CamelUrlScanner, 1); + scanner->patterns = g_ptr_array_new (); + scanner->trie = e_trie_new (TRUE); + + return scanner; +} + + +void +camel_url_scanner_free (CamelUrlScanner *scanner) +{ + g_return_if_fail (scanner != NULL); + + g_ptr_array_free (scanner->patterns, TRUE); + e_trie_free (scanner->trie); + g_free (scanner); +} + + +void +camel_url_scanner_add (CamelUrlScanner *scanner, urlpattern_t *pattern) +{ + g_return_if_fail (scanner != NULL); + + e_trie_add (scanner->trie, pattern->pattern, scanner->patterns->len); + g_ptr_array_add (scanner->patterns, pattern); +} + + +gboolean +camel_url_scanner_scan (CamelUrlScanner *scanner, const char *in, size_t inlen, urlmatch_t *match) +{ + const char *pos, *inend; + urlpattern_t *pat; + int pattern; + + g_return_val_if_fail (scanner != NULL, FALSE); + g_return_val_if_fail (in != NULL, FALSE); + + if (!(pos = e_trie_search (scanner->trie, in, inlen, &pattern))) + return FALSE; + + pat = g_ptr_array_index (scanner->patterns, pattern); + + inend = in + inlen; + if (!pat->start (in, pos, inend, match)) + return FALSE; + + if (!pat->end (in, pos, inend, match)) + return FALSE; + + match->pattern = pat->pattern; + match->prefix = pat->prefix; + + return TRUE; +} + + +static unsigned char url_scanner_table[256] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 24, 64, 96, 64, 64, 64, 64, 64, 96, 96, 64, 64, 96, 64, 96, 96, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 96, 96, 32, 64, 32, 64, + 96, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 96, 96, 96, 64, 64, + 64, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 64, 64, 64, 64, 1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +enum { + IS_CTRL = (1 << 0), + IS_ALPHA = (1 << 1), + IS_DIGIT = (1 << 2), + IS_LWSP = (1 << 3), + IS_SPACE = (1 << 4), + IS_SPECIAL = (1 << 5), + IS_URLSAFE = (1 << 6), +}; + +#define is_ctrl(x) ((url_scanner_table[(unsigned char)(x)] & IS_CTRL) != 0) +#define is_lwsp(x) ((url_scanner_table[(unsigned char)(x)] & IS_LWSP) != 0) +#define is_atom(x) ((url_scanner_table[(unsigned char)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0) +#define is_alpha(x) ((url_scanner_table[(unsigned char)(x)] & IS_ALPHA) != 0) +#define is_digit(x) ((url_scanner_table[(unsigned char)(x)] & IS_DIGIT) != 0) +#define is_domain(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT)) != 0 || (x) == '-') +#define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0) + + +gboolean +camel_url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match) +{ + register const char *inptr = pos; + + g_assert (*inptr == '@'); + + inptr--; + + while (inptr > in) { + if (is_atom (*inptr)) + inptr--; + else + break; + + while (inptr > in && is_atom (*inptr)) + inptr--; + + if (inptr > in && *inptr == '.') + inptr--; + } + + if (!is_atom (*inptr)) + inptr++; + + if (inptr == pos) + return FALSE; + + match->um_so = (inptr - in); + + return TRUE; +} + +gboolean +camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlmatch_t *match) +{ + const char *inptr = pos; + int parts = 0, digits; + + g_assert (*inptr == '@'); + + inptr++; + + if (*inptr == '[') { + /* domain literal */ + do { + inptr++; + + digits = 0; + while (inptr < inend && is_digit (*inptr) && digits < 3) { + inptr++; + digits++; + } + + parts++; + + if (*inptr != '.' && parts != 4) + return FALSE; + } while (parts < 4); + + if (*inptr == ']') + inptr++; + else + return FALSE; + } else { + while (inptr < inend) { + if (is_domain (*inptr)) + inptr++; + else + break; + + while (inptr < inend && is_domain (*inptr)) + inptr++; + + if (inptr < inend && *inptr == '.') + inptr++; + } + } + + if (inptr == pos) + return FALSE; + + match->um_eo = (inptr - in); + + return TRUE; +} + +gboolean +camel_url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match) +{ + match->um_so = (pos - in); + + return TRUE; +} + +gboolean +camel_url_file_end (const char *in, const char *pos, const char *inend, urlmatch_t *match) +{ + register const char *inptr = pos; + + inptr += strlen (match->pattern); + + if (*inptr == '/') + inptr++; + + while (inptr < inend && is_urlsafe (*inptr)) + inptr++; + + if (inptr == pos) + return FALSE; + + match->um_eo = (inptr - in); + + return TRUE; +} + +gboolean +camel_url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match) +{ + match->um_so = (pos - in); + + return TRUE; +} + +gboolean +camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_t *match) +{ + register const char *inptr = pos; + int parts = 0, digits, port; + + inptr += strlen (match->pattern); + + /* find the end of the domain */ + if (is_digit (*inptr)) { + /* domain-literal */ + do { + digits = 0; + while (inptr < inend && is_digit (*inptr) && digits < 3) { + inptr++; + digits++; + } + + parts++; + + if (*inptr != '.' && parts != 4) + return FALSE; + else if (*inptr == '.') + inptr++; + + } while (parts < 4); + } else if (is_domain (*inptr)) { + do { + while (inptr < inend && is_domain (*inptr)) + inptr++; + + if (inptr < inend && *inptr == '.') + inptr++; + else + break; + + } while (inptr < inend); + } else { + return FALSE; + } + + if (inptr < inend && *inptr == ':') { + /* skip past the port */ + inptr++; + port = 0; + + while (inptr < inend && is_digit (*inptr) && port < 65536) + port = (port * 10) + (*inptr++ - '0'); + } + + if (inptr < inend && *inptr == '/') { + /* skip past our url path */ + inptr++; + + while (inptr < inend && is_urlsafe (*inptr)) + inptr++; + } + + match->um_eo = (inptr - in); + + return TRUE; +} + + + +#ifdef BUILD_TABLE + +#include + +/* got these from rfc1738 */ +#define CHARS_LWSP " \t\n\r" /* linear whitespace chars */ +#define CHARS_SPECIAL "()<>@,;:\\\".[]" + +/* got these from rfc1738 */ +#define CHARS_URLSAFE "$-_.+!*'(),{}|\\^~[]`#%\";/?:@&=" + + +static void +table_init_bits (unsigned int mask, const unsigned char *vals) +{ + int i; + + for (i = 0; vals[i] != '\0'; i++) + url_scanner_table[vals[i]] |= mask; +} + +static void +url_scanner_table_init (void) +{ + int i; + + for (i = 0; i < 256; i++) { + url_scanner_table[i] = 0; + if (i < 32) + url_scanner_table[i] |= IS_CTRL; + if ((i >= '0' && i <= '9')) + url_scanner_table[i] |= IS_DIGIT; + if ((i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z')) + url_scanner_table[i] |= IS_ALPHA; + } + + url_scanner_table[127] |= IS_CTRL; + url_scanner_table[' '] |= IS_SPACE; + + /* not defined to be special in rfc0822, but when scanning + backwards to find the beginning of the email address we do + not want to incldue this char if we come accross it - so + this is kind of a hack */ + url_scanner_table['/'] |= IS_SPECIAL; + + table_init_bits (IS_LWSP, CHARS_LWSP); + table_init_bits (IS_SPECIAL, CHARS_SPECIAL); + table_init_bits (IS_URLSAFE, CHARS_URLSAFE); +} + +int main (int argc, char **argv) +{ + int i; + + url_scanner_table_init (); + + printf ("static unsigned char url_scanner_table[256] = {"); + for (i = 0; i < 256; i++) { + printf ("%s%3d%s", (i % 16) ? "" : "\n\t", + url_scanner_table[i], i != 255 ? "," : "\n"); + } + printf ("};\n\n"); + + return 0; +} + +#endif /* BUILD_TABLE */ -- cgit v1.2.3