diff options
-rw-r--r-- | camel/ChangeLog | 10 | ||||
-rw-r--r-- | camel/camel-url-scanner.c | 85 |
2 files changed, 60 insertions, 35 deletions
diff --git a/camel/ChangeLog b/camel/ChangeLog index 1d81bead50..592dd9c74b 100644 --- a/camel/ChangeLog +++ b/camel/ChangeLog @@ -1,5 +1,15 @@ 2002-12-09 Jeffrey Stedfast <fejj@ximian.com> + * camel-url-scanner.c (camel_url_addrspec_end): Fixed to not be + fooled in the case where the address is followed immediately by a + period. + (camel_url_web_end): Made more robust. + (camel_url_scanner_scan): Oops. We need to set the match->pattern + string pointer to the correct pattern before executing the + start/end methods (as some of them rely on this info). + +2002-12-09 Jeffrey Stedfast <fejj@ximian.com> + * camel-url-scanner.c: New code to scan for patterns (used only for url pattern matching atm, but we may find other uses for this and thus rename it? I dunno). Uses ETrie. diff --git a/camel/camel-url-scanner.c b/camel/camel-url-scanner.c index 8ff6a14156..32ff7064f3 100644 --- a/camel/camel-url-scanner.c +++ b/camel/camel-url-scanner.c @@ -86,6 +86,9 @@ camel_url_scanner_scan (CamelUrlScanner *scanner, const char *in, size_t inlen, pat = g_ptr_array_index (scanner->patterns, pattern); + match->pattern = pat->pattern; + match->prefix = pat->prefix; + inend = in + inlen; if (!pat->start (in, pos, inend, match)) return FALSE; @@ -93,9 +96,6 @@ camel_url_scanner_scan (CamelUrlScanner *scanner, const char *in, size_t inlen, if (!pat->end (in, pos, inend, match)) return FALSE; - match->pattern = pat->pattern; - match->prefix = pat->prefix; - return TRUE; } @@ -103,12 +103,12 @@ camel_url_scanner_scan (CamelUrlScanner *scanner, const char *in, size_t inlen, static unsigned char url_scanner_table[256] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, - 24, 64, 96, 64, 64, 64, 64, 64, 96, 96, 64, 64, 96, 64, 96, 96, - 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 96, 96, 32, 64, 32, 64, - 96, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 96, 96, 96, 64, 64, - 64, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, - 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 64, 64, 64, 64, 1, + 24,128,160,128,128,128,128,128,160,160,128,128,160,192,160,160, + 68, 68, 68, 68, 68, 68, 68, 68, 68, 68,160,160, 32,128, 32,128, + 160, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,160,160,160,128,128, + 128, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, 66, 66, 66,128,128,128,128, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, @@ -126,7 +126,8 @@ enum { IS_LWSP = (1 << 3), IS_SPACE = (1 << 4), IS_SPECIAL = (1 << 5), - IS_URLSAFE = (1 << 6), + IS_DOMAIN = (1 << 6), + IS_URLSAFE = (1 << 7), }; #define is_ctrl(x) ((url_scanner_table[(unsigned char)(x)] & IS_CTRL) != 0) @@ -134,7 +135,7 @@ enum { #define is_atom(x) ((url_scanner_table[(unsigned char)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0) #define is_alpha(x) ((url_scanner_table[(unsigned char)(x)] & IS_ALPHA) != 0) #define is_digit(x) ((url_scanner_table[(unsigned char)(x)] & IS_DIGIT) != 0) -#define is_domain(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT)) != 0 || (x) == '-') +#define is_domain(x) ((url_scanner_table[(unsigned char)(x)] & IS_DOMAIN) != 0) #define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0) @@ -212,7 +213,7 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm while (inptr < inend && is_domain (*inptr)) inptr++; - if (inptr < inend && *inptr == '.') + if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) inptr++; } } @@ -289,35 +290,48 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_ } while (parts < 4); } else if (is_domain (*inptr)) { - do { - while (inptr < inend && is_domain (*inptr)) - inptr++; - - if (inptr < inend && *inptr == '.') + while (inptr < inend) { + if (is_domain (*inptr)) inptr++; else break; - } while (inptr < inend); + while (inptr < inend && is_domain (*inptr)) + inptr++; + + if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) + inptr++; + } } else { return FALSE; } - if (inptr < inend && *inptr == ':') { - /* skip past the port */ - inptr++; - port = 0; - - while (inptr < inend && is_digit (*inptr) && port < 65536) - port = (port * 10) + (*inptr++ - '0'); - } - - if (inptr < inend && *inptr == '/') { - /* skip past our url path */ - inptr++; - - while (inptr < inend && is_urlsafe (*inptr)) + if (inptr < inend) { + switch (*inptr) { + case ':': /* port notation */ inptr++; + port = 0; + + while (inptr < inend && is_digit (*inptr) && port < 65536) + port = (port * 10) + (*inptr++ - '0'); + + if (port >= 65536) + inptr--; + + if (inptr >= inend || *inptr != '/') + break; + + /* we have a '/' so there could be a path - fall through */ + case '/': /* we've detected a path component to our url */ + inptr++; + + while (inptr < inend && is_urlsafe (*inptr)) + inptr++; + + break; + default: + break; + } } match->um_eo = (inptr - in); @@ -358,17 +372,18 @@ url_scanner_table_init (void) if (i < 32) url_scanner_table[i] |= IS_CTRL; if ((i >= '0' && i <= '9')) - url_scanner_table[i] |= IS_DIGIT; + url_scanner_table[i] |= IS_DIGIT | IS_DOMAIN; if ((i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z')) - url_scanner_table[i] |= IS_ALPHA; + url_scanner_table[i] |= IS_ALPHA | IS_DOMAIN; } url_scanner_table[127] |= IS_CTRL; url_scanner_table[' '] |= IS_SPACE; + url_scanner_table['-'] |= IS_DOMAIN; /* not defined to be special in rfc0822, but when scanning backwards to find the beginning of the email address we do - not want to incldue this char if we come accross it - so + not want to include this char if we come accross it - so this is kind of a hack */ url_scanner_table['/'] |= IS_SPECIAL; |