From 3bed590653f5e6d72ffecb606f2569c1d1057326 Mon Sep 17 00:00:00 2001 From: Jeffrey Stedfast Date: Tue, 3 Feb 2004 18:52:53 +0000 Subject: New test suite for url scanning. 2004-02-03 Jeffrey Stedfast * tests/misc/url-scan.c: New test suite for url scanning. * camel-url-scanner.c: Added single/double quotes to url_braces[] in case the user is quoting the url. (camel_url_web_end): Add "-;:" to list of punctuation to strip off the end of urls. Also fixed to handle user@domain's (camel_url_addrspec_start): Strip open brace characters from the beginning of the addr. (camel_url_web_start): Make sure "www" wasn't part of something not a url (like "Ewww.Gross") by check that pos[-1] is either an open brace or whitespace. (camel_url_addrspec_end): Don't allow toplevel domain addr-specs (if we encounter something that looks like it is a toplevel domain addr, it is more likely to be bogus than correct). svn path=/trunk/; revision=24592 --- camel/camel-url-scanner.c | 128 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 89 insertions(+), 39 deletions(-) (limited to 'camel/camel-url-scanner.c') diff --git a/camel/camel-url-scanner.c b/camel/camel-url-scanner.c index 8a43b05aee..3d7c0b6053 100644 --- a/camel/camel-url-scanner.c +++ b/camel/camel-url-scanner.c @@ -139,6 +139,46 @@ enum { #define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0) +static struct { + char open; + char close; +} url_braces[] = { + { '(', ')' }, + { '{', '}' }, + { '[', ']' }, + { '<', '>' }, + { '|', '|' }, +}; + +static gboolean +is_open_brace (char c) +{ + int i; + + for (i = 0; i < G_N_ELEMENTS (url_braces); i++) { + if (c == url_braces[i].open) + return TRUE; + } + + return FALSE; +} + +static char +url_stop_at_brace (const char *in, size_t so) +{ + int i; + + if (so > 0) { + for (i = 0; i < G_N_ELEMENTS (url_braces); i++) { + if (in[so - 1] == url_braces[i].open) + return url_braces[i].close; + } + } + + return '\0'; +} + + gboolean camel_url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match) { @@ -161,7 +201,7 @@ camel_url_addrspec_start (const char *in, const char *pos, const char *inend, ur inptr--; } - if (!is_atom (*inptr)) + if (!is_atom (*inptr) || is_open_brace (*inptr)) inptr++; if (inptr == pos) @@ -177,6 +217,7 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm { const char *inptr = pos; int parts = 0, digits; + gboolean got_dot = FALSE; g_assert (*inptr == '@'); @@ -213,12 +254,16 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm while (inptr < inend && is_domain (*inptr)) inptr++; - if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) + if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) { + if (*inptr == '.') + got_dot = TRUE; inptr++; + } } } - if (inptr == pos + 1) + /* don't allow toplevel domains */ + if (inptr == pos + 1 || !got_dot) return FALSE; match->um_eo = (inptr - in); @@ -226,31 +271,6 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm return TRUE; } -static struct { - char open; - char close; -} url_braces[] = { - { '(', ')' }, - { '{', '}' }, - { '[', ']' }, - { '<', '>' }, -}; - -static char -url_stop_at_brace (const char *in, size_t so) -{ - int i; - - if (so > 0) { - for (i = 0; i < 4; i++) { - if (in[so - 1] == url_braces[i].open) - return url_braces[i].close; - } - } - - return '\0'; -} - gboolean camel_url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match) { @@ -286,6 +306,12 @@ camel_url_file_end (const char *in, const char *pos, const char *inend, urlmatch gboolean camel_url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match) { + if (pos > in && !strncmp (pos, "www", 3)) { + /* make sure we aren't actually part of another word */ + if (!is_open_brace (pos[-1]) && !isspace (pos[-1])) + return FALSE; + } + match->um_so = (pos - in); return TRUE; @@ -320,13 +346,37 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_ inptr++; } while (parts < 4); - } else if (is_domain (*inptr)) { - while (inptr < inend) { - if (is_domain (*inptr)) + } else if (is_atom (*inptr)) { + /* might be a domain or user@domain */ + const char *save = inptr; + + while (inptr < inend) { + if (!is_atom (*inptr)) + break; + + inptr++; + + while (inptr < inend && is_atom (*inptr)) inptr++; - else + + if (inptr < inend && *inptr == '.' && is_atom (inptr[1])) + inptr++; + } + + if (*inptr != '@') + inptr = save; + else + inptr++; + + goto domain; + } else if (is_domain (*inptr)) { + domain: + while (inptr < inend) { + if (!is_domain (*inptr)) break; + inptr++; + while (inptr < inend && is_domain (*inptr)) inptr++; @@ -359,19 +409,19 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_ while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace) inptr++; - /* urls are extremely unlikely to end with any - * punctuation, so strip any trailing - * punctuation off. Also strip off any closing - * braces. */ - while (inptr > pos && strchr (",.?!)}]", inptr[-1])) - inptr--; - break; default: break; } } + /* urls are extremely unlikely to end with any + * punctuation, so strip any trailing + * punctuation off. Also strip off any closing + * braces or quotes. */ + while (inptr > pos && strchr (",.:;?!-|)}]'\"", inptr[-1])) + inptr--; + match->um_eo = (inptr - in); return TRUE; -- cgit v1.2.3