From 3bed590653f5e6d72ffecb606f2569c1d1057326 Mon Sep 17 00:00:00 2001 From: Jeffrey Stedfast Date: Tue, 3 Feb 2004 18:52:53 +0000 Subject: New test suite for url scanning. 2004-02-03 Jeffrey Stedfast * tests/misc/url-scan.c: New test suite for url scanning. * camel-url-scanner.c: Added single/double quotes to url_braces[] in case the user is quoting the url. (camel_url_web_end): Add "-;:" to list of punctuation to strip off the end of urls. Also fixed to handle user@domain's (camel_url_addrspec_start): Strip open brace characters from the beginning of the addr. (camel_url_web_start): Make sure "www" wasn't part of something not a url (like "Ewww.Gross") by check that pos[-1] is either an open brace or whitespace. (camel_url_addrspec_end): Don't allow toplevel domain addr-specs (if we encounter something that looks like it is a toplevel domain addr, it is more likely to be bogus than correct). svn path=/trunk/; revision=24592 --- camel/ChangeLog | 17 ++++++ camel/camel-url-scanner.c | 128 ++++++++++++++++++++++++++++------------- camel/tests/misc/Makefile.am | 3 +- camel/tests/misc/url-scan.c | 132 +++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 240 insertions(+), 40 deletions(-) create mode 100644 camel/tests/misc/url-scan.c diff --git a/camel/ChangeLog b/camel/ChangeLog index b222c86fa0..0a90be539a 100644 --- a/camel/ChangeLog +++ b/camel/ChangeLog @@ -1,3 +1,20 @@ +2004-02-03 Jeffrey Stedfast + + * tests/misc/url-scan.c: New test suite for url scanning. + + * camel-url-scanner.c: Added single/double quotes to url_braces[] + in case the user is quoting the url. + (camel_url_web_end): Add "-;:" to list of punctuation to strip off + the end of urls. Also fixed to handle user@domain's + (camel_url_addrspec_start): Strip open brace characters from the + beginning of the addr. + (camel_url_web_start): Make sure "www" wasn't part of something + not a url (like "Ewww.Gross") by check that pos[-1] is either an + open brace or whitespace. + (camel_url_addrspec_end): Don't allow toplevel domain addr-specs + (if we encounter something that looks like it is a toplevel domain + addr, it is more likely to be bogus than correct). + 2004-02-02 Jeffrey Stedfast Fixes for bug #53091. diff --git a/camel/camel-url-scanner.c b/camel/camel-url-scanner.c index 8a43b05aee..3d7c0b6053 100644 --- a/camel/camel-url-scanner.c +++ b/camel/camel-url-scanner.c @@ -139,6 +139,46 @@ enum { #define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0) +static struct { + char open; + char close; +} url_braces[] = { + { '(', ')' }, + { '{', '}' }, + { '[', ']' }, + { '<', '>' }, + { '|', '|' }, +}; + +static gboolean +is_open_brace (char c) +{ + int i; + + for (i = 0; i < G_N_ELEMENTS (url_braces); i++) { + if (c == url_braces[i].open) + return TRUE; + } + + return FALSE; +} + +static char +url_stop_at_brace (const char *in, size_t so) +{ + int i; + + if (so > 0) { + for (i = 0; i < G_N_ELEMENTS (url_braces); i++) { + if (in[so - 1] == url_braces[i].open) + return url_braces[i].close; + } + } + + return '\0'; +} + + gboolean camel_url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match) { @@ -161,7 +201,7 @@ camel_url_addrspec_start (const char *in, const char *pos, const char *inend, ur inptr--; } - if (!is_atom (*inptr)) + if (!is_atom (*inptr) || is_open_brace (*inptr)) inptr++; if (inptr == pos) @@ -177,6 +217,7 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm { const char *inptr = pos; int parts = 0, digits; + gboolean got_dot = FALSE; g_assert (*inptr == '@'); @@ -213,12 +254,16 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm while (inptr < inend && is_domain (*inptr)) inptr++; - if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) + if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) { + if (*inptr == '.') + got_dot = TRUE; inptr++; + } } } - if (inptr == pos + 1) + /* don't allow toplevel domains */ + if (inptr == pos + 1 || !got_dot) return FALSE; match->um_eo = (inptr - in); @@ -226,31 +271,6 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm return TRUE; } -static struct { - char open; - char close; -} url_braces[] = { - { '(', ')' }, - { '{', '}' }, - { '[', ']' }, - { '<', '>' }, -}; - -static char -url_stop_at_brace (const char *in, size_t so) -{ - int i; - - if (so > 0) { - for (i = 0; i < 4; i++) { - if (in[so - 1] == url_braces[i].open) - return url_braces[i].close; - } - } - - return '\0'; -} - gboolean camel_url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match) { @@ -286,6 +306,12 @@ camel_url_file_end (const char *in, const char *pos, const char *inend, urlmatch gboolean camel_url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match) { + if (pos > in && !strncmp (pos, "www", 3)) { + /* make sure we aren't actually part of another word */ + if (!is_open_brace (pos[-1]) && !isspace (pos[-1])) + return FALSE; + } + match->um_so = (pos - in); return TRUE; @@ -320,13 +346,37 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_ inptr++; } while (parts < 4); - } else if (is_domain (*inptr)) { - while (inptr < inend) { - if (is_domain (*inptr)) + } else if (is_atom (*inptr)) { + /* might be a domain or user@domain */ + const char *save = inptr; + + while (inptr < inend) { + if (!is_atom (*inptr)) + break; + + inptr++; + + while (inptr < inend && is_atom (*inptr)) inptr++; - else + + if (inptr < inend && *inptr == '.' && is_atom (inptr[1])) + inptr++; + } + + if (*inptr != '@') + inptr = save; + else + inptr++; + + goto domain; + } else if (is_domain (*inptr)) { + domain: + while (inptr < inend) { + if (!is_domain (*inptr)) break; + inptr++; + while (inptr < inend && is_domain (*inptr)) inptr++; @@ -359,19 +409,19 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_ while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace) inptr++; - /* urls are extremely unlikely to end with any - * punctuation, so strip any trailing - * punctuation off. Also strip off any closing - * braces. */ - while (inptr > pos && strchr (",.?!)}]", inptr[-1])) - inptr--; - break; default: break; } } + /* urls are extremely unlikely to end with any + * punctuation, so strip any trailing + * punctuation off. Also strip off any closing + * braces or quotes. */ + while (inptr > pos && strchr (",.:;?!-|)}]'\"", inptr[-1])) + inptr--; + match->um_eo = (inptr - in); return TRUE; diff --git a/camel/tests/misc/Makefile.am b/camel/tests/misc/Makefile.am index d3ed29f2e6..cc119e9390 100644 --- a/camel/tests/misc/Makefile.am +++ b/camel/tests/misc/Makefile.am @@ -18,10 +18,11 @@ LDADD = \ check_PROGRAMS = \ url \ + url-scan \ utf7 \ split -TESTS = url utf7 split +TESTS = url utf7 split url-scan diff --git a/camel/tests/misc/url-scan.c b/camel/tests/misc/url-scan.c new file mode 100644 index 0000000000..a7bbf51cf5 --- /dev/null +++ b/camel/tests/misc/url-scan.c @@ -0,0 +1,132 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ +/* + * Authors: Jeffrey Stedfast + * + * Copyright 2004 Ximian, Inc. (www.ximian.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA. + * + */ + + +#ifdef HAVE_CONFIG_H +#include +#endif + +#include +#include +#include + +#include + +#include "camel-test.h" + +struct { + char *text, *url; +} url_tests[] = { + { "bob@foo.com", "mailto:bob@foo.com" }, + { "Ends with bob@foo.com", "mailto:bob@foo.com" }, + { "bob@foo.com at start", "mailto:bob@foo.com" }, + { "bob@foo.com.", "mailto:bob@foo.com" }, + { "\"bob@foo.com\"", "mailto:bob@foo.com" }, + { "", "mailto:bob@foo.com" }, + { "(bob@foo.com)", "mailto:bob@foo.com" }, + { "bob@foo.com, 555-9999", "mailto:bob@foo.com" }, + { "|bob@foo.com|555-9999|", "mailto:bob@foo.com" }, + { "bob@ no match bob@", NULL }, + { "@foo.com no match @foo.com", NULL }, + { "\"bob\"@foo.com", NULL }, + { "M@ke money fast!", NULL }, + { "ASCII art @_@ @>->-", NULL }, + + { "http://www.foo.com", "http://www.foo.com" }, + { "Ends with http://www.foo.com", "http://www.foo.com" }, + { "http://www.foo.com at start", "http://www.foo.com" }, + { "http://www.foo.com.", "http://www.foo.com" }, + { "http://www.foo.com/.", "http://www.foo.com/" }, + { "", "http://www.foo.com" }, + { "(http://www.foo.com)", "http://www.foo.com" }, + { "http://www.foo.com, 555-9999", "http://www.foo.com" }, + { "|http://www.foo.com|555-9999|", "http://www.foo.com" }, + { "foo http://www.foo.com/ bar", "http://www.foo.com/" }, + { "foo http://www.foo.com/index.html bar", "http://www.foo.com/index.html" }, + { "foo http://www.foo.com/q?99 bar", "http://www.foo.com/q?99" }, + { "foo http://www.foo.com/;foo=bar&baz=quux bar", "http://www.foo.com/;foo=bar&baz=quux" }, + { "foo http://www.foo.com/index.html#anchor bar", "http://www.foo.com/index.html#anchor" }, + { "http://www.foo.com/index.html; foo", "http://www.foo.com/index.html" }, + { "http://www.foo.com/index.html: foo", "http://www.foo.com/index.html" }, + { "http://www.foo.com/index.html-- foo", "http://www.foo.com/index.html" }, + { "http://www.foo.com/index.html?", "http://www.foo.com/index.html" }, + { "http://www.foo.com/index.html!", "http://www.foo.com/index.html" }, + { "\"http://www.foo.com/index.html\"", "http://www.foo.com/index.html" }, + { "'http://www.foo.com/index.html'", "http://www.foo.com/index.html" }, + { "http://bob@www.foo.com/bar/baz/", "http://bob@www.foo.com/bar/baz/" }, + { "http no match http", NULL }, + { "http: no match http:", NULL }, + { "http:// no match http://", NULL }, + { "unrecognized://bob@foo.com/path", "mailto:bob@foo.com" }, + + { "src/www.c", NULL }, + { "Ewwwwww.Gross.", NULL }, + +}; + +static int num_url_tests = G_N_ELEMENTS (url_tests); + +int main (int argc, char **argv) +{ + char *html, *url, *p; + int i, errors = 0; + guint32 flags; + + camel_test_init (argc, argv); + + camel_test_start ("URL scanning"); + + flags = CAMEL_MIME_FILTER_TOHTML_CONVERT_URLS | CAMEL_MIME_FILTER_TOHTML_CONVERT_ADDRESSES; + for (i = 0; i < num_url_tests; i++) { + camel_test_push ("'%s' => '%s'", url_tests[i].text, url_tests[i].url ? url_tests[i].url : "None"); + + html = camel_text_to_html (url_tests[i].text, flags, 0); + + url = strstr (html, "href=\""); + if (url) { + url += 6; + p = strchr (url, '"'); + if (p) + *p = '\0'; + + while ((p = strstr (url, "&"))) + memmove (p + 1, p + 5, strlen (p + 5) + 1); + } + + if ((url && (!url_tests[i].url || strcmp (url, url_tests[i].url) != 0)) || + (!url && url_tests[i].url)) { + printf ("FAILED on \"%s\" -> %s\n (got %s)\n\n", + url_tests[i].text, + url_tests[i].url ? url_tests[i].url : "(nothing)", + url ? url : "(nothing)"); + errors++; + } + + g_free (html); + } + + printf ("\n%d errors\n", errors); + + camel_test_end (); + + return errors; +} -- cgit v1.2.3