From 3bed590653f5e6d72ffecb606f2569c1d1057326 Mon Sep 17 00:00:00 2001
From: Jeffrey Stedfast <fejj@ximian.com>
Date: Tue, 3 Feb 2004 18:52:53 +0000
Subject: New test suite for url scanning.

2004-02-03  Jeffrey Stedfast  <fejj@ximian.com>

	* tests/misc/url-scan.c: New test suite for url scanning.

	* camel-url-scanner.c: Added single/double quotes to url_braces[]
	in case the user is quoting the url.
	(camel_url_web_end): Add "-;:" to list of punctuation to strip off
	the end of urls. Also fixed to handle user@domain's
	(camel_url_addrspec_start): Strip open brace characters from the
	beginning of the addr.
	(camel_url_web_start): Make sure "www" wasn't part of something
	not a url (like "Ewww.Gross") by check that pos[-1] is either an
	open brace or whitespace.
	(camel_url_addrspec_end): Don't allow toplevel domain addr-specs
	(if we encounter something that looks like it is a toplevel domain
	addr, it is more likely to be bogus than correct).

svn path=/trunk/; revision=24592
---
 camel/ChangeLog              |  17 ++++++
 camel/camel-url-scanner.c    | 128 ++++++++++++++++++++++++++++-------------
 camel/tests/misc/Makefile.am |   3 +-
 camel/tests/misc/url-scan.c  | 132 +++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 240 insertions(+), 40 deletions(-)
 create mode 100644 camel/tests/misc/url-scan.c

diff --git a/camel/ChangeLog b/camel/ChangeLog
index b222c86fa0..0a90be539a 100644
--- a/camel/ChangeLog
+++ b/camel/ChangeLog
@@ -1,3 +1,20 @@
+2004-02-03  Jeffrey Stedfast  <fejj@ximian.com>
+
+	* tests/misc/url-scan.c: New test suite for url scanning.
+
+	* camel-url-scanner.c: Added single/double quotes to url_braces[]
+	in case the user is quoting the url.
+	(camel_url_web_end): Add "-;:" to list of punctuation to strip off
+	the end of urls. Also fixed to handle user@domain's
+	(camel_url_addrspec_start): Strip open brace characters from the
+	beginning of the addr.
+	(camel_url_web_start): Make sure "www" wasn't part of something
+	not a url (like "Ewww.Gross") by check that pos[-1] is either an
+	open brace or whitespace.
+	(camel_url_addrspec_end): Don't allow toplevel domain addr-specs
+	(if we encounter something that looks like it is a toplevel domain
+	addr, it is more likely to be bogus than correct).
+
 2004-02-02  Jeffrey Stedfast  <fejj@ximian.com>
 
 	Fixes for bug #53091.
diff --git a/camel/camel-url-scanner.c b/camel/camel-url-scanner.c
index 8a43b05aee..3d7c0b6053 100644
--- a/camel/camel-url-scanner.c
+++ b/camel/camel-url-scanner.c
@@ -139,6 +139,46 @@ enum {
 #define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
 
 
+static struct {
+	char open;
+	char close;
+} url_braces[] = {
+	{ '(', ')' },
+	{ '{', '}' },
+	{ '[', ']' },
+	{ '<', '>' },
+	{ '|', '|' },
+};
+
+static gboolean
+is_open_brace (char c)
+{
+	int i;
+	
+	for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
+		if (c == url_braces[i].open)
+			return TRUE;
+	}
+	
+	return FALSE;
+}
+
+static char
+url_stop_at_brace (const char *in, size_t so)
+{
+	int i;
+	
+	if (so > 0) {
+		for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
+			if (in[so - 1] == url_braces[i].open)
+				return url_braces[i].close;
+		}
+	}
+	
+	return '\0';
+}
+
+
 gboolean
 camel_url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
 {
@@ -161,7 +201,7 @@ camel_url_addrspec_start (const char *in, const char *pos, const char *inend, ur
 			inptr--;
 	}
 	
-	if (!is_atom (*inptr))
+	if (!is_atom (*inptr) || is_open_brace (*inptr))
 		inptr++;
 	
 	if (inptr == pos)
@@ -177,6 +217,7 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm
 {
 	const char *inptr = pos;
 	int parts = 0, digits;
+	gboolean got_dot = FALSE;
 	
 	g_assert (*inptr == '@');
 	
@@ -213,12 +254,16 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm
 			while (inptr < inend && is_domain (*inptr))
 				inptr++;
 			
-			if (inptr < inend && *inptr == '.' && is_domain (inptr[1]))
+			if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) {
+				if (*inptr == '.')
+					got_dot = TRUE;
 				inptr++;
+			}
 		}
 	}
 	
-	if (inptr == pos + 1)
+	/* don't allow toplevel domains */
+	if (inptr == pos + 1 || !got_dot)
 		return FALSE;
 	
 	match->um_eo = (inptr - in);
@@ -226,31 +271,6 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm
 	return TRUE;
 }
 
-static struct {
-	char open;
-	char close;
-} url_braces[] = {
-	{ '(', ')' },
-	{ '{', '}' },
-	{ '[', ']' },
-	{ '<', '>' },
-};
-
-static char
-url_stop_at_brace (const char *in, size_t so)
-{
-	int i;
-	
-	if (so > 0) {
-		for (i = 0; i < 4; i++) {
-			if (in[so - 1] == url_braces[i].open)
-				return url_braces[i].close;
-		}
-	}
-	
-	return '\0';
-}
-
 gboolean
 camel_url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
 {
@@ -286,6 +306,12 @@ camel_url_file_end (const char *in, const char *pos, const char *inend, urlmatch
 gboolean
 camel_url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
 {
+	if (pos > in && !strncmp (pos, "www", 3)) {
+		/* make sure we aren't actually part of another word */
+		if (!is_open_brace (pos[-1]) && !isspace (pos[-1]))
+			return FALSE;
+	}
+	
 	match->um_so = (pos - in);
 	
 	return TRUE;
@@ -320,13 +346,37 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_
 				inptr++;
 			
 		} while (parts < 4);
-	} else if (is_domain (*inptr)) {
-				while (inptr < inend) {
-			if (is_domain (*inptr))
+	} else if (is_atom (*inptr)) {
+		/* might be a domain or user@domain */
+		const char *save = inptr;
+		
+		while (inptr < inend) {
+			if (!is_atom (*inptr))
+				break;
+			
+			inptr++;
+			
+			while (inptr < inend && is_atom (*inptr))
 				inptr++;
-			else
+			
+			if (inptr < inend && *inptr == '.' && is_atom (inptr[1]))
+				inptr++;
+		}
+		
+		if (*inptr != '@')
+			inptr = save;
+		else
+			inptr++;
+		
+		goto domain;
+	} else if (is_domain (*inptr)) {
+	domain:
+		while (inptr < inend) {
+			if (!is_domain (*inptr))
 				break;
 			
+			inptr++;
+			
 			while (inptr < inend && is_domain (*inptr))
 				inptr++;
 			
@@ -359,19 +409,19 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_
 			while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace)
 				inptr++;
 			
-			/* urls are extremely unlikely to end with any
-			 * punctuation, so strip any trailing
-			 * punctuation off. Also strip off any closing
-			 * braces. */
-			while (inptr > pos && strchr (",.?!)}]", inptr[-1]))
-				inptr--;
-			
 			break;
 		default:
 			break;
 		}
 	}
 	
+	/* urls are extremely unlikely to end with any
+	 * punctuation, so strip any trailing
+	 * punctuation off. Also strip off any closing
+	 * braces or quotes. */
+	while (inptr > pos && strchr (",.:;?!-|)}]'\"", inptr[-1]))
+		inptr--;
+	
 	match->um_eo = (inptr - in);
 	
 	return TRUE;
diff --git a/camel/tests/misc/Makefile.am b/camel/tests/misc/Makefile.am
index d3ed29f2e6..cc119e9390 100644
--- a/camel/tests/misc/Makefile.am
+++ b/camel/tests/misc/Makefile.am
@@ -18,10 +18,11 @@ LDADD = \
 
 check_PROGRAMS =  	\
 	url		\
+	url-scan	\
 	utf7		\
 	split
 
-TESTS = url utf7 split
+TESTS = url utf7 split url-scan
 
 
 
diff --git a/camel/tests/misc/url-scan.c b/camel/tests/misc/url-scan.c
new file mode 100644
index 0000000000..a7bbf51cf5
--- /dev/null
+++ b/camel/tests/misc/url-scan.c
@@ -0,0 +1,132 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ *  Authors: Jeffrey Stedfast <fejj@ximian.com>
+ *
+ *  Copyright 2004 Ximian, Inc. (www.ximian.com)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <camel/camel-mime-filter-tohtml.h>
+
+#include "camel-test.h"
+
+struct {
+	char *text, *url;
+} url_tests[] = {
+	{ "bob@foo.com", "mailto:bob@foo.com" },
+	{ "Ends with bob@foo.com", "mailto:bob@foo.com" },
+	{ "bob@foo.com at start", "mailto:bob@foo.com" },
+	{ "bob@foo.com.", "mailto:bob@foo.com" },
+	{ "\"bob@foo.com\"", "mailto:bob@foo.com" },
+	{ "<bob@foo.com>", "mailto:bob@foo.com" },
+	{ "(bob@foo.com)", "mailto:bob@foo.com" },
+	{ "bob@foo.com, 555-9999", "mailto:bob@foo.com" },
+	{ "|bob@foo.com|555-9999|", "mailto:bob@foo.com" },
+	{ "bob@ no match bob@", NULL },
+	{ "@foo.com no match @foo.com", NULL },
+	{ "\"bob\"@foo.com", NULL },
+	{ "M@ke money fast!", NULL },
+	{ "ASCII art @_@ @>->-", NULL },
+
+	{ "http://www.foo.com", "http://www.foo.com" },
+	{ "Ends with http://www.foo.com", "http://www.foo.com" },
+	{ "http://www.foo.com at start", "http://www.foo.com" },
+	{ "http://www.foo.com.", "http://www.foo.com" },
+	{ "http://www.foo.com/.", "http://www.foo.com/" },
+	{ "<http://www.foo.com>", "http://www.foo.com" },
+	{ "(http://www.foo.com)", "http://www.foo.com" },
+	{ "http://www.foo.com, 555-9999", "http://www.foo.com" },
+	{ "|http://www.foo.com|555-9999|", "http://www.foo.com" },
+	{ "foo http://www.foo.com/ bar", "http://www.foo.com/" },
+	{ "foo http://www.foo.com/index.html bar", "http://www.foo.com/index.html" },
+	{ "foo http://www.foo.com/q?99 bar", "http://www.foo.com/q?99" },
+	{ "foo http://www.foo.com/;foo=bar&baz=quux bar", "http://www.foo.com/;foo=bar&baz=quux" },
+	{ "foo http://www.foo.com/index.html#anchor bar", "http://www.foo.com/index.html#anchor" },
+	{ "http://www.foo.com/index.html; foo", "http://www.foo.com/index.html" },
+	{ "http://www.foo.com/index.html: foo", "http://www.foo.com/index.html" },
+	{ "http://www.foo.com/index.html-- foo", "http://www.foo.com/index.html" },
+	{ "http://www.foo.com/index.html?", "http://www.foo.com/index.html" },
+	{ "http://www.foo.com/index.html!", "http://www.foo.com/index.html" },
+	{ "\"http://www.foo.com/index.html\"", "http://www.foo.com/index.html" },
+	{ "'http://www.foo.com/index.html'", "http://www.foo.com/index.html" },
+	{ "http://bob@www.foo.com/bar/baz/", "http://bob@www.foo.com/bar/baz/" },
+	{ "http no match http", NULL },
+	{ "http: no match http:", NULL },
+	{ "http:// no match http://", NULL },
+	{ "unrecognized://bob@foo.com/path", "mailto:bob@foo.com" },
+
+	{ "src/www.c", NULL },
+	{ "Ewwwwww.Gross.", NULL },
+
+};
+
+static int num_url_tests = G_N_ELEMENTS (url_tests);
+
+int main (int argc, char **argv)
+{
+	char *html, *url, *p;
+	int i, errors = 0;
+	guint32 flags;
+	
+	camel_test_init (argc, argv);
+	
+	camel_test_start ("URL scanning");
+	
+	flags = CAMEL_MIME_FILTER_TOHTML_CONVERT_URLS | CAMEL_MIME_FILTER_TOHTML_CONVERT_ADDRESSES;
+	for (i = 0; i < num_url_tests; i++) {
+		camel_test_push ("'%s' => '%s'", url_tests[i].text, url_tests[i].url ? url_tests[i].url : "None");
+		
+		html = camel_text_to_html (url_tests[i].text, flags, 0);
+		
+		url = strstr (html, "href=\"");
+		if (url) {
+			url += 6;
+			p = strchr (url, '"');
+			if (p)
+				*p = '\0';
+			
+			while ((p = strstr (url, "&amp;")))
+				memmove (p + 1, p + 5, strlen (p + 5) + 1);
+		}
+		
+		if ((url && (!url_tests[i].url || strcmp (url, url_tests[i].url) != 0)) ||
+		    (!url && url_tests[i].url)) {
+			printf ("FAILED on \"%s\" -> %s\n  (got %s)\n\n",
+				url_tests[i].text,
+				url_tests[i].url ? url_tests[i].url : "(nothing)",
+				url ? url : "(nothing)");
+			errors++;
+		}
+		
+		g_free (html);
+	}
+	
+	printf ("\n%d errors\n", errors);
+	
+	camel_test_end ();
+	
+	return errors;
+}
-- 
cgit v1.2.3