From 3bed590653f5e6d72ffecb606f2569c1d1057326 Mon Sep 17 00:00:00 2001
From: Jeffrey Stedfast <fejj@ximian.com>
Date: Tue, 3 Feb 2004 18:52:53 +0000
Subject: New test suite for url scanning.

2004-02-03  Jeffrey Stedfast  <fejj@ximian.com>

	* tests/misc/url-scan.c: New test suite for url scanning.

	* camel-url-scanner.c: Added single/double quotes to url_braces[]
	in case the user is quoting the url.
	(camel_url_web_end): Add "-;:" to list of punctuation to strip off
	the end of urls. Also fixed to handle user@domain's
	(camel_url_addrspec_start): Strip open brace characters from the
	beginning of the addr.
	(camel_url_web_start): Make sure "www" wasn't part of something
	not a url (like "Ewww.Gross") by check that pos[-1] is either an
	open brace or whitespace.
	(camel_url_addrspec_end): Don't allow toplevel domain addr-specs
	(if we encounter something that looks like it is a toplevel domain
	addr, it is more likely to be bogus than correct).

svn path=/trunk/; revision=24592
---
 camel/camel-url-scanner.c | 128 ++++++++++++++++++++++++++++++++--------------
 1 file changed, 89 insertions(+), 39 deletions(-)

(limited to 'camel/camel-url-scanner.c')

diff --git a/camel/camel-url-scanner.c b/camel/camel-url-scanner.c
index 8a43b05aee..3d7c0b6053 100644
--- a/camel/camel-url-scanner.c
+++ b/camel/camel-url-scanner.c
@@ -139,6 +139,46 @@ enum {
 #define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
 
 
+static struct {
+	char open;
+	char close;
+} url_braces[] = {
+	{ '(', ')' },
+	{ '{', '}' },
+	{ '[', ']' },
+	{ '<', '>' },
+	{ '|', '|' },
+};
+
+static gboolean
+is_open_brace (char c)
+{
+	int i;
+	
+	for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
+		if (c == url_braces[i].open)
+			return TRUE;
+	}
+	
+	return FALSE;
+}
+
+static char
+url_stop_at_brace (const char *in, size_t so)
+{
+	int i;
+	
+	if (so > 0) {
+		for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
+			if (in[so - 1] == url_braces[i].open)
+				return url_braces[i].close;
+		}
+	}
+	
+	return '\0';
+}
+
+
 gboolean
 camel_url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
 {
@@ -161,7 +201,7 @@ camel_url_addrspec_start (const char *in, const char *pos, const char *inend, ur
 			inptr--;
 	}
 	
-	if (!is_atom (*inptr))
+	if (!is_atom (*inptr) || is_open_brace (*inptr))
 		inptr++;
 	
 	if (inptr == pos)
@@ -177,6 +217,7 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm
 {
 	const char *inptr = pos;
 	int parts = 0, digits;
+	gboolean got_dot = FALSE;
 	
 	g_assert (*inptr == '@');
 	
@@ -213,12 +254,16 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm
 			while (inptr < inend && is_domain (*inptr))
 				inptr++;
 			
-			if (inptr < inend && *inptr == '.' && is_domain (inptr[1]))
+			if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) {
+				if (*inptr == '.')
+					got_dot = TRUE;
 				inptr++;
+			}
 		}
 	}
 	
-	if (inptr == pos + 1)
+	/* don't allow toplevel domains */
+	if (inptr == pos + 1 || !got_dot)
 		return FALSE;
 	
 	match->um_eo = (inptr - in);
@@ -226,31 +271,6 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm
 	return TRUE;
 }
 
-static struct {
-	char open;
-	char close;
-} url_braces[] = {
-	{ '(', ')' },
-	{ '{', '}' },
-	{ '[', ']' },
-	{ '<', '>' },
-};
-
-static char
-url_stop_at_brace (const char *in, size_t so)
-{
-	int i;
-	
-	if (so > 0) {
-		for (i = 0; i < 4; i++) {
-			if (in[so - 1] == url_braces[i].open)
-				return url_braces[i].close;
-		}
-	}
-	
-	return '\0';
-}
-
 gboolean
 camel_url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
 {
@@ -286,6 +306,12 @@ camel_url_file_end (const char *in, const char *pos, const char *inend, urlmatch
 gboolean
 camel_url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
 {
+	if (pos > in && !strncmp (pos, "www", 3)) {
+		/* make sure we aren't actually part of another word */
+		if (!is_open_brace (pos[-1]) && !isspace (pos[-1]))
+			return FALSE;
+	}
+	
 	match->um_so = (pos - in);
 	
 	return TRUE;
@@ -320,13 +346,37 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_
 				inptr++;
 			
 		} while (parts < 4);
-	} else if (is_domain (*inptr)) {
-				while (inptr < inend) {
-			if (is_domain (*inptr))
+	} else if (is_atom (*inptr)) {
+		/* might be a domain or user@domain */
+		const char *save = inptr;
+		
+		while (inptr < inend) {
+			if (!is_atom (*inptr))
+				break;
+			
+			inptr++;
+			
+			while (inptr < inend && is_atom (*inptr))
 				inptr++;
-			else
+			
+			if (inptr < inend && *inptr == '.' && is_atom (inptr[1]))
+				inptr++;
+		}
+		
+		if (*inptr != '@')
+			inptr = save;
+		else
+			inptr++;
+		
+		goto domain;
+	} else if (is_domain (*inptr)) {
+	domain:
+		while (inptr < inend) {
+			if (!is_domain (*inptr))
 				break;
 			
+			inptr++;
+			
 			while (inptr < inend && is_domain (*inptr))
 				inptr++;
 			
@@ -359,19 +409,19 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_
 			while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace)
 				inptr++;
 			
-			/* urls are extremely unlikely to end with any
-			 * punctuation, so strip any trailing
-			 * punctuation off. Also strip off any closing
-			 * braces. */
-			while (inptr > pos && strchr (",.?!)}]", inptr[-1]))
-				inptr--;
-			
 			break;
 		default:
 			break;
 		}
 	}
 	
+	/* urls are extremely unlikely to end with any
+	 * punctuation, so strip any trailing
+	 * punctuation off. Also strip off any closing
+	 * braces or quotes. */
+	while (inptr > pos && strchr (",.:;?!-|)}]'\"", inptr[-1]))
+		inptr--;
+	
 	match->um_eo = (inptr - in);
 	
 	return TRUE;
-- 
cgit v1.2.3