2 files changed, 61 insertions, 62 deletions
diff --git a/e-util/ChangeLog b/e-util/ChangeLog
index b282021766..bbc5ceeafc 100644
--- a/e-util/ChangeLog
+++ b/e-util/ChangeLog
@@ -1,3 +1,12 @@
+2001-04-29  Dan Winship  <danw@ximian.com>
+
+	* e-html-utils.c (email_address_extract): Make this smarter. Now
+	e_text_to_html_full calls it when it sees a '@', and
+	email_address_extract deals with scanning both forward and
+	backward to find the bounds of the email address. It's still
+	fooled by Message-IDs, but there's not a whole lot we can do
+	there...
+
 2001-04-26  Dan Winship  <danw@ximian.com>
 
 	* e-host-utils.c (e_gethostbyname_r): Make the Solaris and
diff --git a/e-util/e-html-utils.c b/e-util/e-html-utils.c
index 8f15e4a39d..3b40c6ae87 100644
--- a/e-util/e-html-utils.c
+++ b/e-util/e-html-utils.c
@@ -40,6 +40,22 @@ check_size (char **buffer, int *buffer_size, char *out, int len)
 	return out;
 }
 
+/* 1 = non-email-address chars: ()<>@,;:\"[]    */
+/* 2 = trailing garbage:        ,.!?;:>)]}`'-_  */
+static int special_chars[] = {
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    /*  nul - 0x0f */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    /* 0x10 - 0x1f */
+	1, 2, 1, 0, 0, 0, 0, 2, 1, 3, 0, 0, 3, 2, 2, 0,    /*   sp - /    */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 1, 0, 3, 2,    /*    0 - ?    */
+	1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    /*    @ - O    */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 3, 0, 2,    /*    P - _    */
+	2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    /*    ` - o    */
+	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0     /*    p - del  */
+};
+
+#define is_addr_char(c) (isprint (c) && !(special_chars[c] & 1))
+#define is_trailing_garbage(c) (!isprint(c) || (special_chars[c] & 2))
+
 static char *
 url_extract (const unsigned char **text, gboolean check)
 {
@@ -50,7 +66,7 @@ url_extract (const unsigned char **text, gboolean check)
 		end++;
 
 	/* Back up if we probably went too far. */
-	while (end > *text && strchr (",.!?;:>)]}", *(end - 1)))
+	while (end > *text && is_trailing_garbage (*(end - 1)))
 		end--;
 
 	if (check) {
@@ -65,58 +81,37 @@ url_extract (const unsigned char **text, gboolean check)
 	return out;
 }
 
-/* FIXME -- this should be smarter */
-static gboolean
-is_email_address (const unsigned char *c)
+static char *
+email_address_extract (const unsigned char **cur, char **out, const unsigned char *linestart)
 {
-	gboolean seen_at = FALSE, seen_postat = FALSE;
-
-	if (c == NULL)
-		return FALSE;
-
-	if (*c == '<')
-		++c;
-
-	while (*c && (isalnum ((gint) *c)
-		      || *c == '-'
-		      || *c == '_'
-		      || *c == (seen_at ? '.' : '@'))) {
-		
-		if (seen_at && !seen_postat) {
-			if (*c == '.')
-				return FALSE;
-			seen_postat = TRUE;
-		}
+	const unsigned char *start, *end, *dot;
+	char *addr;
 
-		if (*c == '@')
-			seen_at = TRUE;
+	/* *cur points to the '@'. Look backward for a valid local-part */
+	for (start = *cur; start - 1 >= linestart && is_addr_char (*(start - 1)); start--)
+		;
+	if (start == *cur)
+		return NULL;
 
-		++c;
+	/* Now look forward for a valid domain part */
+	for (end = *cur + 1, dot = NULL; is_addr_char (*end); end++) {
+		if (*end == '.' && !dot)
+			dot = end;
 	}
-
-	return seen_at && seen_postat && (isspace ((gint) *c) || *c == '>' || !*c);
-}
-
-static gchar *
-email_address_extract (const unsigned char **text)
-{
-	const unsigned char *end = *text;
-	char *out;
-
-	if (end == NULL)
+	if (!dot)
 		return NULL;
 
-	while (*end && !isspace (*end) && (*end != '>') && (*end < 0x80))
-		++end;
-
-	out = g_strndup (*text, end - *text);
-	if (!is_email_address (out)) {
-		g_free (out);
+	/* Remove trailing garbage */
+	while (is_trailing_garbage (*(end - 1)))
+		end--;
+	if (dot > end)
 		return NULL;
-	}
 
-	*text = end;
-	return out;
+	addr = g_strndup (start, end - start);
+	*out -= *cur - start;
+	*cur = end;
+
+	return addr;
 }
 
 static gboolean
@@ -203,7 +198,7 @@ is_citation (const unsigned char *c, gboolean saw_citation)
 char *
 e_text_to_html_full (const char *input, unsigned int flags, guint32 color)
 {
-	const unsigned char *cur = input;
+	const unsigned char *cur, *linestart;
 	char *buffer = NULL;
 	char *out = NULL;
 	int buffer_size = 0, col;
@@ -219,7 +214,7 @@ e_text_to_html_full (const char *input, unsigned int flags, guint32 color)
 
 	col = 0;
 
-	for (cur = input; cur && *cur; cur = g_utf8_next_char (cur)) {
+	for (cur = linestart = input; cur && *cur; cur = g_utf8_next_char (cur)) {
 		gunichar u;
 
 		if (flags & E_TEXT_TO_HTML_MARK_CITATION && col == 0) {
@@ -290,29 +285,23 @@ e_text_to_html_full (const char *input, unsigned int flags, guint32 color)
 			u = g_utf8_get_char (cur);
 		}
 
-		if (g_unichar_isalpha (u)
-		    && (flags & E_TEXT_TO_HTML_CONVERT_ADDRESSES)
-		    && is_email_address (cur)) {
-			gchar *addr = NULL, *dispaddr = NULL;
+		if (u == '@' && (flags & E_TEXT_TO_HTML_CONVERT_ADDRESSES)) {
+			char *addr, *dispaddr, *outaddr;
 
-			addr = email_address_extract (&cur);
-			dispaddr = e_text_to_html (addr, 0);
-			
+			addr = email_address_extract (&cur, &out, linestart);
 			if (addr) {
-				gchar *outaddr = g_strdup_printf ("<a href=\"mailto:%s\">%s</a>",
-								  addr, dispaddr);
-				out = check_size (&buffer, &buffer_size, out, strlen(outaddr));
+				dispaddr = e_text_to_html (addr, 0);
+				outaddr = g_strdup_printf ("<a href=\"mailto:%s\">%s</a>",
+							   addr, dispaddr);
+				out = check_size (&buffer, &buffer_size, out, strlen (outaddr));
 				out += sprintf (out, "%s", outaddr);
 				col += strlen (addr);
 				g_free (addr);
 				g_free (dispaddr);
 				g_free (outaddr);
-			}
 
-			if (!*cur)
-				break;
-			u = g_utf8_get_char (cur);
-			
+				u = g_utf8_get_char (cur);
+			}
 		}
 
 		if (u == (gunichar)-1) {
@@ -355,6 +344,7 @@ e_text_to_html_full (const char *input, unsigned int flags, guint32 color)
 				out += 4;
 			}
 			*out++ = *cur;
+			linestart = cur;
 			col = 0;
 			break;