diff options
-rw-r--r-- | e-util/ChangeLog | 9 | ||||
-rw-r--r-- | e-util/e-html-utils.c | 114 |
2 files changed, 61 insertions, 62 deletions
diff --git a/e-util/ChangeLog b/e-util/ChangeLog index b282021766..bbc5ceeafc 100644 --- a/e-util/ChangeLog +++ b/e-util/ChangeLog @@ -1,3 +1,12 @@ +2001-04-29 Dan Winship <danw@ximian.com> + + * e-html-utils.c (email_address_extract): Make this smarter. Now + e_text_to_html_full calls it when it sees a '@', and + email_address_extract deals with scanning both forward and + backward to find the bounds of the email address. It's still + fooled by Message-IDs, but there's not a whole lot we can do + there... + 2001-04-26 Dan Winship <danw@ximian.com> * e-host-utils.c (e_gethostbyname_r): Make the Solaris and diff --git a/e-util/e-html-utils.c b/e-util/e-html-utils.c index 8f15e4a39d..3b40c6ae87 100644 --- a/e-util/e-html-utils.c +++ b/e-util/e-html-utils.c @@ -40,6 +40,22 @@ check_size (char **buffer, int *buffer_size, char *out, int len) return out; } +/* 1 = non-email-address chars: ()<>@,;:\"[] */ +/* 2 = trailing garbage: ,.!?;:>)]}`'-_ */ +static int special_chars[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* nul - 0x0f */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 0x10 - 0x1f */ + 1, 2, 1, 0, 0, 0, 0, 2, 1, 3, 0, 0, 3, 2, 2, 0, /* sp - / */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 3, 1, 0, 3, 2, /* 0 - ? */ + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* @ - O */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 3, 0, 2, /* P - _ */ + 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* ` - o */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0 /* p - del */ +}; + +#define is_addr_char(c) (isprint (c) && !(special_chars[c] & 1)) +#define is_trailing_garbage(c) (!isprint(c) || (special_chars[c] & 2)) + static char * url_extract (const unsigned char **text, gboolean check) { @@ -50,7 +66,7 @@ url_extract (const unsigned char **text, gboolean check) end++; /* Back up if we probably went too far. */ - while (end > *text && strchr (",.!?;:>)]}", *(end - 1))) + while (end > *text && is_trailing_garbage (*(end - 1))) end--; if (check) { @@ -65,58 +81,37 @@ url_extract (const unsigned char **text, gboolean check) return out; } -/* FIXME -- this should be smarter */ -static gboolean -is_email_address (const unsigned char *c) +static char * +email_address_extract (const unsigned char **cur, char **out, const unsigned char *linestart) { - gboolean seen_at = FALSE, seen_postat = FALSE; - - if (c == NULL) - return FALSE; - - if (*c == '<') - ++c; - - while (*c && (isalnum ((gint) *c) - || *c == '-' - || *c == '_' - || *c == (seen_at ? '.' : '@'))) { - - if (seen_at && !seen_postat) { - if (*c == '.') - return FALSE; - seen_postat = TRUE; - } + const unsigned char *start, *end, *dot; + char *addr; - if (*c == '@') - seen_at = TRUE; + /* *cur points to the '@'. Look backward for a valid local-part */ + for (start = *cur; start - 1 >= linestart && is_addr_char (*(start - 1)); start--) + ; + if (start == *cur) + return NULL; - ++c; + /* Now look forward for a valid domain part */ + for (end = *cur + 1, dot = NULL; is_addr_char (*end); end++) { + if (*end == '.' && !dot) + dot = end; } - - return seen_at && seen_postat && (isspace ((gint) *c) || *c == '>' || !*c); -} - -static gchar * -email_address_extract (const unsigned char **text) -{ - const unsigned char *end = *text; - char *out; - - if (end == NULL) + if (!dot) return NULL; - while (*end && !isspace (*end) && (*end != '>') && (*end < 0x80)) - ++end; - - out = g_strndup (*text, end - *text); - if (!is_email_address (out)) { - g_free (out); + /* Remove trailing garbage */ + while (is_trailing_garbage (*(end - 1))) + end--; + if (dot > end) return NULL; - } - *text = end; - return out; + addr = g_strndup (start, end - start); + *out -= *cur - start; + *cur = end; + + return addr; } static gboolean @@ -203,7 +198,7 @@ is_citation (const unsigned char *c, gboolean saw_citation) char * e_text_to_html_full (const char *input, unsigned int flags, guint32 color) { - const unsigned char *cur = input; + const unsigned char *cur, *linestart; char *buffer = NULL; char *out = NULL; int buffer_size = 0, col; @@ -219,7 +214,7 @@ e_text_to_html_full (const char *input, unsigned int flags, guint32 color) col = 0; - for (cur = input; cur && *cur; cur = g_utf8_next_char (cur)) { + for (cur = linestart = input; cur && *cur; cur = g_utf8_next_char (cur)) { gunichar u; if (flags & E_TEXT_TO_HTML_MARK_CITATION && col == 0) { @@ -290,29 +285,23 @@ e_text_to_html_full (const char *input, unsigned int flags, guint32 color) u = g_utf8_get_char (cur); } - if (g_unichar_isalpha (u) - && (flags & E_TEXT_TO_HTML_CONVERT_ADDRESSES) - && is_email_address (cur)) { - gchar *addr = NULL, *dispaddr = NULL; + if (u == '@' && (flags & E_TEXT_TO_HTML_CONVERT_ADDRESSES)) { + char *addr, *dispaddr, *outaddr; - addr = email_address_extract (&cur); - dispaddr = e_text_to_html (addr, 0); - + addr = email_address_extract (&cur, &out, linestart); if (addr) { - gchar *outaddr = g_strdup_printf ("<a href=\"mailto:%s\">%s</a>", - addr, dispaddr); - out = check_size (&buffer, &buffer_size, out, strlen(outaddr)); + dispaddr = e_text_to_html (addr, 0); + outaddr = g_strdup_printf ("<a href=\"mailto:%s\">%s</a>", + addr, dispaddr); + out = check_size (&buffer, &buffer_size, out, strlen (outaddr)); out += sprintf (out, "%s", outaddr); col += strlen (addr); g_free (addr); g_free (dispaddr); g_free (outaddr); - } - if (!*cur) - break; - u = g_utf8_get_char (cur); - + u = g_utf8_get_char (cur); + } } if (u == (gunichar)-1) { @@ -355,6 +344,7 @@ e_text_to_html_full (const char *input, unsigned int flags, guint32 color) out += 4; } *out++ = *cur; + linestart = cur; col = 0; break; |