aboutsummaryrefslogtreecommitdiffstats
path: root/camel/camel-url-scanner.c
diff options
context:
space:
mode:
authorJeffrey Stedfast <fejj@ximian.com>2004-02-04 02:52:53 +0800
committerJeffrey Stedfast <fejj@src.gnome.org>2004-02-04 02:52:53 +0800
commit3bed590653f5e6d72ffecb606f2569c1d1057326 (patch)
treec9d614774900392368137e9903ad86b88594c356 /camel/camel-url-scanner.c
parent229c627ee11c7b5f4b7df9355b42d3bf0d35fbaa (diff)
downloadgsoc2013-evolution-3bed590653f5e6d72ffecb606f2569c1d1057326.tar
gsoc2013-evolution-3bed590653f5e6d72ffecb606f2569c1d1057326.tar.gz
gsoc2013-evolution-3bed590653f5e6d72ffecb606f2569c1d1057326.tar.bz2
gsoc2013-evolution-3bed590653f5e6d72ffecb606f2569c1d1057326.tar.lz
gsoc2013-evolution-3bed590653f5e6d72ffecb606f2569c1d1057326.tar.xz
gsoc2013-evolution-3bed590653f5e6d72ffecb606f2569c1d1057326.tar.zst
gsoc2013-evolution-3bed590653f5e6d72ffecb606f2569c1d1057326.zip
New test suite for url scanning.
2004-02-03 Jeffrey Stedfast <fejj@ximian.com> * tests/misc/url-scan.c: New test suite for url scanning. * camel-url-scanner.c: Added single/double quotes to url_braces[] in case the user is quoting the url. (camel_url_web_end): Add "-;:" to list of punctuation to strip off the end of urls. Also fixed to handle user@domain's (camel_url_addrspec_start): Strip open brace characters from the beginning of the addr. (camel_url_web_start): Make sure "www" wasn't part of something not a url (like "Ewww.Gross") by check that pos[-1] is either an open brace or whitespace. (camel_url_addrspec_end): Don't allow toplevel domain addr-specs (if we encounter something that looks like it is a toplevel domain addr, it is more likely to be bogus than correct). svn path=/trunk/; revision=24592
Diffstat (limited to 'camel/camel-url-scanner.c')
-rw-r--r--camel/camel-url-scanner.c128
1 files changed, 89 insertions, 39 deletions
diff --git a/camel/camel-url-scanner.c b/camel/camel-url-scanner.c
index 8a43b05aee..3d7c0b6053 100644
--- a/camel/camel-url-scanner.c
+++ b/camel/camel-url-scanner.c
@@ -139,6 +139,46 @@ enum {
#define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
+static struct {
+ char open;
+ char close;
+} url_braces[] = {
+ { '(', ')' },
+ { '{', '}' },
+ { '[', ']' },
+ { '<', '>' },
+ { '|', '|' },
+};
+
+static gboolean
+is_open_brace (char c)
+{
+ int i;
+
+ for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
+ if (c == url_braces[i].open)
+ return TRUE;
+ }
+
+ return FALSE;
+}
+
+static char
+url_stop_at_brace (const char *in, size_t so)
+{
+ int i;
+
+ if (so > 0) {
+ for (i = 0; i < G_N_ELEMENTS (url_braces); i++) {
+ if (in[so - 1] == url_braces[i].open)
+ return url_braces[i].close;
+ }
+ }
+
+ return '\0';
+}
+
+
gboolean
camel_url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
{
@@ -161,7 +201,7 @@ camel_url_addrspec_start (const char *in, const char *pos, const char *inend, ur
inptr--;
}
- if (!is_atom (*inptr))
+ if (!is_atom (*inptr) || is_open_brace (*inptr))
inptr++;
if (inptr == pos)
@@ -177,6 +217,7 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm
{
const char *inptr = pos;
int parts = 0, digits;
+ gboolean got_dot = FALSE;
g_assert (*inptr == '@');
@@ -213,12 +254,16 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm
while (inptr < inend && is_domain (*inptr))
inptr++;
- if (inptr < inend && *inptr == '.' && is_domain (inptr[1]))
+ if (inptr < inend && *inptr == '.' && is_domain (inptr[1])) {
+ if (*inptr == '.')
+ got_dot = TRUE;
inptr++;
+ }
}
}
- if (inptr == pos + 1)
+ /* don't allow toplevel domains */
+ if (inptr == pos + 1 || !got_dot)
return FALSE;
match->um_eo = (inptr - in);
@@ -226,31 +271,6 @@ camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlm
return TRUE;
}
-static struct {
- char open;
- char close;
-} url_braces[] = {
- { '(', ')' },
- { '{', '}' },
- { '[', ']' },
- { '<', '>' },
-};
-
-static char
-url_stop_at_brace (const char *in, size_t so)
-{
- int i;
-
- if (so > 0) {
- for (i = 0; i < 4; i++) {
- if (in[so - 1] == url_braces[i].open)
- return url_braces[i].close;
- }
- }
-
- return '\0';
-}
-
gboolean
camel_url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
{
@@ -286,6 +306,12 @@ camel_url_file_end (const char *in, const char *pos, const char *inend, urlmatch
gboolean
camel_url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
{
+ if (pos > in && !strncmp (pos, "www", 3)) {
+ /* make sure we aren't actually part of another word */
+ if (!is_open_brace (pos[-1]) && !isspace (pos[-1]))
+ return FALSE;
+ }
+
match->um_so = (pos - in);
return TRUE;
@@ -320,13 +346,37 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_
inptr++;
} while (parts < 4);
- } else if (is_domain (*inptr)) {
- while (inptr < inend) {
- if (is_domain (*inptr))
+ } else if (is_atom (*inptr)) {
+ /* might be a domain or user@domain */
+ const char *save = inptr;
+
+ while (inptr < inend) {
+ if (!is_atom (*inptr))
+ break;
+
+ inptr++;
+
+ while (inptr < inend && is_atom (*inptr))
inptr++;
- else
+
+ if (inptr < inend && *inptr == '.' && is_atom (inptr[1]))
+ inptr++;
+ }
+
+ if (*inptr != '@')
+ inptr = save;
+ else
+ inptr++;
+
+ goto domain;
+ } else if (is_domain (*inptr)) {
+ domain:
+ while (inptr < inend) {
+ if (!is_domain (*inptr))
break;
+ inptr++;
+
while (inptr < inend && is_domain (*inptr))
inptr++;
@@ -359,19 +409,19 @@ camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_
while (inptr < inend && is_urlsafe (*inptr) && *inptr != close_brace)
inptr++;
- /* urls are extremely unlikely to end with any
- * punctuation, so strip any trailing
- * punctuation off. Also strip off any closing
- * braces. */
- while (inptr > pos && strchr (",.?!)}]", inptr[-1]))
- inptr--;
-
break;
default:
break;
}
}
+ /* urls are extremely unlikely to end with any
+ * punctuation, so strip any trailing
+ * punctuation off. Also strip off any closing
+ * braces or quotes. */
+ while (inptr > pos && strchr (",.:;?!-|)}]'\"", inptr[-1]))
+ inptr--;
+
match->um_eo = (inptr - in);
return TRUE;