aboutsummaryrefslogtreecommitdiffstats
path: root/camel/camel-url-scanner.c
diff options
context:
space:
mode:
Diffstat (limited to 'camel/camel-url-scanner.c')
-rw-r--r--camel/camel-url-scanner.c396
1 files changed, 396 insertions, 0 deletions
diff --git a/camel/camel-url-scanner.c b/camel/camel-url-scanner.c
new file mode 100644
index 0000000000..8ff6a14156
--- /dev/null
+++ b/camel/camel-url-scanner.c
@@ -0,0 +1,396 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ * Authors: Jeffrey Stedfast <fejj@ximian.com>
+ *
+ * Copyright 2002 Ximian, Inc. (www.ximian.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <string.h>
+
+#include "e-util/e-trie.h"
+#include "camel-url-scanner.h"
+
+
+struct _CamelUrlScanner {
+ GPtrArray *patterns;
+ ETrie *trie;
+};
+
+
+CamelUrlScanner *
+camel_url_scanner_new (void)
+{
+ CamelUrlScanner *scanner;
+
+ scanner = g_new (CamelUrlScanner, 1);
+ scanner->patterns = g_ptr_array_new ();
+ scanner->trie = e_trie_new (TRUE);
+
+ return scanner;
+}
+
+
+void
+camel_url_scanner_free (CamelUrlScanner *scanner)
+{
+ g_return_if_fail (scanner != NULL);
+
+ g_ptr_array_free (scanner->patterns, TRUE);
+ e_trie_free (scanner->trie);
+ g_free (scanner);
+}
+
+
+void
+camel_url_scanner_add (CamelUrlScanner *scanner, urlpattern_t *pattern)
+{
+ g_return_if_fail (scanner != NULL);
+
+ e_trie_add (scanner->trie, pattern->pattern, scanner->patterns->len);
+ g_ptr_array_add (scanner->patterns, pattern);
+}
+
+
+gboolean
+camel_url_scanner_scan (CamelUrlScanner *scanner, const char *in, size_t inlen, urlmatch_t *match)
+{
+ const char *pos, *inend;
+ urlpattern_t *pat;
+ int pattern;
+
+ g_return_val_if_fail (scanner != NULL, FALSE);
+ g_return_val_if_fail (in != NULL, FALSE);
+
+ if (!(pos = e_trie_search (scanner->trie, in, inlen, &pattern)))
+ return FALSE;
+
+ pat = g_ptr_array_index (scanner->patterns, pattern);
+
+ inend = in + inlen;
+ if (!pat->start (in, pos, inend, match))
+ return FALSE;
+
+ if (!pat->end (in, pos, inend, match))
+ return FALSE;
+
+ match->pattern = pat->pattern;
+ match->prefix = pat->prefix;
+
+ return TRUE;
+}
+
+
+static unsigned char url_scanner_table[256] = {
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 9, 9, 1, 1, 9, 1, 1,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 24, 64, 96, 64, 64, 64, 64, 64, 96, 96, 64, 64, 96, 64, 96, 96,
+ 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 96, 96, 32, 64, 32, 64,
+ 96, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 96, 96, 96, 64, 64,
+ 64, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 64, 64, 64, 64, 1,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+enum {
+ IS_CTRL = (1 << 0),
+ IS_ALPHA = (1 << 1),
+ IS_DIGIT = (1 << 2),
+ IS_LWSP = (1 << 3),
+ IS_SPACE = (1 << 4),
+ IS_SPECIAL = (1 << 5),
+ IS_URLSAFE = (1 << 6),
+};
+
+#define is_ctrl(x) ((url_scanner_table[(unsigned char)(x)] & IS_CTRL) != 0)
+#define is_lwsp(x) ((url_scanner_table[(unsigned char)(x)] & IS_LWSP) != 0)
+#define is_atom(x) ((url_scanner_table[(unsigned char)(x)] & (IS_SPECIAL|IS_SPACE|IS_CTRL)) == 0)
+#define is_alpha(x) ((url_scanner_table[(unsigned char)(x)] & IS_ALPHA) != 0)
+#define is_digit(x) ((url_scanner_table[(unsigned char)(x)] & IS_DIGIT) != 0)
+#define is_domain(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT)) != 0 || (x) == '-')
+#define is_urlsafe(x) ((url_scanner_table[(unsigned char)(x)] & (IS_ALPHA|IS_DIGIT|IS_URLSAFE)) != 0)
+
+
+gboolean
+camel_url_addrspec_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
+{
+ register const char *inptr = pos;
+
+ g_assert (*inptr == '@');
+
+ inptr--;
+
+ while (inptr > in) {
+ if (is_atom (*inptr))
+ inptr--;
+ else
+ break;
+
+ while (inptr > in && is_atom (*inptr))
+ inptr--;
+
+ if (inptr > in && *inptr == '.')
+ inptr--;
+ }
+
+ if (!is_atom (*inptr))
+ inptr++;
+
+ if (inptr == pos)
+ return FALSE;
+
+ match->um_so = (inptr - in);
+
+ return TRUE;
+}
+
+gboolean
+camel_url_addrspec_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
+{
+ const char *inptr = pos;
+ int parts = 0, digits;
+
+ g_assert (*inptr == '@');
+
+ inptr++;
+
+ if (*inptr == '[') {
+ /* domain literal */
+ do {
+ inptr++;
+
+ digits = 0;
+ while (inptr < inend && is_digit (*inptr) && digits < 3) {
+ inptr++;
+ digits++;
+ }
+
+ parts++;
+
+ if (*inptr != '.' && parts != 4)
+ return FALSE;
+ } while (parts < 4);
+
+ if (*inptr == ']')
+ inptr++;
+ else
+ return FALSE;
+ } else {
+ while (inptr < inend) {
+ if (is_domain (*inptr))
+ inptr++;
+ else
+ break;
+
+ while (inptr < inend && is_domain (*inptr))
+ inptr++;
+
+ if (inptr < inend && *inptr == '.')
+ inptr++;
+ }
+ }
+
+ if (inptr == pos)
+ return FALSE;
+
+ match->um_eo = (inptr - in);
+
+ return TRUE;
+}
+
+gboolean
+camel_url_file_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
+{
+ match->um_so = (pos - in);
+
+ return TRUE;
+}
+
+gboolean
+camel_url_file_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
+{
+ register const char *inptr = pos;
+
+ inptr += strlen (match->pattern);
+
+ if (*inptr == '/')
+ inptr++;
+
+ while (inptr < inend && is_urlsafe (*inptr))
+ inptr++;
+
+ if (inptr == pos)
+ return FALSE;
+
+ match->um_eo = (inptr - in);
+
+ return TRUE;
+}
+
+gboolean
+camel_url_web_start (const char *in, const char *pos, const char *inend, urlmatch_t *match)
+{
+ match->um_so = (pos - in);
+
+ return TRUE;
+}
+
+gboolean
+camel_url_web_end (const char *in, const char *pos, const char *inend, urlmatch_t *match)
+{
+ register const char *inptr = pos;
+ int parts = 0, digits, port;
+
+ inptr += strlen (match->pattern);
+
+ /* find the end of the domain */
+ if (is_digit (*inptr)) {
+ /* domain-literal */
+ do {
+ digits = 0;
+ while (inptr < inend && is_digit (*inptr) && digits < 3) {
+ inptr++;
+ digits++;
+ }
+
+ parts++;
+
+ if (*inptr != '.' && parts != 4)
+ return FALSE;
+ else if (*inptr == '.')
+ inptr++;
+
+ } while (parts < 4);
+ } else if (is_domain (*inptr)) {
+ do {
+ while (inptr < inend && is_domain (*inptr))
+ inptr++;
+
+ if (inptr < inend && *inptr == '.')
+ inptr++;
+ else
+ break;
+
+ } while (inptr < inend);
+ } else {
+ return FALSE;
+ }
+
+ if (inptr < inend && *inptr == ':') {
+ /* skip past the port */
+ inptr++;
+ port = 0;
+
+ while (inptr < inend && is_digit (*inptr) && port < 65536)
+ port = (port * 10) + (*inptr++ - '0');
+ }
+
+ if (inptr < inend && *inptr == '/') {
+ /* skip past our url path */
+ inptr++;
+
+ while (inptr < inend && is_urlsafe (*inptr))
+ inptr++;
+ }
+
+ match->um_eo = (inptr - in);
+
+ return TRUE;
+}
+
+
+
+#ifdef BUILD_TABLE
+
+#include <stdio.h>
+
+/* got these from rfc1738 */
+#define CHARS_LWSP " \t\n\r" /* linear whitespace chars */
+#define CHARS_SPECIAL "()<>@,;:\\\".[]"
+
+/* got these from rfc1738 */
+#define CHARS_URLSAFE "$-_.+!*'(),{}|\\^~[]`#%\";/?:@&="
+
+
+static void
+table_init_bits (unsigned int mask, const unsigned char *vals)
+{
+ int i;
+
+ for (i = 0; vals[i] != '\0'; i++)
+ url_scanner_table[vals[i]] |= mask;
+}
+
+static void
+url_scanner_table_init (void)
+{
+ int i;
+
+ for (i = 0; i < 256; i++) {
+ url_scanner_table[i] = 0;
+ if (i < 32)
+ url_scanner_table[i] |= IS_CTRL;
+ if ((i >= '0' && i <= '9'))
+ url_scanner_table[i] |= IS_DIGIT;
+ if ((i >= 'a' && i <= 'z') || (i >= 'A' && i <= 'Z'))
+ url_scanner_table[i] |= IS_ALPHA;
+ }
+
+ url_scanner_table[127] |= IS_CTRL;
+ url_scanner_table[' '] |= IS_SPACE;
+
+ /* not defined to be special in rfc0822, but when scanning
+ backwards to find the beginning of the email address we do
+ not want to incldue this char if we come accross it - so
+ this is kind of a hack */
+ url_scanner_table['/'] |= IS_SPECIAL;
+
+ table_init_bits (IS_LWSP, CHARS_LWSP);
+ table_init_bits (IS_SPECIAL, CHARS_SPECIAL);
+ table_init_bits (IS_URLSAFE, CHARS_URLSAFE);
+}
+
+int main (int argc, char **argv)
+{
+ int i;
+
+ url_scanner_table_init ();
+
+ printf ("static unsigned char url_scanner_table[256] = {");
+ for (i = 0; i < 256; i++) {
+ printf ("%s%3d%s", (i % 16) ? "" : "\n\t",
+ url_scanner_table[i], i != 255 ? "," : "\n");
+ }
+ printf ("};\n\n");
+
+ return 0;
+}
+
+#endif /* BUILD_TABLE */