2 files changed, 321 insertions, 0 deletions
diff --git a/camel/camel-search-private.c b/camel/camel-search-private.c
new file mode 100644
index 0000000000..94c6c1a355
--- /dev/null
+++ b/camel/camel-search-private.c
@@ -0,0 +1,275 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */
+/*
+ *  Authors: Jeffrey Stedfast <fejj@helixcode.com>
+ *	     Michael Zucchi <NotZed@Ximian.com>
+ *
+ *  Copyright 2000 Helix Code, Inc. (www.helixcode.com)
+ *  Copyright 2001 Ximian Inc. (www.ximian.com)
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License, or
+ *  (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA.
+ *
+ */
+
+/* (from glibc headers:
+   POSIX says that <sys/types.h> must be included (by the caller) before <regex.h>.  */
+#include <sys/types.h>
+#include <regex.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+
+#include "camel-exception.h"
+#include "camel-mime-message.h"
+#include "camel-multipart.h"
+#include "camel-stream-mem.h"
+#include "e-util/e-sexp.h"
+
+#include "camel-search-private.h"
+
+#define d(x)
+
+/* builds the regex into pattern */
+/* taken from camel-folder-search, with added isregex & exception parameter */
+/* Basically, we build a new regex, either based on subset regex's, or substrings,
+   that can be executed once over the whoel body, to match anything suitable.
+   This is more efficient than multiple searches, and probably most (naive) strstr
+   implementations, over long content.
+
+   A small issue is that case-insenstivity wont work entirely correct for utf8 strings. */
+int
+camel_search_build_match_regex(regex_t *pattern, camel_search_flags_t type, int argc, struct _ESExpResult **argv, CamelException *ex)
+{
+	GString *match = g_string_new("");
+	int c, i, count=0, err;
+	char *word;
+	int flags;
+
+	/* build a regex pattern we can use to match the words, we OR them together */
+	if (argc>1)
+		g_string_append_c(match, '(');
+	for (i=0;i<argc;i++) {
+		if (argv[i]->type == ESEXP_RES_STRING) {
+			if (count > 0)
+				g_string_append_c(match, '|');
+			/* escape any special chars (not sure if this list is complete) */
+			word = argv[i]->value.string;
+			if (type & CAMEL_SEARCH_MATCH_REGEX) {
+				g_string_append(match, word);
+			} else {
+				if (type & CAMEL_SEARCH_MATCH_START)
+					g_string_append_c(match, '^');
+				while ((c = *word++)) {
+					if (strchr("*\\.()[]^$+", c) != NULL) {
+						g_string_append_c(match, '\\');
+					}
+					g_string_append_c(match, c);
+				}
+				if (type & CAMEL_SEARCH_MATCH_END)
+					g_string_append_c(match, '^');
+			}
+			count++;
+		} else {
+			g_warning("Invalid type passed to body-contains match function");
+		}
+	}
+	if (argc>1)
+		g_string_append_c(match, ')');
+	flags = REG_EXTENDED|REG_NOSUB;
+	if (type & CAMEL_SEARCH_MATCH_ICASE)
+		flags |= REG_ICASE;
+	err = regcomp(pattern, match->str, flags);
+	if (err != 0) {
+		/* regerror gets called twice to get the full error string 
+		   length to do proper posix error reporting */
+		int len = regerror(err, pattern, 0, 0);
+		char *buffer = g_malloc0(len + 1);
+
+		regerror(err, pattern, buffer, len);
+		camel_exception_setv(ex, CAMEL_EXCEPTION_SYSTEM,
+				     _("Regular expression compilation failed: %s: %s"),
+				     match->str, buffer);
+
+		regfree(pattern);
+	}
+	d(printf("Built regex: '%s'\n", match->str));
+	g_string_free(match, TRUE);
+	return err;
+}
+
+static unsigned char soundex_table[256] = {
+	  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	  0,  0, 49, 50, 51,  0, 49, 50,  0,  0, 50, 50, 52, 53, 53,  0,
+	 49, 50, 54, 50, 51,  0, 49,  0, 50,  0, 50,  0,  0,  0,  0,  0,
+	  0,  0, 49, 50, 51,  0, 49, 50,  0,  0, 50, 50, 52, 53, 53,  0,
+	 49, 50, 54, 50, 51,  0, 49,  0, 50,  0, 50,  0,  0,  0,  0,  0,
+	  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+	  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+};
+
+static void
+soundexify (const gchar *sound, gchar code[5])
+{
+	guchar *c, last = '\0';
+	gint n;
+	
+	for (c = (guchar *) sound; *c && !isalpha (*c); c++);
+	code[0] = toupper (*c);
+	memset (code + 1, '0', 3);
+	for (n = 1; *c && n < 5; c++) {
+		guchar ch = soundex_table[*c];
+		
+		if (ch && ch != last) {
+			code[n++] = ch;
+			last = ch;
+		}
+	}
+	code[4] = '\0';
+}
+
+static gboolean
+header_soundex(const char *header, const char *match)
+{
+	char mcode[5], hcode[5];
+	const char *p;
+	char c;
+	GString *word;
+	int truth = FALSE;
+
+	soundexify(match, mcode);
+
+	/* split the header into words, and soundexify and compare each one */
+	/* FIXME: Should this convert to utf8, and split based on that, and what not?
+	   soundex only makes sense for us-ascii though ... */
+
+	word = g_string_new("");
+	p = header;
+	do {
+		c = *p++;
+		if (c == 0 || isspace(c)) {
+			if (word->len > 0) {
+				soundexify(word->str, hcode);
+				if (strcmp(hcode, mcode) == 0)
+					truth = TRUE;
+			}
+			g_string_truncate(word, 0);
+		} else if (isalpha(c))
+			g_string_append_c(word, c);
+	} while (c && !truth);
+	g_string_free(word, TRUE);
+
+	return truth;
+}
+
+/* searhces for match inside value, if match is mixed case, hten use case-sensitive,
+   else insensitive */
+gboolean camel_search_header_match(const char *value, const char *match, camel_search_match_t how)
+{
+	const char *p;
+
+	if (how == CAMEL_SEARCH_MATCH_SOUNDEX)
+		return header_soundex(value, match);
+
+	while (*value && isspace(*value))
+		value++;
+
+	if (strlen(value) < strlen(match))
+		return FALSE;
+
+	/* from dan the man, if we have mixed case, perform a case-sensitive match,
+	   otherwise not */
+	p = match;
+	while (*p) {
+		if (isupper(*p)) {
+			switch(how) {
+			case CAMEL_SEARCH_MATCH_EXACT:
+				return strcmp(value, match) == 0;
+			case CAMEL_SEARCH_MATCH_CONTAINS:
+				return strstr(value, match) != NULL;
+			case CAMEL_SEARCH_MATCH_STARTS:
+				return strncmp(value, match, strlen(match)) == 0;
+			case CAMEL_SEARCH_MATCH_ENDS:
+				return strcmp(value+strlen(value)-strlen(match), match) == 0;
+			default:
+				break;
+			}
+			return FALSE;
+		}
+		p++;
+	}
+	switch(how) {
+	case CAMEL_SEARCH_MATCH_EXACT:
+		return strcasecmp(value, match) == 0;
+	case CAMEL_SEARCH_MATCH_CONTAINS:
+		return e_utf8_strstrcase(value, match) != NULL;
+	case CAMEL_SEARCH_MATCH_STARTS:
+		return strncasecmp(value, match, strlen(match)) == 0;
+	case CAMEL_SEARCH_MATCH_ENDS:
+		return strcasecmp(value+strlen(value)-strlen(match), match) == 0;
+	default:
+		break;
+	}
+
+	return FALSE;
+}
+
+/* performs a 'slow' content-based match */
+/* there is also an identical copy of this in camel-filter-search.c */
+gboolean
+camel_search_message_body_contains(CamelDataWrapper *object, regex_t *pattern)
+{
+	CamelDataWrapper *containee;
+	int truth = FALSE;
+	int parts, i;
+
+	containee = camel_medium_get_content_object(CAMEL_MEDIUM(object));
+
+	if (containee == NULL)
+		return FALSE;
+
+	/* TODO: I find it odd that get_part and get_content_object do not
+	   add a reference, probably need fixing for multithreading */
+
+	/* using the object types is more accurate than using the mime/types */
+	if (CAMEL_IS_MULTIPART(containee)) {
+		parts = camel_multipart_get_number(CAMEL_MULTIPART(containee));
+		for (i=0;i<parts && truth==FALSE;i++) {
+			CamelDataWrapper *part = (CamelDataWrapper *)camel_multipart_get_part(CAMEL_MULTIPART(containee), i);
+			if (part)
+				truth = camel_search_message_body_contains(part, pattern);
+		}
+	} else if (CAMEL_IS_MIME_MESSAGE(containee)) {
+		/* for messages we only look at its contents */
+		truth = camel_search_message_body_contains((CamelDataWrapper *)containee, pattern);
+	} else if (header_content_type_is(CAMEL_DATA_WRAPPER(containee)->mime_type, "text", "*")) {
+		/* for all other text parts, we look inside, otherwise we dont care */
+		CamelStreamMem *mem = (CamelStreamMem *)camel_stream_mem_new();
+
+		camel_data_wrapper_write_to_stream(containee, (CamelStream *)mem);
+		camel_stream_write((CamelStream *)mem, "", 1);
+		truth = regexec(pattern, mem->buffer->data, 0, NULL, 0) == 0;
+		camel_object_unref((CamelObject *)mem);
+	}
+	return truth;
+}
+
diff --git a/camel/camel-search-private.h b/camel/camel-search-private.h
new file mode 100644
index 0000000000..aff881f32f
--- /dev/null
+++ b/camel/camel-search-private.h
@@ -0,0 +1,46 @@
+/*
+ *  Copyright (C) 2001 Ximian Inc.
+ *
+ *  Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU Library General Public License
+ *  as published by the Free Software Foundation; either version 2 of
+ *  the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU Library General Public License for more details.
+ *
+ *  You should have received a copy of the GNU Library General Public
+ *  License along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#ifndef _CAMEL_SEARCH_PRIVATE_H
+#define _CAMEL_SEARCH_PRIVATE_H
+
+typedef enum {
+	CAMEL_SEARCH_MATCH_START = 1<<0,
+	CAMEL_SEARCH_MATCH_END = 1<<1,
+	CAMEL_SEARCH_MATCH_REGEX = 1<<2, /* disables the first 2 */
+	CAMEL_SEARCH_MATCH_ICASE = 1<<3,
+} camel_search_flags_t;
+
+typedef enum {
+	CAMEL_SEARCH_MATCH_EXACT,
+	CAMEL_SEARCH_MATCH_CONTAINS,
+	CAMEL_SEARCH_MATCH_STARTS,
+	CAMEL_SEARCH_MATCH_ENDS,
+	CAMEL_SEARCH_MATCH_SOUNDEX,
+} camel_search_match_t;
+
+/* builds a regex that represents a string search */
+int camel_search_build_match_regex(regex_t *pattern, camel_search_flags_t type, int argc, struct _ESExpResult **argv, CamelException *ex);
+gboolean camel_search_message_body_contains(CamelDataWrapper *object, regex_t *pattern);
+
+gboolean camel_search_header_match(const char *value, const char *match, camel_search_match_t how);
+gboolean camel_search_header_soundex(const char *header, const char *match);
+
+#endif /* ! _CAMEL_SEARCH_PRIVATE_H */