diff options
Diffstat (limited to 'camel')
-rw-r--r-- | camel/camel-search-private.c | 275 | ||||
-rw-r--r-- | camel/camel-search-private.h | 46 |
2 files changed, 321 insertions, 0 deletions
diff --git a/camel/camel-search-private.c b/camel/camel-search-private.c new file mode 100644 index 0000000000..94c6c1a355 --- /dev/null +++ b/camel/camel-search-private.c @@ -0,0 +1,275 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ +/* + * Authors: Jeffrey Stedfast <fejj@helixcode.com> + * Michael Zucchi <NotZed@Ximian.com> + * + * Copyright 2000 Helix Code, Inc. (www.helixcode.com) + * Copyright 2001 Ximian Inc. (www.ximian.com) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Street #330, Boston, MA 02111-1307, USA. + * + */ + +/* (from glibc headers: + POSIX says that <sys/types.h> must be included (by the caller) before <regex.h>. */ +#include <sys/types.h> +#include <regex.h> +#include <string.h> +#include <ctype.h> +#include <stdio.h> + +#include "camel-exception.h" +#include "camel-mime-message.h" +#include "camel-multipart.h" +#include "camel-stream-mem.h" +#include "e-util/e-sexp.h" + +#include "camel-search-private.h" + +#define d(x) + +/* builds the regex into pattern */ +/* taken from camel-folder-search, with added isregex & exception parameter */ +/* Basically, we build a new regex, either based on subset regex's, or substrings, + that can be executed once over the whoel body, to match anything suitable. + This is more efficient than multiple searches, and probably most (naive) strstr + implementations, over long content. + + A small issue is that case-insenstivity wont work entirely correct for utf8 strings. */ +int +camel_search_build_match_regex(regex_t *pattern, camel_search_flags_t type, int argc, struct _ESExpResult **argv, CamelException *ex) +{ + GString *match = g_string_new(""); + int c, i, count=0, err; + char *word; + int flags; + + /* build a regex pattern we can use to match the words, we OR them together */ + if (argc>1) + g_string_append_c(match, '('); + for (i=0;i<argc;i++) { + if (argv[i]->type == ESEXP_RES_STRING) { + if (count > 0) + g_string_append_c(match, '|'); + /* escape any special chars (not sure if this list is complete) */ + word = argv[i]->value.string; + if (type & CAMEL_SEARCH_MATCH_REGEX) { + g_string_append(match, word); + } else { + if (type & CAMEL_SEARCH_MATCH_START) + g_string_append_c(match, '^'); + while ((c = *word++)) { + if (strchr("*\\.()[]^$+", c) != NULL) { + g_string_append_c(match, '\\'); + } + g_string_append_c(match, c); + } + if (type & CAMEL_SEARCH_MATCH_END) + g_string_append_c(match, '^'); + } + count++; + } else { + g_warning("Invalid type passed to body-contains match function"); + } + } + if (argc>1) + g_string_append_c(match, ')'); + flags = REG_EXTENDED|REG_NOSUB; + if (type & CAMEL_SEARCH_MATCH_ICASE) + flags |= REG_ICASE; + err = regcomp(pattern, match->str, flags); + if (err != 0) { + /* regerror gets called twice to get the full error string + length to do proper posix error reporting */ + int len = regerror(err, pattern, 0, 0); + char *buffer = g_malloc0(len + 1); + + regerror(err, pattern, buffer, len); + camel_exception_setv(ex, CAMEL_EXCEPTION_SYSTEM, + _("Regular expression compilation failed: %s: %s"), + match->str, buffer); + + regfree(pattern); + } + d(printf("Built regex: '%s'\n", match->str)); + g_string_free(match, TRUE); + return err; +} + +static unsigned char soundex_table[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 49, 50, 51, 0, 49, 50, 0, 0, 50, 50, 52, 53, 53, 0, + 49, 50, 54, 50, 51, 0, 49, 0, 50, 0, 50, 0, 0, 0, 0, 0, + 0, 0, 49, 50, 51, 0, 49, 50, 0, 0, 50, 50, 52, 53, 53, 0, + 49, 50, 54, 50, 51, 0, 49, 0, 50, 0, 50, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +static void +soundexify (const gchar *sound, gchar code[5]) +{ + guchar *c, last = '\0'; + gint n; + + for (c = (guchar *) sound; *c && !isalpha (*c); c++); + code[0] = toupper (*c); + memset (code + 1, '0', 3); + for (n = 1; *c && n < 5; c++) { + guchar ch = soundex_table[*c]; + + if (ch && ch != last) { + code[n++] = ch; + last = ch; + } + } + code[4] = '\0'; +} + +static gboolean +header_soundex(const char *header, const char *match) +{ + char mcode[5], hcode[5]; + const char *p; + char c; + GString *word; + int truth = FALSE; + + soundexify(match, mcode); + + /* split the header into words, and soundexify and compare each one */ + /* FIXME: Should this convert to utf8, and split based on that, and what not? + soundex only makes sense for us-ascii though ... */ + + word = g_string_new(""); + p = header; + do { + c = *p++; + if (c == 0 || isspace(c)) { + if (word->len > 0) { + soundexify(word->str, hcode); + if (strcmp(hcode, mcode) == 0) + truth = TRUE; + } + g_string_truncate(word, 0); + } else if (isalpha(c)) + g_string_append_c(word, c); + } while (c && !truth); + g_string_free(word, TRUE); + + return truth; +} + +/* searhces for match inside value, if match is mixed case, hten use case-sensitive, + else insensitive */ +gboolean camel_search_header_match(const char *value, const char *match, camel_search_match_t how) +{ + const char *p; + + if (how == CAMEL_SEARCH_MATCH_SOUNDEX) + return header_soundex(value, match); + + while (*value && isspace(*value)) + value++; + + if (strlen(value) < strlen(match)) + return FALSE; + + /* from dan the man, if we have mixed case, perform a case-sensitive match, + otherwise not */ + p = match; + while (*p) { + if (isupper(*p)) { + switch(how) { + case CAMEL_SEARCH_MATCH_EXACT: + return strcmp(value, match) == 0; + case CAMEL_SEARCH_MATCH_CONTAINS: + return strstr(value, match) != NULL; + case CAMEL_SEARCH_MATCH_STARTS: + return strncmp(value, match, strlen(match)) == 0; + case CAMEL_SEARCH_MATCH_ENDS: + return strcmp(value+strlen(value)-strlen(match), match) == 0; + default: + break; + } + return FALSE; + } + p++; + } + switch(how) { + case CAMEL_SEARCH_MATCH_EXACT: + return strcasecmp(value, match) == 0; + case CAMEL_SEARCH_MATCH_CONTAINS: + return e_utf8_strstrcase(value, match) != NULL; + case CAMEL_SEARCH_MATCH_STARTS: + return strncasecmp(value, match, strlen(match)) == 0; + case CAMEL_SEARCH_MATCH_ENDS: + return strcasecmp(value+strlen(value)-strlen(match), match) == 0; + default: + break; + } + + return FALSE; +} + +/* performs a 'slow' content-based match */ +/* there is also an identical copy of this in camel-filter-search.c */ +gboolean +camel_search_message_body_contains(CamelDataWrapper *object, regex_t *pattern) +{ + CamelDataWrapper *containee; + int truth = FALSE; + int parts, i; + + containee = camel_medium_get_content_object(CAMEL_MEDIUM(object)); + + if (containee == NULL) + return FALSE; + + /* TODO: I find it odd that get_part and get_content_object do not + add a reference, probably need fixing for multithreading */ + + /* using the object types is more accurate than using the mime/types */ + if (CAMEL_IS_MULTIPART(containee)) { + parts = camel_multipart_get_number(CAMEL_MULTIPART(containee)); + for (i=0;i<parts && truth==FALSE;i++) { + CamelDataWrapper *part = (CamelDataWrapper *)camel_multipart_get_part(CAMEL_MULTIPART(containee), i); + if (part) + truth = camel_search_message_body_contains(part, pattern); + } + } else if (CAMEL_IS_MIME_MESSAGE(containee)) { + /* for messages we only look at its contents */ + truth = camel_search_message_body_contains((CamelDataWrapper *)containee, pattern); + } else if (header_content_type_is(CAMEL_DATA_WRAPPER(containee)->mime_type, "text", "*")) { + /* for all other text parts, we look inside, otherwise we dont care */ + CamelStreamMem *mem = (CamelStreamMem *)camel_stream_mem_new(); + + camel_data_wrapper_write_to_stream(containee, (CamelStream *)mem); + camel_stream_write((CamelStream *)mem, "", 1); + truth = regexec(pattern, mem->buffer->data, 0, NULL, 0) == 0; + camel_object_unref((CamelObject *)mem); + } + return truth; +} + diff --git a/camel/camel-search-private.h b/camel/camel-search-private.h new file mode 100644 index 0000000000..aff881f32f --- /dev/null +++ b/camel/camel-search-private.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2001 Ximian Inc. + * + * Authors: Michael Zucchi <notzed@helixcode.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#ifndef _CAMEL_SEARCH_PRIVATE_H +#define _CAMEL_SEARCH_PRIVATE_H + +typedef enum { + CAMEL_SEARCH_MATCH_START = 1<<0, + CAMEL_SEARCH_MATCH_END = 1<<1, + CAMEL_SEARCH_MATCH_REGEX = 1<<2, /* disables the first 2 */ + CAMEL_SEARCH_MATCH_ICASE = 1<<3, +} camel_search_flags_t; + +typedef enum { + CAMEL_SEARCH_MATCH_EXACT, + CAMEL_SEARCH_MATCH_CONTAINS, + CAMEL_SEARCH_MATCH_STARTS, + CAMEL_SEARCH_MATCH_ENDS, + CAMEL_SEARCH_MATCH_SOUNDEX, +} camel_search_match_t; + +/* builds a regex that represents a string search */ +int camel_search_build_match_regex(regex_t *pattern, camel_search_flags_t type, int argc, struct _ESExpResult **argv, CamelException *ex); +gboolean camel_search_message_body_contains(CamelDataWrapper *object, regex_t *pattern); + +gboolean camel_search_header_match(const char *value, const char *match, camel_search_match_t how); +gboolean camel_search_header_soundex(const char *header, const char *match); + +#endif /* ! _CAMEL_SEARCH_PRIVATE_H */ |