aboutsummaryrefslogtreecommitdiffstats
path: root/widgets/misc/e-searching-tokenizer.c
diff options
context:
space:
mode:
Diffstat (limited to 'widgets/misc/e-searching-tokenizer.c')
-rw-r--r--widgets/misc/e-searching-tokenizer.c1195
1 files changed, 1195 insertions, 0 deletions
diff --git a/widgets/misc/e-searching-tokenizer.c b/widgets/misc/e-searching-tokenizer.c
new file mode 100644
index 0000000000..8920a7dd21
--- /dev/null
+++ b/widgets/misc/e-searching-tokenizer.c
@@ -0,0 +1,1195 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) version 3.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with the program; if not, see <http://www.gnu.org/licenses/>
+ *
+ *
+ * Authors:
+ * Developed by Jon Trowbridge <trow@ximian.com>
+ * Rewritten significantly to handle multiple strings and improve performance
+ * by Michael Zucchi <notzed@ximian.com>
+ *
+ * Copyright (C) 1999-2008 Novell, Inc. (www.novell.com)
+ *
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "e-searching-tokenizer.h"
+
+#include "libedataserver/e-memory.h"
+
+#define d(x)
+
+#define E_SEARCHING_TOKENIZER_GET_PRIVATE(obj) \
+ (G_TYPE_INSTANCE_GET_PRIVATE \
+ ((obj), E_TYPE_SEARCHING_TOKENIZER, ESearchingTokenizerPrivate))
+
+enum {
+ MATCH_SIGNAL,
+ LAST_SIGNAL
+};
+
+static gpointer parent_class;
+static guint signals[LAST_SIGNAL];
+
+/* Utility functions */
+
+/* This is faster and safer than glib2's utf8 abomination, but isn't exported from camel as yet */
+static inline guint32
+camel_utf8_getc(const guchar **ptr)
+{
+ register guchar *p = (guchar *)*ptr;
+ register guchar c, r;
+ register guint32 v, m;
+
+again:
+ r = *p++;
+loop:
+ if (r < 0x80) {
+ *ptr = p;
+ v = r;
+ } else if (r < 0xfe) { /* valid start char? */
+ v = r;
+ m = 0x7f80; /* used to mask out the length bits */
+ do {
+ c = *p++;
+ if ((c & 0xc0) != 0x80) {
+ r = c;
+ goto loop;
+ }
+ v = (v<<6) | (c & 0x3f);
+ r<<=1;
+ m<<=5;
+ } while (r & 0x40);
+
+ *ptr = p;
+
+ v &= ~m;
+ } else {
+ goto again;
+ }
+
+ return v;
+}
+
+/* note: our tags of interest are 7 bit ascii, only, no need to do any fancy utf8 stuff */
+/* tags should be upper case
+ if this list gets longer than 10 entries, consider binary search */
+static const gchar *ignored_tags[] = {
+ "B", "I", "FONT", "TT", "EM", /* and more? */};
+
+static gint
+ignore_tag (const gchar *tag)
+{
+ gchar *t = g_alloca(strlen(tag)+1), c, *out;
+ const gchar *in;
+ gint i;
+
+ /* we could use a aho-corasick matcher here too ... but we wont */
+
+ /* normalise tag into 't'.
+ Note we use the property that the only tags we're interested in
+ are 7 bit ascii to shortcut and simplify case insensitivity */
+ in = tag+2; /* skip: TAG_ESCAPE '<' */
+ if (*in == '/')
+ in++;
+ out = t;
+ while ((c = *in++)) {
+ if (c >= 'A' && c <= 'Z')
+ *out++ = c;
+ else if (c >= 'a' && c <= 'z')
+ *out++ = c & 0xdf; /* convert ASCII to upper case */
+ else
+ /* maybe should check for > or ' ' etc? */
+ break;
+ }
+ *out = 0;
+
+ for (i = 0; i < G_N_ELEMENTS (ignored_tags); i++) {
+ if (strcmp (t, ignored_tags[i]) == 0)
+ return 1;
+ }
+
+ return 0;
+}
+
+/* ********************************************************************** */
+
+/* Aho-Corasick search tree implmeentation */
+
+/* next state if we match a character */
+struct _match {
+ struct _match *next;
+ guint32 ch;
+ struct _state *match;
+};
+
+/* tree state node */
+struct _state {
+ struct _match *matches;
+ guint final; /* max no of chars we just matched */
+ struct _state *fail; /* where to try next if we fail */
+ struct _state *next; /* next on this level? */
+};
+
+/* base tree structure */
+struct _trie {
+ struct _state root;
+ gint max_depth;
+
+ EMemChunk *state_chunks;
+ EMemChunk *match_chunks;
+};
+
+/* will be enabled only if debug is enabled */
+#if d(1) -1 != -1
+static void
+dump_trie (struct _state *s, gint d)
+{
+ gchar *p = g_alloca(d*2+1);
+ struct _match *m;
+
+ memset(p, ' ', d*2);
+ p[d*2]=0;
+
+ printf("%s[state] %p: %d fail->%p\n", p, s, s->final, s->fail);
+ m = s->matches;
+ while (m) {
+ printf(" %s'%c' -> %p\n", p, m->ch, m->match);
+ if (m->match)
+ dump_trie (m->match, d+1);
+ m = m->next;
+ }
+}
+#endif
+
+/* This builds an Aho-Corasick search trie for a set of utf8 words */
+/* See
+ http://www-sr.informatik.uni-tuebingen.de/~buehler/AC/AC.html
+ for a neat demo */
+
+static inline struct _match *
+g (struct _state *q, guint32 c)
+{
+ struct _match *m = q->matches;
+
+ while (m && m->ch != c)
+ m = m->next;
+
+ return m;
+}
+
+static struct _trie *
+build_trie (gint nocase, gint len, guchar **words)
+{
+ struct _state *q, *qt, *r;
+ const guchar *word;
+ struct _match *m, *n = NULL;
+ gint i, depth;
+ guint32 c;
+ struct _trie *trie;
+ gint state_depth_max, state_depth_size;
+ struct _state **state_depth;
+
+ trie = g_malloc(sizeof(*trie));
+ trie->root.matches = NULL;
+ trie->root.final = 0;
+ trie->root.fail = NULL;
+ trie->root.next = NULL;
+
+ trie->state_chunks = e_memchunk_new (8, sizeof(struct _state));
+ trie->match_chunks = e_memchunk_new (8, sizeof(struct _match));
+
+ /* This will correspond to the length of the longest pattern */
+ state_depth_size = 0;
+ state_depth_max = 64;
+ state_depth = g_malloc(sizeof(*state_depth[0])*64);
+ state_depth[0] = NULL;
+
+ /* Step 1: Build trie */
+
+ /* This just builds a tree that merges all common prefixes into the same branch */
+
+ for (i=0;i<len;i++) {
+ word = words[i];
+ q = &trie->root;
+ depth = 0;
+ while ((c = camel_utf8_getc (&word))) {
+ if (nocase)
+ c = g_unichar_tolower (c);
+ m = g (q, c);
+ if (m == NULL) {
+ m = e_memchunk_alloc(trie->match_chunks);
+ m->ch = c;
+ m->next = q->matches;
+ q->matches = m;
+ q = m->match = e_memchunk_alloc(trie->state_chunks);
+ q->matches = NULL;
+ q->fail = &trie->root;
+ q->final = 0;
+ if (state_depth_max < depth) {
+ state_depth_max += 64;
+ state_depth = g_realloc(state_depth, sizeof(*state_depth[0])*state_depth_max);
+ }
+ if (state_depth_size < depth) {
+ state_depth[depth] = NULL;
+ state_depth_size = depth;
+ }
+ q->next = state_depth[depth];
+ state_depth[depth] = q;
+ } else {
+ q = m->match;
+ }
+ depth++;
+ }
+ q->final = depth;
+ }
+
+ d(printf("Dumping trie:\n"));
+ d(dump_trie (&trie->root, 0));
+
+ /* Step 2: Build failure graph */
+
+ /* This searches for the longest substring which is a prefix of another string and
+ builds a graph of failure links so you can find multiple substrings concurrently,
+ using aho-corasick's algorithm */
+
+ for (i=0;i<state_depth_size;i++) {
+ q = state_depth[i];
+ while (q) {
+ m = q->matches;
+ while (m) {
+ c = m->ch;
+ qt = m->match;
+ r = q->fail;
+ while (r && (n = g (r, c)) == NULL)
+ r = r->fail;
+ if (r != NULL) {
+ qt->fail = n->match;
+ if (qt->fail->final > qt->final)
+ qt->final = qt->fail->final;
+ } else {
+ if ((n = g (&trie->root, c)))
+ qt->fail = n->match;
+ else
+ qt->fail = &trie->root;
+ }
+ m = m->next;
+ }
+ q = q->next;
+ }
+ }
+
+ d (printf("After failure analysis\n"));
+ d (dump_trie (&trie->root, 0));
+
+ g_free (state_depth);
+
+ trie->max_depth = state_depth_size;
+
+ return trie;
+}
+
+static void
+free_trie (struct _trie *t)
+{
+ e_memchunk_destroy(t->match_chunks);
+ e_memchunk_destroy(t->state_chunks);
+
+ g_free (t);
+}
+
+/* ********************************************************************** */
+
+/* html token searcher */
+
+struct _token {
+ struct _token *next;
+ struct _token *prev;
+ guint offset;
+ /* we need to copy the token for memory management, so why not copy it whole */
+ gchar tok[1];
+};
+
+/* stack of submatches currently being scanned, used for merging */
+struct _submatch {
+ guint offstart, offend; /* in bytes */
+};
+
+/* flags for new func */
+#define SEARCH_CASE (1)
+#define SEARCH_BOLD (2)
+
+struct _searcher {
+ struct _trie *t;
+
+ gchar *(*next_token)(); /* callbacks for more tokens */
+ gpointer next_data;
+
+ gint words; /* how many words */
+ gchar *tags, *tage; /* the tag we used to highlight */
+
+ gint flags; /* case sensitive or not */
+
+ struct _state *state; /* state is the current trie state */
+
+ gint matchcount;
+
+ GQueue input; /* pending 'input' tokens, processed but might match */
+ GQueue output; /* output tokens ready for source */
+
+ struct _token *current; /* for token output memory management */
+
+ guint32 offset; /* current offset through searchable stream? */
+ guint32 offout; /* last output position */
+
+ guint lastp; /* current position in rotating last buffer */
+ guint32 *last; /* buffer that goes back last 'n' positions */
+ guint32 last_mask; /* bitmask for efficient rotation calculation */
+
+ guint submatchp; /* submatch stack */
+ struct _submatch *submatches;
+};
+
+static void
+searcher_set_tokenfunc(struct _searcher *s, gchar *(*next)(), gpointer data)
+{
+ s->next_token = next;
+ s->next_data = data;
+}
+
+static struct _searcher *
+searcher_new (gint flags, gint argc, guchar **argv, const gchar *tags, const gchar *tage)
+{
+ gint i, m;
+ struct _searcher *s;
+
+ s = g_malloc(sizeof(*s));
+
+ s->t = build_trie ((flags&SEARCH_CASE) == 0, argc, argv);
+ s->words = argc;
+ s->tags = g_strdup (tags);
+ s->tage = g_strdup (tage);
+ s->flags = flags;
+ s->state = &s->t->root;
+ s->matchcount = 0;
+
+ g_queue_init (&s->input);
+ g_queue_init (&s->output);
+ s->current = NULL;
+
+ s->offset = 0;
+ s->offout = 0;
+
+ /* rotating queue of previous character positions */
+ m = s->t->max_depth+1;
+ i = 2;
+ while (i<m)
+ i<<=2;
+ s->last = g_malloc(sizeof(s->last[0])*i);
+ s->last_mask = i-1;
+ s->lastp = 0;
+
+ /* a stack of possible submatches */
+ s->submatchp = 0;
+ s->submatches = g_malloc(sizeof(s->submatches[0])*argc+1);
+
+ return s;
+}
+
+static void
+searcher_free (struct _searcher *s)
+{
+ struct _token *t;
+
+ while ((t = g_queue_pop_head (&s->input)) != NULL)
+ g_free (t);
+ while ((t = g_queue_pop_head (&s->output)) != NULL)
+ g_free (t);
+ g_free (s->tags);
+ g_free (s->tage);
+ g_free (s->last);
+ g_free (s->submatches);
+ free_trie (s->t);
+ g_free (s);
+}
+
+static struct _token *
+append_token (GQueue *queue, const gchar *tok, gint len)
+{
+ struct _token *token;
+
+ if (len == -1)
+ len = strlen(tok);
+ token = g_malloc(sizeof(*token) + len+1);
+ token->offset = 0; /* set by caller when required */
+ memcpy(token->tok, tok, len);
+ token->tok[len] = 0;
+ g_queue_push_tail (queue, token);
+
+ return token;
+}
+
+#define free_token(x) (g_free (x))
+
+static void
+output_token(struct _searcher *s, struct _token *token)
+{
+ gint offend;
+ gint left, pre;
+
+ if (token->tok[0] == TAG_ESCAPE) {
+ if (token->offset >= s->offout) {
+ d (printf("moving tag token '%s' from input to output\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+ g_queue_push_tail (&s->output, token);
+ } else {
+ d (printf("discarding tag token '%s' from input\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+ free_token(token);
+ }
+ } else {
+ offend = token->offset + strlen(token->tok);
+ left = offend-s->offout;
+ if (left > 0) {
+ pre = s->offout - token->offset;
+ if (pre>0)
+ memmove (token->tok, token->tok+pre, left+1);
+ d (printf("adding partial remaining/failed '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+ s->offout = offend;
+ g_queue_push_tail (&s->output, token);
+ } else {
+ d (printf("discarding whole token '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+ free_token(token);
+ }
+ }
+}
+
+static struct _token *
+find_token(struct _searcher *s, gint start)
+{
+ GList *link;
+
+ /* find token which is start token, from end of list back */
+ link = g_queue_peek_tail_link (&s->input);
+ while (link != NULL) {
+ struct _token *token = link->data;
+
+ if (token->offset <= start)
+ return token;
+
+ link = g_list_previous (link);
+ }
+
+ return NULL;
+}
+
+static void
+output_match(struct _searcher *s, guint start, guint end)
+{
+ register struct _token *token;
+ struct _token *starttoken, *endtoken;
+ gchar b[8];
+
+ d (printf("output match: %d-%d at %d\n", start, end, s->offout));
+
+ starttoken = find_token(s, start);
+ endtoken = find_token(s, end);
+
+ if (starttoken == NULL || endtoken == NULL) {
+ d (printf("Cannot find match history for match %d-%d\n", start, end));
+ return;
+ }
+
+ d (printf("start in token '%s'\n", starttoken->tok[0]==TAG_ESCAPE?starttoken->tok+1:starttoken->tok));
+ d (printf("end in token '%s'\n", endtoken->tok[0]==TAG_ESCAPE?endtoken->tok+1:endtoken->tok));
+
+ /* output pending stuff that didn't match afterall */
+ while ((struct _token *) g_queue_peek_head (&s->input) != starttoken) {
+ token = g_queue_pop_head (&s->input);
+ d (printf("appending failed match '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+ output_token(s, token);
+ }
+
+ /* output any pre-match text */
+ if (s->offout < start) {
+ token = append_token(&s->output, starttoken->tok + (s->offout-starttoken->offset), start-s->offout);
+ d (printf("adding pre-match text '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+ s->offout = start;
+ }
+
+ /* output highlight/bold */
+ if (s->flags & SEARCH_BOLD) {
+ sprintf(b, "%c<b>", (gchar)TAG_ESCAPE);
+ append_token(&s->output, b, -1);
+ }
+ if (s->tags)
+ append_token(&s->output, s->tags, -1);
+
+ /* output match node (s) */
+ if (starttoken != endtoken) {
+ while ((struct _token *) g_queue_peek_head (&s->input) != endtoken) {
+ token = g_queue_pop_head (&s->input);
+ d (printf("appending (partial) match node (head) '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+ output_token(s, token);
+ }
+ }
+
+ /* any remaining partial content */
+ if (s->offout < end) {
+ token = append_token(&s->output, endtoken->tok+(s->offout-endtoken->offset), end-s->offout);
+ d (printf("appending (partial) match node (tail) '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+ s->offout = end;
+ }
+
+ /* end highlight */
+ if (s->tage)
+ append_token(&s->output, s->tage, -1);
+
+ /* and close bold if we need to */
+ if (s->flags & SEARCH_BOLD) {
+ sprintf(b, "%c</b>", (gchar)TAG_ESCAPE);
+ append_token(&s->output, b, -1);
+ }
+}
+
+/* output any sub-pending blocks */
+static void
+output_subpending (struct _searcher *s)
+{
+ gint i;
+
+ for (i=s->submatchp-1;i>=0;i--)
+ output_match(s, s->submatches[i].offstart, s->submatches[i].offend);
+ s->submatchp = 0;
+}
+
+/* returns true if a merge took place */
+static gint
+merge_subpending (struct _searcher *s, gint offstart, gint offend)
+{
+ gint i;
+
+ /* merges overlapping or abutting match strings */
+ if (s->submatchp &&
+ s->submatches[s->submatchp-1].offend >= offstart) {
+
+ /* go from end, any that match 'invalidate' follow-on ones too */
+ for (i=s->submatchp-1;i>=0;i--) {
+ if (s->submatches[i].offend >= offstart) {
+ if (offstart < s->submatches[i].offstart)
+ s->submatches[i].offstart = offstart;
+ s->submatches[i].offend = offend;
+ if (s->submatchp > i)
+ s->submatchp = i+1;
+ }
+ }
+ return 1;
+ }
+
+ return 0;
+}
+
+static void
+push_subpending (struct _searcher *s, gint offstart, gint offend)
+{
+ /* This is really an assertion, we just ignore the last pending match instead of crashing though */
+ if (s->submatchp >= s->words) {
+ d (printf("ERROR: submatch pending stack overflow\n"));
+ s->submatchp = s->words-1;
+ }
+
+ s->submatches[s->submatchp].offstart = offstart;
+ s->submatches[s->submatchp].offend = offend;
+ s->submatchp++;
+}
+
+/* move any (partial) tokens from input to output if they are beyond the current output position */
+static void
+output_pending (struct _searcher *s)
+{
+ struct _token *token;
+
+ while ((token = g_queue_pop_head (&s->input)) != NULL)
+ output_token(s, token);
+}
+
+/* flushes any nodes we cannot possibly match anymore */
+static void
+flush_extra(struct _searcher *s)
+{
+ guint start;
+ gint i;
+ struct _token *starttoken, *token;
+
+ /* find earliest gchar that can be in contention */
+ start = s->offset - s->t->max_depth;
+ for (i=0;i<s->submatchp;i++)
+ if (s->submatches[i].offstart < start)
+ start = s->submatches[i].offstart;
+
+ /* now, flush out any tokens which are before this point */
+ starttoken = find_token(s, start);
+ if (starttoken == NULL)
+ return;
+
+ while ((struct _token *) g_queue_peek_head (&s->input) != starttoken) {
+ token = g_queue_pop_head (&s->input);
+ output_token(s, token);
+ }
+}
+
+static gchar *
+searcher_next_token(struct _searcher *s)
+{
+ struct _token *token;
+ const guchar *tok, *stok, *pre_tok;
+ struct _trie *t = s->t;
+ struct _state *q = s->state;
+ struct _match *m = NULL;
+ gint offstart, offend;
+ guint32 c;
+
+ while (g_queue_is_empty (&s->output)) {
+ /* get next token */
+ tok = (guchar *)s->next_token(s->next_data);
+ if (tok == NULL) {
+ output_subpending (s);
+ output_pending (s);
+ break;
+ }
+
+ /* we dont always have to copy each token, e.g. if we dont match anything */
+ token = append_token(&s->input, (gchar *)tok, -1);
+ token->offset = s->offset;
+ tok = (guchar *)token->tok;
+
+ d (printf("new token %d '%s'\n", token->offset, token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok));
+
+ /* tag test, reset state on unknown tags */
+ if (tok[0] == TAG_ESCAPE) {
+ if (!ignore_tag ((gchar *)tok)) {
+ /* force reset */
+ output_subpending (s);
+ output_pending (s);
+ q = &t->root;
+ }
+
+ continue;
+ }
+
+ /* process whole token */
+ pre_tok = stok = tok;
+ while ((c = camel_utf8_getc (&tok))) {
+ if ((s->flags & SEARCH_CASE) == 0)
+ c = g_unichar_tolower (c);
+ while (q && (m = g (q, c)) == NULL)
+ q = q->fail;
+ if (q == NULL) {
+ /* mismatch ... reset state */
+ output_subpending (s);
+ q = &t->root;
+ } else if (m != NULL) {
+ /* keep track of previous offsets of utf8 chars, rotating buffer */
+ s->last[s->lastp] = s->offset + (pre_tok-stok);
+ s->lastp = (s->lastp+1)&s->last_mask;
+
+ q = m->match;
+ /* we have a match of q->final characters for a matching word */
+ if (q->final) {
+ s->matchcount++;
+
+ /* use the last buffer to find the real offset of this gchar */
+ offstart = s->last[(s->lastp - q->final)&s->last_mask];
+ offend = s->offset + (tok - stok);
+
+ if (q->matches == NULL) {
+ if (s->submatchp == 0) {
+ /* nothing pending, always put something in so we can try merge */
+ push_subpending (s, offstart, offend);
+ } else if (!merge_subpending (s, offstart, offend)) {
+ /* can't merge, output what we have, and start againt */
+ output_subpending (s);
+ push_subpending (s, offstart, offend);
+ /*output_match(s, offstart, offend);*/
+ } else if (g_queue_get_length (&s->input) > 8) {
+ /* we're continuing to match and merge, but we have a lot of stuff
+ waiting, so flush it out now since this is a safe point to do it */
+ output_subpending (s);
+ }
+ } else {
+ /* merge/add subpending */
+ if (!merge_subpending (s, offstart, offend))
+ push_subpending (s, offstart, offend);
+ }
+ }
+ }
+ pre_tok = tok;
+ }
+
+ s->offset += (pre_tok-stok);
+
+ flush_extra(s);
+ }
+
+ s->state = q;
+
+ if (s->current)
+ free_token(s->current);
+
+ s->current = token = g_queue_pop_head (&s->output);
+
+ return token ? g_strdup (token->tok) : NULL;
+}
+
+static gchar *
+searcher_peek_token(struct _searcher *s)
+{
+ gchar *tok;
+
+ /* we just get it and then put it back, it's fast enuf */
+ tok = searcher_next_token(s);
+ if (tok) {
+ /* need to clear this so we dont free it while its still active */
+ g_queue_push_head (&s->output, s->current);
+ s->current = NULL;
+ }
+
+ return tok;
+}
+
+static gint
+searcher_pending (struct _searcher *s)
+{
+ return !(g_queue_is_empty (&s->input) && g_queue_is_empty (&s->output));
+}
+
+/* ********************************************************************** */
+
+struct _search_info {
+ GPtrArray *strv;
+ gchar *color;
+ guint size:8;
+ guint flags:8;
+};
+
+/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
+
+static struct _search_info *
+search_info_new (void)
+{
+ struct _search_info *s;
+
+ s = g_malloc0(sizeof(struct _search_info));
+ s->strv = g_ptr_array_new ();
+
+ return s;
+}
+
+static void
+search_info_set_flags(struct _search_info *si, guint flags, guint mask)
+{
+ si->flags = (si->flags & ~mask) | (flags & mask);
+}
+
+static void
+search_info_set_color (struct _search_info *si, const gchar *color)
+{
+ g_free (si->color);
+ si->color = g_strdup (color);
+}
+
+static void
+search_info_add_string (struct _search_info *si, const gchar *s)
+{
+ const guchar *start;
+ guint32 c;
+
+ if (s && s[0]) {
+ const guchar *us = (guchar *) s;
+ /* strip leading whitespace */
+ start = us;
+ while ((c = camel_utf8_getc (&us))) {
+ if (!g_unichar_isspace (c)) {
+ break;
+ }
+ start = us;
+ }
+ /* should probably also strip trailing, but i'm lazy today */
+ if (start[0])
+ g_ptr_array_add (si->strv, g_strdup ((gchar *)start));
+ }
+}
+
+static void
+search_info_clear (struct _search_info *si)
+{
+ gint i;
+
+ for (i=0;i<si->strv->len;i++)
+ g_free (si->strv->pdata[i]);
+
+ g_ptr_array_set_size (si->strv, 0);
+}
+
+static void
+search_info_free (struct _search_info *si)
+{
+ gint i;
+
+ for (i=0;i<si->strv->len;i++)
+ g_free (si->strv->pdata[i]);
+
+ g_ptr_array_free (si->strv, TRUE);
+ g_free (si->color);
+ g_free (si);
+}
+
+static struct _search_info *
+search_info_clone (struct _search_info *si)
+{
+ struct _search_info *out;
+ gint i;
+
+ out = search_info_new ();
+ for (i=0;i<si->strv->len;i++)
+ g_ptr_array_add (out->strv, g_strdup (si->strv->pdata[i]));
+ out->color = g_strdup (si->color);
+ out->flags = si->flags;
+ out->size = si->size;
+
+ return out;
+}
+
+static struct _searcher *
+search_info_to_searcher (struct _search_info *si)
+{
+ gchar *tags, *tage;
+ const gchar *col;
+
+ if (si->strv->len == 0)
+ return NULL;
+
+ if (si->color == NULL)
+ col = "red";
+ else
+ col = si->color;
+
+ tags = g_alloca(20+strlen(col));
+ sprintf(tags, "%c<font color=\"%s\">", TAG_ESCAPE, col);
+ tage = g_alloca(20);
+ sprintf(tage, "%c</font>", TAG_ESCAPE);
+
+ return searcher_new (si->flags, si->strv->len, (guchar **)si->strv->pdata, tags, tage);
+}
+
+/* ********************************************************************** */
+
+struct _ESearchingTokenizerPrivate {
+ struct _search_info *primary, *secondary;
+ struct _searcher *engine;
+};
+
+/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
+
+/** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/
+
+/* blah blah the htmltokeniser doesn't like being asked
+ for a token if it doens't hvae any! */
+static gchar *
+get_token (HTMLTokenizer *tokenizer)
+{
+ HTMLTokenizerClass *class = HTML_TOKENIZER_CLASS (parent_class);
+
+ if (class->has_more (tokenizer))
+ return class->next_token (tokenizer);
+
+ return NULL;
+}
+
+/* proxy matched event, not sure what its for otherwise */
+static void
+matched (ESearchingTokenizer *tokenizer)
+{
+ /*++tokenizer->priv->match_count;*/
+ g_signal_emit (tokenizer, signals[MATCH_SIGNAL], 0);
+}
+
+/* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */
+
+static void
+searching_tokenizer_finalize (GObject *object)
+{
+ ESearchingTokenizerPrivate *priv;
+
+ priv = E_SEARCHING_TOKENIZER_GET_PRIVATE (object);
+
+ search_info_free (priv->primary);
+ search_info_free (priv->secondary);
+
+ if (priv->engine != NULL)
+ searcher_free (priv->engine);
+
+ /* Chain up to parent's finalize () method. */
+ G_OBJECT_CLASS (parent_class)->finalize (object);
+}
+
+static void
+searching_tokenizer_begin (HTMLTokenizer *tokenizer,
+ const gchar *content_type)
+{
+ ESearchingTokenizerPrivate *priv;
+
+ priv = E_SEARCHING_TOKENIZER_GET_PRIVATE (tokenizer);
+
+ /* reset search */
+ if (priv->engine != NULL) {
+ searcher_free (priv->engine);
+ priv->engine = NULL;
+ }
+
+ if ((priv->engine = search_info_to_searcher (priv->primary))
+ || (priv->engine = search_info_to_searcher (priv->secondary))) {
+ searcher_set_tokenfunc(priv->engine, get_token, tokenizer);
+ }
+ /* else - no engine, no search active */
+
+ /* Chain up to parent's begin() method. */
+ HTML_TOKENIZER_CLASS (parent_class)->begin (tokenizer, content_type);
+}
+
+static gchar *
+searching_tokenizer_peek_token (HTMLTokenizer *tokenizer)
+{
+ ESearchingTokenizerPrivate *priv;
+
+ priv = E_SEARCHING_TOKENIZER_GET_PRIVATE (tokenizer);
+
+ if (priv->engine != NULL)
+ return searcher_peek_token (priv->engine);
+
+ /* Chain up to parent's peek_token() method. */
+ return HTML_TOKENIZER_CLASS (parent_class)->peek_token (tokenizer);
+}
+
+static gchar *
+searching_tokenizer_next_token (HTMLTokenizer *tokenizer)
+{
+ ESearchingTokenizerPrivate *priv;
+ gint oldmatched;
+ gchar *token;
+
+ priv = E_SEARCHING_TOKENIZER_GET_PRIVATE (tokenizer);
+
+ /* If no search is active, just use the default method. */
+ if (priv->engine == NULL)
+ return HTML_TOKENIZER_CLASS (parent_class)->next_token (tokenizer);
+
+ oldmatched = priv->engine->matchcount;
+
+ token = searcher_next_token (priv->engine);
+
+ /* not sure if this has to be accurate or just say we had some matches */
+ if (oldmatched != priv->engine->matchcount)
+ g_signal_emit (tokenizer, signals[MATCH_SIGNAL], 0);
+
+ return token;
+}
+
+static gboolean
+searching_tokenizer_has_more (HTMLTokenizer *tokenizer)
+{
+ ESearchingTokenizerPrivate *priv;
+
+ priv = E_SEARCHING_TOKENIZER_GET_PRIVATE (tokenizer);
+
+ return (priv->engine != NULL && searcher_pending (priv->engine)) ||
+ HTML_TOKENIZER_CLASS (parent_class)->has_more (tokenizer);
+}
+
+static HTMLTokenizer *
+searching_tokenizer_clone (HTMLTokenizer *tokenizer)
+{
+ ESearchingTokenizer *orig_st;
+ ESearchingTokenizer *new_st;
+
+ orig_st = E_SEARCHING_TOKENIZER (tokenizer);
+ new_st = e_searching_tokenizer_new ();
+
+ search_info_free (new_st->priv->primary);
+ search_info_free (new_st->priv->secondary);
+
+ new_st->priv->primary = search_info_clone (orig_st->priv->primary);
+ new_st->priv->secondary = search_info_clone (orig_st->priv->secondary);
+
+ g_signal_connect_swapped (
+ new_st, "match", G_CALLBACK (matched), orig_st);
+
+ return HTML_TOKENIZER (new_st);
+}
+static void
+searching_tokenizer_class_init (ESearchingTokenizerClass *class)
+{
+ GObjectClass *object_class;
+ HTMLTokenizerClass *tokenizer_class;
+
+ parent_class = g_type_class_peek_parent (class);
+ g_type_class_add_private (class, sizeof (ESearchingTokenizerPrivate));
+
+ object_class = G_OBJECT_CLASS (class);
+ object_class->finalize = searching_tokenizer_finalize;
+
+ tokenizer_class = HTML_TOKENIZER_CLASS (class);
+ tokenizer_class->begin = searching_tokenizer_begin;
+ tokenizer_class->peek_token = searching_tokenizer_peek_token;
+ tokenizer_class->next_token = searching_tokenizer_next_token;
+ tokenizer_class->has_more = searching_tokenizer_has_more;
+ tokenizer_class->clone = searching_tokenizer_clone;
+
+ signals[MATCH_SIGNAL] = g_signal_new (
+ "match",
+ G_TYPE_FROM_CLASS (class),
+ G_SIGNAL_RUN_LAST,
+ G_STRUCT_OFFSET (ESearchingTokenizerClass, match),
+ NULL, NULL,
+ g_cclosure_marshal_VOID__VOID,
+ G_TYPE_NONE, 0);
+}
+
+static void
+searching_tokenizer_init (ESearchingTokenizer *tokenizer)
+{
+ tokenizer->priv = E_SEARCHING_TOKENIZER_GET_PRIVATE (tokenizer);
+
+ tokenizer->priv->primary = search_info_new ();
+ search_info_set_flags (
+ tokenizer->priv->primary,
+ SEARCH_BOLD, SEARCH_CASE | SEARCH_BOLD);
+ search_info_set_color (tokenizer->priv->primary, "red");
+
+ tokenizer->priv->secondary = search_info_new ();
+ search_info_set_flags(
+ tokenizer->priv->secondary,
+ SEARCH_BOLD, SEARCH_CASE | SEARCH_BOLD);
+ search_info_set_color (tokenizer->priv->secondary, "purple");
+}
+
+GType
+e_searching_tokenizer_get_type (void)
+{
+ static GType type = 0;
+
+ if (G_UNLIKELY (type == 0)) {
+ static const GTypeInfo type_info = {
+ sizeof (ESearchingTokenizerClass),
+ (GBaseInitFunc) NULL,
+ (GBaseFinalizeFunc) NULL,
+ (GClassInitFunc) searching_tokenizer_class_init,
+ (GClassFinalizeFunc) NULL,
+ NULL, /* class_data */
+ sizeof (ESearchingTokenizer),
+ 0, /* n_preallocs */
+ (GInstanceInitFunc) searching_tokenizer_init,
+ NULL /* value_table */
+ };
+
+ type = g_type_register_static (
+ HTML_TYPE_TOKENIZER, "ESearchingTokenizer",
+ &type_info, 0);
+ }
+
+ return type;
+}
+
+ESearchingTokenizer *
+e_searching_tokenizer_new (void)
+{
+ return g_object_new (E_TYPE_SEARCHING_TOKENIZER, NULL);
+}
+
+void
+e_searching_tokenizer_set_primary_search_string (ESearchingTokenizer *tokenizer,
+ const gchar *primary_string)
+{
+ g_return_if_fail (E_IS_SEARCHING_TOKENIZER (tokenizer));
+
+ search_info_clear (tokenizer->priv->primary);
+ search_info_add_string (tokenizer->priv->primary, primary_string);
+}
+
+void
+e_searching_tokenizer_add_primary_search_string (ESearchingTokenizer *tokenizer,
+ const gchar *primary_string)
+{
+ g_return_if_fail (E_IS_SEARCHING_TOKENIZER (tokenizer));
+
+ search_info_add_string (tokenizer->priv->primary, primary_string);
+}
+
+void
+e_searching_tokenizer_set_primary_case_sensitivity (ESearchingTokenizer *tokenizer,
+ gboolean case_sensitive)
+{
+ g_return_if_fail (E_IS_SEARCHING_TOKENIZER (tokenizer));
+
+ search_info_set_flags (
+ tokenizer->priv->primary,
+ case_sensitive ? SEARCH_CASE : 0, SEARCH_CASE);
+}
+
+void
+e_searching_tokenizer_set_secondary_search_string (ESearchingTokenizer *tokenizer,
+ const gchar *secondary_string)
+{
+ g_return_if_fail (E_IS_SEARCHING_TOKENIZER (tokenizer));
+
+ search_info_clear (tokenizer->priv->secondary);
+ search_info_add_string (tokenizer->priv->secondary, secondary_string);
+}
+
+void
+e_searching_tokenizer_add_secondary_search_string (ESearchingTokenizer *tokenizer,
+ const gchar *secondary_string)
+{
+ g_return_if_fail (E_IS_SEARCHING_TOKENIZER (tokenizer));
+
+ search_info_add_string (tokenizer->priv->secondary, secondary_string);
+}
+
+void
+e_searching_tokenizer_set_secondary_case_sensitivity (ESearchingTokenizer *tokenizer,
+ gboolean case_sensitive)
+{
+ g_return_if_fail (E_IS_SEARCHING_TOKENIZER (tokenizer));
+
+ search_info_set_flags (
+ tokenizer->priv->secondary,
+ case_sensitive ? SEARCH_CASE : 0, SEARCH_CASE);
+}
+
+/* Note: only returns the primary search string count */
+gint
+e_searching_tokenizer_match_count (ESearchingTokenizer *tokenizer)
+{
+ g_return_val_if_fail (E_IS_SEARCHING_TOKENIZER (tokenizer), -1);
+
+ if (tokenizer->priv->engine && tokenizer->priv->primary->strv->len)
+ return tokenizer->priv->engine->matchcount;
+
+ return 0;
+}