/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- */ /* * e-searching-tokenizer.c * * Copyright (C) 2002 Ximian, Inc. * * Developed by Jon Trowbridge * Rewritten significantly to handle multiple strings and improve performance * by Michael Zucchi */ /* * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public * License as published by the Free Software Foundation. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 * USA. */ #include #include #include #include #include "e-searching-tokenizer.h" #include "e-util/e-memory.h" #include "e-util/e-msgport.h" #define d(x) enum { EST_MATCH_SIGNAL, EST_LAST_SIGNAL }; guint e_searching_tokenizer_signals[EST_LAST_SIGNAL] = { 0 }; static void e_searching_tokenizer_begin (HTMLTokenizer *, gchar *); static void e_searching_tokenizer_end (HTMLTokenizer *); static gchar *e_searching_tokenizer_peek_token (HTMLTokenizer *); static gchar *e_searching_tokenizer_next_token (HTMLTokenizer *); static gboolean e_searching_tokenizer_has_more (HTMLTokenizer *); static HTMLTokenizer *e_searching_tokenizer_clone (HTMLTokenizer *); /* static const gchar *space_tags[] = { "br", NULL };*/ GtkObjectClass *parent_class = NULL; /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ /* ??? typedef struct _SharedState SharedState; struct _SharedState { gint refs; gchar *str_primary; gchar *str_secondary; gboolean case_sensitive_primary; gboolean case_sensitive_secondary; }; */ /* ********************************************************************** */ #if 0 static SharedState * shared_state_new (void) { SharedState *shared = g_new0 (SharedState, 1); shared->refs = 1; return shared; } static void shared_state_ref (SharedState *shared) { g_return_if_fail (shared != NULL); g_return_if_fail (shared->refs > 0); ++shared->refs; } static void shared_state_unref (SharedState *shared) { if (shared) { g_return_if_fail (shared->refs > 0); --shared->refs; if (shared->refs == 0) { g_free (shared->str_primary); g_free (shared->str_secondary); g_free (shared); } } } #endif /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ /* ********************************************************************** */ /* Utility functions */ /* This is faster and safer than glib2's utf8 abomination, but isn't exported from camel as yet */ static __inline__ guint32 camel_utf8_getc(const unsigned char **ptr) { register unsigned char *p = (unsigned char *)*ptr; register unsigned char c, r; register guint32 v, m; again: r = *p++; loop: if (r < 0x80) { *ptr = p; v = r; } else if (r < 0xfe) { /* valid start char? */ v = r; m = 0x7f80; /* used to mask out the length bits */ do { c = *p++; if ((c & 0xc0) != 0x80) { r = c; goto loop; } v = (v<<6) | (c & 0x3f); r<<=1; m<<=5; } while (r & 0x40); *ptr = p; v &= ~m; } else { goto again; } return v; } /* note: our tags of interest are 7 bit ascii, only, no need to do any fancy utf8 stuff */ /* tags should be upper case if this list gets longer than 10 entries, consider binary search */ static char *ignored_tags[] = { "B", "I", "FONT", "TT", "EM", /* and more? */}; static int ignore_tag(const char *tag) { char *t = alloca(strlen(tag)+1), c, *out; const char *in; int i; /* we could use a aho-corasick matcher here too ... but we wont */ /* normalise tag into 't'. Note we use the property that the only tags we're interested in are 7 bit ascii to shortcut and simplify case insensitivity */ in = tag+2; /* skip: TAG_ESCAPE '<' */ if (*in == '/') in++; out = t; while ((c = *in++)) { if (c >= 'A' && c <= 'Z') *out++ = c; else if (c >= 'a' && c <= 'z') *out++ = c & 0xdf; /* convert ASCII to upper case */ else /* maybe should check for > or ' ' etc? */ break; } *out = 0; for (i=0;i%p\n", p, s, s->final, s->fail); m = s->matches; while (m) { printf(" %s'%c' -> %p\n", p, m->ch, m->match); if (m->match) dump_trie(m->match, d+1); m = m->next; } } /* This builds an Aho-Corasick search trie for a set of utf8 words */ /* See http://www-sr.informatik.uni-tuebingen.de/~buehler/AC/AC.html for a neat demo */ static __inline__ struct _match * g(struct _state *q, guint32 c) { struct _match *m = q->matches; while (m && m->ch != c) m = m->next; return m; } static struct _trie * build_trie(int nocase, int len, char **words) { struct _state *q, *qt, *r; char *word; struct _match *m, *n; int i, depth; guint32 c; struct _trie *trie; int state_depth_max, state_depth_size; struct _state **state_depth; trie = g_malloc(sizeof(*trie)); trie->root.matches = 0; trie->root.final = 0; trie->root.fail = 0; trie->root.next = 0; trie->state_chunks = e_memchunk_new(8, sizeof(struct _state)); trie->match_chunks = e_memchunk_new(8, sizeof(struct _match)); /* This will correspond to the length of the longest pattern */ state_depth_size = 0; state_depth_max = 64; state_depth = g_malloc(sizeof(*state_depth[0])*64); state_depth[0] = NULL; /* Step 1: Build trie */ /* This just builds a tree that merges all common prefixes into the same branch */ for (i=0;iroot; depth = 0; while ((c = camel_utf8_getc((const unsigned char **)&word))) { if (nocase) c = g_unichar_tolower(c); m = g(q, c); if (m == 0) { m = e_memchunk_alloc(trie->match_chunks); m->ch = c; m->next = q->matches; q->matches = m; q = m->match = e_memchunk_alloc(trie->state_chunks); q->matches = 0; q->fail = &trie->root; q->final = 0; if (state_depth_max < depth) { state_depth_max += 64; state_depth = g_realloc(state_depth, sizeof(*state_depth[0])*state_depth_max); } if (state_depth_size < depth) { state_depth[depth] = 0; state_depth_size = depth; } q->next = state_depth[depth]; state_depth[depth] = q; } else { q = m->match; } depth++; } q->final = depth; } d(printf("Dumping trie:\n")); d(dump_trie(&trie->root, 0)); /* Step 2: Build failure graph */ /* This searches for the longest substring which is a prefix of another string and builds a graph of failure links so you can find multiple substrings concurrently, using aho-corasick's algorithm */ for (i=0;imatches; while (m) { c = m->ch; qt = m->match; r = q->fail; while (r != 0 && (n = g(r, c)) == NULL) r = r->fail; if (r != 0) { qt->fail = n->match; if (qt->fail->final > qt->final) qt->final = qt->fail->final; } else { if ((n = g(&trie->root, c))) qt->fail = n->match; else qt->fail = &trie->root; } m = m->next; } q = q->next; } } d(printf("After failure analysis\n")); d(dump_trie(&trie->root, 0)); g_free(state_depth); trie->max_depth = state_depth_size; return trie; } static void free_trie(struct _trie *t) { e_memchunk_destroy(t->match_chunks); e_memchunk_destroy(t->state_chunks); g_free(t); } /* ********************************************************************** */ /* html token searcher */ struct _token { struct _token *next; struct _token *prev; unsigned int offset; /* we need to copy the token for memory management, so why not copy it whole */ char tok[1]; }; /* stack of submatches currently being scanned, used for merging */ struct _submatch { unsigned int offstart, offend; /* in bytes */ }; /* flags for new func */ #define SEARCH_CASE (1) #define SEARCH_BOLD (2) struct _searcher { struct _trie *t; char *(*next_token)(); /* callbacks for more tokens */ void *next_data; int words; /* how many words */ char *tags, *tage; /* the tag we used to highlight */ int flags; /* case sensitive or not */ struct _state *state; /* state is the current trie state */ int matchcount; EDList input; /* pending 'input' tokens, processed but might match */ EDList output; /* output tokens ready for source */ struct _token *current; /* for token output memory management */ guint32 offset; /* current offset through searchable stream? */ guint32 offout; /* last output position */ unsigned int lastp; /* current position in rotating last buffer */ guint32 *last; /* buffer that goes back last 'n' positions */ guint32 last_mask; /* bitmask for efficient rotation calculation */ unsigned int submatchp; /* submatch stack */ struct _submatch *submatches; }; static void searcher_set_tokenfunc(struct _searcher *s, char *(*next)(), void *data) { s->next_token = next; s->next_data = data; } static struct _searcher * searcher_new(int flags, int argc, char **argv, const char *tags, const char *tage) { int i, m; struct _searcher *s; s = g_malloc(sizeof(*s)); s->t = build_trie((flags&SEARCH_CASE) == 0, argc, argv); s->words = argc; s->tags = g_strdup(tags); s->tage = g_strdup(tage); s->flags = flags; s->state = &s->t->root; s->matchcount = 0; e_dlist_init(&s->input); e_dlist_init(&s->output); s->current = 0; s->offset = 0; s->offout = 0; /* rotating queue of previous character positions */ m = s->t->max_depth+1; i = 2; while (ilast = g_malloc(sizeof(s->last[0])*i); s->last_mask = i-1; s->lastp = 0; /* a stack of possible submatches */ s->submatchp = 0; s->submatches = g_malloc(sizeof(s->submatches[0])*argc+1); return s; } static void searcher_free(struct _searcher *s) { struct _token *t; while ((t = (struct _token *)e_dlist_remhead(&s->input))) g_free(t); while ((t = (struct _token *)e_dlist_remhead(&s->output))) g_free(t); g_free(s->tags); g_free(s->tage); g_free(s->last); g_free(s->submatches); free_trie(s->t); g_free(s); } static struct _token * append_token(EDList *list, const char *tok, int len) { struct _token *token; if (len == -1) len = strlen(tok); token = g_malloc(sizeof(*token) + len+1); token->offset = 0; /* set by caller when required */ memcpy(token->tok, tok, len); token->tok[len] = 0; e_dlist_addtail(list, (EDListNode *)token); return token; } #define free_token(x) (g_free(x)) static void output_token(struct _searcher *s, struct _token *token) { int offend; int left, pre; if (token->tok[0] == TAG_ESCAPE) { if (token->offset >= s->offout) { d(printf("moving tag token '%s' from input to output\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); e_dlist_addtail(&s->output, (EDListNode *)token); } else { d(printf("discarding tag token '%s' from input\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); free_token(token); } } else { offend = token->offset + strlen(token->tok); left = offend-s->offout; if (left > 0) { pre = s->offout - token->offset; if (pre>0) memmove(token->tok, token->tok+pre, left+1); d(printf("adding partial remaining/failed '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); s->offout = offend; e_dlist_addtail(&s->output, (EDListNode *)token); } else { d(printf("discarding whole token '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); free_token(token); } } } static struct _token * find_token(struct _searcher *s, int start) { register struct _token *token; /* find token which is start token, from end of list back */ token = (struct _token *)s->input.tailpred; while (token->prev) { if (token->offset <= start) return token; token = token->prev; } return NULL; } static void output_match(struct _searcher *s, unsigned int start, unsigned int end) { register struct _token *token; struct _token *starttoken, *endtoken; char b[8]; d(printf("output match: %d-%d at %d\n", start, end, s->offout)); starttoken = find_token(s, start); endtoken = find_token(s, end); if (starttoken == NULL || endtoken == NULL) { printf("Cannot find match history for match %d-%d\n", start, end); return; } d(printf("start in token '%s'\n", starttoken->tok[0]==TAG_ESCAPE?starttoken->tok+1:starttoken->tok)); d(printf("end in token '%s'\n", endtoken->tok[0]==TAG_ESCAPE?endtoken->tok+1:endtoken->tok)); /* output pending stuff that didn't match afterall */ while ((struct _token *)s->input.head != starttoken) { token = (struct _token *)e_dlist_remhead(&s->input); d(printf("appending failed match '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); output_token(s, token); } /* output any pre-match text */ if (s->offout < start) { token = append_token(&s->output, starttoken->tok + (s->offout-starttoken->offset), start-s->offout); d(printf("adding pre-match text '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); s->offout = start; } /* output highlight/bold */ if (s->flags & SEARCH_BOLD) { sprintf(b, "%c", (char)TAG_ESCAPE); append_token(&s->output, b, -1); } if (s->tags) append_token(&s->output, s->tags, -1); /* output match node(s) */ if (starttoken != endtoken) { while ((struct _token *)s->input.head != endtoken) { token = (struct _token *)e_dlist_remhead(&s->input); d(printf("appending (partial) match node (head) '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); output_token(s, token); } } /* any remaining partial content */ if (s->offout < end) { token = append_token(&s->output, endtoken->tok+(s->offout-endtoken->offset), end-s->offout); d(printf("appending (partial) match node (tail) '%s'\n", token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); s->offout = end; } /* end highlight */ if (s->tage) append_token(&s->output, s->tage, -1); /* and close bold if we need to */ if (s->flags & SEARCH_BOLD) { sprintf(b, "%c", (char)TAG_ESCAPE); append_token(&s->output, b, -1); } } /* output any sub-pending blocks */ static void output_subpending(struct _searcher *s) { int i; for (i=s->submatchp-1;i>=0;i--) output_match(s, s->submatches[i].offstart, s->submatches[i].offend); s->submatchp = 0; } /* returns true if a merge took place */ static int merge_subpending(struct _searcher *s, int offstart, int offend) { int i; /* merges overlapping or abutting match strings */ if (s->submatchp && s->submatches[s->submatchp-1].offend >= offstart) { /* go from end, any that match 'invalidate' follow-on ones too */ for (i=s->submatchp-1;i>=0;i--) { if (s->submatches[i].offend >= offstart) { if (offstart < s->submatches[i].offstart) s->submatches[i].offstart = offstart; s->submatches[i].offend = offend; if (s->submatchp > i) s->submatchp = i+1; } } return 1; } return 0; } static void push_subpending(struct _searcher *s, int offstart, int offend) { /* This is really an assertion, we just ignore the last pending match instead of crashing though */ if (s->submatchp >= s->words) { printf("ERROR: submatch pending stack overflow\n"); s->submatchp = s->words-1; } s->submatches[s->submatchp].offstart = offstart; s->submatches[s->submatchp].offend = offend; s->submatchp++; } /* move any (partial) tokens from input to output if they are beyond the current output position */ static void output_pending(struct _searcher *s) { struct _token *token; while ( (token = (struct _token *)e_dlist_remhead(&s->input)) ) output_token(s, token); } /* flushes any nodes we cannot possibly match anymore */ static void flush_extra(struct _searcher *s) { unsigned int start; int i; struct _token *starttoken, *token; /* find earliest char that can be in contention */ start = s->offset - s->t->max_depth; for (i=0;isubmatchp;i++) if (s->submatches[i].offstart < start) start = s->submatches[i].offstart; /* now, flush out any tokens which are before this point */ starttoken = find_token(s, start); if (starttoken == NULL) return; while ((struct _token *)s->input.head != starttoken) { token = (struct _token *)e_dlist_remhead(&s->input); output_token(s, token); } } static char * searcher_next_token(struct _searcher *s) { struct _token *token; char *tok, *stok; struct _trie *t = s->t; struct _state *q = s->state; struct _match *m; int offstart, offend; guint32 c; while (e_dlist_empty(&s->output)) { /* get next token */ tok = s->next_token(s->next_data); if (tok == NULL) { output_subpending(s); output_pending(s); break; } /* we dont always have to copy each token, e.g. if we dont match anything */ token = append_token(&s->input, tok, -1); token->offset = s->offset; tok = token->tok; d(printf("new token %d '%s'\n", token->offset, token->tok[0]==TAG_ESCAPE?token->tok+1:token->tok)); /* tag test, reset state on unknown tags */ if (tok[0] == TAG_ESCAPE) { if (!ignore_tag(tok)) { /* force reset */ output_subpending(s); output_pending(s); q = &t->root; } continue; } /* process whole token */ stok = tok; while ((c = camel_utf8_getc((const unsigned char **)&tok))) { if ((s->flags & SEARCH_CASE) == 0) c = g_unichar_tolower(c); while (q && (m = g(q, c)) == NULL) q = q->fail; if (q == 0) { /* mismatch ... reset state */ output_subpending(s); q = &t->root; } else { /* keep track of previous offsets of utf8 chars, rotating buffer */ s->last[s->lastp] = s->offset + (tok-stok)-1; s->lastp = (s->lastp+1)&s->last_mask; q = m->match; /* we have a match of q->final characters for a matching word */ if (q->final) { s->matchcount++; /* use the last buffer to find the real offset of this char */ offstart = s->last[(s->lastp - q->final)&s->last_mask]; offend = s->offset + (tok - stok); if (q->matches == NULL) { if (s->submatchp == 0) { /* nothing pending, always put something in so we can try merge */ push_subpending(s, offstart, offend); } else if (!merge_subpending(s, offstart, offend)) { /* can't merge, output what we have, and start againt */ output_subpending(s); push_subpending(s, offstart, offend); /*output_match(s, offstart, offend);*/ } else if (e_dlist_length(&s->input) > 8) { /* we're continuing to match and merge, but we have a lot of stuff waiting, so flush it out now since this is a safe point to do it */ output_subpending(s); } } else { /* merge/add subpending */ if (!merge_subpending(s, offstart, offend)) push_subpending(s, offstart, offend); } } } } s->offset += (tok-stok)-1; flush_extra(s); } s->state = q; if (s->current) free_token(s->current); s->current = token = (struct _token *)e_dlist_remhead(&s->output); return token?token->tok:NULL; } static char * searcher_peek_token(struct _searcher *s) { char *tok; /* we just get it and then put it back, it's fast enuf */ tok = searcher_next_token(s); if (tok) { /* need to clear this so we dont free it while its still active */ e_dlist_addhead(&s->output, (EDListNode *)s->current); s->current = NULL; } return tok; } static int searcher_pending(struct _searcher *s) { return !(e_dlist_empty(&s->input) && e_dlist_empty(&s->output)); } /* ********************************************************************** */ struct _search_info { GPtrArray *strv; char *colour; unsigned int size:8; unsigned int flags:8; }; /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ static struct _search_info * search_info_new(void) { struct _search_info *s; s = g_malloc0(sizeof(struct _search_info)); s->strv = g_ptr_array_new(); return s; } static void search_info_set_flags(struct _search_info *si, unsigned int flags, unsigned int mask) { si->flags = (si->flags & ~mask) | (flags & mask); } static void search_info_set_colour(struct _search_info *si, const char *colour) { g_free(si->colour); si->colour = g_strdup(colour); } static void search_info_add_string(struct _search_info *si, const char *s) { const char *start; guint32 c; if (s && s[0]) { /* strip leading whitespace */ start = s; while ((c = camel_utf8_getc((const unsigned char **)&s))) { if (!g_unichar_isspace(c)) { break; } start = s; } /* should probably also strip trailing, but i'm lazy today */ if (start[0]) g_ptr_array_add(si->strv, g_strdup(start)); } } static void search_info_clear(struct _search_info *si) { int i; for (i=0;istrv->len;i++) g_free(si->strv->pdata[i]); g_ptr_array_set_size(si->strv, 0); } static void search_info_free(struct _search_info *si) { int i; for (i=0;istrv->len;i++) g_free(si->strv->pdata[i]); g_ptr_array_free(si->strv, TRUE); g_free(si->colour); g_free(si); } static struct _search_info * search_info_clone(struct _search_info *si) { struct _search_info *out; int i; out = search_info_new(); for (i=0;istrv->len;i++) g_ptr_array_add(out->strv, g_strdup(si->strv->pdata[i])); out->colour = g_strdup(si->colour); out->flags = si->flags; out->size = si->size; return out; } static struct _searcher * search_info_to_searcher(struct _search_info *si) { char *tags, *tage; char *col; if (si->strv->len == 0) return NULL; if (si->colour == NULL) col = "red"; else col = si->colour; tags = alloca(20+strlen(col)); sprintf(tags, "%c", TAG_ESCAPE, col); tage = alloca(20); sprintf(tage, "%c", TAG_ESCAPE); return searcher_new(si->flags, si->strv->len, (char **)si->strv->pdata, tags, tage); } /* ********************************************************************** */ struct _ESearchingTokenizerPrivate { struct _search_info *primary, *secondary; struct _searcher *engine; }; /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ /* shoudlnt' this be finalise? */ static void e_searching_tokenizer_destroy (GtkObject *obj) { ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (obj); struct _ESearchingTokenizerPrivate *p = st->priv; search_info_free (p->primary); search_info_free (p->secondary); if (p->engine) searcher_free(p->engine); /* again wtf? shared_state_unref (st->priv->shared); */ g_free(p); if (parent_class->destroy) parent_class->destroy (obj); } static void e_searching_tokenizer_class_init (ESearchingTokenizerClass *klass) { GtkObjectClass *obj_class = (GtkObjectClass *) klass; HTMLTokenizerClass *tok_class = HTML_TOKENIZER_CLASS (klass); e_searching_tokenizer_signals[EST_MATCH_SIGNAL] = gtk_signal_new ("match", GTK_RUN_LAST, obj_class->type, GTK_SIGNAL_OFFSET (ESearchingTokenizerClass, match), gtk_marshal_NONE__NONE, GTK_TYPE_NONE, 0); gtk_object_class_add_signals (obj_class, e_searching_tokenizer_signals, EST_LAST_SIGNAL); obj_class->destroy = e_searching_tokenizer_destroy; tok_class->begin = e_searching_tokenizer_begin; tok_class->end = e_searching_tokenizer_end; tok_class->peek_token = e_searching_tokenizer_peek_token; tok_class->next_token = e_searching_tokenizer_next_token; tok_class->has_more = e_searching_tokenizer_has_more; tok_class->clone = e_searching_tokenizer_clone; parent_class = gtk_type_class (HTML_TYPE_TOKENIZER); } static void e_searching_tokenizer_init (ESearchingTokenizer *st) { struct _ESearchingTokenizerPrivate *p; p = st->priv = g_new0 (struct _ESearchingTokenizerPrivate, 1); p->primary = search_info_new(); search_info_set_flags(p->primary, SEARCH_BOLD, SEARCH_CASE|SEARCH_BOLD); search_info_set_colour(p->primary, "red"); p->secondary = search_info_new(); search_info_set_flags(p->secondary, SEARCH_BOLD, SEARCH_CASE|SEARCH_BOLD); search_info_set_colour(p->secondary, "purple"); } GtkType e_searching_tokenizer_get_type (void) { static GtkType e_searching_tokenizer_type = 0; if (! e_searching_tokenizer_type) { static GtkTypeInfo e_searching_tokenizer_info = { "ESearchingTokenizer", sizeof (ESearchingTokenizer), sizeof (ESearchingTokenizerClass), (GtkClassInitFunc) e_searching_tokenizer_class_init, (GtkObjectInitFunc) e_searching_tokenizer_init, NULL, NULL, (GtkClassInitFunc) NULL }; e_searching_tokenizer_type = gtk_type_unique (HTML_TYPE_TOKENIZER, &e_searching_tokenizer_info); } return e_searching_tokenizer_type; } HTMLTokenizer * e_searching_tokenizer_new (void) { return (HTMLTokenizer *) gtk_type_new (E_TYPE_SEARCHING_TOKENIZER); } /** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** **/ /* blah blah the htmltokeniser doesn't like being asked for a token if it doens't hvae any! */ static char *get_token(HTMLTokenizer *t) { HTMLTokenizerClass *klass = HTML_TOKENIZER_CLASS (parent_class); return klass->has_more(t) ? klass->next_token(t) : NULL; } static void e_searching_tokenizer_begin (HTMLTokenizer *t, gchar *content_type) { ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (t); struct _ESearchingTokenizerPrivate *p = st->priv; /* reset search */ if (p->engine) { searcher_free(p->engine); p->engine = NULL; } if ((p->engine = search_info_to_searcher(p->primary)) || (p->engine = search_info_to_searcher(p->secondary))) { /*HTMLTokenizerClass *klass = HTML_TOKENIZER_CLASS (parent_class);*/ /*searcher_set_tokenfunc(p->engine, klass->next_token, st);*/ searcher_set_tokenfunc(p->engine, get_token, st); } /* else - no engine, no search active */ HTML_TOKENIZER_CLASS (parent_class)->begin (t, content_type); } static void e_searching_tokenizer_end (HTMLTokenizer *t) { ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (t); struct _ESearchingTokenizerPrivate *p = st->priv; /* so end gets called before any get/next tokens. I dont get it. */ #if 0 /* not sure if we should reset search every time ... *shrug* */ if (p->engine) { searcher_free(p->engine); p->engine = NULL; } #endif HTML_TOKENIZER_CLASS (parent_class)->end (t); } static gchar * e_searching_tokenizer_peek_token (HTMLTokenizer *tok) { ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok); /* If no search is active, just use the default method. */ if (st->priv->engine == NULL) return HTML_TOKENIZER_CLASS (parent_class)->peek_token (tok); return searcher_peek_token(st->priv->engine); } static gchar * e_searching_tokenizer_next_token (HTMLTokenizer *tok) { ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok); int oldmatched; char *token; /* If no search is active, just use the default method. */ if (st->priv->engine == NULL) return HTML_TOKENIZER_CLASS (parent_class)->next_token (tok); oldmatched = st->priv->engine->matchcount; token = searcher_next_token(st->priv->engine); /* not sure if this has to be accurate or just say we had some matches */ if (oldmatched != st->priv->engine->matchcount) gtk_signal_emit (GTK_OBJECT (st), e_searching_tokenizer_signals[EST_MATCH_SIGNAL]); return token; } static gboolean e_searching_tokenizer_has_more (HTMLTokenizer *tok) { ESearchingTokenizer *st = E_SEARCHING_TOKENIZER (tok); return (st->priv->engine != NULL && searcher_pending(st->priv->engine)) || HTML_TOKENIZER_CLASS (parent_class)->has_more (tok); } /* proxy matched event, not sure what its for otherwise */ static void matched (ESearchingTokenizer *st) { /*++st->priv->match_count;*/ gtk_signal_emit (GTK_OBJECT (st), e_searching_tokenizer_signals[EST_MATCH_SIGNAL]); } static HTMLTokenizer * e_searching_tokenizer_clone (HTMLTokenizer *tok) { ESearchingTokenizer *orig_st = E_SEARCHING_TOKENIZER (tok); ESearchingTokenizer *new_st = E_SEARCHING_TOKENIZER (e_searching_tokenizer_new ()); search_info_free(new_st->priv->primary); search_info_free(new_st->priv->secondary); new_st->priv->primary = search_info_clone(orig_st->priv->primary); new_st->priv->secondary = search_info_clone(orig_st->priv->secondary); /* what the fucking what???? */ #if 0 shared_state_ref (orig_st->priv->shared); shared_state_unref (new_st->priv->shared); new_st->priv->shared = orig_st->priv->shared; #endif gtk_signal_connect_object (GTK_OBJECT (new_st), "match", GTK_SIGNAL_FUNC (matched), GTK_OBJECT (orig_st)); return HTML_TOKENIZER (new_st); } /* ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** ** */ void e_searching_tokenizer_set_primary_search_string (ESearchingTokenizer *st, const gchar *search_str) { g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); search_info_clear(st->priv->primary); search_info_add_string(st->priv->primary, search_str); } void e_searching_tokenizer_add_primary_search_string (ESearchingTokenizer *st, const gchar *search_str) { g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); search_info_add_string(st->priv->primary, search_str); } void e_searching_tokenizer_set_primary_case_sensitivity (ESearchingTokenizer *st, gboolean iscase) { g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); search_info_set_flags(st->priv->primary, iscase?SEARCH_CASE:0, SEARCH_CASE); } void e_searching_tokenizer_set_secondary_search_string (ESearchingTokenizer *st, const gchar *search_str) { g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); search_info_clear(st->priv->secondary); search_info_add_string(st->priv->secondary, search_str); } void e_searching_tokenizer_add_secondary_search_string (ESearchingTokenizer *st, const gchar *search_str) { g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); search_info_add_string(st->priv->secondary, search_str); } void e_searching_tokenizer_set_secondary_case_sensitivity (ESearchingTokenizer *st, gboolean iscase) { g_return_if_fail (st && E_IS_SEARCHING_TOKENIZER (st)); search_info_set_flags(st->priv->secondary, iscase?SEARCH_CASE:0, SEARCH_CASE); } gint e_searching_tokenizer_match_count (ESearchingTokenizer *st) { g_return_val_if_fail (st && E_IS_SEARCHING_TOKENIZER (st), -1); if (st->priv->engine) return st->priv->engine->matchcount; return 0; }