aboutsummaryrefslogtreecommitdiffstats
path: root/libibex/wordindex.c
diff options
context:
space:
mode:
Diffstat (limited to 'libibex/wordindex.c')
-rw-r--r--libibex/wordindex.c645
1 files changed, 0 insertions, 645 deletions
diff --git a/libibex/wordindex.c b/libibex/wordindex.c
deleted file mode 100644
index 6ddbcb599a..0000000000
--- a/libibex/wordindex.c
+++ /dev/null
@@ -1,645 +0,0 @@
-/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
- *
- * Copyright (C) 2000 Ximian, Inc.
- *
- * Authors: Michael Zucchi <notzed@ximian.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of version 2 of the GNU General Public
- * License as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public
- * License along with this program; if not, write to the
- * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
- * Boston, MA 02111-1307, USA.
- */
-
-/* code to manage a word index */
-/* includes a cache for word index writes,
- but not for name index writes (currently), or any reads.
-
-Note the word cache is only needed during indexing of lots
-of words, and could then be discarded (:flush()).
-
-*/
-
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <unistd.h>
-
-#include <glib.h>
-
-#include "block.h"
-#include "index.h"
-#include "wordindex.h"
-
-#define d(x)
-
-/*#define WORDCACHE_SIZE (256)*/
-#define WORDCACHE_SIZE (4096)
-
-extern struct _IBEXStoreClass ibex_diskarray_class;
-extern struct _IBEXIndexClass ibex_hash_class;
-
-/* need 2 types of hash key?
- one that just stores the wordid / wordblock
- and one that stores the filecount/files?
-*/
-
-
-#define CACHE_FILE_COUNT (62)
-
-struct _wordcache {
- struct _wordcache *next;
- struct _wordcache *prev;
- nameid_t wordid; /* disk wordid */
- blockid_t wordblock; /* head of disk list */
- blockid_t wordtail; /* and the tail data */
- short filecount; /* how many valid items in files[] */
- short filealloc; /* how much allocated space in files[] */
- union {
- nameid_t *files; /* memory cache of files */
- nameid_t file0; /* if filecount == 1 && filealloc == 0, store directly */
- } file;
- char word[1]; /* actual word follows */
-};
-
-static void unindex_name(struct _IBEXWord *, const char *name); /* unindex all entries for name */
-static gboolean contains_name(struct _IBEXWord *, const char *name); /* index contains data for name */
-static GPtrArray *find(struct _IBEXWord *, const char *word); /* returns all matches for word */
-static gboolean find_name(struct _IBEXWord *, const char *name, const char *word); /* find if name contains word */
-static void add(struct _IBEXWord *, const char *name, const char *word); /* adds a single word to name (slow) */
-static void add_list(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */
-static int word_sync(struct _IBEXWord *idx);
-static int word_flush(struct _IBEXWord *idx);
-static int word_close(struct _IBEXWord *idx);
-static void word_index_pre(struct _IBEXWord *idx);
-static void word_index_post(struct _IBEXWord *idx);
-
-struct _IBEXWordClass ibex_word_index_class = {
- word_sync, word_flush, word_close,
- word_index_pre, word_index_post,
- unindex_name, contains_name,
- find, find_name,
- add, add_list
-};
-
-/* this interface isn't the best, but it'll do for now */
-struct _IBEXWord *
-ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot)
-{
- struct _IBEXWord *idx;
-
- idx = g_malloc0(sizeof(*idx));
- idx->wordcache = g_hash_table_new(g_str_hash, g_str_equal);
- ibex_list_new(&idx->wordnodes);
- idx->wordcount = 0;
- idx->klass = &ibex_word_index_class;
-
- /* we use the same block array storage for both indexes at the moment */
- idx->wordstore = ibex_diskarray_class.create(bc);
- idx->namestore = idx->wordstore;
-
- /* but not the same indexes! */
- if (*wordroot) {
- printf("opening wordindex root = %d\n", *wordroot);
- idx->wordindex = ibex_hash_class.open(bc, *wordroot);
- } else {
- idx->wordindex = ibex_hash_class.create(bc, 2048);
- *wordroot = idx->wordindex->root;
- printf("creating wordindex root = %d\n", *wordroot);
- }
- if (*nameroot) {
- printf("opening nameindex root = %d\n", *nameroot);
- idx->nameindex = ibex_hash_class.open(bc, *nameroot);
- } else {
- idx->nameindex = ibex_hash_class.create(bc, 2048);
- *nameroot = idx->nameindex->root;
- printf("creating nameindex root = %d\n", *nameroot);
- }
- return idx;
-}
-
-#if d(!)0
-static void
-cache_sanity(struct _wordcache *head)
-{
- while (head->next) {
- g_assert(head->filecount <= head->filealloc);
- g_assert(strlen(head->word) != 0);
- head = head->next;
- }
-}
-#endif
-
-static void word_index_pre(struct _IBEXWord *idx)
-{
-}
-
-static void word_index_post(struct _IBEXWord *idx)
-{
-}
-
-/* unindex all entries for name */
-static void unindex_name(struct _IBEXWord *idx, const char *name)
-{
- GArray *words;
- int i;
- nameid_t nameid, wordid;
- blockid_t nameblock, wordblock, newblock, nametail, wordtail, newtail;
- char *word;
- struct _wordcache *cache;
-
- d(printf("unindexing %s\n", name));
-
- /* lookup the hash key */
- nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name));
- /* get the block for this key */
- nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail);
- /* and the data for this block */
- words = idx->namestore->klass->get(idx->namestore, nameblock, nametail);
- /* walk it ... */
- for (i=0;i<words->len;i++) {
- /* get the word */
- wordid = g_array_index(words, nameid_t, i);
- d(printf(" word %d\n", wordid));
- /* get the data block */
- wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail);
- /* clear this name from it */
- newblock = wordblock;
- newtail = wordtail;
- idx->wordstore->klass->remove(idx->wordstore, &newblock, &newtail, nameid);
- if (newblock != wordblock || newtail != wordtail)
- idx->wordindex->klass->set_data(idx->wordindex, wordid, newblock, newtail);
-
- /* now check the cache as well */
- word = idx->nameindex->klass->get_key(idx->wordindex, wordid, NULL);
- if (word) {
- cache = g_hash_table_lookup(idx->wordcache, word);
- if (cache) {
- /* its there, update our head/tail pointers */
- cache->wordblock = newblock;
- cache->wordtail = newtail;
-
- /* now check that we have a data entry in it */
- if (cache->filealloc == 0 && cache->filecount == 1) {
- if (cache->file.file0 == nameid) {
- cache->filecount = 0;
- }
- } else {
- int j;
-
- for (j=0;j<cache->filecount;j++) {
- if (cache->file.files[j] == nameid) {
- cache->file.files[j] = cache->file.files[cache->filecount-1];
- cache->filecount--;
- break;
- }
- }
- }
- }
- g_free(word);
- }
- }
- g_array_free(words, TRUE);
-
- /* and remove name data and itself */
- idx->namestore->klass->free(idx->namestore, nameblock, nametail);
- idx->nameindex->klass->remove(idx->nameindex, name, strlen(name));
-}
-
-/* index contains (any) data for name */
-static gboolean contains_name(struct _IBEXWord *idx, const char *name)
-{
- return idx->nameindex->klass->find(idx->nameindex, name, strlen(name)) != 0;
-}
-
-/* returns all matches for word */
-static GPtrArray *find(struct _IBEXWord *idx, const char *word)
-{
- nameid_t wordid, nameid;
- GPtrArray *res;
- GArray *names;
- int i;
- char *new;
- struct _wordcache *cache;
- blockid_t wordblock, wordtail;
-
- res = g_ptr_array_new();
-
- cache = g_hash_table_lookup(idx->wordcache, word);
- if (cache) {
- /* freshen cache entry if we touch it */
- ibex_list_remove((struct _listnode *)cache);
- ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache);
- wordid = cache->wordid;
- wordblock = cache->wordblock;
- wordtail = cache->wordtail;
- } else {
- /* lookup the hash key */
- wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word));
- /* get the block for this key */
- wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail);
- }
- /* and the data for this block */
- names = idx->wordstore->klass->get(idx->wordstore, wordblock, wordtail);
- /* .. including any memory-only data */
- if (cache) {
- if (cache->filealloc == 0 && cache->filecount == 1)
- g_array_append_val(names, cache->file.file0);
- else
- g_array_append_vals(names, cache->file.files, cache->filecount);
- }
-
- /* walk it ... converting id's back to strings */
- g_ptr_array_set_size(res, names->len);
- for (i=0;i<names->len;i++) {
- nameid = g_array_index(names, nameid_t, i);
- new = idx->nameindex->klass->get_key(idx->nameindex, nameid, NULL);
- res->pdata[i] = new;
- }
- g_array_free(names, TRUE);
- return res;
-}
-
-/* find if name contains word */
-static gboolean find_name(struct _IBEXWord *idx, const char *name, const char *word)
-{
- nameid_t wordid, nameid;
- blockid_t nameblock, nametail;
- struct _wordcache *cache;
- int i;
-
- /* lookup the hash key for the name */
- nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name));
- /* get the block for this name */
- nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail);
-
- /* check if there is an in-memory cache for this word, check its file there first */
- cache = g_hash_table_lookup(idx->wordcache, word);
- if (cache) {
- /* freshen cache entry if we touch it */
- ibex_list_remove((struct _listnode *)cache);
- ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache);
- if (cache->filecount == 1 && cache->filealloc == 0) {
- if (cache->file.file0 == nameid)
- return TRUE;
- } else {
- for (i=0;i<cache->filecount;i++) {
- if (cache->file.files[i] == nameid)
- return TRUE;
- }
- }
- /* not there? well we can use the wordid anyway */
- wordid = cache->wordid;
- } else {
- /* lookup the hash key for word */
- wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word));
- }
-
- /* see if wordid is in nameblock */
- return idx->namestore->klass->find(idx->namestore, nameblock, nametail, wordid);
-}
-
-/* cache helper functions */
-/* flush a cache entry to disk, and empty it out */
-static void
-sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache)
-{
- GArray array; /* just use this as a header */
- blockid_t oldblock, oldtail;
-
- d(printf("syncing cache entry '%s' used %d\n", cache->word, cache->filecount));
- if (cache->filecount == 1 && cache->filealloc == 0)
- array.data = (char *)&cache->file.file0;
- else
- array.data = (char *)cache->file.files;
- array.len = cache->filecount;
- oldblock = cache->wordblock;
- oldtail = cache->wordtail;
- idx->wordstore->klass->add_list(idx->wordstore, &cache->wordblock, &cache->wordtail, &array);
- if (oldblock != cache->wordblock || oldtail != cache->wordtail) {
- idx->wordindex->klass->set_data(idx->wordindex, cache->wordid, cache->wordblock, cache->wordtail);
- }
- cache->filecount = 0;
-}
-
-/* create a new key in an index, returning its id and head block */
-static void
-add_index_key(struct _IBEXIndex *wordindex, const char *word, nameid_t *wordid, blockid_t *wordblock, blockid_t *wordtail)
-{
- /* initialise cache entry - id of word entry and head block */
- *wordid = wordindex->klass->find(wordindex, word, strlen(word));
- if (*wordid == 0) {
- *wordid = wordindex->klass->insert(wordindex, word, strlen(word));
- *wordblock = 0;
- *wordtail = 0;
- } else {
- *wordblock = wordindex->klass->get_data(wordindex, *wordid, wordtail);
- }
-}
-
-/* create a new key in a cached index (only word cache so far), flushing old keys
- if too much space is being used */
-static struct _wordcache *
-add_index_cache(struct _IBEXWord *idx, const char *word)
-{
- struct _wordcache *cache;
-
- d(printf("adding %s to cache\n", word));
-
- cache = g_hash_table_lookup(idx->wordcache, word);
- if (cache == 0) {
- /* see if we have to flush off the last entry */
- if (idx->wordcount >= WORDCACHE_SIZE) {
- struct _wordcache *mincache;
- int min, count=0;
- /* remove last entry, and flush it */
- cache = (struct _wordcache *)idx->wordnodes.tailpred;
- mincache = cache;
- min = mincache->filecount;
-
- d(printf("flushing word from cache %s\n", cache->word));
- /* instead of just using the last entry, we try and find an entry with
- with only 1 item (failing that, the smallest in the range we look at) */
- /* this could probably benefit greatly from a more sophisticated aging algorithm */
- while (cache->next && count < 100) {
- if (cache->filecount == 1) {
- mincache = cache;
- break;
- }
- if (cache->filecount > 0 && cache->filecount < min) {
- mincache = cache;
- min = cache->filecount;
- }
- cache = cache->next;
- count++;
- }
- ibex_list_remove((struct _listnode *)mincache);
- g_hash_table_remove(idx->wordcache, mincache->word);
- sync_cache_entry(idx, mincache);
- if (mincache->filealloc)
- g_free(mincache->file.files);
- g_free(mincache);
- idx->wordcount--;
- }
- cache = g_malloc0(sizeof(*cache)+strlen(word));
- /* initialise cache entry - id of word entry and head block */
- add_index_key(idx->wordindex, word, &cache->wordid, &cache->wordblock, &cache->wordtail);
- /* other fields */
- strcpy(cache->word, word);
- cache->filecount = 0;
- g_hash_table_insert(idx->wordcache, cache->word, cache);
- ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache);
- idx->wordcount++;
- } else {
- /* move cache bucket ot the head of the cache list */
- ibex_list_remove((struct _listnode *)cache);
- ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache);
- }
- return cache;
-}
-
-/* adds a single word to name (slow) */
-static void add(struct _IBEXWord *idx, const char *name, const char *word)
-{
- nameid_t nameid;
- blockid_t nameblock, newblock, nametail, newtail;
- struct _wordcache *cache;
-
- g_error("Dont use wordindex::add()");
- abort();
-
- cache = add_index_cache(idx, word);
-
- /* get the nameid and block start for this name */
- add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
-
- /* check for repeats of the last name - dont add anything */
- if (cache->filecount == 1 && cache->filealloc == 0) {
- if (cache->file.file0 == nameid)
- return;
- } else {
- if (cache->file.files[cache->filecount] == nameid)
- return;
- }
-
- /* see if we are setting the first, drop it in the union */
- if (cache->filecount == 0 && cache->filealloc == 0) {
- cache->file.file0 = nameid;
- } else if (cache->filecount == 1 && cache->filealloc == 0) {
- nameid_t saveid;
- /* we need to allocate space for words */
- saveid = cache->file.file0;
- cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT);
- /* this could possibly grow as needed, but i wont for now */
- cache->filealloc = CACHE_FILE_COUNT;
- cache->file.files[0] = saveid;
- cache->file.files[1] = nameid;
- } else {
- cache->file.files[cache->filecount] = nameid;
- }
-
- cache->filecount++;
-
- /* if we are full, force a flush now */
- if (cache->filealloc && cache->filecount >= cache->filealloc) {
- sync_cache_entry(idx, cache);
- }
-
- newtail = nametail;
- newblock = nameblock;
- idx->namestore->klass->add(idx->namestore, &newblock, &newtail, cache->wordid);
- if (newblock != nameblock || newtail != nametail) {
- idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail);
- }
-}
-
-/* adds a bunch of words to a given name */
-static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words)
-{
- int i;
- GArray *data = g_array_new(0, 0, sizeof(nameid_t));
- blockid_t nameblock, newblock, nametail, newtail;
- nameid_t nameid;
- struct _wordcache *cache;
-
- d(printf("Adding words to name %s\n", name));
-
- d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
-
- /* get the nameid and block start for this name */
- add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
-
- d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
-
- for (i=0;i<words->len;i++) {
- char *word = words->pdata[i];
-
- cache = add_index_cache(idx, word);
-
- /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
-
- /* check for duplicates; doesn't catch duplicates over an overflow boundary. Watch me care. */
- if (cache->filecount == 0
- /* the 1 item case */
- || (cache->filecount == 1 && cache->filealloc == 0 && cache->file.file0 != nameid)
- /* the normal case */
- || (cache->filealloc > 0 && cache->file.files[cache->filecount-1] != nameid)) {
-
- /* see if we are setting the first, drop it in the union */
- if (cache->filecount == 0 && cache->filealloc == 0) {
- cache->file.file0 = nameid;
- } else if (cache->filecount == 1 && cache->filealloc == 0) {
- nameid_t saveid;
- /* we need to allocate space for words */
- saveid = cache->file.file0;
- cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT);
- /* this could possibly grow as needed, but i wont for now */
- cache->filealloc = CACHE_FILE_COUNT;
- cache->file.files[0] = saveid;
- cache->file.files[1] = nameid;
- } else {
- cache->file.files[cache->filecount] = nameid;
- }
-
- cache->filecount++;
-
- /* if we are full, force a flush now */
- if (cache->filealloc && cache->filecount >= cache->filealloc) {
- sync_cache_entry(idx, cache);
- }
-
- /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
-
- /* and append this wordid for this name in memory */
- g_array_append_val(data, cache->wordid);
- }
-
- /*d(cache_sanity((struct _wordcache *)idx->wordnodes.head));*/
- }
-
- d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
-
- /* and append these word id's in one go */
- newblock = nameblock;
- newtail = nametail;
- idx->namestore->klass->add_list(idx->namestore, &newblock, &newtail, data);
- if (newblock != nameblock || newtail != nametail) {
- idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail);
- }
-
- d(cache_sanity((struct _wordcache *)idx->wordnodes.head));
-
- g_array_free(data, TRUE);
-}
-
-/* sync any in-memory data to disk */
-static int
-word_sync(struct _IBEXWord *idx)
-{
- /* we just flush also, save memory */
- word_flush(idx);
-
-#if 0
- struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head;
-
- while (cache->next) {
- sync_cache_entry(idx, cache);
- cache = cache->next;
- }
-
- /*ibex_hash_dump(idx->wordindex);*/
- /*ibex_hash_dump(idx->nameindex);*/
-#endif
- return 0;
-}
-
-/* sync and flush any in-memory data to disk and free it */
-static int
-word_flush(struct _IBEXWord *idx)
-{
- struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next;
- extern int block_log;
- int count= 0;
- int used=0, wasted=0;
-
- block_log = 0;
-
- next = cache->next;
- while (next) {
- count++;
- used += sizeof(struct _wordcache) + (cache->filealloc * sizeof(nameid_t));
- if (cache->filealloc)
- wasted += (cache->filealloc-cache->filecount)*sizeof(nameid_t);
- else
- wasted += (1-cache->filecount) * sizeof(nameid_t);
-
- /*printf("syncing word %s\n", cache->word);*/
- sync_cache_entry(idx, cache);
- g_hash_table_remove(idx->wordcache, cache->word);
- if (cache->filealloc)
- g_free(cache->file.files);
- g_free(cache);
- cache = next;
- next = cache->next;
- }
-
- printf("sync cache entries = %d\n used memory = %d\n wasted memory = %d\n", count, used, wasted);
-
- block_log = 0;
- ibex_list_new(&idx->wordnodes);
- idx->wordcount = 0;
- return 0;
-}
-
-static int word_close(struct _IBEXWord *idx)
-{
- struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head, *next;
- extern int block_log;
- int count= 0;
- int used=0, wasted=0;
-
- block_log = 0;
-
- next = cache->next;
- while (next) {
- count++;
- used += sizeof(struct _wordcache) + (cache->filealloc * sizeof(nameid_t));
- if (cache->filealloc)
- wasted += (cache->filealloc-cache->filecount)*sizeof(nameid_t);
- else
- wasted += (1-cache->filecount) * sizeof(nameid_t);
-
- /*printf("closing word %s\n", cache->word);*/
- sync_cache_entry(idx, cache);
- if (cache->filealloc)
- g_free(cache->file.files);
- g_free(cache);
- cache = next;
- next = cache->next;
- }
- block_log = 0;
-
- printf("cache entries = %d\n used memory = %d\n wasted memory = %d\n", count, used, wasted);
-
- idx->namestore->klass->close(idx->namestore);
- idx->nameindex->klass->close(idx->nameindex);
- /*same as namestore:
- idx->wordstore->klass->close(idx->wordstore);*/
- idx->wordindex->klass->close(idx->wordindex);
- g_hash_table_destroy(idx->wordcache);
- g_free(idx);
-
- return 0;
-}