13 files changed, 1121 insertions, 86 deletions
diff --git a/libibex/ChangeLog b/libibex/ChangeLog
index 53187c03c6..5318e935c4 100644
--- a/libibex/ChangeLog
+++ b/libibex/ChangeLog
@@ -1,14 +1,83 @@
+2000-10-25  Not Zed  <NotZed@HelixCode.com>
+
+	* ibex_internal.h (IBEX_VERSION): Bumped to another version.  The
+	file format hasn't changed, but earlier bugs may create invalid
+	files.
+
+	* block.c (ibex_block_read): Use the root data directly.
+	(ibex_block_cache_open): As well.
+	(ibex_block_get): And here too.
+	(ibex_block_cache_sync): Sync the root block directly here.
+
+	* block.h: Pad root block out to 1024 bytes.
+	Added root block to struct _memcache.
+
+	* disktail.c (tail_get): Dirty the root block.
+	(tail_get): Fix for changes to root access.
+	(disk_remove): And here too.
+
+	* wordindexmem.c (sync_cache_entry): Handle the case of not having
+	any files in the list, which can happen now.
+	(word_index_pre): Make sure we set the wordid on the new cache
+	entry.
+
+	* ibex_block.c (ibex_save): Sigh.  Pass the right argument to
+	index_post.
+
 2000-10-24  JP Rosevear  <jpr@helixcode.com>
 
 	* .cvsignore: Shush
 
 2000-10-24  Not Zed  <NotZed@HelixCode.com>
 
+	* block.c (ibex_block_cache_open): Create a word_index_mem for
+	indexing the words, rather than a word_index.
+
+	* ibex_block.c (ibex_index_buffer): If we haven't called index_pre
+	yet, do it before indexing anything.
+	(ibex_save): If wehave called index_pre previously, call
+	index_post.
+	(ibex_close): And same for here.
+
+	* index.h: Added a cursor class, and cursor retrieval function for
+	iterating through an index's keys.
+
+	* wordindexmem.c (ibex_create_word_index_mem): New word class,
+	similar to wordindex, but meant to be faster for updates.
+	(word_index_pre): Implement.  We load all keys into memory.
+	(word_index_post): Implement.  We sync and free all keys.
+	(find): Remove lru code, its no longer a cache, but a lookup
+	table.
+	(add_index_cache): Remove lru code here too.
+	(find_name): And here.
+	(word_flush): Flush the hashtable direct.
+	(word_close): Call flush to flush, rather than doing it ourselves.
+	(add_index_cache): If we are in an index state, we can assume a
+	cache miss == a new word.
+	(word_index_post): Maintain whether or not we are in an index
+	state, and the depth of the state.
+	(word_index_pre): Likewise.  Dont reread the index if we have
+	already.
+	(cache_sanity): Fixed for struct changes.
+
+	* wordindex.h (IBEXWordClass): Added functions to prepare/cleanup
+	for lots of indexing.  i.e. can be used to optimise indexing speed
+	at the cost of extra memory usage during the indexing process.
+
 	* dumpindex.c: Dumps the contents of indexs.
 
 	* hash.c (ibex_hash_dump_rec): Also print the word count.
+	(hash_cursor_create): Create a new cursor for iterating through a
+	hashtable.
+	(hash_cursor_close): 'close' the cursor. It is upto the
+	application to close any cursors it creates.
+	(hash_cursor_next): Goto the next key id.
+	(hash_cursor_next_key): Goto the next key, reutrn the key.
+	(hash_get_cursor): Return a cursor object.
 
 	* wordindex.c (unindex_name): Cross-check the cache as well.
+	(word_index_post): 
+	(word_index_pre): Added (empty) callbacks for pre/post functions.
 
 2000-10-12  Not Zed  <NotZed@HelixCode.com>
 
diff --git a/libibex/Makefile.am b/libibex/Makefile.am
index fdde10aca3..6cc88186d6 100644
--- a/libibex/Makefile.am
+++ b/libibex/Makefile.am
@@ -3,7 +3,7 @@
 noinst_LTLIBRARIES = libibex.la
 
 libibex_la_SOURCES = \
-	wordindex.c \
+	wordindex.c wordindexmem.c \
 	block.c ibex.h \
 	hash.c \
 	disktail.c \
diff --git a/libibex/block.c b/libibex/block.c
index 8e10a116cb..52a79d27ae 100644
--- a/libibex/block.c
+++ b/libibex/block.c
@@ -30,11 +30,18 @@
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <unistd.h>
-
+#include <errno.h>
+#include <string.h>
 #include <glib.h>
 
 #include "block.h"
 
+/*#define MALLOC_CHECK*/
+
+#ifdef MALLOC_CHECK
+#include <mcheck.h>
+#endif
+
 #define d(x)
 /*#define DEBUG*/
 
@@ -113,6 +120,27 @@ add_miss(struct _memcache *index, int id)
 }
 #endif /* IBEX_STATS */
 
+#ifdef MALLOC_CHECK
+static void
+checkmem(void *p)
+{
+	if (p) {
+		int status = mprobe(p);
+
+		switch (status) {
+		case MCHECK_HEAD:
+			printf("Memory underrun at %p\n", p);
+			abort();
+		case MCHECK_TAIL:
+			printf("Memory overrun at %p\n", p);
+			abort();
+		case MCHECK_FREE:
+			printf("Double free %p\n", p);
+			abort();
+		}
+	}
+}
+#endif
 
 /* simple list routines (for simplified memory management of cache/lists) */
 
@@ -187,6 +215,29 @@ memblock_addr(struct _block *block)
 	return (struct _memblock *)(((char *)block) - G_STRUCT_OFFSET(struct _memblock, data));
 }
 
+/* read/sync the rootblock into the block_cache structure */
+static int
+ibex_block_read_root(struct _memcache *block_cache)
+{
+	lseek(block_cache->fd, 0, SEEK_SET);
+	if (read(block_cache->fd, &block_cache->root, sizeof(block_cache->root)) != sizeof(block_cache->root)) {
+		
+		return -1;
+	}
+	return 0;
+}
+
+static int
+ibex_block_sync_root(struct _memcache *block_cache)
+{
+	lseek(block_cache->fd, 0, SEEK_SET);
+	if (write(block_cache->fd, &block_cache->root, sizeof(block_cache->root)) != sizeof(block_cache->root)) {
+		return -1;
+	}
+	return fsync(block_cache->fd);
+}
+
+
 /**
  * ibex_block_dirty:
  * @block: 
@@ -224,27 +275,22 @@ sync_block(struct _memcache *block_cache, struct _memblock *memblock)
 void
 ibex_block_cache_sync(struct _memcache *block_cache)
 {
-	struct _memblock *memblock, *rootblock = 0;
+	struct _memblock *memblock;
 
 	memblock = (struct _memblock *)block_cache->nodes.head;
 	while (memblock->next) {
-		if (memblock->block == 0) {
-			rootblock = memblock;
-		} else {
-			if (memblock->flags & BLOCK_DIRTY) {
-				sync_block(block_cache, memblock);
-			}
+#ifdef MALLOC_CHECK
+		checkmem(memblock);
+#endif
+		if (memblock->flags & BLOCK_DIRTY) {
+			sync_block(block_cache, memblock);
 		}
 		memblock = memblock->next;
 	}
 
-	if (rootblock) {
-		struct _root *root = (struct _root *)&rootblock->data;
-		root->flags |= IBEX_ROOT_SYNCF;
-		sync_block(block_cache, rootblock);
-	}
-	if (fsync(block_cache->fd) == 0) {
-		block_cache->flags |= IBEX_ROOT_SYNCF;
+	block_cache->root.flags |= IBEX_ROOT_SYNCF;
+	if (ibex_block_sync_root(block_cache) != 0) {
+		block_cache->root.flags &= ~IBEX_ROOT_SYNCF;
 	}
 
 #ifdef IBEX_STATS
@@ -253,6 +299,25 @@ ibex_block_cache_sync(struct _memcache *block_cache)
 
 }
 
+#ifdef MALLOC_CHECK
+static void
+check_cache(struct _memcache *block_cache)
+{
+	struct _memblock *mw, *mn;
+
+	checkmem(block_cache);
+	checkmem(block_cache->index);
+
+	mw = (struct _memblock *)block_cache->nodes.head;
+	mn = mw->next;
+	while (mn) {
+		checkmem(mw);
+		mw = mn;
+		mn = mn->next;
+	}
+}
+#endif
+
 /**
  * ibex_block_cache_flush:
  * @block_cache: 
@@ -278,7 +343,6 @@ ibex_block_cache_flush(struct _memcache *block_cache)
 	ibex_list_new(&block_cache->nodes);
 }
 
-
 /**
  * ibex_block_read:
  * @block_cache: 
@@ -296,32 +360,39 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid)
 {
 	struct _memblock *memblock;
 
-	{
-		/* assert blockid<roof */
-		if (blockid > 0) {
-			struct _root *root = (struct _root *)ibex_block_read(block_cache, 0);
-			g_assert(blockid < root->roof);
-		}
-	}
+#ifdef MALLOC_CHECK
+	check_cache(block_cache);
+#endif
+
+	/* nothing can read the root block directly */
+	g_assert(blockid != 0);
+	g_assert(blockid < block_cache->root.roof);
 
 	memblock = g_hash_table_lookup(block_cache->index, (void *)blockid);
+
+#ifdef MALLOC_CHECK
+	check_cache(block_cache);
+#endif
+
 	if (memblock) {
 		d(printf("foudn blockid in cache %d = %p\n", blockid, &memblock->data));
-#if 0
-		if (blockid == 0) {
-			struct _root *root = &memblock->data;
-			d(printf("superblock state:\n"
-				 " roof = %d\n free = %d\n words = %d\n names = %d\n tail = %d",
-				 root->roof, root->free, root->words, root->names, root->tail));
-			
-		}
-#endif
+
 		/* 'access' page */
 		ibex_list_remove((struct _listnode *)memblock);
 		ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock);
+
+#ifdef MALLOC_CHECK
+		check_cache(block_cache);
+#endif
+
 #ifdef IBEX_STATS
 		add_hit(block_cache, memblock->block);
 #endif
+
+#ifdef MALLOC_CHECK
+		check_cache(block_cache);
+#endif
+
 		return &memblock->data;
 	}
 #ifdef IBEX_STATS
@@ -338,6 +409,7 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid)
 	lseek(block_cache->fd, blockid, SEEK_SET);
 	memset(&memblock->data, 0, sizeof(memblock->data));
 	read(block_cache->fd, &memblock->data, sizeof(memblock->data));
+
 	ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock);
 	g_hash_table_insert(block_cache->index, (void *)blockid, memblock);
 	if (block_cache->count >= CACHE_SIZE) {
@@ -347,18 +419,14 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid)
 		ibex_list_remove((struct _listnode *)old);
 		if (old->flags & BLOCK_DIRTY) {
 			/* are we about to un-sync the file?  update root and sync it */
-			if (block_cache->flags & IBEX_ROOT_SYNCF) {
-				/* TODO: put the rootblock in the block_cache struct, not in the cache */
-				struct _memblock *rootblock = g_hash_table_lookup(block_cache->index, (void *)0);
-				struct _root *root = (struct _root *)&rootblock->data;
-
+			if (block_cache->root.flags & IBEX_ROOT_SYNCF) {
 				printf("Unsyncing root block\n");
 
-				g_assert(rootblock != NULL);
-				root->flags &= ~IBEX_ROOT_SYNCF;
-				sync_block(block_cache, rootblock);
-				if (fsync(block_cache->fd) == 0)
-					block_cache->flags &= ~IBEX_ROOT_SYNCF;
+				block_cache->root.flags &= ~IBEX_ROOT_SYNCF;
+				if (ibex_block_sync_root(block_cache) != 0) {
+					/* what do we do?  i dont know! */
+					g_warning("Could not sync root block of index: %s", strerror(errno));
+				}
 			}
 			sync_block(block_cache, old);
 		}
@@ -369,6 +437,9 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid)
 
 	d(printf("  --- cached blocks : %d\n", block_cache->count));
 
+#ifdef MALLOC_CHECK
+	check_cache(block_cache);
+#endif
 	return &memblock->data;
 }
 
@@ -389,7 +460,6 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid)
 struct _memcache *
 ibex_block_cache_open(const char *name, int flags, int mode)
 {
-	struct _root *root;
 	struct _memcache *block_cache = g_malloc0(sizeof(*block_cache));
 
 	d(printf("opening ibex file: %s", name));
@@ -406,33 +476,33 @@ ibex_block_cache_open(const char *name, int flags, int mode)
 		return NULL;
 	}
 
-	root = (struct _root *)ibex_block_read(block_cache, 0);
-	if (root->roof == 0
-	    || memcmp(root->version, "ibx4", 4)
-	    || ((root->flags & IBEX_ROOT_SYNCF) == 0)) {
+	ibex_block_read_root(block_cache);
+	if (block_cache->root.roof == 0
+	    || memcmp(block_cache->root.version, "ibx5", 4)
+	    || ((block_cache->root.flags & IBEX_ROOT_SYNCF) == 0)) {
 		(printf("Initialising superblock\n"));
 		/* reset root data */
-		memcpy(root->version, "ibx4", 4);
-		root->roof = 1024;
-		root->free = 0;
-		root->words = 0;
-		root->names = 0;
-		root->tail = 0;	/* list of tail blocks */
-		root->flags = 0;
-		ibex_block_dirty((struct _block *)root);
+		memcpy(block_cache->root.version, "ibx5", 4);
+		block_cache->root.roof = 1024;
+		block_cache->root.free = 0;
+		block_cache->root.words = 0;
+		block_cache->root.names = 0;
+		block_cache->root.tail = 0;	/* list of tail blocks */
+		block_cache->root.flags = 0;
+		ibex_block_sync_root(block_cache);
 		/* reset the file contents */
 		ftruncate(block_cache->fd, 1024);
 	} else {
 		(printf("superblock already initialised:\n"
-			" roof = %d\n free = %d\n words = %d\n names = %d\n tail = %d",
-			root->roof, root->free, root->words, root->names, root->tail));
+			" roof = %d\n free = %d\n words = %d\n names = %d\n tail = %d\n",
+			block_cache->root.roof, block_cache->root.free,
+			block_cache->root.words, block_cache->root.names, block_cache->root.tail));
 	}
-	block_cache->flags = root->flags;
-	/* this should be moved higher up */
+	/* FIXME: this should be moved higher up in the object tree */
 	{
-		struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot);
+		struct _IBEXWord *ibex_create_word_index_mem(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot);
 
-		block_cache->words = ibex_create_word_index(block_cache, &root->words, &root->names);
+		block_cache->words = ibex_create_word_index_mem(block_cache, &block_cache->root.words,&block_cache->root.names);
 	}
 
 #ifdef IBEX_STATS
@@ -480,12 +550,10 @@ ibex_block_cache_close(struct _memcache *block_cache)
 void
 ibex_block_free(struct _memcache *block_cache, blockid_t blockid)
 {
-	struct _root *root = (struct _root *)ibex_block_read(block_cache, 0);
 	struct _block *block = ibex_block_read(block_cache, blockid);
 
-	block->next = block_number(root->free);
-	root->free = blockid;
-	ibex_block_dirty((struct _block *)root);
+	block->next = block_number(block_cache->root.free);
+	block_cache->root.free = blockid;
 	ibex_block_dirty((struct _block *)block);
 }
 
@@ -502,19 +570,18 @@ ibex_block_free(struct _memcache *block_cache, blockid_t blockid)
 blockid_t
 ibex_block_get(struct _memcache *block_cache)
 {
-	struct _root *root = (struct _root *)ibex_block_read(block_cache, 0);
 	struct _block *block;
 	blockid_t head;
 
-	if (root->free) {
-		head = root->free;
+	if (block_cache->root.free) {
+		head = block_cache->root.free;
 		block = ibex_block_read(block_cache, head);
-		root->free = block_location(block->next);
+		block_cache->root.free = block_location(block->next);
 	} else {
 		/* TODO: check the block will fit first */
 		/* TODO: no need to read this block, can allocate it manually (saves a syscall/read) */
-		head = root->roof;
-		root->roof += BLOCK_SIZE;
+		head = block_cache->root.roof;
+		block_cache->root.roof += BLOCK_SIZE;
 		block = ibex_block_read(block_cache, head);
 	}
 
@@ -524,6 +591,5 @@ ibex_block_get(struct _memcache *block_cache)
 	block->next = 0;
 	block->used = 0;
 	ibex_block_dirty(block);
-	ibex_block_dirty((struct _block *)root);
 	return head;
 }
diff --git a/libibex/block.h b/libibex/block.h
index deb6494231..6de33281b4 100644
--- a/libibex/block.h
+++ b/libibex/block.h
@@ -27,6 +27,9 @@ struct _root {
 	blockid_t names;	/* root of names index */
 
 	char flags;		/* state flags */
+
+	/* makes structure fill up to 1024 bytes */
+	char dummy[1024 - (sizeof(char)*5) - (sizeof(blockid_t)*5)];
 };
 
 #define IBEX_ROOT_SYNCF (1<<0)	/* file is synced */
@@ -74,11 +77,11 @@ struct _memcache {
 	GHashTable *index;	/* blockid->memblock mapping */
 	int fd;			/* file fd */
 
-	int flags;		/* flags (mirror of root->flags) */
-
 #ifdef IBEX_STATS
 	GHashTable *stats;
 #endif
+	struct _root root;	/* root block */
+
 	/* temporary here */
 	struct _IBEXWord *words; /* word index */
 };
diff --git a/libibex/disktail.c b/libibex/disktail.c
index 90562889de..9f5a0dbea2 100644
--- a/libibex/disktail.c
+++ b/libibex/disktail.c
@@ -214,7 +214,6 @@ tail_dump(struct _memcache *blocks, blockid_t tailid)
 static blockid_t
 tail_get(struct _memcache *blocks, int size)
 {
-	struct _root *root = (struct _root *)ibex_block_read(blocks, 0);
 	blockid_t tailid;
 	struct _tailblock *tail;
 	int freeindex;
@@ -225,22 +224,30 @@ tail_get(struct _memcache *blocks, int size)
 
 	/* look for a node with enough space, if we dont find it fairly
 	   quickly, just quit.  needs a better free algorithm i think ... */
-	tailid = root->tail;
+	tailid = blocks->root.tail;
 	while (tailid && count<5) {
 		int space;
 
 		d(printf(" checking tail node %d\n", tailid));
 
 		tail = (struct _tailblock *)ibex_block_read(blocks, tailid);
+
 		if (tail->used == 0) {
 			/* assume its big enough ... */
 			tail->used = 1;
 			tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size;
 			d(printf("allocated %d (%d), used %d\n", tailid, tailid, tail->used));
 			ibex_block_dirty((struct _block *)tail);
+
+			g_assert(&tail->tb_offset[tail->used-1]
+				 < &tail->tb_data[tail->tb_offset[tail->used-1]]);
+
 			return tailid;
 		}
 
+		g_assert(&tail->tb_offset[tail->used-1]
+			 < &tail->tb_data[tail->tb_offset[tail->used-1]]);
+
 		/* see if we have a free slot first */
 		freeindex = -1;
 		end = &tail->tb_data[sizeof(tail->tb_data)/sizeof(tail->tb_data[0])];
@@ -280,16 +287,18 @@ tail_get(struct _memcache *blocks, int size)
 	}
 
 	d(printf("allocating new data node for tail data\n"));
-	/* didn't find one, setup a new one */
-	root = (struct _root *)ibex_block_read(blocks, 0);
 	tailid = ibex_block_get(blocks);
 	tail = (struct _tailblock *)ibex_block_read(blocks, tailid);
-	tail->next = block_number(root->tail);
-	root->tail = tailid;
+	tail->next = block_number(blocks->root.tail);
+	blocks->root.tail = tailid;
 	tail->used = 1;
 	tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size;
 	ibex_block_dirty((struct _block *)tail);
 	d(printf("allocated %d (%d), used %d\n", tailid, TAIL_KEY(tailid, 0), tail->used));
+
+	g_assert(&tail->tb_offset[tail->used-1]
+		 < &tail->tb_data[tail->tb_offset[tail->used-1]]);
+
 	return TAIL_KEY(tailid, 0);
 }
 
@@ -578,15 +587,13 @@ disk_remove(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, na
 				start->used--;
 				block->bl_data[i] = start->bl_data[start->used];
 				if (start->used == 0) {
-					struct _root *root = (struct _root *)ibex_block_read(store->blocks, 0);
 					blockid_t new;
 
 					d(printf("dropping block %d, new head = %d\n", head, start->next));
 					new = block_location(start->next);
-					start->next = block_number(root->free);
-					root->free = head;
+					start->next = block_number(store->blocks->root.free);
+					store->blocks->root.free = head;
 					head = new;
-					ibex_block_dirty((struct _block *)root);
 				}
 				ibex_block_dirty(block);
 				ibex_block_dirty(start);
diff --git a/libibex/dumpindex.c b/libibex/dumpindex.c
index 92f4b08845..cf4f02cc8a 100644
--- a/libibex/dumpindex.c
+++ b/libibex/dumpindex.c
@@ -9,6 +9,30 @@
 
 extern void ibex_hash_dump(struct _IBEXIndex *index);
 
+static void
+index_iterate(struct _IBEXIndex *index)
+{
+	struct _IBEXCursor *idc;
+	int len;
+	char *key;
+	int total = 0, totallen = 0;
+
+	idc = index->klass->get_cursor(index);
+	key = idc->klass->next_key(idc, &len);
+	while (len) {
+		total++;
+		totallen += len;
+		printf("%s\n", key);
+		g_free(key);
+		key = idc->klass->next_key(idc, &len);
+	}
+	g_free(key);
+
+	idc->klass->close(idc);
+
+	printf("Iterate Totals: %d items, total bytes %d\n", total, totallen);
+}
+
 int main(int argc, char **argv)
 {
 	ibex *ib;
@@ -26,6 +50,9 @@ int main(int argc, char **argv)
 	ibex_hash_dump(ib->words->wordindex);
 	ibex_hash_dump(ib->words->nameindex);
 
+	index_iterate(ib->words->wordindex);
+	index_iterate(ib->words->nameindex);
+
 	ibex_close(ib);
 
 	return 0;
diff --git a/libibex/hash.c b/libibex/hash.c
index 9395e1e00b..0581d22fcb 100644
--- a/libibex/hash.c
+++ b/libibex/hash.c
@@ -42,6 +42,14 @@
 
 typedef guint32 hashid_t;
 
+struct _HASHCursor {
+	struct _IBEXCursor cursor;
+
+	hashid_t block;
+	unsigned int index;
+	unsigned int size;
+};
+
 static struct _IBEXIndex *hash_create(struct _memcache *bc, int size);
 static struct _IBEXIndex *hash_open(struct _memcache *bc, blockid_t root);
 static int hash_sync(struct _IBEXIndex *index);
@@ -53,6 +61,12 @@ static hashid_t hash_insert(struct _IBEXIndex *index, const char *key, int keyle
 static char *hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len);
 static void hash_set_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t blockid, blockid_t tail);
 static blockid_t hash_get_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t *tail);
+static struct _IBEXCursor *hash_get_cursor(struct _IBEXIndex *index);
+
+static struct _IBEXCursor *hash_cursor_create(struct _IBEXIndex *);
+static void hash_cursor_close(struct _IBEXCursor *);
+static guint32 hash_cursor_next(struct _IBEXCursor *);
+static char *hash_cursor_next_key(struct _IBEXCursor *, int *keylenptr);
 
 struct _IBEXIndexClass ibex_hash_class = {
 	hash_create, hash_open,
@@ -63,6 +77,13 @@ struct _IBEXIndexClass ibex_hash_class = {
 	hash_get_key,
 	hash_set_data_block,
 	hash_get_data_block,
+	hash_get_cursor,
+};
+
+struct _IBEXCursorClass ibex_hash_cursor_class = {
+	hash_cursor_close,
+	hash_cursor_next,
+	hash_cursor_next_key
 };
 
 /* the reason we have the tail here is that otherwise we need to
@@ -197,6 +218,12 @@ static int hash_close(struct _IBEXIndex *index)
 	return 0;
 }
 
+/* get an iterator class */
+static struct _IBEXCursor *hash_get_cursor(struct _IBEXIndex *index)
+{
+	return hash_cursor_create(index);
+}
+
 /* convert a hashbucket id into a name */
 static char *
 hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len)
@@ -620,6 +647,74 @@ hash_insert(struct _IBEXIndex *index, const char *key, int keylen)
 	return HASH_KEY(keybucket, 0);
 }
 
+/* hash cursor functions */
+static struct _IBEXCursor *
+hash_cursor_create(struct _IBEXIndex *idx)
+{
+	struct _HASHCursor *idc;
+	struct _hashroot *hashroot;
+
+	idc = g_malloc(sizeof(*idc));
+	idc->cursor.klass = &ibex_hash_cursor_class;
+	idc->cursor.index = idx;
+	idc->block = 0;
+	idc->index = 0;
+
+	hashroot = (struct _hashroot *)ibex_block_read(idx->blocks, idx->root);
+	idc->size = hashroot->size;
+
+	return &idc->cursor;
+}
+
+static void
+hash_cursor_close(struct _IBEXCursor *idc)
+{
+	g_free(idc);
+}
+
+static guint32
+hash_cursor_next(struct _IBEXCursor *idc)
+{
+	struct _HASHCursor *hc = (struct _HASHCursor *)idc;
+	struct _hashroot *hashroot;
+	struct _hashblock *bucket;
+	struct _hashtableblock *table;
+
+	/* get the next bucket chain */
+	if (hc->block != 0) {
+		int ind;
+
+		bucket = (struct _hashblock *)ibex_block_read(idc->index->blocks, HASH_BLOCK(hc->block));
+		ind = HASH_INDEX(hc->block);
+
+		g_assert(ind < bucket->used);
+
+		hc->block = bucket->hb_keys[ind].next;
+	}
+
+	if (hc->block == 0) {
+		hashroot = (struct _hashroot *)ibex_block_read(idc->index->blocks, idc->index->root);
+		while (hc->block == 0 && hc->index < hc->size) {
+			table = (struct _hashtableblock *)
+				ibex_block_read(idc->index->blocks,
+						hashroot->table[hc->index / (BLOCK_SIZE/sizeof(blockid_t))]);
+			hc->block = table->buckets[hc->index % (BLOCK_SIZE/sizeof(blockid_t))];
+
+			hc->index++;
+		}
+	}
+
+	return hc->block;
+}
+
+static char *
+hash_cursor_next_key(struct _IBEXCursor *idc, int *keylenptr)
+{
+	/* TODO: this could be made slightly mroe efficient going to the structs direct.
+	   but i'm lazy today */
+	return idc->index->klass->get_key(idc->index, idc->klass->next(idc), keylenptr);
+}
+
 /* debug */
 void ibex_hash_dump(struct _IBEXIndex *index);
 static void ibex_hash_dump_rec(struct _IBEXIndex *index, int *words, int *wordslen);
diff --git a/libibex/ibex_block.c b/libibex/ibex_block.c
index 8a042cd2cf..7540d48789 100644
--- a/libibex/ibex_block.c
+++ b/libibex/ibex_block.c
@@ -217,6 +217,10 @@ ibex_index_buffer (ibex *ib, char *name, char *buffer, size_t len, size_t *unrea
 	}
 done:
 	d(printf("name %s count %d size %d\n", name, wordlist->len, len));
+	if (!ib->predone) {
+		ib->words->klass->index_pre(ib->words);
+		ib->predone = TRUE;
+	}
 	ib->words->klass->add_list(ib->words, name, wordlist);
 	ret = 0;
 error:
@@ -249,6 +253,10 @@ ibex *ibex_open (char *file, int flags, int mode)
 int ibex_save (ibex *ib)
 {
 	printf("syncing database\n");
+	if (ib->predone) {
+		ib->words->klass->index_post(ib->words);
+		ib->predone = FALSE;
+	}
 	ib->words->klass->sync(ib->words);
 	/* FIXME: some return */
 	ibex_block_cache_sync(ib->blocks);
@@ -261,6 +269,11 @@ int ibex_close (ibex *ib)
 
 	printf("closing database\n");
 
+	if (ib->predone) {
+		ib->words->klass->index_post(ib->words);
+		ib->predone = FALSE;
+	}
+
 	ib->words->klass->close(ib->words);
 	ibex_block_cache_close(ib->blocks);
 	g_free(ib);
diff --git a/libibex/ibex_internal.h b/libibex/ibex_internal.h
index bd5d1cbea4..db772e4062 100644
--- a/libibex/ibex_internal.h
+++ b/libibex/ibex_internal.h
@@ -24,10 +24,11 @@
 #include "block.h"
 #include "wordindex.h"
 
-#define IBEX_VERSION "ibx4"
+#define IBEX_VERSION "ibx5"
 
 struct ibex {
 	char *path;
 	struct _memcache *blocks;
 	struct _IBEXWord *words;
+	int predone;
 };
diff --git a/libibex/index.h b/libibex/index.h
index c3c83c1bcf..0cef7948b3 100644
--- a/libibex/index.h
+++ b/libibex/index.h
@@ -27,6 +27,18 @@
 
 #define INDEX_STAT
 
+struct _IBEXCursor {
+	struct _IBEXCursorClass *klass;
+	struct _IBEXIndex *index;
+};
+
+struct _IBEXCursorClass {
+	void (*close)(struct _IBEXCursor *);
+
+	guint32 (*next)(struct _IBEXCursor *);
+	char *(*next_key)(struct _IBEXCursor *, int *keylenptr);
+};
+
 struct _IBEXIndex {
 	struct _IBEXIndexClass *klass;
 	struct _memcache *blocks;
@@ -62,6 +74,9 @@ struct _IBEXIndexClass {
 
 	/* get the key contents based on the keyid */
 	blockid_t (*get_data)(struct _IBEXIndex *, guint32 keyid, blockid_t *tail);
+
+	/* get a cursor for iterating over all contents */
+	struct _IBEXCursor *(*get_cursor)(struct _IBEXIndex *);
 };
 
 /* a storage class, stores lists of lists of id's */
diff --git a/libibex/wordindex.c b/libibex/wordindex.c
index 62e8859e15..43a91f4342 100644
--- a/libibex/wordindex.c
+++ b/libibex/wordindex.c
@@ -82,9 +82,12 @@ static void add_list(struct _IBEXWord *, const char *name, GPtrArray *words);/*
 static int word_sync(struct _IBEXWord *idx);
 static int word_flush(struct _IBEXWord *idx);
 static int word_close(struct _IBEXWord *idx);
+static void word_index_pre(struct _IBEXWord *idx);
+static void word_index_post(struct _IBEXWord *idx);
 
 struct _IBEXWordClass ibex_word_index_class = {
 	word_sync, word_flush, word_close,
+	word_index_pre, word_index_post,
 	unindex_name, contains_name,
 	find, find_name,
 	add, add_list
@@ -138,6 +141,14 @@ cache_sanity(struct _wordcache *head)
 }
 #endif
 
+static void word_index_pre(struct _IBEXWord *idx)
+{
+}
+
+static void word_index_post(struct _IBEXWord *idx)
+{
+}
+
 /* unindex all entries for name */
 static void unindex_name(struct _IBEXWord *idx, const char *name)
 {
diff --git a/libibex/wordindex.h b/libibex/wordindex.h
index d222ff5a87..353f4dddc6 100644
--- a/libibex/wordindex.h
+++ b/libibex/wordindex.h
@@ -38,6 +38,9 @@ struct _IBEXWordClass {
 	int (*flush)(struct _IBEXWord *);
 	int (*close)(struct _IBEXWord *);
 
+	void (*index_pre)(struct _IBEXWord *); /* get ready for doing a lot of indexing.  may be a nop */
+	void (*index_post)(struct _IBEXWord *);
+
 	void (*unindex_name)(struct _IBEXWord *, const char *name);		/* unindex all entries for name */
 	gboolean (*contains_name)(struct _IBEXWord *, const char *name);	/* index contains data for name */
 	GPtrArray *(*find)(struct _IBEXWord *, const char *word);		/* returns all matches for word */
@@ -57,9 +60,13 @@ struct _IBEXWord {
 	GHashTable *wordcache;	/* word->struct _wordcache mapping */
 	struct _list wordnodes;	/* LRU list of wordcache structures */
 	int wordcount;		/* how much space used in cache */
+	int precount;
 };
 
 
 struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot);
 
+/* alternate implemenation */
+struct _IBEXWord *ibex_create_word_index_mem(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot);
+
 #endif /* !_WORDINDEX_H */
diff --git a/libibex/wordindexmem.c b/libibex/wordindexmem.c
new file mode 100644
index 0000000000..f0cc7336f8
--- /dev/null
+++ b/libibex/wordindexmem.c
@@ -0,0 +1,721 @@
+/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*-
+ *
+ * Copyright (C) 2000 Helix Code, Inc.
+ *
+ * Authors: Michael Zucchi <notzed@helixcode.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Library General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ * 
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Library General Public License for more details.
+ *
+ * You should have received a copy of the GNU Library General Public
+ * License along with the Gnome Library; see the file COPYING.LIB.  If not,
+ * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
+ */
+
+/* this is the same as wordindex.c, but it doesn't have an LRU cache
+   for word names.  it has a lookup tab le that is only loaded if
+   index-pre is called, otherwise it always hits disk */
+
+/* code to manage a word index */
+/* includes a cache for word index writes,
+   but not for name index writes (currently), or any reads.
+
+Note the word cache is only needed during indexing of lots
+of words, and could then be discarded (:flush()).
+
+*/
+
+#include <stdio.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+#include <glib.h>
+
+#include "block.h"
+#include "index.h"
+#include "wordindex.h"
+
+/*#define MALLOC_CHECK*/
+
+#ifdef MALLOC_CHECK
+#include <mcheck.h>
+#endif
+
+#define d(x)
+
+/*#define WORDCACHE_SIZE (256)*/
+#define WORDCACHE_SIZE (4096)
+
+extern struct _IBEXStoreClass ibex_diskarray_class;
+extern struct _IBEXIndexClass ibex_hash_class;
+
+/* need 2 types of hash key?
+   one that just stores the wordid / wordblock
+   and one that stores the filecount/files?
+*/
+
+
+#define CACHE_FILE_COUNT (62)
+
+struct _wordcache {
+	nameid_t wordid;	/* disk wordid */
+	blockid_t wordblock;	/* head of disk list */
+	blockid_t wordtail;	/* and the tail data */
+	short filecount;	/* how many valid items in files[] */
+	short filealloc;	/* how much allocated space in files[] */
+	union {
+		nameid_t *files;	/* memory cache of files */
+		nameid_t file0;	/* if filecount == 1 && filealloc == 0, store directly */
+	} file;
+	char word[1];		/* actual word follows */
+};
+
+static void unindex_name(struct _IBEXWord *, const char *name); 	/* unindex all entries for name */
+static gboolean contains_name(struct _IBEXWord *, const char *name);	/* index contains data for name */
+static GPtrArray *find(struct _IBEXWord *, const char *word);		/* returns all matches for word */
+static gboolean find_name(struct _IBEXWord *, const char *name, const char *word);	/* find if name contains word */
+static void add(struct _IBEXWord *, const char *name, const char *word);	/* adds a single word to name (slow) */
+static void add_list(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */
+static int word_sync(struct _IBEXWord *idx);
+static int word_flush(struct _IBEXWord *idx);
+static int word_close(struct _IBEXWord *idx);
+static void word_index_pre(struct _IBEXWord *idx);
+static void word_index_post(struct _IBEXWord *idx);
+
+static void sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache);
+
+struct _IBEXWordClass ibex_word_index_mem_class = {
+	word_sync, word_flush, word_close,
+	word_index_pre, word_index_post,
+	unindex_name, contains_name,
+	find, find_name,
+	add, add_list
+};
+
+#ifdef MALLOC_CHECK
+static void
+checkmem(void *p)
+{
+	if (p) {
+		int status = mprobe(p);
+
+		switch (status) {
+		case MCHECK_HEAD:
+			printf("Memory underrun at %p\n", p);
+			abort();
+		case MCHECK_TAIL:
+			printf("Memory overrun at %p\n", p);
+			abort();
+		case MCHECK_FREE:
+			printf("Double free %p\n", p);
+			abort();
+		}
+	}
+}
+#endif
+
+/* this interface isn't the best, but it'll do for now */
+struct _IBEXWord *
+ibex_create_word_index_mem(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot)
+{
+	struct _IBEXWord *idx;
+
+	idx = g_malloc0(sizeof(*idx));
+	idx->wordcache = g_hash_table_new(g_str_hash, g_str_equal);
+	ibex_list_new(&idx->wordnodes);
+	idx->wordcount = 0;
+	idx->precount = 0;
+	idx->klass = &ibex_word_index_mem_class;
+
+	/* we use the same block array storage for both indexes at the moment */
+	idx->wordstore = ibex_diskarray_class.create(bc);
+	idx->namestore = idx->wordstore;
+
+	/* but not the same indexes! */
+	if (*wordroot) {
+		printf("opening wordindex root = %d\n", *wordroot);
+		idx->wordindex = ibex_hash_class.open(bc, *wordroot);
+	} else {
+		idx->wordindex = ibex_hash_class.create(bc, 2048);
+		*wordroot = idx->wordindex->root;
+		printf("creating wordindex root = %d\n", *wordroot);
+	}
+	if (*nameroot) {
+		printf("opening nameindex root = %d\n", *nameroot);
+		idx->nameindex = ibex_hash_class.open(bc, *nameroot);
+	} else {
+		idx->nameindex = ibex_hash_class.create(bc, 2048);
+		*nameroot = idx->nameindex->root;
+		printf("creating nameindex root = %d\n", *nameroot);
+	}
+	return idx;
+}
+
+static void
+node_sanity(char *key, struct _wordcache *node, void *data)
+{
+	g_assert(node->filecount <= node->filealloc || (node->filecount == 1 && node->filealloc == 0));
+	g_assert(strlen(node->word) != 0);
+#ifdef MALLOC_CHECK
+	checkmem(node);
+	if (node->filealloc)
+		checkmem(node->file.files);
+#endif
+}
+
+static void
+cache_sanity(struct _IBEXWord *idx)
+{
+#ifdef MALLOC_CHECK
+	checkmem(idx);
+#endif
+	g_hash_table_foreach(idx->wordcache, (GHFunc)node_sanity, idx);
+}
+
+static void word_index_pre(struct _IBEXWord *idx)
+{
+	struct _IBEXCursor *idc;
+	struct _wordcache *cache;
+	nameid_t wordid;
+	char *key;
+	int len;
+
+	idx->precount ++;
+	if (idx->precount > 1)
+		return;
+
+	/* want to load all words into the cache lookup table */
+	printf("pre-loading all word info into memory\n");
+	idc = idx->wordindex->klass->get_cursor(idx->wordindex);
+	while ( (wordid = idc->klass->next(idc)) ) {
+		key = idc->index->klass->get_key(idc->index, wordid, &len);
+		/*d(printf("Adding word %s\n", key));*/
+		cache = g_malloc0(sizeof(*cache) + strlen(key));
+		strcpy(cache->word, key);
+		g_free(key);
+		cache->wordid = wordid;
+		cache->wordblock = idc->index->klass->get_data(idc->index, wordid, &cache->wordtail);
+		cache->filecount = 0;
+		cache->filealloc = 0;
+		g_hash_table_insert(idx->wordcache, cache->word, cache);
+		idx->wordcount++;
+	}
+
+#ifdef MALLOC_CHECK
+	cache_sanity(idx);
+#endif
+
+	idc->klass->close(idc);
+
+	printf("done\n");
+}
+
+static gboolean
+sync_free_value(void *key, void *value, void *data)
+{
+	struct _wordcache *cache = (struct _wordcache *)value;
+	struct _IBEXWord *idx = (struct _IBEXWord *)data;
+
+	sync_cache_entry(idx, cache);
+	if (cache->filealloc)
+		g_free(cache->file.files);
+	g_free(cache);
+
+	return TRUE;
+}
+
+static void
+sync_value(void *key, void *value, void *data)
+{
+	struct _wordcache *cache = (struct _wordcache *)value;
+	struct _IBEXWord *idx = (struct _IBEXWord *)data;
+
+	sync_cache_entry(idx, cache);
+}
+
+static void word_index_post(struct _IBEXWord *idx)
+{
+	idx->precount--;
+	if (idx->precount > 0)
+		return;
+	idx->precount = 0;
+
+#ifdef MALLOC_CHECK
+	cache_sanity(idx);
+#endif
+
+	g_hash_table_foreach_remove(idx->wordcache, sync_free_value, idx);
+	idx->wordcount = 0;
+}
+
+/* unindex all entries for name */
+static void unindex_name(struct _IBEXWord *idx, const char *name)
+{
+	GArray *words;
+	int i;
+	nameid_t nameid, wordid;
+	blockid_t nameblock, wordblock, newblock, nametail, wordtail, newtail;
+	char *word;
+	struct _wordcache *cache;
+
+	d(printf("unindexing %s\n", name));
+
+	/* lookup the hash key */
+	nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name));
+	/* get the block for this key */
+	nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail);
+	/* and the data for this block */
+	words = idx->namestore->klass->get(idx->namestore, nameblock, nametail);
+	/* walk it ... */
+	for (i=0;i<words->len;i++) {
+		/* get the word */
+		wordid = g_array_index(words, nameid_t, i);
+		d(printf(" word %d\n", wordid));
+		/* get the data block */
+		wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail);
+		/* clear this name from it */
+		newblock = wordblock;
+		newtail = wordtail;
+		idx->wordstore->klass->remove(idx->wordstore, &newblock, &newtail, nameid);
+		if (newblock != wordblock || newtail != wordtail)
+			idx->wordindex->klass->set_data(idx->wordindex, wordid, newblock, newtail);
+
+		/* now check the cache as well */
+		word = idx->nameindex->klass->get_key(idx->wordindex, wordid, NULL);
+		if (word) {
+			cache = g_hash_table_lookup(idx->wordcache, word);
+			if (cache) {
+				/* its there, update our head/tail pointers */
+				cache->wordblock = newblock;
+				cache->wordtail = newtail;
+
+				/* now check that we have a data entry in it */
+				if (cache->filealloc == 0 && cache->filecount == 1) {
+					if (cache->file.file0 == nameid) {
+						cache->filecount = 0;
+					}
+				} else {
+					int j;
+
+					for (j=0;j<cache->filecount;j++) {
+						if (cache->file.files[j] == nameid) {
+							cache->file.files[j] = cache->file.files[cache->filecount-1];
+							cache->filecount--;
+							break;
+						}
+					}
+				}
+			}
+			g_free(word);
+		}
+	}
+	g_array_free(words, TRUE);
+
+	/* and remove name data and itself */
+	idx->namestore->klass->free(idx->namestore, nameblock, nametail);
+	idx->nameindex->klass->remove(idx->nameindex, name, strlen(name));
+}
+
+/* index contains (any) data for name */
+static gboolean contains_name(struct _IBEXWord *idx, const char *name)
+{
+	return idx->nameindex->klass->find(idx->nameindex, name, strlen(name)) != 0;
+}
+
+/* returns all matches for word */
+static GPtrArray *find(struct _IBEXWord *idx, const char *word)
+{
+	nameid_t wordid, nameid;
+	GPtrArray *res;
+	GArray *names;
+	int i;
+	char *new;
+	struct _wordcache *cache;
+	blockid_t wordblock, wordtail;
+
+	res = g_ptr_array_new();
+
+	cache = g_hash_table_lookup(idx->wordcache, word);
+	if (cache) {
+#if 0
+		/* freshen cache entry if we touch it */
+		ibex_list_remove((struct _listnode *)cache);
+		ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache);
+#endif
+		wordid = cache->wordid;
+		wordblock = cache->wordblock;
+		wordtail = cache->wordtail;
+	} else {
+		/* lookup the hash key */
+		wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word));
+		/* get the block for this key */
+		wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail);
+	}
+	/* and the data for this block */
+	names = idx->wordstore->klass->get(idx->wordstore, wordblock, wordtail);
+	/* .. including any memory-only data */
+	if (cache) {
+		if (cache->filealloc == 0 && cache->filecount == 1)
+			g_array_append_val(names, cache->file.file0);
+		else
+			g_array_append_vals(names, cache->file.files, cache->filecount);
+	}
+
+	/* walk it ... converting id's back to strings */
+	g_ptr_array_set_size(res, names->len);
+	for (i=0;i<names->len;i++) {
+		nameid = g_array_index(names, nameid_t, i);
+		new = idx->nameindex->klass->get_key(idx->nameindex, nameid, NULL);
+		res->pdata[i] = new;
+	}
+	g_array_free(names, TRUE);
+	return res;
+}
+
+/* find if name contains word */
+static gboolean find_name(struct _IBEXWord *idx, const char *name, const char *word)
+{
+	nameid_t wordid, nameid;
+	blockid_t nameblock, nametail;
+	struct _wordcache *cache;
+	int i;
+
+	/* lookup the hash key for the name */
+	nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name));
+	/* get the block for this name */
+	nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail);
+
+	/* check if there is an in-memory cache for this word, check its file there first */
+	cache = g_hash_table_lookup(idx->wordcache, word);
+	if (cache) {
+#if 0
+		/* freshen cache entry if we touch it */
+		ibex_list_remove((struct _listnode *)cache);
+		ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache);
+#endif
+		if (cache->filecount == 1 && cache->filealloc == 0) {
+			if (cache->file.file0 == nameid)
+				return TRUE;
+		} else {
+			for (i=0;i<cache->filecount;i++) {
+				if (cache->file.files[i] == nameid)
+					return TRUE;
+			}
+		}
+		/* not there?  well we can use the wordid anyway */
+		wordid = cache->wordid;
+	} else {
+		/* lookup the hash key for word */
+		wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word));
+	}
+
+	/* see if wordid is in nameblock */
+	return idx->namestore->klass->find(idx->namestore, nameblock, nametail, wordid);
+}
+
+/* cache helper functions */
+/* flush a cache entry to disk, and empty it out */
+static void
+sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache)
+{
+	GArray array; /* just use this as a header */
+	blockid_t oldblock, oldtail;
+	
+	if (cache->filecount == 0)
+		return;
+
+	d(printf("syncing cache entry '%s' used %d\n", cache->word, cache->filecount));
+	if (cache->filecount == 1 && cache->filealloc == 0)
+		array.data = (char *)&cache->file.file0;
+	else
+		array.data = (char *)cache->file.files;
+	array.len = cache->filecount;
+	oldblock = cache->wordblock;
+	oldtail = cache->wordtail;
+	idx->wordstore->klass->add_list(idx->wordstore, &cache->wordblock, &cache->wordtail, &array);
+	if (oldblock != cache->wordblock || oldtail != cache->wordtail) {
+		idx->wordindex->klass->set_data(idx->wordindex, cache->wordid, cache->wordblock, cache->wordtail);
+	}
+	cache->filecount = 0;
+}
+
+/* create a new key in an index, returning its id and head block */
+static void
+add_index_key(struct _IBEXIndex *wordindex, const char *word, nameid_t *wordid, blockid_t *wordblock, blockid_t *wordtail)
+{
+	/* initialise cache entry - id of word entry and head block */
+	*wordid = wordindex->klass->find(wordindex, word, strlen(word));
+
+	if (*wordid == 0) {
+		*wordid = wordindex->klass->insert(wordindex, word, strlen(word));
+		*wordblock = 0;
+		*wordtail = 0;
+	} else {
+		*wordblock = wordindex->klass->get_data(wordindex, *wordid, wordtail);
+	}
+}
+
+/* create a new key in a cached index (only word cache so far), flushing old keys
+   if too much space is being used */
+static struct _wordcache *
+add_index_cache(struct _IBEXWord *idx, const char *word)
+{
+	struct _wordcache *cache;
+
+	cache = g_hash_table_lookup(idx->wordcache, word);
+	if (cache == 0) {
+		/*d(printf("adding %s to cache\n", word));*/
+
+#if 0
+		/* see if we have to flush off the last entry */
+		if (idx->wordcount >= WORDCACHE_SIZE) {
+			struct _wordcache *mincache;
+			int min, count=0;
+			/* remove last entry, and flush it */
+			cache = (struct _wordcache *)idx->wordnodes.tailpred;
+			mincache = cache;
+			min = mincache->filecount;
+
+			d(printf("flushing word from cache %s\n", cache->word));
+			/* instead of just using the last entry, we try and find an entry with
+			   with only 1 item (failing that, the smallest in the range we look at) */
+			/* this could probably benefit greatly from a more sophisticated aging algorithm */
+			while (cache->next && count < 100) {
+				if (cache->filecount == 1) {
+					mincache = cache;
+					break;
+				}
+				if (cache->filecount > 0 && cache->filecount < min) {
+					mincache = cache;
+					min = cache->filecount;
+				}
+				cache = cache->next;
+				count++;
+			}
+			ibex_list_remove((struct _listnode *)mincache);
+			g_hash_table_remove(idx->wordcache, mincache->word);
+			sync_cache_entry(idx, mincache);
+			if (mincache->filealloc)
+				g_free(mincache->file.files);
+			g_free(mincache);
+			idx->wordcount--;
+		}
+#endif
+		cache = g_malloc0(sizeof(*cache)+strlen(word));
+		/* if we're in an index state, we can assume we dont have it if we dont have it in memory */
+		if (idx->precount == 0) {
+			/* initialise cache entry - id of word entry and head block */
+			add_index_key(idx->wordindex, word, &cache->wordid, &cache->wordblock, &cache->wordtail);
+		} else {
+			cache->wordid = idx->wordindex->klass->insert(idx->wordindex, word, strlen(word));
+		}
+		/* other fields */
+		strcpy(cache->word, word);
+		cache->filecount = 0;
+		g_hash_table_insert(idx->wordcache, cache->word, cache);
+#if 0
+		ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache);
+#endif
+		idx->wordcount++;
+	} else {
+		/*d(printf("already have %s in cache\n", word));*/
+#if 0
+		/* move cache bucket ot the head of the cache list */
+		ibex_list_remove((struct _listnode *)cache);
+		ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache);
+#endif
+	}
+	return cache;
+}
+
+/* adds a single word to name (slow) */
+static void add(struct _IBEXWord *idx, const char *name, const char *word)
+{
+	nameid_t nameid;
+	blockid_t nameblock, newblock, nametail, newtail;
+	struct _wordcache *cache;
+
+	g_error("Dont use wordindex::add()");
+	abort();
+
+	cache = add_index_cache(idx, word);
+
+	/* get the nameid and block start for this name */
+	add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
+
+	/* check for repeats of the last name - dont add anything */
+	if (cache->filecount == 1 && cache->filealloc == 0) {
+		if (cache->file.file0 == nameid)
+			return;
+	} else {
+		if (cache->file.files[cache->filecount] == nameid)
+			return;
+	}
+
+	/* see if we are setting the first, drop it in the union */
+	if (cache->filecount == 0 && cache->filealloc == 0) {
+		cache->file.file0 = nameid;
+	} else if (cache->filecount == 1 && cache->filealloc == 0) {
+		nameid_t saveid;
+		/* we need to allocate space for words */
+		saveid = cache->file.file0;
+		cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT);
+		/* this could possibly grow as needed, but i wont for now */
+		cache->filealloc = CACHE_FILE_COUNT;
+		cache->file.files[0] = saveid;
+		cache->file.files[1] = nameid;
+	} else {
+		cache->file.files[cache->filecount] = nameid;
+	}
+
+	cache->filecount++;
+
+	/* if we are full, force a flush now */
+	if (cache->filealloc && cache->filecount >= cache->filealloc) {
+		sync_cache_entry(idx, cache);
+	}
+
+	newtail = nametail;
+	newblock = nameblock;
+	idx->namestore->klass->add(idx->namestore, &newblock, &newtail, cache->wordid);
+	if (newblock != nameblock || newtail != nametail) {
+		idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail);
+	}
+}
+
+/* adds a bunch of words to a given name */
+static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words)
+{
+	int i;
+	GArray *data = g_array_new(0, 0, sizeof(nameid_t));
+	blockid_t nameblock, newblock, nametail, newtail;
+	nameid_t nameid;
+	struct _wordcache *cache;
+
+	d(printf("Adding words to name %s\n", name));
+
+	d(cache_sanity(idx));
+
+	/* get the nameid and block start for this name */
+	add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
+
+	d(cache_sanity(idx));
+
+	for (i=0;i<words->len;i++) {
+		char *word = words->pdata[i];
+
+		cache = add_index_cache(idx, word);
+
+		/*d(cache_sanity(idx));*/
+
+		/* check for duplicates; doesn't catch duplicates over an overflow boundary.  Watch me care. */
+		if (cache->filecount == 0
+		    /* the 1 item case */
+		    || (cache->filecount == 1 && cache->filealloc == 0 && cache->file.file0 != nameid)
+		    /* the normal case */
+		    || (cache->filealloc > 0 && cache->file.files[cache->filecount-1] != nameid)) {
+
+			/* see if we are setting the first, drop it in the union */
+			if (cache->filecount == 0 && cache->filealloc == 0) {
+				cache->file.file0 = nameid;
+			} else if (cache->filecount == 1 && cache->filealloc == 0) {
+				nameid_t saveid;
+				/* we need to allocate space for words */
+				saveid = cache->file.file0;
+				cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT);
+				/* this could possibly grow as needed, but i wont for now */
+				cache->filealloc = CACHE_FILE_COUNT;
+				cache->file.files[0] = saveid;
+				cache->file.files[1] = nameid;
+			} else {
+				cache->file.files[cache->filecount] = nameid;
+			}
+
+			cache->filecount++;
+
+			/* if we are full, force a flush now */
+			if (cache->filealloc && cache->filecount >= cache->filealloc) {
+				sync_cache_entry(idx, cache);
+			}
+
+			/*d(cache_sanity(idx));*/
+
+			/* and append this wordid for this name in memory */
+			g_array_append_val(data, cache->wordid);
+		}
+
+		/*d(cache_sanity(idx));*/
+	}
+
+	d(cache_sanity(idx));
+
+	/* and append these word id's in one go */
+	newblock = nameblock;
+	newtail = nametail;
+	idx->namestore->klass->add_list(idx->namestore, &newblock, &newtail, data);
+	if (newblock != nameblock || newtail != nametail) {
+		idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail);
+	}
+
+	d(cache_sanity(idx));
+
+	g_array_free(data, TRUE);
+}
+
+/* sync any in-memory data to disk */
+static int
+word_sync(struct _IBEXWord *idx)
+{
+	/* we just flush also, save memory */
+	word_flush(idx);
+
+#if 0
+	struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head;
+
+	while (cache->next) {
+		sync_cache_entry(idx, cache);
+		cache = cache->next;
+	}
+
+	/*ibex_hash_dump(idx->wordindex);*/
+	/*ibex_hash_dump(idx->nameindex);*/
+#endif
+	return 0;
+}
+
+/* sync and flush any in-memory data to disk and free it */
+static int
+word_flush(struct _IBEXWord *idx)
+{
+	d(cache_sanity(idx));
+
+	g_hash_table_foreach_remove(idx->wordcache, sync_free_value, idx);
+	idx->wordcount = 0;
+	return 0;
+}
+
+static int word_close(struct _IBEXWord *idx)
+{
+	idx->klass->flush(idx);
+
+	idx->namestore->klass->close(idx->namestore);
+	idx->nameindex->klass->close(idx->nameindex);
+	/*same as namestore:
+	  idx->wordstore->klass->close(idx->wordstore);*/
+	idx->wordindex->klass->close(idx->wordindex);
+	g_hash_table_destroy(idx->wordcache);
+	g_free(idx);
+
+	return 0;
+}