From 9aae808cd0b4cf8bd35f6c0205e30c79f62632ef Mon Sep 17 00:00:00 2001 From: Not Zed Date: Wed, 25 Oct 2000 13:59:44 +0000 Subject: Bugfixes, performance improvemnts. Should scale up much better than before, and be more bugfree than ever! 2000-10-25 Not Zed * ibex_internal.h (IBEX_VERSION): Bumped to another version. The file format hasn't changed, but earlier bugs may create invalid files. * block.c (ibex_block_read): Use the root data directly. (ibex_block_cache_open): As well. (ibex_block_get): And here too. (ibex_block_cache_sync): Sync the root block directly here. * block.h: Pad root block out to 1024 bytes. Added root block to struct _memcache. * disktail.c (tail_get): Dirty the root block. (tail_get): Fix for changes to root access. (disk_remove): And here too. * wordindexmem.c (sync_cache_entry): Handle the case of not having any files in the list, which can happen now. (word_index_pre): Make sure we set the wordid on the new cache entry. * ibex_block.c (ibex_save): Sigh. Pass the right argument to index_post. * block.c (ibex_block_cache_open): Create a word_index_mem for indexing the words, rather than a word_index. * ibex_block.c (ibex_index_buffer): If we haven't called index_pre yet, do it before indexing anything. (ibex_save): If wehave called index_pre previously, call index_post. (ibex_close): And same for here. * index.h: Added a cursor class, and cursor retrieval function for iterating through an index's keys. * wordindexmem.c (ibex_create_word_index_mem): New word class, similar to wordindex, but meant to be faster for updates. (word_index_pre): Implement. We load all keys into memory. (word_index_post): Implement. We sync and free all keys. (find): Remove lru code, its no longer a cache, but a lookup table. (add_index_cache): Remove lru code here too. (find_name): And here. (word_flush): Flush the hashtable direct. (word_close): Call flush to flush, rather than doing it ourselves. (add_index_cache): If we are in an index state, we can assume a cache miss == a new word. (word_index_post): Maintain whether or not we are in an index state, and the depth of the state. (word_index_pre): Likewise. Dont reread the index if we have already. (cache_sanity): Fixed for struct changes. * wordindex.h (IBEXWordClass): Added functions to prepare/cleanup for lots of indexing. i.e. can be used to optimise indexing speed at the cost of extra memory usage during the indexing process. * hash.c (hash_cursor_create): Create a new cursor for iterating through a hashtable. (hash_cursor_close): 'close' the cursor. It is upto the application to close any cursors it creates. (hash_cursor_next): Goto the next key id. (hash_cursor_next_key): Goto the next key, reutrn the key. (hash_get_cursor): Return a cursor object. * wordindex.c (word_index_post): (word_index_pre): Added (empty) callbacks for pre/post functions. svn path=/trunk/; revision=6165 --- libibex/ChangeLog | 69 +++++ libibex/Makefile.am | 2 +- libibex/block.c | 210 +++++++++----- libibex/block.h | 7 +- libibex/disktail.c | 27 +- libibex/dumpindex.c | 27 ++ libibex/hash.c | 95 +++++++ libibex/ibex_block.c | 13 + libibex/ibex_internal.h | 3 +- libibex/index.h | 15 + libibex/wordindex.c | 11 + libibex/wordindex.h | 7 + libibex/wordindexmem.c | 721 ++++++++++++++++++++++++++++++++++++++++++++++++ 13 files changed, 1121 insertions(+), 86 deletions(-) create mode 100644 libibex/wordindexmem.c diff --git a/libibex/ChangeLog b/libibex/ChangeLog index 53187c03c6..5318e935c4 100644 --- a/libibex/ChangeLog +++ b/libibex/ChangeLog @@ -1,14 +1,83 @@ +2000-10-25 Not Zed + + * ibex_internal.h (IBEX_VERSION): Bumped to another version. The + file format hasn't changed, but earlier bugs may create invalid + files. + + * block.c (ibex_block_read): Use the root data directly. + (ibex_block_cache_open): As well. + (ibex_block_get): And here too. + (ibex_block_cache_sync): Sync the root block directly here. + + * block.h: Pad root block out to 1024 bytes. + Added root block to struct _memcache. + + * disktail.c (tail_get): Dirty the root block. + (tail_get): Fix for changes to root access. + (disk_remove): And here too. + + * wordindexmem.c (sync_cache_entry): Handle the case of not having + any files in the list, which can happen now. + (word_index_pre): Make sure we set the wordid on the new cache + entry. + + * ibex_block.c (ibex_save): Sigh. Pass the right argument to + index_post. + 2000-10-24 JP Rosevear * .cvsignore: Shush 2000-10-24 Not Zed + * block.c (ibex_block_cache_open): Create a word_index_mem for + indexing the words, rather than a word_index. + + * ibex_block.c (ibex_index_buffer): If we haven't called index_pre + yet, do it before indexing anything. + (ibex_save): If wehave called index_pre previously, call + index_post. + (ibex_close): And same for here. + + * index.h: Added a cursor class, and cursor retrieval function for + iterating through an index's keys. + + * wordindexmem.c (ibex_create_word_index_mem): New word class, + similar to wordindex, but meant to be faster for updates. + (word_index_pre): Implement. We load all keys into memory. + (word_index_post): Implement. We sync and free all keys. + (find): Remove lru code, its no longer a cache, but a lookup + table. + (add_index_cache): Remove lru code here too. + (find_name): And here. + (word_flush): Flush the hashtable direct. + (word_close): Call flush to flush, rather than doing it ourselves. + (add_index_cache): If we are in an index state, we can assume a + cache miss == a new word. + (word_index_post): Maintain whether or not we are in an index + state, and the depth of the state. + (word_index_pre): Likewise. Dont reread the index if we have + already. + (cache_sanity): Fixed for struct changes. + + * wordindex.h (IBEXWordClass): Added functions to prepare/cleanup + for lots of indexing. i.e. can be used to optimise indexing speed + at the cost of extra memory usage during the indexing process. + * dumpindex.c: Dumps the contents of indexs. * hash.c (ibex_hash_dump_rec): Also print the word count. + (hash_cursor_create): Create a new cursor for iterating through a + hashtable. + (hash_cursor_close): 'close' the cursor. It is upto the + application to close any cursors it creates. + (hash_cursor_next): Goto the next key id. + (hash_cursor_next_key): Goto the next key, reutrn the key. + (hash_get_cursor): Return a cursor object. * wordindex.c (unindex_name): Cross-check the cache as well. + (word_index_post): + (word_index_pre): Added (empty) callbacks for pre/post functions. 2000-10-12 Not Zed diff --git a/libibex/Makefile.am b/libibex/Makefile.am index fdde10aca3..6cc88186d6 100644 --- a/libibex/Makefile.am +++ b/libibex/Makefile.am @@ -3,7 +3,7 @@ noinst_LTLIBRARIES = libibex.la libibex_la_SOURCES = \ - wordindex.c \ + wordindex.c wordindexmem.c \ block.c ibex.h \ hash.c \ disktail.c \ diff --git a/libibex/block.c b/libibex/block.c index 8e10a116cb..52a79d27ae 100644 --- a/libibex/block.c +++ b/libibex/block.c @@ -30,11 +30,18 @@ #include #include #include - +#include +#include #include #include "block.h" +/*#define MALLOC_CHECK*/ + +#ifdef MALLOC_CHECK +#include +#endif + #define d(x) /*#define DEBUG*/ @@ -113,6 +120,27 @@ add_miss(struct _memcache *index, int id) } #endif /* IBEX_STATS */ +#ifdef MALLOC_CHECK +static void +checkmem(void *p) +{ + if (p) { + int status = mprobe(p); + + switch (status) { + case MCHECK_HEAD: + printf("Memory underrun at %p\n", p); + abort(); + case MCHECK_TAIL: + printf("Memory overrun at %p\n", p); + abort(); + case MCHECK_FREE: + printf("Double free %p\n", p); + abort(); + } + } +} +#endif /* simple list routines (for simplified memory management of cache/lists) */ @@ -187,6 +215,29 @@ memblock_addr(struct _block *block) return (struct _memblock *)(((char *)block) - G_STRUCT_OFFSET(struct _memblock, data)); } +/* read/sync the rootblock into the block_cache structure */ +static int +ibex_block_read_root(struct _memcache *block_cache) +{ + lseek(block_cache->fd, 0, SEEK_SET); + if (read(block_cache->fd, &block_cache->root, sizeof(block_cache->root)) != sizeof(block_cache->root)) { + + return -1; + } + return 0; +} + +static int +ibex_block_sync_root(struct _memcache *block_cache) +{ + lseek(block_cache->fd, 0, SEEK_SET); + if (write(block_cache->fd, &block_cache->root, sizeof(block_cache->root)) != sizeof(block_cache->root)) { + return -1; + } + return fsync(block_cache->fd); +} + + /** * ibex_block_dirty: * @block: @@ -224,27 +275,22 @@ sync_block(struct _memcache *block_cache, struct _memblock *memblock) void ibex_block_cache_sync(struct _memcache *block_cache) { - struct _memblock *memblock, *rootblock = 0; + struct _memblock *memblock; memblock = (struct _memblock *)block_cache->nodes.head; while (memblock->next) { - if (memblock->block == 0) { - rootblock = memblock; - } else { - if (memblock->flags & BLOCK_DIRTY) { - sync_block(block_cache, memblock); - } +#ifdef MALLOC_CHECK + checkmem(memblock); +#endif + if (memblock->flags & BLOCK_DIRTY) { + sync_block(block_cache, memblock); } memblock = memblock->next; } - if (rootblock) { - struct _root *root = (struct _root *)&rootblock->data; - root->flags |= IBEX_ROOT_SYNCF; - sync_block(block_cache, rootblock); - } - if (fsync(block_cache->fd) == 0) { - block_cache->flags |= IBEX_ROOT_SYNCF; + block_cache->root.flags |= IBEX_ROOT_SYNCF; + if (ibex_block_sync_root(block_cache) != 0) { + block_cache->root.flags &= ~IBEX_ROOT_SYNCF; } #ifdef IBEX_STATS @@ -253,6 +299,25 @@ ibex_block_cache_sync(struct _memcache *block_cache) } +#ifdef MALLOC_CHECK +static void +check_cache(struct _memcache *block_cache) +{ + struct _memblock *mw, *mn; + + checkmem(block_cache); + checkmem(block_cache->index); + + mw = (struct _memblock *)block_cache->nodes.head; + mn = mw->next; + while (mn) { + checkmem(mw); + mw = mn; + mn = mn->next; + } +} +#endif + /** * ibex_block_cache_flush: * @block_cache: @@ -278,7 +343,6 @@ ibex_block_cache_flush(struct _memcache *block_cache) ibex_list_new(&block_cache->nodes); } - /** * ibex_block_read: * @block_cache: @@ -296,32 +360,39 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid) { struct _memblock *memblock; - { - /* assert blockid 0) { - struct _root *root = (struct _root *)ibex_block_read(block_cache, 0); - g_assert(blockid < root->roof); - } - } +#ifdef MALLOC_CHECK + check_cache(block_cache); +#endif + + /* nothing can read the root block directly */ + g_assert(blockid != 0); + g_assert(blockid < block_cache->root.roof); memblock = g_hash_table_lookup(block_cache->index, (void *)blockid); + +#ifdef MALLOC_CHECK + check_cache(block_cache); +#endif + if (memblock) { d(printf("foudn blockid in cache %d = %p\n", blockid, &memblock->data)); -#if 0 - if (blockid == 0) { - struct _root *root = &memblock->data; - d(printf("superblock state:\n" - " roof = %d\n free = %d\n words = %d\n names = %d\n tail = %d", - root->roof, root->free, root->words, root->names, root->tail)); - - } -#endif + /* 'access' page */ ibex_list_remove((struct _listnode *)memblock); ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock); + +#ifdef MALLOC_CHECK + check_cache(block_cache); +#endif + #ifdef IBEX_STATS add_hit(block_cache, memblock->block); #endif + +#ifdef MALLOC_CHECK + check_cache(block_cache); +#endif + return &memblock->data; } #ifdef IBEX_STATS @@ -338,6 +409,7 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid) lseek(block_cache->fd, blockid, SEEK_SET); memset(&memblock->data, 0, sizeof(memblock->data)); read(block_cache->fd, &memblock->data, sizeof(memblock->data)); + ibex_list_addtail(&block_cache->nodes, (struct _listnode *)memblock); g_hash_table_insert(block_cache->index, (void *)blockid, memblock); if (block_cache->count >= CACHE_SIZE) { @@ -347,18 +419,14 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid) ibex_list_remove((struct _listnode *)old); if (old->flags & BLOCK_DIRTY) { /* are we about to un-sync the file? update root and sync it */ - if (block_cache->flags & IBEX_ROOT_SYNCF) { - /* TODO: put the rootblock in the block_cache struct, not in the cache */ - struct _memblock *rootblock = g_hash_table_lookup(block_cache->index, (void *)0); - struct _root *root = (struct _root *)&rootblock->data; - + if (block_cache->root.flags & IBEX_ROOT_SYNCF) { printf("Unsyncing root block\n"); - g_assert(rootblock != NULL); - root->flags &= ~IBEX_ROOT_SYNCF; - sync_block(block_cache, rootblock); - if (fsync(block_cache->fd) == 0) - block_cache->flags &= ~IBEX_ROOT_SYNCF; + block_cache->root.flags &= ~IBEX_ROOT_SYNCF; + if (ibex_block_sync_root(block_cache) != 0) { + /* what do we do? i dont know! */ + g_warning("Could not sync root block of index: %s", strerror(errno)); + } } sync_block(block_cache, old); } @@ -369,6 +437,9 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid) d(printf(" --- cached blocks : %d\n", block_cache->count)); +#ifdef MALLOC_CHECK + check_cache(block_cache); +#endif return &memblock->data; } @@ -389,7 +460,6 @@ ibex_block_read(struct _memcache *block_cache, blockid_t blockid) struct _memcache * ibex_block_cache_open(const char *name, int flags, int mode) { - struct _root *root; struct _memcache *block_cache = g_malloc0(sizeof(*block_cache)); d(printf("opening ibex file: %s", name)); @@ -406,33 +476,33 @@ ibex_block_cache_open(const char *name, int flags, int mode) return NULL; } - root = (struct _root *)ibex_block_read(block_cache, 0); - if (root->roof == 0 - || memcmp(root->version, "ibx4", 4) - || ((root->flags & IBEX_ROOT_SYNCF) == 0)) { + ibex_block_read_root(block_cache); + if (block_cache->root.roof == 0 + || memcmp(block_cache->root.version, "ibx5", 4) + || ((block_cache->root.flags & IBEX_ROOT_SYNCF) == 0)) { (printf("Initialising superblock\n")); /* reset root data */ - memcpy(root->version, "ibx4", 4); - root->roof = 1024; - root->free = 0; - root->words = 0; - root->names = 0; - root->tail = 0; /* list of tail blocks */ - root->flags = 0; - ibex_block_dirty((struct _block *)root); + memcpy(block_cache->root.version, "ibx5", 4); + block_cache->root.roof = 1024; + block_cache->root.free = 0; + block_cache->root.words = 0; + block_cache->root.names = 0; + block_cache->root.tail = 0; /* list of tail blocks */ + block_cache->root.flags = 0; + ibex_block_sync_root(block_cache); /* reset the file contents */ ftruncate(block_cache->fd, 1024); } else { (printf("superblock already initialised:\n" - " roof = %d\n free = %d\n words = %d\n names = %d\n tail = %d", - root->roof, root->free, root->words, root->names, root->tail)); + " roof = %d\n free = %d\n words = %d\n names = %d\n tail = %d\n", + block_cache->root.roof, block_cache->root.free, + block_cache->root.words, block_cache->root.names, block_cache->root.tail)); } - block_cache->flags = root->flags; - /* this should be moved higher up */ + /* FIXME: this should be moved higher up in the object tree */ { - struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot); + struct _IBEXWord *ibex_create_word_index_mem(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot); - block_cache->words = ibex_create_word_index(block_cache, &root->words, &root->names); + block_cache->words = ibex_create_word_index_mem(block_cache, &block_cache->root.words,&block_cache->root.names); } #ifdef IBEX_STATS @@ -480,12 +550,10 @@ ibex_block_cache_close(struct _memcache *block_cache) void ibex_block_free(struct _memcache *block_cache, blockid_t blockid) { - struct _root *root = (struct _root *)ibex_block_read(block_cache, 0); struct _block *block = ibex_block_read(block_cache, blockid); - block->next = block_number(root->free); - root->free = blockid; - ibex_block_dirty((struct _block *)root); + block->next = block_number(block_cache->root.free); + block_cache->root.free = blockid; ibex_block_dirty((struct _block *)block); } @@ -502,19 +570,18 @@ ibex_block_free(struct _memcache *block_cache, blockid_t blockid) blockid_t ibex_block_get(struct _memcache *block_cache) { - struct _root *root = (struct _root *)ibex_block_read(block_cache, 0); struct _block *block; blockid_t head; - if (root->free) { - head = root->free; + if (block_cache->root.free) { + head = block_cache->root.free; block = ibex_block_read(block_cache, head); - root->free = block_location(block->next); + block_cache->root.free = block_location(block->next); } else { /* TODO: check the block will fit first */ /* TODO: no need to read this block, can allocate it manually (saves a syscall/read) */ - head = root->roof; - root->roof += BLOCK_SIZE; + head = block_cache->root.roof; + block_cache->root.roof += BLOCK_SIZE; block = ibex_block_read(block_cache, head); } @@ -524,6 +591,5 @@ ibex_block_get(struct _memcache *block_cache) block->next = 0; block->used = 0; ibex_block_dirty(block); - ibex_block_dirty((struct _block *)root); return head; } diff --git a/libibex/block.h b/libibex/block.h index deb6494231..6de33281b4 100644 --- a/libibex/block.h +++ b/libibex/block.h @@ -27,6 +27,9 @@ struct _root { blockid_t names; /* root of names index */ char flags; /* state flags */ + + /* makes structure fill up to 1024 bytes */ + char dummy[1024 - (sizeof(char)*5) - (sizeof(blockid_t)*5)]; }; #define IBEX_ROOT_SYNCF (1<<0) /* file is synced */ @@ -74,11 +77,11 @@ struct _memcache { GHashTable *index; /* blockid->memblock mapping */ int fd; /* file fd */ - int flags; /* flags (mirror of root->flags) */ - #ifdef IBEX_STATS GHashTable *stats; #endif + struct _root root; /* root block */ + /* temporary here */ struct _IBEXWord *words; /* word index */ }; diff --git a/libibex/disktail.c b/libibex/disktail.c index 90562889de..9f5a0dbea2 100644 --- a/libibex/disktail.c +++ b/libibex/disktail.c @@ -214,7 +214,6 @@ tail_dump(struct _memcache *blocks, blockid_t tailid) static blockid_t tail_get(struct _memcache *blocks, int size) { - struct _root *root = (struct _root *)ibex_block_read(blocks, 0); blockid_t tailid; struct _tailblock *tail; int freeindex; @@ -225,22 +224,30 @@ tail_get(struct _memcache *blocks, int size) /* look for a node with enough space, if we dont find it fairly quickly, just quit. needs a better free algorithm i think ... */ - tailid = root->tail; + tailid = blocks->root.tail; while (tailid && count<5) { int space; d(printf(" checking tail node %d\n", tailid)); tail = (struct _tailblock *)ibex_block_read(blocks, tailid); + if (tail->used == 0) { /* assume its big enough ... */ tail->used = 1; tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size; d(printf("allocated %d (%d), used %d\n", tailid, tailid, tail->used)); ibex_block_dirty((struct _block *)tail); + + g_assert(&tail->tb_offset[tail->used-1] + < &tail->tb_data[tail->tb_offset[tail->used-1]]); + return tailid; } + g_assert(&tail->tb_offset[tail->used-1] + < &tail->tb_data[tail->tb_offset[tail->used-1]]); + /* see if we have a free slot first */ freeindex = -1; end = &tail->tb_data[sizeof(tail->tb_data)/sizeof(tail->tb_data[0])]; @@ -280,16 +287,18 @@ tail_get(struct _memcache *blocks, int size) } d(printf("allocating new data node for tail data\n")); - /* didn't find one, setup a new one */ - root = (struct _root *)ibex_block_read(blocks, 0); tailid = ibex_block_get(blocks); tail = (struct _tailblock *)ibex_block_read(blocks, tailid); - tail->next = block_number(root->tail); - root->tail = tailid; + tail->next = block_number(blocks->root.tail); + blocks->root.tail = tailid; tail->used = 1; tail->tb_offset[0] = sizeof(tail->tb_data)/sizeof(tail->tb_data[0]) - size; ibex_block_dirty((struct _block *)tail); d(printf("allocated %d (%d), used %d\n", tailid, TAIL_KEY(tailid, 0), tail->used)); + + g_assert(&tail->tb_offset[tail->used-1] + < &tail->tb_data[tail->tb_offset[tail->used-1]]); + return TAIL_KEY(tailid, 0); } @@ -578,15 +587,13 @@ disk_remove(struct _IBEXStore *store, blockid_t *headptr, blockid_t *tailptr, na start->used--; block->bl_data[i] = start->bl_data[start->used]; if (start->used == 0) { - struct _root *root = (struct _root *)ibex_block_read(store->blocks, 0); blockid_t new; d(printf("dropping block %d, new head = %d\n", head, start->next)); new = block_location(start->next); - start->next = block_number(root->free); - root->free = head; + start->next = block_number(store->blocks->root.free); + store->blocks->root.free = head; head = new; - ibex_block_dirty((struct _block *)root); } ibex_block_dirty(block); ibex_block_dirty(start); diff --git a/libibex/dumpindex.c b/libibex/dumpindex.c index 92f4b08845..cf4f02cc8a 100644 --- a/libibex/dumpindex.c +++ b/libibex/dumpindex.c @@ -9,6 +9,30 @@ extern void ibex_hash_dump(struct _IBEXIndex *index); +static void +index_iterate(struct _IBEXIndex *index) +{ + struct _IBEXCursor *idc; + int len; + char *key; + int total = 0, totallen = 0; + + idc = index->klass->get_cursor(index); + key = idc->klass->next_key(idc, &len); + while (len) { + total++; + totallen += len; + printf("%s\n", key); + g_free(key); + key = idc->klass->next_key(idc, &len); + } + g_free(key); + + idc->klass->close(idc); + + printf("Iterate Totals: %d items, total bytes %d\n", total, totallen); +} + int main(int argc, char **argv) { ibex *ib; @@ -26,6 +50,9 @@ int main(int argc, char **argv) ibex_hash_dump(ib->words->wordindex); ibex_hash_dump(ib->words->nameindex); + index_iterate(ib->words->wordindex); + index_iterate(ib->words->nameindex); + ibex_close(ib); return 0; diff --git a/libibex/hash.c b/libibex/hash.c index 9395e1e00b..0581d22fcb 100644 --- a/libibex/hash.c +++ b/libibex/hash.c @@ -42,6 +42,14 @@ typedef guint32 hashid_t; +struct _HASHCursor { + struct _IBEXCursor cursor; + + hashid_t block; + unsigned int index; + unsigned int size; +}; + static struct _IBEXIndex *hash_create(struct _memcache *bc, int size); static struct _IBEXIndex *hash_open(struct _memcache *bc, blockid_t root); static int hash_sync(struct _IBEXIndex *index); @@ -53,6 +61,12 @@ static hashid_t hash_insert(struct _IBEXIndex *index, const char *key, int keyle static char *hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len); static void hash_set_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t blockid, blockid_t tail); static blockid_t hash_get_data_block(struct _IBEXIndex *index, hashid_t keyid, blockid_t *tail); +static struct _IBEXCursor *hash_get_cursor(struct _IBEXIndex *index); + +static struct _IBEXCursor *hash_cursor_create(struct _IBEXIndex *); +static void hash_cursor_close(struct _IBEXCursor *); +static guint32 hash_cursor_next(struct _IBEXCursor *); +static char *hash_cursor_next_key(struct _IBEXCursor *, int *keylenptr); struct _IBEXIndexClass ibex_hash_class = { hash_create, hash_open, @@ -63,6 +77,13 @@ struct _IBEXIndexClass ibex_hash_class = { hash_get_key, hash_set_data_block, hash_get_data_block, + hash_get_cursor, +}; + +struct _IBEXCursorClass ibex_hash_cursor_class = { + hash_cursor_close, + hash_cursor_next, + hash_cursor_next_key }; /* the reason we have the tail here is that otherwise we need to @@ -197,6 +218,12 @@ static int hash_close(struct _IBEXIndex *index) return 0; } +/* get an iterator class */ +static struct _IBEXCursor *hash_get_cursor(struct _IBEXIndex *index) +{ + return hash_cursor_create(index); +} + /* convert a hashbucket id into a name */ static char * hash_get_key(struct _IBEXIndex *index, hashid_t hashbucket, int *len) @@ -620,6 +647,74 @@ hash_insert(struct _IBEXIndex *index, const char *key, int keylen) return HASH_KEY(keybucket, 0); } +/* hash cursor functions */ +static struct _IBEXCursor * +hash_cursor_create(struct _IBEXIndex *idx) +{ + struct _HASHCursor *idc; + struct _hashroot *hashroot; + + idc = g_malloc(sizeof(*idc)); + idc->cursor.klass = &ibex_hash_cursor_class; + idc->cursor.index = idx; + idc->block = 0; + idc->index = 0; + + hashroot = (struct _hashroot *)ibex_block_read(idx->blocks, idx->root); + idc->size = hashroot->size; + + return &idc->cursor; +} + +static void +hash_cursor_close(struct _IBEXCursor *idc) +{ + g_free(idc); +} + +static guint32 +hash_cursor_next(struct _IBEXCursor *idc) +{ + struct _HASHCursor *hc = (struct _HASHCursor *)idc; + struct _hashroot *hashroot; + struct _hashblock *bucket; + struct _hashtableblock *table; + + /* get the next bucket chain */ + if (hc->block != 0) { + int ind; + + bucket = (struct _hashblock *)ibex_block_read(idc->index->blocks, HASH_BLOCK(hc->block)); + ind = HASH_INDEX(hc->block); + + g_assert(ind < bucket->used); + + hc->block = bucket->hb_keys[ind].next; + } + + if (hc->block == 0) { + hashroot = (struct _hashroot *)ibex_block_read(idc->index->blocks, idc->index->root); + while (hc->block == 0 && hc->index < hc->size) { + table = (struct _hashtableblock *) + ibex_block_read(idc->index->blocks, + hashroot->table[hc->index / (BLOCK_SIZE/sizeof(blockid_t))]); + hc->block = table->buckets[hc->index % (BLOCK_SIZE/sizeof(blockid_t))]; + + hc->index++; + } + } + + return hc->block; +} + +static char * +hash_cursor_next_key(struct _IBEXCursor *idc, int *keylenptr) +{ + /* TODO: this could be made slightly mroe efficient going to the structs direct. + but i'm lazy today */ + return idc->index->klass->get_key(idc->index, idc->klass->next(idc), keylenptr); +} + /* debug */ void ibex_hash_dump(struct _IBEXIndex *index); static void ibex_hash_dump_rec(struct _IBEXIndex *index, int *words, int *wordslen); diff --git a/libibex/ibex_block.c b/libibex/ibex_block.c index 8a042cd2cf..7540d48789 100644 --- a/libibex/ibex_block.c +++ b/libibex/ibex_block.c @@ -217,6 +217,10 @@ ibex_index_buffer (ibex *ib, char *name, char *buffer, size_t len, size_t *unrea } done: d(printf("name %s count %d size %d\n", name, wordlist->len, len)); + if (!ib->predone) { + ib->words->klass->index_pre(ib->words); + ib->predone = TRUE; + } ib->words->klass->add_list(ib->words, name, wordlist); ret = 0; error: @@ -249,6 +253,10 @@ ibex *ibex_open (char *file, int flags, int mode) int ibex_save (ibex *ib) { printf("syncing database\n"); + if (ib->predone) { + ib->words->klass->index_post(ib->words); + ib->predone = FALSE; + } ib->words->klass->sync(ib->words); /* FIXME: some return */ ibex_block_cache_sync(ib->blocks); @@ -261,6 +269,11 @@ int ibex_close (ibex *ib) printf("closing database\n"); + if (ib->predone) { + ib->words->klass->index_post(ib->words); + ib->predone = FALSE; + } + ib->words->klass->close(ib->words); ibex_block_cache_close(ib->blocks); g_free(ib); diff --git a/libibex/ibex_internal.h b/libibex/ibex_internal.h index bd5d1cbea4..db772e4062 100644 --- a/libibex/ibex_internal.h +++ b/libibex/ibex_internal.h @@ -24,10 +24,11 @@ #include "block.h" #include "wordindex.h" -#define IBEX_VERSION "ibx4" +#define IBEX_VERSION "ibx5" struct ibex { char *path; struct _memcache *blocks; struct _IBEXWord *words; + int predone; }; diff --git a/libibex/index.h b/libibex/index.h index c3c83c1bcf..0cef7948b3 100644 --- a/libibex/index.h +++ b/libibex/index.h @@ -27,6 +27,18 @@ #define INDEX_STAT +struct _IBEXCursor { + struct _IBEXCursorClass *klass; + struct _IBEXIndex *index; +}; + +struct _IBEXCursorClass { + void (*close)(struct _IBEXCursor *); + + guint32 (*next)(struct _IBEXCursor *); + char *(*next_key)(struct _IBEXCursor *, int *keylenptr); +}; + struct _IBEXIndex { struct _IBEXIndexClass *klass; struct _memcache *blocks; @@ -62,6 +74,9 @@ struct _IBEXIndexClass { /* get the key contents based on the keyid */ blockid_t (*get_data)(struct _IBEXIndex *, guint32 keyid, blockid_t *tail); + + /* get a cursor for iterating over all contents */ + struct _IBEXCursor *(*get_cursor)(struct _IBEXIndex *); }; /* a storage class, stores lists of lists of id's */ diff --git a/libibex/wordindex.c b/libibex/wordindex.c index 62e8859e15..43a91f4342 100644 --- a/libibex/wordindex.c +++ b/libibex/wordindex.c @@ -82,9 +82,12 @@ static void add_list(struct _IBEXWord *, const char *name, GPtrArray *words);/* static int word_sync(struct _IBEXWord *idx); static int word_flush(struct _IBEXWord *idx); static int word_close(struct _IBEXWord *idx); +static void word_index_pre(struct _IBEXWord *idx); +static void word_index_post(struct _IBEXWord *idx); struct _IBEXWordClass ibex_word_index_class = { word_sync, word_flush, word_close, + word_index_pre, word_index_post, unindex_name, contains_name, find, find_name, add, add_list @@ -138,6 +141,14 @@ cache_sanity(struct _wordcache *head) } #endif +static void word_index_pre(struct _IBEXWord *idx) +{ +} + +static void word_index_post(struct _IBEXWord *idx) +{ +} + /* unindex all entries for name */ static void unindex_name(struct _IBEXWord *idx, const char *name) { diff --git a/libibex/wordindex.h b/libibex/wordindex.h index d222ff5a87..353f4dddc6 100644 --- a/libibex/wordindex.h +++ b/libibex/wordindex.h @@ -38,6 +38,9 @@ struct _IBEXWordClass { int (*flush)(struct _IBEXWord *); int (*close)(struct _IBEXWord *); + void (*index_pre)(struct _IBEXWord *); /* get ready for doing a lot of indexing. may be a nop */ + void (*index_post)(struct _IBEXWord *); + void (*unindex_name)(struct _IBEXWord *, const char *name); /* unindex all entries for name */ gboolean (*contains_name)(struct _IBEXWord *, const char *name); /* index contains data for name */ GPtrArray *(*find)(struct _IBEXWord *, const char *word); /* returns all matches for word */ @@ -57,9 +60,13 @@ struct _IBEXWord { GHashTable *wordcache; /* word->struct _wordcache mapping */ struct _list wordnodes; /* LRU list of wordcache structures */ int wordcount; /* how much space used in cache */ + int precount; }; struct _IBEXWord *ibex_create_word_index(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot); +/* alternate implemenation */ +struct _IBEXWord *ibex_create_word_index_mem(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot); + #endif /* !_WORDINDEX_H */ diff --git a/libibex/wordindexmem.c b/libibex/wordindexmem.c new file mode 100644 index 0000000000..f0cc7336f8 --- /dev/null +++ b/libibex/wordindexmem.c @@ -0,0 +1,721 @@ +/* -*- Mode: C; tab-width: 8; indent-tabs-mode: t; c-basic-offset: 8 -*- + * + * Copyright (C) 2000 Helix Code, Inc. + * + * Authors: Michael Zucchi + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Library General Public License + * as published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Library General Public License for more details. + * + * You should have received a copy of the GNU Library General Public + * License along with the Gnome Library; see the file COPYING.LIB. If not, + * write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 02111-1307, USA. + */ + +/* this is the same as wordindex.c, but it doesn't have an LRU cache + for word names. it has a lookup tab le that is only loaded if + index-pre is called, otherwise it always hits disk */ + +/* code to manage a word index */ +/* includes a cache for word index writes, + but not for name index writes (currently), or any reads. + +Note the word cache is only needed during indexing of lots +of words, and could then be discarded (:flush()). + +*/ + +#include + +#include +#include +#include +#include + +#include + +#include "block.h" +#include "index.h" +#include "wordindex.h" + +/*#define MALLOC_CHECK*/ + +#ifdef MALLOC_CHECK +#include +#endif + +#define d(x) + +/*#define WORDCACHE_SIZE (256)*/ +#define WORDCACHE_SIZE (4096) + +extern struct _IBEXStoreClass ibex_diskarray_class; +extern struct _IBEXIndexClass ibex_hash_class; + +/* need 2 types of hash key? + one that just stores the wordid / wordblock + and one that stores the filecount/files? +*/ + + +#define CACHE_FILE_COUNT (62) + +struct _wordcache { + nameid_t wordid; /* disk wordid */ + blockid_t wordblock; /* head of disk list */ + blockid_t wordtail; /* and the tail data */ + short filecount; /* how many valid items in files[] */ + short filealloc; /* how much allocated space in files[] */ + union { + nameid_t *files; /* memory cache of files */ + nameid_t file0; /* if filecount == 1 && filealloc == 0, store directly */ + } file; + char word[1]; /* actual word follows */ +}; + +static void unindex_name(struct _IBEXWord *, const char *name); /* unindex all entries for name */ +static gboolean contains_name(struct _IBEXWord *, const char *name); /* index contains data for name */ +static GPtrArray *find(struct _IBEXWord *, const char *word); /* returns all matches for word */ +static gboolean find_name(struct _IBEXWord *, const char *name, const char *word); /* find if name contains word */ +static void add(struct _IBEXWord *, const char *name, const char *word); /* adds a single word to name (slow) */ +static void add_list(struct _IBEXWord *, const char *name, GPtrArray *words);/* adds a bunch of words to a given name */ +static int word_sync(struct _IBEXWord *idx); +static int word_flush(struct _IBEXWord *idx); +static int word_close(struct _IBEXWord *idx); +static void word_index_pre(struct _IBEXWord *idx); +static void word_index_post(struct _IBEXWord *idx); + +static void sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache); + +struct _IBEXWordClass ibex_word_index_mem_class = { + word_sync, word_flush, word_close, + word_index_pre, word_index_post, + unindex_name, contains_name, + find, find_name, + add, add_list +}; + +#ifdef MALLOC_CHECK +static void +checkmem(void *p) +{ + if (p) { + int status = mprobe(p); + + switch (status) { + case MCHECK_HEAD: + printf("Memory underrun at %p\n", p); + abort(); + case MCHECK_TAIL: + printf("Memory overrun at %p\n", p); + abort(); + case MCHECK_FREE: + printf("Double free %p\n", p); + abort(); + } + } +} +#endif + +/* this interface isn't the best, but it'll do for now */ +struct _IBEXWord * +ibex_create_word_index_mem(struct _memcache *bc, blockid_t *wordroot, blockid_t *nameroot) +{ + struct _IBEXWord *idx; + + idx = g_malloc0(sizeof(*idx)); + idx->wordcache = g_hash_table_new(g_str_hash, g_str_equal); + ibex_list_new(&idx->wordnodes); + idx->wordcount = 0; + idx->precount = 0; + idx->klass = &ibex_word_index_mem_class; + + /* we use the same block array storage for both indexes at the moment */ + idx->wordstore = ibex_diskarray_class.create(bc); + idx->namestore = idx->wordstore; + + /* but not the same indexes! */ + if (*wordroot) { + printf("opening wordindex root = %d\n", *wordroot); + idx->wordindex = ibex_hash_class.open(bc, *wordroot); + } else { + idx->wordindex = ibex_hash_class.create(bc, 2048); + *wordroot = idx->wordindex->root; + printf("creating wordindex root = %d\n", *wordroot); + } + if (*nameroot) { + printf("opening nameindex root = %d\n", *nameroot); + idx->nameindex = ibex_hash_class.open(bc, *nameroot); + } else { + idx->nameindex = ibex_hash_class.create(bc, 2048); + *nameroot = idx->nameindex->root; + printf("creating nameindex root = %d\n", *nameroot); + } + return idx; +} + +static void +node_sanity(char *key, struct _wordcache *node, void *data) +{ + g_assert(node->filecount <= node->filealloc || (node->filecount == 1 && node->filealloc == 0)); + g_assert(strlen(node->word) != 0); +#ifdef MALLOC_CHECK + checkmem(node); + if (node->filealloc) + checkmem(node->file.files); +#endif +} + +static void +cache_sanity(struct _IBEXWord *idx) +{ +#ifdef MALLOC_CHECK + checkmem(idx); +#endif + g_hash_table_foreach(idx->wordcache, (GHFunc)node_sanity, idx); +} + +static void word_index_pre(struct _IBEXWord *idx) +{ + struct _IBEXCursor *idc; + struct _wordcache *cache; + nameid_t wordid; + char *key; + int len; + + idx->precount ++; + if (idx->precount > 1) + return; + + /* want to load all words into the cache lookup table */ + printf("pre-loading all word info into memory\n"); + idc = idx->wordindex->klass->get_cursor(idx->wordindex); + while ( (wordid = idc->klass->next(idc)) ) { + key = idc->index->klass->get_key(idc->index, wordid, &len); + /*d(printf("Adding word %s\n", key));*/ + cache = g_malloc0(sizeof(*cache) + strlen(key)); + strcpy(cache->word, key); + g_free(key); + cache->wordid = wordid; + cache->wordblock = idc->index->klass->get_data(idc->index, wordid, &cache->wordtail); + cache->filecount = 0; + cache->filealloc = 0; + g_hash_table_insert(idx->wordcache, cache->word, cache); + idx->wordcount++; + } + +#ifdef MALLOC_CHECK + cache_sanity(idx); +#endif + + idc->klass->close(idc); + + printf("done\n"); +} + +static gboolean +sync_free_value(void *key, void *value, void *data) +{ + struct _wordcache *cache = (struct _wordcache *)value; + struct _IBEXWord *idx = (struct _IBEXWord *)data; + + sync_cache_entry(idx, cache); + if (cache->filealloc) + g_free(cache->file.files); + g_free(cache); + + return TRUE; +} + +static void +sync_value(void *key, void *value, void *data) +{ + struct _wordcache *cache = (struct _wordcache *)value; + struct _IBEXWord *idx = (struct _IBEXWord *)data; + + sync_cache_entry(idx, cache); +} + +static void word_index_post(struct _IBEXWord *idx) +{ + idx->precount--; + if (idx->precount > 0) + return; + idx->precount = 0; + +#ifdef MALLOC_CHECK + cache_sanity(idx); +#endif + + g_hash_table_foreach_remove(idx->wordcache, sync_free_value, idx); + idx->wordcount = 0; +} + +/* unindex all entries for name */ +static void unindex_name(struct _IBEXWord *idx, const char *name) +{ + GArray *words; + int i; + nameid_t nameid, wordid; + blockid_t nameblock, wordblock, newblock, nametail, wordtail, newtail; + char *word; + struct _wordcache *cache; + + d(printf("unindexing %s\n", name)); + + /* lookup the hash key */ + nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name)); + /* get the block for this key */ + nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail); + /* and the data for this block */ + words = idx->namestore->klass->get(idx->namestore, nameblock, nametail); + /* walk it ... */ + for (i=0;ilen;i++) { + /* get the word */ + wordid = g_array_index(words, nameid_t, i); + d(printf(" word %d\n", wordid)); + /* get the data block */ + wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail); + /* clear this name from it */ + newblock = wordblock; + newtail = wordtail; + idx->wordstore->klass->remove(idx->wordstore, &newblock, &newtail, nameid); + if (newblock != wordblock || newtail != wordtail) + idx->wordindex->klass->set_data(idx->wordindex, wordid, newblock, newtail); + + /* now check the cache as well */ + word = idx->nameindex->klass->get_key(idx->wordindex, wordid, NULL); + if (word) { + cache = g_hash_table_lookup(idx->wordcache, word); + if (cache) { + /* its there, update our head/tail pointers */ + cache->wordblock = newblock; + cache->wordtail = newtail; + + /* now check that we have a data entry in it */ + if (cache->filealloc == 0 && cache->filecount == 1) { + if (cache->file.file0 == nameid) { + cache->filecount = 0; + } + } else { + int j; + + for (j=0;jfilecount;j++) { + if (cache->file.files[j] == nameid) { + cache->file.files[j] = cache->file.files[cache->filecount-1]; + cache->filecount--; + break; + } + } + } + } + g_free(word); + } + } + g_array_free(words, TRUE); + + /* and remove name data and itself */ + idx->namestore->klass->free(idx->namestore, nameblock, nametail); + idx->nameindex->klass->remove(idx->nameindex, name, strlen(name)); +} + +/* index contains (any) data for name */ +static gboolean contains_name(struct _IBEXWord *idx, const char *name) +{ + return idx->nameindex->klass->find(idx->nameindex, name, strlen(name)) != 0; +} + +/* returns all matches for word */ +static GPtrArray *find(struct _IBEXWord *idx, const char *word) +{ + nameid_t wordid, nameid; + GPtrArray *res; + GArray *names; + int i; + char *new; + struct _wordcache *cache; + blockid_t wordblock, wordtail; + + res = g_ptr_array_new(); + + cache = g_hash_table_lookup(idx->wordcache, word); + if (cache) { +#if 0 + /* freshen cache entry if we touch it */ + ibex_list_remove((struct _listnode *)cache); + ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache); +#endif + wordid = cache->wordid; + wordblock = cache->wordblock; + wordtail = cache->wordtail; + } else { + /* lookup the hash key */ + wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word)); + /* get the block for this key */ + wordblock = idx->wordindex->klass->get_data(idx->wordindex, wordid, &wordtail); + } + /* and the data for this block */ + names = idx->wordstore->klass->get(idx->wordstore, wordblock, wordtail); + /* .. including any memory-only data */ + if (cache) { + if (cache->filealloc == 0 && cache->filecount == 1) + g_array_append_val(names, cache->file.file0); + else + g_array_append_vals(names, cache->file.files, cache->filecount); + } + + /* walk it ... converting id's back to strings */ + g_ptr_array_set_size(res, names->len); + for (i=0;ilen;i++) { + nameid = g_array_index(names, nameid_t, i); + new = idx->nameindex->klass->get_key(idx->nameindex, nameid, NULL); + res->pdata[i] = new; + } + g_array_free(names, TRUE); + return res; +} + +/* find if name contains word */ +static gboolean find_name(struct _IBEXWord *idx, const char *name, const char *word) +{ + nameid_t wordid, nameid; + blockid_t nameblock, nametail; + struct _wordcache *cache; + int i; + + /* lookup the hash key for the name */ + nameid = idx->nameindex->klass->find(idx->nameindex, name, strlen(name)); + /* get the block for this name */ + nameblock = idx->nameindex->klass->get_data(idx->nameindex, nameid, &nametail); + + /* check if there is an in-memory cache for this word, check its file there first */ + cache = g_hash_table_lookup(idx->wordcache, word); + if (cache) { +#if 0 + /* freshen cache entry if we touch it */ + ibex_list_remove((struct _listnode *)cache); + ibex_list_addtail(&idx->wordnodes, (struct _listnode *)cache); +#endif + if (cache->filecount == 1 && cache->filealloc == 0) { + if (cache->file.file0 == nameid) + return TRUE; + } else { + for (i=0;ifilecount;i++) { + if (cache->file.files[i] == nameid) + return TRUE; + } + } + /* not there? well we can use the wordid anyway */ + wordid = cache->wordid; + } else { + /* lookup the hash key for word */ + wordid = idx->wordindex->klass->find(idx->wordindex, word, strlen(word)); + } + + /* see if wordid is in nameblock */ + return idx->namestore->klass->find(idx->namestore, nameblock, nametail, wordid); +} + +/* cache helper functions */ +/* flush a cache entry to disk, and empty it out */ +static void +sync_cache_entry(struct _IBEXWord *idx, struct _wordcache *cache) +{ + GArray array; /* just use this as a header */ + blockid_t oldblock, oldtail; + + if (cache->filecount == 0) + return; + + d(printf("syncing cache entry '%s' used %d\n", cache->word, cache->filecount)); + if (cache->filecount == 1 && cache->filealloc == 0) + array.data = (char *)&cache->file.file0; + else + array.data = (char *)cache->file.files; + array.len = cache->filecount; + oldblock = cache->wordblock; + oldtail = cache->wordtail; + idx->wordstore->klass->add_list(idx->wordstore, &cache->wordblock, &cache->wordtail, &array); + if (oldblock != cache->wordblock || oldtail != cache->wordtail) { + idx->wordindex->klass->set_data(idx->wordindex, cache->wordid, cache->wordblock, cache->wordtail); + } + cache->filecount = 0; +} + +/* create a new key in an index, returning its id and head block */ +static void +add_index_key(struct _IBEXIndex *wordindex, const char *word, nameid_t *wordid, blockid_t *wordblock, blockid_t *wordtail) +{ + /* initialise cache entry - id of word entry and head block */ + *wordid = wordindex->klass->find(wordindex, word, strlen(word)); + + if (*wordid == 0) { + *wordid = wordindex->klass->insert(wordindex, word, strlen(word)); + *wordblock = 0; + *wordtail = 0; + } else { + *wordblock = wordindex->klass->get_data(wordindex, *wordid, wordtail); + } +} + +/* create a new key in a cached index (only word cache so far), flushing old keys + if too much space is being used */ +static struct _wordcache * +add_index_cache(struct _IBEXWord *idx, const char *word) +{ + struct _wordcache *cache; + + cache = g_hash_table_lookup(idx->wordcache, word); + if (cache == 0) { + /*d(printf("adding %s to cache\n", word));*/ + +#if 0 + /* see if we have to flush off the last entry */ + if (idx->wordcount >= WORDCACHE_SIZE) { + struct _wordcache *mincache; + int min, count=0; + /* remove last entry, and flush it */ + cache = (struct _wordcache *)idx->wordnodes.tailpred; + mincache = cache; + min = mincache->filecount; + + d(printf("flushing word from cache %s\n", cache->word)); + /* instead of just using the last entry, we try and find an entry with + with only 1 item (failing that, the smallest in the range we look at) */ + /* this could probably benefit greatly from a more sophisticated aging algorithm */ + while (cache->next && count < 100) { + if (cache->filecount == 1) { + mincache = cache; + break; + } + if (cache->filecount > 0 && cache->filecount < min) { + mincache = cache; + min = cache->filecount; + } + cache = cache->next; + count++; + } + ibex_list_remove((struct _listnode *)mincache); + g_hash_table_remove(idx->wordcache, mincache->word); + sync_cache_entry(idx, mincache); + if (mincache->filealloc) + g_free(mincache->file.files); + g_free(mincache); + idx->wordcount--; + } +#endif + cache = g_malloc0(sizeof(*cache)+strlen(word)); + /* if we're in an index state, we can assume we dont have it if we dont have it in memory */ + if (idx->precount == 0) { + /* initialise cache entry - id of word entry and head block */ + add_index_key(idx->wordindex, word, &cache->wordid, &cache->wordblock, &cache->wordtail); + } else { + cache->wordid = idx->wordindex->klass->insert(idx->wordindex, word, strlen(word)); + } + /* other fields */ + strcpy(cache->word, word); + cache->filecount = 0; + g_hash_table_insert(idx->wordcache, cache->word, cache); +#if 0 + ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache); +#endif + idx->wordcount++; + } else { + /*d(printf("already have %s in cache\n", word));*/ +#if 0 + /* move cache bucket ot the head of the cache list */ + ibex_list_remove((struct _listnode *)cache); + ibex_list_addhead(&idx->wordnodes, (struct _listnode *)cache); +#endif + } + return cache; +} + +/* adds a single word to name (slow) */ +static void add(struct _IBEXWord *idx, const char *name, const char *word) +{ + nameid_t nameid; + blockid_t nameblock, newblock, nametail, newtail; + struct _wordcache *cache; + + g_error("Dont use wordindex::add()"); + abort(); + + cache = add_index_cache(idx, word); + + /* get the nameid and block start for this name */ + add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail); + + /* check for repeats of the last name - dont add anything */ + if (cache->filecount == 1 && cache->filealloc == 0) { + if (cache->file.file0 == nameid) + return; + } else { + if (cache->file.files[cache->filecount] == nameid) + return; + } + + /* see if we are setting the first, drop it in the union */ + if (cache->filecount == 0 && cache->filealloc == 0) { + cache->file.file0 = nameid; + } else if (cache->filecount == 1 && cache->filealloc == 0) { + nameid_t saveid; + /* we need to allocate space for words */ + saveid = cache->file.file0; + cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT); + /* this could possibly grow as needed, but i wont for now */ + cache->filealloc = CACHE_FILE_COUNT; + cache->file.files[0] = saveid; + cache->file.files[1] = nameid; + } else { + cache->file.files[cache->filecount] = nameid; + } + + cache->filecount++; + + /* if we are full, force a flush now */ + if (cache->filealloc && cache->filecount >= cache->filealloc) { + sync_cache_entry(idx, cache); + } + + newtail = nametail; + newblock = nameblock; + idx->namestore->klass->add(idx->namestore, &newblock, &newtail, cache->wordid); + if (newblock != nameblock || newtail != nametail) { + idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail); + } +} + +/* adds a bunch of words to a given name */ +static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words) +{ + int i; + GArray *data = g_array_new(0, 0, sizeof(nameid_t)); + blockid_t nameblock, newblock, nametail, newtail; + nameid_t nameid; + struct _wordcache *cache; + + d(printf("Adding words to name %s\n", name)); + + d(cache_sanity(idx)); + + /* get the nameid and block start for this name */ + add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail); + + d(cache_sanity(idx)); + + for (i=0;ilen;i++) { + char *word = words->pdata[i]; + + cache = add_index_cache(idx, word); + + /*d(cache_sanity(idx));*/ + + /* check for duplicates; doesn't catch duplicates over an overflow boundary. Watch me care. */ + if (cache->filecount == 0 + /* the 1 item case */ + || (cache->filecount == 1 && cache->filealloc == 0 && cache->file.file0 != nameid) + /* the normal case */ + || (cache->filealloc > 0 && cache->file.files[cache->filecount-1] != nameid)) { + + /* see if we are setting the first, drop it in the union */ + if (cache->filecount == 0 && cache->filealloc == 0) { + cache->file.file0 = nameid; + } else if (cache->filecount == 1 && cache->filealloc == 0) { + nameid_t saveid; + /* we need to allocate space for words */ + saveid = cache->file.file0; + cache->file.files = g_malloc(sizeof(cache->file.files[0]) * CACHE_FILE_COUNT); + /* this could possibly grow as needed, but i wont for now */ + cache->filealloc = CACHE_FILE_COUNT; + cache->file.files[0] = saveid; + cache->file.files[1] = nameid; + } else { + cache->file.files[cache->filecount] = nameid; + } + + cache->filecount++; + + /* if we are full, force a flush now */ + if (cache->filealloc && cache->filecount >= cache->filealloc) { + sync_cache_entry(idx, cache); + } + + /*d(cache_sanity(idx));*/ + + /* and append this wordid for this name in memory */ + g_array_append_val(data, cache->wordid); + } + + /*d(cache_sanity(idx));*/ + } + + d(cache_sanity(idx)); + + /* and append these word id's in one go */ + newblock = nameblock; + newtail = nametail; + idx->namestore->klass->add_list(idx->namestore, &newblock, &newtail, data); + if (newblock != nameblock || newtail != nametail) { + idx->nameindex->klass->set_data(idx->nameindex, nameid, newblock, newtail); + } + + d(cache_sanity(idx)); + + g_array_free(data, TRUE); +} + +/* sync any in-memory data to disk */ +static int +word_sync(struct _IBEXWord *idx) +{ + /* we just flush also, save memory */ + word_flush(idx); + +#if 0 + struct _wordcache *cache = (struct _wordcache *)idx->wordnodes.head; + + while (cache->next) { + sync_cache_entry(idx, cache); + cache = cache->next; + } + + /*ibex_hash_dump(idx->wordindex);*/ + /*ibex_hash_dump(idx->nameindex);*/ +#endif + return 0; +} + +/* sync and flush any in-memory data to disk and free it */ +static int +word_flush(struct _IBEXWord *idx) +{ + d(cache_sanity(idx)); + + g_hash_table_foreach_remove(idx->wordcache, sync_free_value, idx); + idx->wordcount = 0; + return 0; +} + +static int word_close(struct _IBEXWord *idx) +{ + idx->klass->flush(idx); + + idx->namestore->klass->close(idx->namestore); + idx->nameindex->klass->close(idx->nameindex); + /*same as namestore: + idx->wordstore->klass->close(idx->wordstore);*/ + idx->wordindex->klass->close(idx->wordindex); + g_hash_table_destroy(idx->wordcache); + g_free(idx); + + return 0; +} -- cgit v1.2.3