aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--libibex/ChangeLog8
-rw-r--r--libibex/Makefile.am5
-rw-r--r--libibex/testindex.c158
-rw-r--r--libibex/wordindexmem.c112
4 files changed, 278 insertions, 5 deletions
diff --git a/libibex/ChangeLog b/libibex/ChangeLog
index de9cb8dd40..1d74b73898 100644
--- a/libibex/ChangeLog
+++ b/libibex/ChangeLog
@@ -1,3 +1,11 @@
+2000-11-17 Not Zed <NotZed@HelixCode.com>
+
+ * wordindexmem.c (add_list): If we have the namecache active, and
+ there is no name there, we add it directly and dont look it up
+ first.
+
+ * testindex.c: Some performance testing & stat gathering stuff.
+
2000-11-16 Not Zed <NotZed@HelixCode.com>
* wordindexmem.c (ibex_create_word_index_mem): Initialise nameinit
diff --git a/libibex/Makefile.am b/libibex/Makefile.am
index 6cc88186d6..61f3d72004 100644
--- a/libibex/Makefile.am
+++ b/libibex/Makefile.am
@@ -21,11 +21,14 @@ INCLUDES = -I$(srcdir) $(GLIB_CFLAGS) $(UNICODE_CFLAGS) \
-DG_LOG_DOMAIN=\"libibex\"
-noinst_PROGRAMS = dumpindex
+noinst_PROGRAMS = dumpindex testindex
dumpindex_SOURCES = dumpindex.c
dumpindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
+testindex_SOURCES = testindex.c
+testindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) -lm
+
#noinst_PROGRAMS = mkindex lookup
#
#mkindex_SOURCES = mkindex.c
diff --git a/libibex/testindex.c b/libibex/testindex.c
new file mode 100644
index 0000000000..e21d73ff06
--- /dev/null
+++ b/libibex/testindex.c
@@ -0,0 +1,158 @@
+/* Test code for libibex */
+
+#include <stdio.h>
+#include <glib.h>
+#include <errno.h>
+#include <string.h>
+#include "ibex_internal.h"
+
+void word_index_mem_dump_info(struct _IBEXWord *idx);
+
+/*
+ The following is a routine to generate a Gaussian distribution
+ of pseudo random numbers, to make the results a little more
+ meaningful
+*/
+
+/* boxmuller.c Implements the Polar form of the Box-Muller
+ Transformation
+
+ (c) Copyright 1994, Everett F. Carter Jr.
+ Permission is granted by the author to use
+ this software for any application provided this
+ copyright notice is preserved.
+
+*/
+
+#include <stdlib.h>
+#include <math.h>
+
+#define ranf() ((float)rand()/(float)RAND_MAX)
+
+static float box_muller(float m, float s) /* normal random variate generator */
+{ /* mean m, standard deviation s */
+ float x1, x2, w, y1;
+ static float y2;
+ static int use_last = 0;
+
+ if (use_last) /* use value from previous call */
+ {
+ y1 = y2;
+ use_last = 0;
+ }
+ else
+ {
+ do {
+ x1 = 2.0 * ranf() - 1.0;
+ x2 = 2.0 * ranf() - 1.0;
+ w = x1 * x1 + x2 * x2;
+ } while ( w >= 1.0 );
+
+ w = sqrt( (-2.0 * log( w ) ) / w );
+ y1 = x1 * w;
+ y2 = x2 * w;
+ use_last = 1;
+ }
+
+ return( m + y1 * s );
+}
+
+/* gets a word from words, using m and s as distribution values */
+static char *getword(GPtrArray *words, float m, float s)
+{
+ int index;
+
+ do {
+ index = (int)box_muller(m, s);
+ } while (index<0 || index>=words->len);
+
+ return words->pdata[index];
+}
+
+
+int main(int argc, char **argv)
+{
+ int i, j;
+ GPtrArray *words = g_ptr_array_new();
+ char line[256];
+ int len;
+ FILE *file;
+ float m, s;
+ ibex *ib;
+ GString *buffer = g_string_new("");
+ int files;
+ char *dict;
+
+ srand(0xABADF00D);
+
+ files = 80000;
+ dict = "/usr/dict/words";
+
+ /* read words into an array */
+ file = fopen(dict, "r");
+ if (file == NULL) {
+ fprintf(stderr, "Cannot open word file: %s: %s\n", dict, strerror(errno));
+ return 1;
+ }
+ while (fgets(line, sizeof(line), file) != NULL) {
+ len = strlen(line);
+ if (len>0 && line[len-1]=='\n') {
+ line[len-1]=0;
+ }
+ g_ptr_array_add(words, g_strdup(line));
+ }
+ fclose(file);
+
+ fprintf(stderr, "Read %d words\n", words->len);
+
+ /* *shrug* arbitrary values really */
+ m = words->len/2;
+ /* well, the average vocabulary of a mailbox is about 10K words */
+ s = 1000.0;
+
+ printf("mean is %f, s is %f\n", m, s);
+
+ /* open ibex file */
+ ib = ibex_open("test.ibex", O_RDWR|O_CREAT, 0600);
+ if (ib == NULL) {
+ perror("Creating ibex file\n");
+ return 1;
+ }
+
+ printf("Adding %d files\n", files);
+
+ /* simulate adding new words to a bunch of files */
+ for (j=0;j<files;j++) {
+ /* always new name */
+ char *name = words->pdata[j % words->len];
+ /* something like 60 words in a typical message, say */
+ int count = (int)box_muller(60.0, 20.0);
+
+ if (j%1000 == 0)
+ word_index_mem_dump_info(ib->words);
+
+ /* cache the name info */
+ ibex_contains_name(ib, name);
+
+ /*printf("Adding %d words to '%s'\n", count, name);*/
+
+ g_string_truncate(buffer, 0);
+
+ /* build up the word buffer */
+ for (i=0;i<count;i++) {
+ if (i>0)
+ g_string_append_c(buffer, ' ');
+ g_string_append(buffer, getword(words, m, s));
+ }
+
+ /* and index it */
+ ibex_index_buffer(ib, name, buffer->str, buffer->len, NULL);
+ }
+
+ word_index_mem_dump_info(ib->words);
+
+ ibex_close(ib);
+
+ return 0;
+}
+
diff --git a/libibex/wordindexmem.c b/libibex/wordindexmem.c
index 4c0bca7cef..9d26bb3697 100644
--- a/libibex/wordindexmem.c
+++ b/libibex/wordindexmem.c
@@ -650,11 +650,16 @@ static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words)
d(cache_sanity(idx));
/* make sure we keep the namecache in sync, if it is active */
- if (idx->nameinit && g_hash_table_lookup(idx->namecache, name) == NULL)
+ if (idx->nameinit && g_hash_table_lookup(idx->namecache, name) == NULL) {
g_hash_table_insert(idx->namecache, g_strdup(name), (void *)TRUE);
-
- /* get the nameid and block start for this name */
- add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
+ /* we know we dont have it in the disk hash either, so we insert anew (saves a lookup) */
+ nameid = idx->nameindex->klass->insert(idx->nameindex, name, strlen(name));
+ nameblock = 0;
+ nametail = 0;
+ } else {
+ /* get the nameid and block start for this name */
+ add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
+ }
d(cache_sanity(idx));
@@ -778,3 +783,102 @@ static int word_close(struct _IBEXWord *idx)
return 0;
}
+
+/* debugging/tuning function */
+
+struct _stats {
+ int memcache; /* total memory used by cache entries */
+ int memfile; /* total mem ysed by file data */
+ int memfileused; /* actual memory used by file data */
+ int memword; /* total mem used by words */
+ int file1; /* total file entries with only 1 entry */
+ int total;
+};
+
+static void
+get_info(void *key, void *value, void *data)
+{
+ struct _wordcache *cache = (struct _wordcache *)value;
+ struct _stats *stats = (struct _stats *)data;
+
+ /* round up to probable alignment, + malloc overheads */
+ stats->memcache += ((sizeof(struct _wordcache) + strlen(cache->word) + 4 + 3) & ~3);
+ if (cache->filealloc > 0) {
+ /* size of file array data */
+ stats->memcache += sizeof(nameid_t) * cache->filealloc + 4;
+ /* actual used memory */
+ stats->memfile += sizeof(nameid_t) * cache->filealloc;
+ stats->memfileused += sizeof(nameid_t) * cache->filecount;
+ }
+ if (cache->filecount == 1 && cache->filealloc == 0)
+ stats->file1++;
+
+ stats->memword += strlen(cache->word);
+ stats->total++;
+}
+
+static char *
+num(int num)
+{
+ int n;
+ char buf[256], *p = buf;
+ char type = 0;
+
+ n = num;
+ if (n>1000000) {
+ p+= sprintf(p, "%d ", n/1000000);
+ n -= (n/1000000)*1000000;
+ type = 'M';
+ }
+ if (n>1000) {
+ if (num>1000000)
+ p+= sprintf(p, "%03d ", n/1000);
+ else
+ p+= sprintf(p, "%d ", n/1000);
+ n -= (n/1000)*1000;
+ if (type == 0)
+ type = 'K';
+ }
+ if (num > 1000)
+ p += sprintf(p, "%03d", n);
+ else
+ p += sprintf(p, "%d", n);
+
+ n = num;
+ switch (type) {
+ case 'M':
+ p += sprintf(p, ", %d.%02dM", n/1024/1024, n*100/1024/1024);
+ break;
+ case 'K':
+ p += sprintf(p, ", %d.%02dK", n/1024, n*100/1024);
+ break;
+ case 0:
+ break;
+ }
+
+ return buf;
+}
+
+void word_index_mem_dump_info(struct _IBEXWord *idx);
+
+void word_index_mem_dump_info(struct _IBEXWord *idx)
+{
+ struct _stats stats = { 0 };
+ int useful;
+
+ g_hash_table_foreach(idx->wordcache, get_info, &stats);
+
+ useful = stats.total * sizeof(struct _wordcache) + stats.memword + stats.memfile;
+
+ printf("Word Index Stats:\n");
+ printf("Total word count: %d\n", stats.total);
+ printf("Total memory used: %s\n", num(stats.memcache));
+ printf("Total useful memory: %s\n", num(useful));
+ printf("Total malloc/alignment overhead: %s\n", num(stats.memcache - useful));
+ printf("Total buffer overhead: %s\n", num(stats.memfile - stats.memfileused));
+ printf("Space taken by words: %s\n", num(stats.memword + stats.total));
+ printf("Number of 1-word entries: %s\n", num(stats.file1));
+ if (stats.memcache > 0)
+ printf("%% unused space: %d %%\n", (stats.memfile - stats.memfileused) * 100 / stats.memcache);
+}
+