diff options
Diffstat (limited to 'libibex/testindex.c')
-rw-r--r-- | libibex/testindex.c | 158 |
1 files changed, 158 insertions, 0 deletions
diff --git a/libibex/testindex.c b/libibex/testindex.c new file mode 100644 index 0000000000..e21d73ff06 --- /dev/null +++ b/libibex/testindex.c @@ -0,0 +1,158 @@ +/* Test code for libibex */ + +#include <stdio.h> +#include <glib.h> +#include <errno.h> +#include <string.h> +#include "ibex_internal.h" + +void word_index_mem_dump_info(struct _IBEXWord *idx); + +/* + The following is a routine to generate a Gaussian distribution + of pseudo random numbers, to make the results a little more + meaningful +*/ + +/* boxmuller.c Implements the Polar form of the Box-Muller + Transformation + + (c) Copyright 1994, Everett F. Carter Jr. + Permission is granted by the author to use + this software for any application provided this + copyright notice is preserved. + +*/ + +#include <stdlib.h> +#include <math.h> + +#define ranf() ((float)rand()/(float)RAND_MAX) + +static float box_muller(float m, float s) /* normal random variate generator */ +{ /* mean m, standard deviation s */ + float x1, x2, w, y1; + static float y2; + static int use_last = 0; + + if (use_last) /* use value from previous call */ + { + y1 = y2; + use_last = 0; + } + else + { + do { + x1 = 2.0 * ranf() - 1.0; + x2 = 2.0 * ranf() - 1.0; + w = x1 * x1 + x2 * x2; + } while ( w >= 1.0 ); + + w = sqrt( (-2.0 * log( w ) ) / w ); + y1 = x1 * w; + y2 = x2 * w; + use_last = 1; + } + + return( m + y1 * s ); +} + +/* gets a word from words, using m and s as distribution values */ +static char *getword(GPtrArray *words, float m, float s) +{ + int index; + + do { + index = (int)box_muller(m, s); + } while (index<0 || index>=words->len); + + return words->pdata[index]; +} + + +int main(int argc, char **argv) +{ + int i, j; + GPtrArray *words = g_ptr_array_new(); + char line[256]; + int len; + FILE *file; + float m, s; + ibex *ib; + GString *buffer = g_string_new(""); + int files; + char *dict; + + srand(0xABADF00D); + + files = 80000; + dict = "/usr/dict/words"; + + /* read words into an array */ + file = fopen(dict, "r"); + if (file == NULL) { + fprintf(stderr, "Cannot open word file: %s: %s\n", dict, strerror(errno)); + return 1; + } + while (fgets(line, sizeof(line), file) != NULL) { + len = strlen(line); + if (len>0 && line[len-1]=='\n') { + line[len-1]=0; + } + g_ptr_array_add(words, g_strdup(line)); + } + fclose(file); + + fprintf(stderr, "Read %d words\n", words->len); + + /* *shrug* arbitrary values really */ + m = words->len/2; + /* well, the average vocabulary of a mailbox is about 10K words */ + s = 1000.0; + + printf("mean is %f, s is %f\n", m, s); + + /* open ibex file */ + ib = ibex_open("test.ibex", O_RDWR|O_CREAT, 0600); + if (ib == NULL) { + perror("Creating ibex file\n"); + return 1; + } + + printf("Adding %d files\n", files); + + /* simulate adding new words to a bunch of files */ + for (j=0;j<files;j++) { + /* always new name */ + char *name = words->pdata[j % words->len]; + /* something like 60 words in a typical message, say */ + int count = (int)box_muller(60.0, 20.0); + + if (j%1000 == 0) + word_index_mem_dump_info(ib->words); + + /* cache the name info */ + ibex_contains_name(ib, name); + + /*printf("Adding %d words to '%s'\n", count, name);*/ + + g_string_truncate(buffer, 0); + + /* build up the word buffer */ + for (i=0;i<count;i++) { + if (i>0) + g_string_append_c(buffer, ' '); + g_string_append(buffer, getword(words, m, s)); + } + + /* and index it */ + ibex_index_buffer(ib, name, buffer->str, buffer->len, NULL); + } + + word_index_mem_dump_info(ib->words); + + ibex_close(ib); + + return 0; +} + |