From 6c45c449cae597f00cf57d5c9914489871ba916e Mon Sep 17 00:00:00 2001
From: Not Zed <NotZed@HelixCode.com>
Date: Mon, 27 Nov 2000 02:05:14 +0000
Subject: If we have the namecache active, and there is no name there, we add
 it

2000-11-17  Not Zed  <NotZed@HelixCode.com>

        * wordindexmem.c (add_list): If we have the namecache active, and
        there is no name there, we add it directly and dont look it up
        first.

        * testindex.c: Some performance testing & stat gathering stuff.

svn path=/trunk/; revision=6677
---
 libibex/ChangeLog      |   8 +++
 libibex/Makefile.am    |   5 +-
 libibex/testindex.c    | 158 +++++++++++++++++++++++++++++++++++++++++++++++++
 libibex/wordindexmem.c | 112 +++++++++++++++++++++++++++++++++--
 4 files changed, 278 insertions(+), 5 deletions(-)
 create mode 100644 libibex/testindex.c

(limited to 'libibex')

diff --git a/libibex/ChangeLog b/libibex/ChangeLog
index de9cb8dd40..1d74b73898 100644
--- a/libibex/ChangeLog
+++ b/libibex/ChangeLog
@@ -1,3 +1,11 @@
+2000-11-17  Not Zed  <NotZed@HelixCode.com>
+
+	* wordindexmem.c (add_list): If we have the namecache active, and
+	there is no name there, we add it directly and dont look it up
+	first.
+
+	* testindex.c: Some performance testing & stat gathering stuff.
+
 2000-11-16  Not Zed  <NotZed@HelixCode.com>
 
 	* wordindexmem.c (ibex_create_word_index_mem): Initialise nameinit
diff --git a/libibex/Makefile.am b/libibex/Makefile.am
index 6cc88186d6..61f3d72004 100644
--- a/libibex/Makefile.am
+++ b/libibex/Makefile.am
@@ -21,11 +21,14 @@ INCLUDES = -I$(srcdir) $(GLIB_CFLAGS) $(UNICODE_CFLAGS) \
 		-DG_LOG_DOMAIN=\"libibex\"
 
 
-noinst_PROGRAMS = dumpindex
+noinst_PROGRAMS = dumpindex testindex
 
 dumpindex_SOURCES = dumpindex.c
 dumpindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS)
 
+testindex_SOURCES = testindex.c
+testindex_LDADD = libibex.la $(GLIB_LIBS) $(UNICODE_LIBS) -lm
+
 #noinst_PROGRAMS = mkindex lookup
 #
 #mkindex_SOURCES = mkindex.c
diff --git a/libibex/testindex.c b/libibex/testindex.c
new file mode 100644
index 0000000000..e21d73ff06
--- /dev/null
+++ b/libibex/testindex.c
@@ -0,0 +1,158 @@
+/* Test code for libibex */
+
+#include <stdio.h>
+#include <glib.h>
+#include <errno.h>
+#include <string.h>
+#include "ibex_internal.h"
+
+void word_index_mem_dump_info(struct _IBEXWord *idx);
+
+/*
+  The following is a routine to generate a Gaussian distribution
+  of pseudo random numbers, to make the results a little more
+  meaningful
+*/
+
+/* boxmuller.c           Implements the Polar form of the Box-Muller
+                         Transformation
+
+                      (c) Copyright 1994, Everett F. Carter Jr.
+                          Permission is granted by the author to use
+                          this software for any application provided this
+                          copyright notice is preserved.
+
+*/
+
+#include <stdlib.h>
+#include <math.h>
+
+#define ranf() ((float)rand()/(float)RAND_MAX)
+
+static float box_muller(float m, float s)      /* normal random variate generator */
+{                                       /* mean m, standard deviation s */
+        float x1, x2, w, y1;
+        static float y2;
+        static int use_last = 0;
+
+        if (use_last)                   /* use value from previous call */
+        {
+                y1 = y2;
+                use_last = 0;
+        }
+        else
+        {
+                do {
+                        x1 = 2.0 * ranf() - 1.0;
+                        x2 = 2.0 * ranf() - 1.0;
+                        w = x1 * x1 + x2 * x2;
+                } while ( w >= 1.0 );
+
+                w = sqrt( (-2.0 * log( w ) ) / w );
+                y1 = x1 * w;
+                y2 = x2 * w;
+                use_last = 1;
+        }
+
+        return( m + y1 * s );
+}
+
+/* gets a word from words, using m and s as distribution values */
+static char *getword(GPtrArray *words, float m, float s)
+{
+	int index;
+
+	do {
+		index = (int)box_muller(m, s);
+	} while (index<0 || index>=words->len);
+
+	return words->pdata[index];
+}
+
+
+int main(int argc, char **argv)
+{
+	int i, j;
+	GPtrArray *words = g_ptr_array_new();
+	char line[256];
+	int len;
+	FILE *file;
+	float m, s;
+	ibex *ib;
+	GString *buffer = g_string_new("");
+	int files;
+	char *dict;
+
+	srand(0xABADF00D);
+
+	files = 80000;
+	dict = "/usr/dict/words";
+
+	/* read words into an array */
+	file = fopen(dict, "r");
+	if (file == NULL) {
+		fprintf(stderr, "Cannot open word file: %s: %s\n", dict, strerror(errno));
+		return 1;
+	}
+	while (fgets(line, sizeof(line), file) != NULL) {
+		len = strlen(line);
+		if (len>0 && line[len-1]=='\n') {
+			line[len-1]=0;
+		}
+		g_ptr_array_add(words, g_strdup(line));
+	}
+	fclose(file);
+	
+	fprintf(stderr, "Read %d words\n", words->len);
+
+	/* *shrug* arbitrary values really */
+	m = words->len/2;
+	/* well, the average vocabulary of a mailbox is about 10K words */
+	s = 1000.0;
+
+	printf("mean is %f, s is %f\n", m, s);
+
+	/* open ibex file */
+	ib = ibex_open("test.ibex", O_RDWR|O_CREAT, 0600);
+	if (ib == NULL) {
+		perror("Creating ibex file\n");
+		return 1;
+	}
+
+	printf("Adding %d files\n", files);
+
+	/* simulate adding new words to a bunch of files */
+	for (j=0;j<files;j++) {
+		/* always new name */
+		char *name = words->pdata[j % words->len];
+		/* something like 60 words in a typical message, say */
+		int count = (int)box_muller(60.0, 20.0);
+
+		if (j%1000 == 0)
+			word_index_mem_dump_info(ib->words);
+
+		/* cache the name info */
+		ibex_contains_name(ib, name);
+
+		/*printf("Adding %d words to '%s'\n", count, name);*/
+
+		g_string_truncate(buffer, 0);
+
+		/* build up the word buffer */
+		for (i=0;i<count;i++) {
+			if (i>0)
+				g_string_append_c(buffer, ' ');
+			g_string_append(buffer, getword(words, m, s));
+		}
+
+		/* and index it */
+		ibex_index_buffer(ib, name, buffer->str, buffer->len, NULL);
+	}
+
+	word_index_mem_dump_info(ib->words);
+
+	ibex_close(ib);
+
+	return 0;
+}
+
diff --git a/libibex/wordindexmem.c b/libibex/wordindexmem.c
index 4c0bca7cef..9d26bb3697 100644
--- a/libibex/wordindexmem.c
+++ b/libibex/wordindexmem.c
@@ -650,11 +650,16 @@ static void add_list(struct _IBEXWord *idx, const char *name, GPtrArray *words)
 	d(cache_sanity(idx));
 
 	/* make sure we keep the namecache in sync, if it is active */
-	if (idx->nameinit && g_hash_table_lookup(idx->namecache, name) == NULL)
+	if (idx->nameinit && g_hash_table_lookup(idx->namecache, name) == NULL) {
 		g_hash_table_insert(idx->namecache, g_strdup(name), (void *)TRUE);
-
-	/* get the nameid and block start for this name */
-	add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
+		/* we know we dont have it in the disk hash either, so we insert anew (saves a lookup) */
+		nameid = idx->nameindex->klass->insert(idx->nameindex, name, strlen(name));
+		nameblock = 0;
+		nametail = 0;
+	} else {
+		/* get the nameid and block start for this name */
+		add_index_key(idx->nameindex, name, &nameid, &nameblock, &nametail);
+	}
 
 	d(cache_sanity(idx));
 
@@ -778,3 +783,102 @@ static int word_close(struct _IBEXWord *idx)
 
 	return 0;
 }
+
+/* debugging/tuning function */
+
+struct _stats {
+	int memcache;		/* total memory used by cache entries */
+	int memfile;		/* total mem ysed by file data */
+	int memfileused;	/* actual memory used by file data */
+	int memword;		/* total mem used by words */
+	int file1;		/* total file entries with only 1 entry */
+	int total;
+};
+
+static void
+get_info(void *key, void *value, void *data)
+{
+	struct _wordcache *cache = (struct _wordcache *)value;
+	struct _stats *stats = (struct _stats *)data;
+
+	/* round up to probable alignment, + malloc overheads */
+	stats->memcache += ((sizeof(struct _wordcache) + strlen(cache->word) + 4 + 3) & ~3);
+	if (cache->filealloc > 0) {
+		/* size of file array data */
+		stats->memcache += sizeof(nameid_t) * cache->filealloc + 4;
+		/* actual used memory */
+		stats->memfile += sizeof(nameid_t) * cache->filealloc;
+		stats->memfileused += sizeof(nameid_t) * cache->filecount;
+	}
+	if (cache->filecount == 1 && cache->filealloc == 0)
+		stats->file1++;
+
+	stats->memword += strlen(cache->word);
+	stats->total++;
+}
+
+static char *
+num(int num)
+{
+	int n;
+	char buf[256], *p = buf;
+	char type = 0;
+
+	n = num;
+	if (n>1000000) {
+		p+= sprintf(p, "%d ", n/1000000);
+		n -= (n/1000000)*1000000;
+		type = 'M';
+	}
+	if (n>1000) {
+		if (num>1000000)
+			p+= sprintf(p, "%03d ", n/1000);
+		else
+			p+= sprintf(p, "%d ", n/1000);
+		n -= (n/1000)*1000;
+		if (type == 0)
+			type = 'K';
+	}
+	if (num > 1000)
+		p += sprintf(p, "%03d", n);
+	else
+		p += sprintf(p, "%d", n);
+
+	n = num;
+	switch (type) {
+	case 'M':
+		p += sprintf(p, ", %d.%02dM", n/1024/1024, n*100/1024/1024);
+		break;
+	case 'K':
+		p += sprintf(p, ", %d.%02dK", n/1024, n*100/1024);
+		break;
+	case 0:
+		break;
+	}
+
+	return buf;
+}
+
+void word_index_mem_dump_info(struct _IBEXWord *idx);
+
+void word_index_mem_dump_info(struct _IBEXWord *idx)
+{
+	struct _stats stats = { 0 };
+	int useful;
+
+	g_hash_table_foreach(idx->wordcache, get_info, &stats);
+
+	useful = stats.total * sizeof(struct _wordcache) + stats.memword + stats.memfile;
+
+	printf("Word Index Stats:\n");
+	printf("Total word count: %d\n", stats.total);
+	printf("Total memory used: %s\n", num(stats.memcache));
+	printf("Total useful memory: %s\n", num(useful));
+	printf("Total malloc/alignment overhead: %s\n", num(stats.memcache - useful));
+	printf("Total buffer overhead: %s\n", num(stats.memfile - stats.memfileused));
+	printf("Space taken by words: %s\n", num(stats.memword + stats.total));
+	printf("Number of 1-word entries: %s\n", num(stats.file1));
+	if (stats.memcache > 0)
+		printf("%% unused space: %d %%\n", (stats.memfile - stats.memfileused) * 100 / stats.memcache);
+}
+
-- 
cgit v1.2.3