1 files changed, 263 insertions, 0 deletions
diff --git a/libibex/words.c b/libibex/words.c
new file mode 100644
index 0000000000..4776634251
--- /dev/null
+++ b/libibex/words.c
@@ -0,0 +1,263 @@
+/*
+	Copyright 2000 Helix Code Inc.
+*/
+/* words.c: low-level indexing ops */
+
+#include <ctype.h>
+#include <errno.h>
+#include <string.h>
+
+#include <unicode.h>
+
+#include "ibex_internal.h"
+
+static signed char utf8_trans[] = {
+  'A', 'A', 'A', 'A', 'A', 'A', -1, 'C', 'E', 'E', 'E', 'E', 'I', 'I',
+  'I', 'I', -2, 'N', 'O', 'O', 'O', 'O', 'O', '*', 'O', 'U', 'U', 'U',
+  'U', 'Y', -3, -4, 'a', 'a', 'a', 'a', 'a', 'a', -5, 'c', 'e', 'e',
+  'e', 'e', 'i', 'i', 'i', 'i', -6, 'n', 'o', 'o', 'o', 'o', 'o', '/',
+  'o', 'u', 'u', 'u', 'u', 'y', -7, 'y', 'A', 'a', 'A', 'a', 'A', 'a',
+  'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e',
+  'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g',
+  'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i',
+  'I', 'i', -8, -9, 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L',
+  'l', 'L', 'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', -10, -11,
+  'O', 'o', 'O', 'o', 'O', 'o', -12, -13, 'R', 'r', 'R', 'r', 'R', 'r',
+  'S', 'r', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't',
+  'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w',
+  'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's'
+};
+
+static char *utf8_long_trans[] = {
+  "AE", "TH", "TH", "ss", "ae", "th", "th", "IJ", "ij", "NG", "ng", "OE", "oe"
+};
+
+/* This is a bit weird. It takes pointers to the start and end (actually
+ * just past the end) of a UTF-8-encoded word, and a buffer at least 1
+ * byte longer than the length of the word. It copies the word into the
+ * buffer in all lowercase without accents, and splits up ligatures.
+ * (Since any ligature would be a multi-byte character in UTF-8, splitting
+ * them into two US-ASCII characters won't overrun the buffer.)
+ *
+ * It is not safe to call this routine with bad UTF-8.
+ */
+static void
+normalize_word(char *start, char *end, char *buf)
+{
+  unsigned char *s, *d;
+  unicode_char_t uc;
+
+  s = (unsigned char *)start;
+  d = (unsigned char *)buf;
+  while (s < (unsigned char *)end)
+    {
+      if (*s < 0x80)
+	{
+	  /* US-ASCII character: copy unless it's an apostrophe. */
+	  if (*s != '\'')
+	    *d++ = tolower(*s);
+	  s++;
+	}
+      else
+	{
+	  char *next = unicode_get_utf8(s, &uc);
+	  if (uc >= 0xc0 && uc < 0xc0 + sizeof(utf8_trans))
+	    {
+	      signed char ch = utf8_trans[uc - 0xc0];
+	      if (ch > 0)
+		*d++ = tolower(ch);
+	      else
+		{
+		  *d++ = tolower(utf8_long_trans[-ch - 1][0]);
+		  *d++ = tolower(utf8_long_trans[-ch - 1][1]);
+		}
+	      s = next;
+	    }
+	  else
+	    {
+	      while (s < (unsigned char *)next)
+		*d++ = *s++;
+	    }
+	}
+    }
+  *d = '\0';
+}
+
+enum { IBEX_ALPHA, IBEX_NONALPHA, IBEX_INVALID, IBEX_INCOMPLETE };
+
+/* This incorporates parts of libunicode, because there's no way to
+ * force libunicode to not read past a certain point.
+ */
+static int
+utf8_category(char *sp, char **snp, char *send)
+{
+  unsigned char *p = (unsigned char *)sp, **np = (unsigned char **)snp;
+  unsigned char *end = (unsigned char *)send;
+
+  if (isascii(*p))
+    {
+      *np = p + 1;
+      if (isalpha(*p) || *p == '\'')
+	return IBEX_ALPHA;
+      return IBEX_NONALPHA;
+    }
+  else
+    {
+      unicode_char_t uc;
+      int more;
+
+      if ((*p & 0xe0) == 0xc0)
+	{
+	  more = 1;
+	  uc = *p & 0x1f;
+	}
+      else if ((*p & 0xf0) == 0xe0)
+	{
+	  more = 2;
+	  uc = *p & 0x0f;
+	}
+      else if ((*p & 0xf8) == 0xf0)
+	{
+	  more = 3;
+	  uc = *p & 0x07;
+	}
+      else if ((*p & 0xfc) == 0xf8)
+	{
+	  more = 4;
+	  uc = *p & 0x03;
+	}
+      else if ((*p & 0xfe) == 0xfc)
+	{
+	  more = 5;
+	  uc = *p & 0x01;
+	}
+      else
+	return IBEX_INVALID;
+
+      if (p + more > end)
+	return IBEX_INCOMPLETE;
+
+      while (more--)
+	{
+	  if ((*++p & 0xc0) != 0x80)
+	    return IBEX_INVALID;
+	  uc <<= 6;
+	  uc |= *p & 0x3f;
+	}
+
+      *np = p + 1;
+      if (unicode_isalpha(uc))
+	return IBEX_ALPHA;
+      else
+	return IBEX_NONALPHA;
+    }
+}
+
+static ibex_file *
+get_ibex_file(ibex *ib, char *name)
+{
+  ibex_file *ibf;
+
+  ibf = g_tree_lookup(ib->files, name);
+  if (!ibf)
+    {
+      ibf = g_malloc(sizeof(ibex_file));
+      ibf->name = strdup(name);
+      ibf->index = 0;
+      g_tree_insert(ib->files, ibf->name, ibf);
+      ib->dirty = TRUE;
+    }
+  return ibf;
+}
+
+static void
+ref_word(ibex *ib, ibex_file *ibf, char *word)
+{
+  GPtrArray *refs;
+
+  refs = g_hash_table_lookup(ib->words, word);
+  if (!refs)
+    {
+      refs = g_ptr_array_new();
+      g_hash_table_insert(ib->words, g_strdup(word), refs);
+      g_ptr_array_add(refs, ibf);
+      ib->dirty = TRUE;
+    }
+  else if (g_ptr_array_index(refs, refs->len - 1) != ibf)
+    {
+      g_ptr_array_add(refs, ibf);
+      ib->dirty = TRUE;
+    }
+}
+
+int
+ibex_index_buffer(ibex *ib, char *name, char *buffer,
+		  size_t len, size_t *unread)
+{
+  char *p, *q, *nq, *end, *word;
+  ibex_file *ibf = get_ibex_file(ib, name);
+  int wordsiz, cat;
+
+  end = buffer + len;
+  wordsiz = 20;
+  word = g_malloc(wordsiz);
+
+  p = buffer;
+  while (p < end)
+    {
+      while (p < end)
+	{
+	  cat = utf8_category(p, &q, end);
+	  if (cat != IBEX_NONALPHA)
+	    break;
+	  p = q;
+	}
+      if (p == end)
+	{
+	  g_free(word);
+	  return 0;
+	}
+      else if (cat == IBEX_INVALID)
+	{
+	  errno = EINVAL;
+	  g_free(word);
+	  return -1;
+	}
+      else if (cat == IBEX_INCOMPLETE)
+	q = end;
+
+      while (q < end)
+	{
+	  cat = utf8_category(q, &nq, end);
+	  if (cat != IBEX_ALPHA)
+	    break;
+	  q = nq;
+	}
+      if (cat == IBEX_INVALID || (cat == IBEX_INCOMPLETE && !unread))
+	{
+	  errno = EINVAL;
+	  g_free(word);
+	  return -1;
+	}
+      else if (cat == IBEX_INCOMPLETE || (q == end && unread))
+	{
+	  *unread = end - p;
+	  g_free(word);
+	  return 0;
+	}
+
+      if (wordsiz < q - p + 1)
+	{
+	  wordsiz = q - p + 1;
+	  word = g_realloc(word, wordsiz);
+	}
+      normalize_word(p, q, word);
+      ref_word(ib, ibf, word);
+      p = q;
+    }
+
+  if (unread)
+    *unread = 0;
+  g_free(word);
+  return 0;
+}