1 files changed, 0 insertions, 263 deletions
diff --git a/libibex/words.c b/libibex/words.c
deleted file mode 100644
index 4776634251..0000000000
--- a/libibex/words.c
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
-	Copyright 2000 Helix Code Inc.
-*/
-/* words.c: low-level indexing ops */
-
-#include <ctype.h>
-#include <errno.h>
-#include <string.h>
-
-#include <unicode.h>
-
-#include "ibex_internal.h"
-
-static signed char utf8_trans[] = {
-  'A', 'A', 'A', 'A', 'A', 'A', -1, 'C', 'E', 'E', 'E', 'E', 'I', 'I',
-  'I', 'I', -2, 'N', 'O', 'O', 'O', 'O', 'O', '*', 'O', 'U', 'U', 'U',
-  'U', 'Y', -3, -4, 'a', 'a', 'a', 'a', 'a', 'a', -5, 'c', 'e', 'e',
-  'e', 'e', 'i', 'i', 'i', 'i', -6, 'n', 'o', 'o', 'o', 'o', 'o', '/',
-  'o', 'u', 'u', 'u', 'u', 'y', -7, 'y', 'A', 'a', 'A', 'a', 'A', 'a',
-  'C', 'c', 'C', 'c', 'C', 'c', 'C', 'c', 'D', 'd', 'D', 'd', 'E', 'e',
-  'E', 'e', 'E', 'e', 'E', 'e', 'E', 'e', 'G', 'g', 'G', 'g', 'G', 'g',
-  'G', 'g', 'H', 'h', 'H', 'h', 'I', 'i', 'I', 'i', 'I', 'i', 'I', 'i',
-  'I', 'i', -8, -9, 'J', 'j', 'K', 'k', 'k', 'L', 'l', 'L', 'l', 'L',
-  'l', 'L', 'l', 'L', 'l', 'N', 'n', 'N', 'n', 'N', 'n', 'n', -10, -11,
-  'O', 'o', 'O', 'o', 'O', 'o', -12, -13, 'R', 'r', 'R', 'r', 'R', 'r',
-  'S', 'r', 'S', 's', 'S', 's', 'S', 's', 'T', 't', 'T', 't', 'T', 't',
-  'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'U', 'u', 'W', 'w',
-  'Y', 'y', 'Y', 'Z', 'z', 'Z', 'z', 'Z', 'z', 's'
-};
-
-static char *utf8_long_trans[] = {
-  "AE", "TH", "TH", "ss", "ae", "th", "th", "IJ", "ij", "NG", "ng", "OE", "oe"
-};
-
-/* This is a bit weird. It takes pointers to the start and end (actually
- * just past the end) of a UTF-8-encoded word, and a buffer at least 1
- * byte longer than the length of the word. It copies the word into the
- * buffer in all lowercase without accents, and splits up ligatures.
- * (Since any ligature would be a multi-byte character in UTF-8, splitting
- * them into two US-ASCII characters won't overrun the buffer.)
- *
- * It is not safe to call this routine with bad UTF-8.
- */
-static void
-normalize_word(char *start, char *end, char *buf)
-{
-  unsigned char *s, *d;
-  unicode_char_t uc;
-
-  s = (unsigned char *)start;
-  d = (unsigned char *)buf;
-  while (s < (unsigned char *)end)
-    {
-      if (*s < 0x80)
-	{
-	  /* US-ASCII character: copy unless it's an apostrophe. */
-	  if (*s != '\'')
-	    *d++ = tolower(*s);
-	  s++;
-	}
-      else
-	{
-	  char *next = unicode_get_utf8(s, &uc);
-	  if (uc >= 0xc0 && uc < 0xc0 + sizeof(utf8_trans))
-	    {
-	      signed char ch = utf8_trans[uc - 0xc0];
-	      if (ch > 0)
-		*d++ = tolower(ch);
-	      else
-		{
-		  *d++ = tolower(utf8_long_trans[-ch - 1][0]);
-		  *d++ = tolower(utf8_long_trans[-ch - 1][1]);
-		}
-	      s = next;
-	    }
-	  else
-	    {
-	      while (s < (unsigned char *)next)
-		*d++ = *s++;
-	    }
-	}
-    }
-  *d = '\0';
-}
-
-enum { IBEX_ALPHA, IBEX_NONALPHA, IBEX_INVALID, IBEX_INCOMPLETE };
-
-/* This incorporates parts of libunicode, because there's no way to
- * force libunicode to not read past a certain point.
- */
-static int
-utf8_category(char *sp, char **snp, char *send)
-{
-  unsigned char *p = (unsigned char *)sp, **np = (unsigned char **)snp;
-  unsigned char *end = (unsigned char *)send;
-
-  if (isascii(*p))
-    {
-      *np = p + 1;
-      if (isalpha(*p) || *p == '\'')
-	return IBEX_ALPHA;
-      return IBEX_NONALPHA;
-    }
-  else
-    {
-      unicode_char_t uc;
-      int more;
-
-      if ((*p & 0xe0) == 0xc0)
-	{
-	  more = 1;
-	  uc = *p & 0x1f;
-	}
-      else if ((*p & 0xf0) == 0xe0)
-	{
-	  more = 2;
-	  uc = *p & 0x0f;
-	}
-      else if ((*p & 0xf8) == 0xf0)
-	{
-	  more = 3;
-	  uc = *p & 0x07;
-	}
-      else if ((*p & 0xfc) == 0xf8)
-	{
-	  more = 4;
-	  uc = *p & 0x03;
-	}
-      else if ((*p & 0xfe) == 0xfc)
-	{
-	  more = 5;
-	  uc = *p & 0x01;
-	}
-      else
-	return IBEX_INVALID;
-
-      if (p + more > end)
-	return IBEX_INCOMPLETE;
-
-      while (more--)
-	{
-	  if ((*++p & 0xc0) != 0x80)
-	    return IBEX_INVALID;
-	  uc <<= 6;
-	  uc |= *p & 0x3f;
-	}
-
-      *np = p + 1;
-      if (unicode_isalpha(uc))
-	return IBEX_ALPHA;
-      else
-	return IBEX_NONALPHA;
-    }
-}
-
-static ibex_file *
-get_ibex_file(ibex *ib, char *name)
-{
-  ibex_file *ibf;
-
-  ibf = g_tree_lookup(ib->files, name);
-  if (!ibf)
-    {
-      ibf = g_malloc(sizeof(ibex_file));
-      ibf->name = strdup(name);
-      ibf->index = 0;
-      g_tree_insert(ib->files, ibf->name, ibf);
-      ib->dirty = TRUE;
-    }
-  return ibf;
-}
-
-static void
-ref_word(ibex *ib, ibex_file *ibf, char *word)
-{
-  GPtrArray *refs;
-
-  refs = g_hash_table_lookup(ib->words, word);
-  if (!refs)
-    {
-      refs = g_ptr_array_new();
-      g_hash_table_insert(ib->words, g_strdup(word), refs);
-      g_ptr_array_add(refs, ibf);
-      ib->dirty = TRUE;
-    }
-  else if (g_ptr_array_index(refs, refs->len - 1) != ibf)
-    {
-      g_ptr_array_add(refs, ibf);
-      ib->dirty = TRUE;
-    }
-}
-
-int
-ibex_index_buffer(ibex *ib, char *name, char *buffer,
-		  size_t len, size_t *unread)
-{
-  char *p, *q, *nq, *end, *word;
-  ibex_file *ibf = get_ibex_file(ib, name);
-  int wordsiz, cat;
-
-  end = buffer + len;
-  wordsiz = 20;
-  word = g_malloc(wordsiz);
-
-  p = buffer;
-  while (p < end)
-    {
-      while (p < end)
-	{
-	  cat = utf8_category(p, &q, end);
-	  if (cat != IBEX_NONALPHA)
-	    break;
-	  p = q;
-	}
-      if (p == end)
-	{
-	  g_free(word);
-	  return 0;
-	}
-      else if (cat == IBEX_INVALID)
-	{
-	  errno = EINVAL;
-	  g_free(word);
-	  return -1;
-	}
-      else if (cat == IBEX_INCOMPLETE)
-	q = end;
-
-      while (q < end)
-	{
-	  cat = utf8_category(q, &nq, end);
-	  if (cat != IBEX_ALPHA)
-	    break;
-	  q = nq;
-	}
-      if (cat == IBEX_INVALID || (cat == IBEX_INCOMPLETE && !unread))
-	{
-	  errno = EINVAL;
-	  g_free(word);
-	  return -1;
-	}
-      else if (cat == IBEX_INCOMPLETE || (q == end && unread))
-	{
-	  *unread = end - p;
-	  g_free(word);
-	  return 0;
-	}
-
-      if (wordsiz < q - p + 1)
-	{
-	  wordsiz = q - p + 1;
-	  word = g_realloc(word, wordsiz);
-	}
-      normalize_word(p, q, word);
-      ref_word(ib, ibf, word);
-      p = q;
-    }
-
-  if (unread)
-    *unread = 0;
-  g_free(word);
-  return 0;
-}