aboutsummaryrefslogblamecommitdiffstats
path: root/libibex/file.c
blob: e9eaab7f51e331a51ca41e28bc511a9283dbc969 (plain) (tree)






























































































































































































































































































































































































                                                                        
/*
    Copyright 2000 Helix Code Inc.
*/

/* file.c: index file read/write ops */

#include <ctype.h>
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "ibex_internal.h"

static unsigned long read_number(FILE *f);
static void write_number(FILE *f, unsigned long n);
static char *get_compressed_word(FILE *f, char **lastword);

static gint free_file(gpointer key, gpointer value, gpointer data);
static void free_word(gpointer key, gpointer value, gpointer data);

/* The file format is:
 *
 * version string (currently "ibex1")
 * file count
 * list of compressed filenames, separated by \0
 * word count
 * list of compressed words, each followed by \0, a count, and that
 *   many references.
 *
 * All numbers are stored 7-bit big-endian, with the high bit telling
 * whether or not the number continues to the next byte.
 *
 * compressed text consists of a byte telling how many characters the
 * line has in common with the line before it, followed by the rest of
 * the string. Obviously this only really works if the lists are sorted.
 */
ibex *
ibex_open(char *file, gboolean create)
{
  ibex *ib;
  FILE *f;
  char vbuf[sizeof(IBEX_VERSION) - 1];
  char *word, *lastword;
  unsigned long nfiles, nwords, nrefs, ref;
  ibex_file **ibfs = NULL;
  int i;
  GPtrArray *refs;

  f = fopen(file, "r");
  if (!f && (errno != ENOENT || !create))
    {
      if (errno == 0)
    errno = ENOMEM;
      return NULL;
    }

  ib = g_malloc(sizeof(ibex));
  ib->dirty = FALSE;
  ib->path = g_strdup(file);
  ib->files = g_tree_new(strcmp);
  ib->words = g_hash_table_new(g_str_hash, g_str_equal);
  ib->oldfiles = g_ptr_array_new();

  if (!f)
    return ib;

  /* Check version. */
  if (fread(vbuf, 1, sizeof(vbuf), f) != sizeof(vbuf))
    {
      if (feof(f))
    errno = EINVAL;
      goto errout;
    }
  if (strncmp(vbuf, IBEX_VERSION, sizeof(vbuf) != 0))
    {
      errno = EINVAL;
      goto errout;
    }

  /* Read list of files. */
  nfiles = read_number(f);
  ibfs = g_malloc(nfiles * sizeof(ibex_file *));
  lastword = NULL;
  for (i = 0; i < nfiles; i++)
    {
      ibfs[i] = g_malloc(sizeof(ibex_file));
      ibfs[i]->name = get_compressed_word(f, &lastword);
      if (!ibfs[i]->name)
    goto errout;
      ibfs[i]->index = 0;
      g_tree_insert(ib->files, ibfs[i]->name, ibfs[i]);
    }

  /* Read list of words. */
  nwords = read_number(f);
  lastword = NULL;
  for (i = 0; i < nwords; i++)
    {
      word = get_compressed_word(f, &lastword);
      if (!word)
    goto errout;

      nrefs = read_number(f);
      refs = g_ptr_array_new();
      g_ptr_array_set_size(refs, nrefs);
      while (nrefs--)
    {
      ref = read_number(f);
      if (ref >= nfiles)
        goto errout;
      refs->pdata[nrefs] = ibfs[ref];
    }

      g_hash_table_insert(ib->words, word, refs);
    }

  g_free(ibfs);
  fclose(f);
  return ib;

errout:
  fclose(f);
  g_tree_traverse(ib->files, free_file, G_IN_ORDER, NULL);
  g_tree_destroy(ib->files);
  g_hash_table_foreach(ib->words, free_word, NULL);
  g_hash_table_destroy(ib->words);
  g_ptr_array_free(ib->oldfiles, TRUE);
  if (ibfs)
    g_free(ibfs);
  g_free(ib->path);
  g_free(ib);

  return NULL;
}

struct ibex_write_data {
  unsigned long index;
  FILE *f;
  char *lastname;
};

static int
get_prefix(struct ibex_write_data *iwd, char *name)
{
  int i = 0;
  if (iwd->lastname)
    {
      while (!strncmp(iwd->lastname, name, i + 1))
    i++;
    }
  iwd->lastname = name;
  return i;
}

static gint
write_file(gpointer key, gpointer value, gpointer data)
{
  char *file = key;
  ibex_file *ibf = value;
  struct ibex_write_data *iwd = data;
  int prefix;

  ibf->index = iwd->index++;
  prefix = get_prefix(iwd, file);
  fprintf(iwd->f, "%c%s", prefix, file + prefix);
  fputc(0, iwd->f);
  return FALSE;
}

static void
store_word(gpointer key, gpointer value, gpointer data)
{
  GTree *wtree = data;

  g_tree_insert(wtree, key, value);
}

static gint
write_word(gpointer key, gpointer value, gpointer data)
{
  char *word = key;
  GPtrArray *refs = value;
  struct ibex_write_data *iwd = data;
  ibex_file *ibf;
  int i, ind, prefix;

  for (i = ind = 0; i < refs->len; i++)
    {
      ibf = g_ptr_array_index(refs, i);
      if (ibf->index == -1)
    {
      g_ptr_array_remove_index_fast(refs, i);
      i--;
    }
      else
    ind++;
    }

  if (ind != 0)
    {
      prefix = get_prefix(iwd, word);
      fprintf(iwd->f, "%c%s", prefix, word + prefix);
      fputc(0, iwd->f);

      write_number(iwd->f, ind);

      for (i = 0; i < refs->len; i++)
    {
      ibf = g_ptr_array_index(refs, i);
      write_number(iwd->f, ibf->index);
    }
    }
  return FALSE;
}      

int
ibex_write(ibex *ib)
{
  struct ibex_write_data iwd;
  GTree *wtree;
  char *tmpfile;

  tmpfile = g_strdup_printf("%s~", ib->path);
  iwd.f = fopen(tmpfile, "w");
  if (!iwd.f)
    {
      if (errno == 0)
    errno = ENOMEM;
      g_free(tmpfile);
      return -1;
    }

  fputs(IBEX_VERSION, iwd.f);
  if (ferror(iwd.f))
    goto lose;

  iwd.index = 0;
  iwd.lastname = NULL;
  write_number(iwd.f, g_tree_nnodes(ib->files));
  if (ferror(iwd.f))
    goto lose;
  g_tree_traverse(ib->files, write_file, G_IN_ORDER, &iwd);
  if (ferror(iwd.f))
    goto lose;

  iwd.lastname = NULL;
  write_number(iwd.f, g_hash_table_size(ib->words));
  if (ferror(iwd.f))
    goto lose;
  wtree = g_tree_new(strcmp);
  g_hash_table_foreach(ib->words, store_word, wtree);
  g_tree_traverse(wtree, write_word, G_IN_ORDER, &iwd);
  g_tree_destroy(wtree);
  if (ferror(iwd.f))
    goto lose;

  if (fclose(iwd.f) == 0 && rename(tmpfile, ib->path) == 0)
    {
      g_free(tmpfile);
      ib->dirty = FALSE;
      return 0;
    }

lose:
  unlink(tmpfile);
  g_free(tmpfile);
  return -1;
}

int
ibex_close(ibex *ib)
{
  ibex_file *ibf;

  if (ib->dirty && ibex_write(ib) == -1)
    return -1;

  g_tree_traverse(ib->files, free_file, G_IN_ORDER, NULL);
  g_tree_destroy(ib->files);
  g_hash_table_foreach(ib->words, free_word, NULL);
  g_hash_table_destroy(ib->words);

  while (ib->oldfiles->len)
    {
      ibf = g_ptr_array_remove_index(ib->oldfiles, 0);
      g_free(ibf->name);
      g_free(ibf);
    }
  g_ptr_array_free(ib->oldfiles, TRUE);
  g_free(ib->path);
  g_free(ib);

  return 0;
}

static gint
free_file(gpointer key, gpointer value, gpointer data)
{
  ibex_file *ibf = value;

  g_free(ibf->name);
  g_free(ibf);
  return FALSE;
}

static void
free_word(gpointer key, gpointer value, gpointer data)
{
  g_free(key);
  g_ptr_array_free(value, TRUE);
}

static char *
get_compressed_word(FILE *f, char **lastword)
{
  char *buf, *p;
  int c, size;

  c = getc(f);
  if (c == EOF)
    return NULL;

  size = c + 10;
  buf = g_malloc(size);
  if (*lastword)
    strncpy(buf, *lastword, c);
  p = buf + c;
  do
    {
      c = getc(f);
      if (c == EOF)
    return NULL;
      if (p == buf + size)
    {
      buf = g_realloc(buf, size + 10);
      p = buf + size;
      size += 10;
    }
      *p++ = c;
    }
  while (c != 0);

  *lastword = buf;
  return buf;
}

static void
write_number(FILE *f, unsigned long number)
{
  int i, flag = 0;
  char buf[4];

  i = 4;
  do
    {
      buf[--i] = (number & 0x7F) | flag;
      number = number >> 7;
      flag = 0x80;
    }
  while (number != 0);

  fwrite(buf + i, 1, 4 - i, f);
}

static unsigned long
read_number(FILE *f)
{
  int byte;
  unsigned long num;

  num = 0;
  do
    {
      byte = getc(f);
      num = num << 7 | (byte & 0x7F);
    }
  while (byte & 0x80);
  
  return num;
}