/* docvec.c - responsible for converting chunks of text (from files or buffers)
   to vectors.  See vector.doc.  */

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <savant.h>
#include <savutil.h>

IV_List *Global_IV_List;
int Global_Num_Indices;

int init_docvecs(void) 
{
  Global_IV_List = NULL;
  Global_Num_Indices = 0;
  return(0);
}

DV_Tree *vectorize_file(FILE *file,
			size_t loc,
			ssize_t len,
			int num_lines,
			size_t *next_window,
			enum Field_Types field_type)
{
  /* here we read len bytes or num_lines lines, whichever is less, into
     a buffer and run vectorize_buffer on that.  We also need to stick
     a file offset into next_window, which will be loc plus num_lines/2 lines. */
  
  char *buffer, line[1024];
  int newline_count, byte_count;
  DV_Tree *dv;

  if(!(len > 0)) {
    return(NULL);
  }

  buffer = (char *)malloc((len+1)*sizeof(char));
  buffer[0] = '\0';
  fseek(file,loc,SEEK_SET);

  for(byte_count=newline_count=0; newline_count<num_lines; newline_count++) {
    /* first, if this is the halfway point and the next window offset 
       is desired by the caller (next_window != NULL), then set it */
    if((newline_count == num_lines/2) && (next_window != NULL)) {
      *next_window = loc + byte_count;
    }

    /* now get the next line, and quit if we hit EOF or the len upper limit */
    if (fgets(line, 1024, file) == NULL) { /*EOF*/
      if (SavantDebug) {
	fprintf(stderr, "docvec.c:vectorize_file():  EOF during fgets\n");
      }
      if(next_window != NULL) {
	*next_window = -1;
      }
      break;
    }
    else {
      byte_count += strlen(line);
      if(byte_count > len) {
	strncat(buffer, line, strlen(line)-(byte_count-len));
	break;
      }
      else {
	strcat(buffer, line);
      }
    }
  }

  dv = vectorize_buffer(buffer, field_type);
  free(buffer);
  return(dv);
}

DV_Tree *vectorize_buffer(char *buffer, 
			  enum Field_Types field_type)
{
  static char word[16];
  char *bufptr, *wordptr;
  unsigned int code[WORD_ENCODE_WIDTH];
  DV_Tree *tree = NULL;
  int i;

  bufptr = buffer;           /* buffer gets freed by whoever calls us */
  while(*bufptr != '\0') { /* quit at the end of the buffer */
    wordptr = word;
    i = 0;  /* index into word, to keep from going over */
    if (field_type == BODY_FIELD) {          /* Bodies only allow letters in words */
      while(!isalpha(*bufptr) && (*bufptr != '\0')) {
	/* skip non-alpha */
	bufptr++;
      }
      while(isalpha(*bufptr)) {
	/* copy over first 15 characters to word */
	if (i < 15) {
	  *(wordptr++) = tolower(*(bufptr++));
	  i++;
	}
	else {
	  bufptr++;
	}
      }
      *wordptr = '\0';
    }
    else if ((field_type == DATE_FIELD) || 
	     (field_type == TIME_FIELD) ||
	     (field_type == DAY_FIELD)) {
      /* Dates & Times are stored as a single token rather than a vector of tokens */
/*      printf("vectorize_buffer: DATE-field = %s\n", bufptr);*/
      if (encode_word(bufptr, code, field_type) != -1) {     /* cheap date */
	dvtree_increment(&tree, code);
      }
      return(tree);   /* We're done here */
    }
      

    else {            /* Other fields are same as body, but allow digits & punctuation */
/*
      if (field_type == SUBJECT_FIELD) {
	printf("vectorize_buffer: SUBJECT-field = %s\n", bufptr);
      }   
      if (field_type == SOURCE_FIELD) {
	printf("vectorize_buffer: SOURCE-field = %s\n", bufptr);
      }   
      if (field_type == LOCATION_FIELD) {
	printf("vectorize_buffer: LOCATION-field = %s\n", bufptr);
      }   
*/
      while(isspace(*bufptr) && (*bufptr != '\0')) {
	/* skip whitespace */
	bufptr++;
      }
      while(!isspace(*bufptr) && (*bufptr != '\0')) {
	/* copy over first 15 characters to word */
	if (i < 15) {
	  *(wordptr++) = tolower(*(bufptr++));
	  i++;
	}
	else {
	  bufptr++;
	}
      }
      *wordptr = '\0';
    }
    
    if(!is_common(word)) { /* skip "stop" words */
      Stem(word);
      if(*word != '\0') { /* Stem may stem word to nothing */
	encode_word(word, code, field_type);
	dvtree_increment(&tree, code);
      }
    }
  }
  
  return(tree);
}

int dvtree_increment(DV_Tree **tree,
		     Wordcode code)
{
  int cmp;
  int i;

  if(*tree==NULL) {
    /* create new node */
    *tree = (DV_Tree *)malloc(sizeof(DV_Tree));
    for (i=0; i<WORD_ENCODE_WIDTH; i++) {
      (*tree)->wordcode[i] = code[i];
    }
    (*tree)->weight = 1;
    (*tree)->left = (*tree)->right = NULL;
    return(1);
  }
  
  cmp = wordcode_cmp(code, (*tree)->wordcode);
  if(cmp==0) {
    return((*tree)->weight++);
  }
  else if(cmp<0) {
    return(dvtree_increment(&((*tree)->left), code));
  }
  else {
    return(dvtree_increment(&((*tree)->right), code));
  }
}

DV_Tree *merge_dvtrees(DV_Tree *tree1,
		       DV_Tree *tree2)
{
  DV_Tree **insert_point;
  DV_Tree **insert_point_prev;
  int cmp;
  int i;

  /* there is probably a faster way.  Good for now. */
  if (tree1 == NULL || tree2 == NULL) {
    return(NULL);
  }

  merge_dvtrees(tree1, tree2->left);

  /* find the node to insert at */
  insert_point = &tree1;
  insert_point_prev = &tree1;
  while(*insert_point != NULL) {
    cmp = wordcode_cmp((*insert_point)->wordcode, tree2->wordcode);
    if(cmp < 0) {
      insert_point_prev = insert_point;
      insert_point = &((*insert_point)->right);
    }
    else if (cmp > 0) {
      insert_point_prev = insert_point;
      insert_point = &((*insert_point)->left);
    }
    else {
      break;
    }
  }
  if (*insert_point == NULL) { /* create a new node */
    *insert_point = (DV_Tree *)malloc(sizeof(DV_Tree));
    for(i=0; i<WORD_ENCODE_WIDTH; i++) {
      (*insert_point)->wordcode[i] = tree2->wordcode[i];
    }
    (*insert_point)->weight = tree2->weight;
    (*insert_point)->left = (*insert_point)->right = NULL;
  }
  else { /* just add the weight */
    (*insert_point)->weight += tree2->weight;
  }

  merge_dvtrees(tree1, tree2->right);

  return(tree1);
}


DocVec *merge_with_global(DV_Tree *tree)
{
  /* this guy performs an in-order traversal of the tree, flattening the docvec
     according to GIVL, as well as adding cells to GIVL as appropriate.  Tricky.. */
  DV_List *dv_list_root, *dv_scan, *new_dvcell;
  IV_List *iv_scan, *new_ivcell;
  int cmp, high_index, i;
  DocVec *flat_docvec;

  /* initialize these */
  dv_list_root = dvtree_to_dvlist(tree);
  dv_scan = dv_list_root;
  iv_scan = Global_IV_List;
  flat_docvec = (DocVec *)malloc(sizeof(DocVec));
  flat_docvec->num_entries = 0;
  high_index = 0;

  /* This while loop merges the DV_List and IV_List (as far as wordcodes),
     up to the end of one of them, whichever end comes first.  We want to keep
     track of the highest index we encounter in iv, for later use.  */
  while((dv_scan != NULL) && (iv_scan != NULL)) {
    flat_docvec->num_entries++;
    cmp = wordcode_cmp(dv_scan->wordcode, iv_scan->wordcode);
    if(cmp < 0) { /* dv contains a word the iv doesn't */
      /* create a new cell in the iv list for this word */
      new_ivcell = (IV_List *)malloc(sizeof(IV_List));
      for(i=0; i<WORD_ENCODE_WIDTH; i++) {
	new_ivcell->wordcode[i] = dv_scan->wordcode[i];
      }
      high_index = new_ivcell->index = Global_Num_Indices++;
      if (iv_scan == Global_IV_List) { 
	/* make GIVL new_ivcell, stick new_ivcell before iv_scan */
	Global_IV_List = new_ivcell;
	new_ivcell->prev = NULL;
	new_ivcell->next = iv_scan;
	iv_scan->prev = new_ivcell;  
      }
      else { /* insert new_ivcell between iv_scan and iv_scan->prev */
	new_ivcell->prev = iv_scan->prev;
	new_ivcell->next = iv_scan;
	(iv_scan->prev)->next = new_ivcell;
	iv_scan->prev = new_ivcell;
      }
      dv_scan = dv_scan->next;
    }
    else if (cmp > 0) { /* iv contains a word dv doesn't */
      /* create a new cell in the dv list for this word */
      new_dvcell = (DV_List *)malloc(sizeof(DV_List));
      for(i=0; i<WORD_ENCODE_WIDTH; i++) {
	new_dvcell->wordcode[i] = iv_scan->wordcode[i];
      }
      new_dvcell->weight = 0;
      if (dv_scan == dv_list_root) {
	/* make dv_list_root new_dvcell, stick new_dvcell before dv_scan */
	dv_list_root = new_dvcell;
	new_dvcell->prev = NULL;
	new_dvcell->next = dv_scan;
	dv_scan->prev = new_dvcell;  
      }
      else { /* insert new_dvcell between dv_scan and dv_scan->prev */
	new_dvcell->prev = dv_scan->prev;
	new_dvcell->next = dv_scan;
	(dv_scan->prev)->next = new_dvcell;
	dv_scan->prev = new_dvcell;
      }
      if (iv_scan->index > high_index) {
	high_index = iv_scan->index;
      }
      iv_scan = iv_scan->next;
    }
    else { /* they share this word, no merge to do */
      dv_scan = dv_scan->next;
      if (iv_scan->index > high_index) {
	high_index = iv_scan->index;
      }
      iv_scan = iv_scan->next;
    }
  }
  /* We just dealt with flat_docvec->num_entries cells in each linked list,
     either by inserting a cell into one while skipping over the corresponding 
     cell of the other, or by skipping over the cell in both if they each
     had it.  Now there may be one of two things to do: either the dv list
     still has entries, in which case we should insert new indices for each
     of them into the iv list, or the iv list still has entries, in which
     case we should go through the dv list inserting 0 weights for words in
     the iv list whose index is less than high_index. */
     
  if (dv_scan != NULL) { 
    if (Global_IV_List == NULL) {
      /* we have to make this first cell "by hand" as it were */
      flat_docvec->num_entries++;
      iv_scan = Global_IV_List = (IV_List *)malloc(sizeof(IV_List));
      for(i=0; i<WORD_ENCODE_WIDTH; i++) {
	iv_scan->wordcode[i] = dv_scan->wordcode[i];
      }
      iv_scan->index = Global_Num_Indices++; 
      iv_scan->prev = iv_scan->next = NULL;
      dv_scan = dv_scan->next;
    }
    else { /* If there is a GIVL, we can just do this: */
      for(iv_scan = Global_IV_List; iv_scan->next != NULL; iv_scan = iv_scan->next) {
	/* do nothing; just get to the end of the list */
      }
    }
    while(dv_scan != NULL) {
      /* tack on a new cell at the end with a new index */
      flat_docvec->num_entries++;
      new_ivcell = (IV_List *)malloc(sizeof(IV_List));
      for(i=0; i<WORD_ENCODE_WIDTH; i++) {
	new_ivcell->wordcode[i] = dv_scan->wordcode[i];
      }
      new_ivcell->index = Global_Num_Indices++;
      new_ivcell->prev = iv_scan;
      new_ivcell->next = NULL;
      iv_scan->next = new_ivcell;
      iv_scan = iv_scan->next;
      dv_scan = dv_scan->next;
    }
  }
  else if (iv_scan != NULL) {
    for(dv_scan = dv_list_root; dv_scan->next != NULL; dv_scan = dv_scan->next) {
      /* do nothing; just get to the end of the list */
    }
    while(iv_scan != NULL) {
      if(iv_scan->index < high_index) {
	/* then tack a cell onto the dv list with a 0 weight */
	flat_docvec->num_entries++;
	new_dvcell = (DV_List *)malloc(sizeof(DV_List));
	for(i=0; i<WORD_ENCODE_WIDTH; i++) {
	  new_dvcell->wordcode[i] = iv_scan->wordcode[i];
	}
	new_dvcell->weight = 0;
	new_dvcell->prev = dv_scan;
	new_dvcell->next = NULL;
	dv_scan->next = new_dvcell;
	dv_scan = dv_scan->next;
	iv_scan = iv_scan->next;
      }
    }
  }

  /* Now just flatten out dv_list_root... */
  flat_docvec->weights = (unsigned int *)malloc(flat_docvec->num_entries*sizeof(unsigned int));
  dv_scan = dv_list_root;
  iv_scan = Global_IV_List;
  while(dv_scan != NULL) {
    flat_docvec->weights[iv_scan->index] = dv_scan->weight;
    dv_scan = dv_scan->next;
    iv_scan = iv_scan->next;
  }

  destroy_dvlist(dv_list_root);
  return(flat_docvec);
      
}

DocVec *dvtree_to_dv(DV_Tree *tree, 
		     IndexVec *iv)
{
  /* like merge_and... except do this one according to a pre-flattened indices DocVec
     and don't update the indices */
  int i, cmp;
  DV_List *dv_list_root, *dv_scan;
  DocVec *dv;

  dv_scan = dv_list_root = dvtree_to_dvlist(tree);

  dv = (DocVec *)malloc(sizeof(DocVec));
  dv->num_entries = iv->num_entries;
  dv->weights = (unsigned int *)malloc(iv->num_entries*sizeof(unsigned int));

  for (i=0; i<iv->num_entries; i++) {
    if(dv_scan != NULL) {
      cmp = wordcode_cmp(dv_scan->wordcode, &(iv->wordcodes[WORD_ENCODE_WIDTH*i]));
      if (cmp < 0) { /* this word is not in iv */
	dv->weights[iv->indices[i]] = 0;
	dv_scan = dv_scan->next;
      }
      else if (cmp > 0) { /* this word may still be coming up */
	dv->weights[iv->indices[i]] = 0;
      } 
      else { /* match; fill in the weight */
	dv->weights[iv->indices[i]] = dv_scan->weight;
	dv_scan = dv_scan->next;
      }
    }
    else {
      dv->weights[iv->indices[i]] = 0;
    }
  }

  destroy_dvlist(dv_list_root);
  return(dv);  
}

DV_List *dvtree_to_dvlist(DV_Tree *tree)
{
  int i;

  /* just does and in-order traversal */
  DV_List *left_list, *cur_cell;

  if(tree == NULL) {
    return(NULL);
  }
  
  /* get the list from the left branch */
  left_list = dvtree_to_dvlist(tree->left);

  if(left_list == NULL) { /* make the current cell the left end of the list */
    cur_cell = (DV_List *)malloc(sizeof(DV_List));
    cur_cell->prev = NULL;
    left_list = cur_cell;
  }
  else { /* stick it to the end of left_list */
    for(cur_cell=left_list; cur_cell->next != NULL; cur_cell=cur_cell->next) {
      /* do nothing.  I just want to chase through to the end. */
    }
    cur_cell->next = (DV_List *)malloc(sizeof(DV_List));
    (cur_cell->next)->prev = cur_cell;
    cur_cell = cur_cell->next;
  }

  /* fill in the important info */
  for(i=0; i<WORD_ENCODE_WIDTH; i++) {
    cur_cell->wordcode[i] = tree->wordcode[i];
  }
  cur_cell->weight = tree->weight;

  /* then get the right branch */
  cur_cell->next = dvtree_to_dvlist(tree->right);
  if (cur_cell->next != NULL) {
    (cur_cell->next)->prev = cur_cell;
  }

  return(left_list);
}

DenseDocVec *dvtree_to_ddv(DV_Tree *dvtree)
{
  int i,j;
  DV_List *dvlist = dvtree_to_dvlist(dvtree), *ptr;
  DenseDocVec *ddv = (DenseDocVec *)malloc(sizeof(DenseDocVec));

  ddv->num_entries = 0;
  for (ptr=dvlist; ptr!=NULL; ptr=ptr->next) {
    ddv->num_entries++;
  }

  ddv->wordcodes = (unsigned int *)malloc(WORD_ENCODE_WIDTH*ddv->num_entries*sizeof(unsigned int));
  ddv->weights = (unsigned int *)malloc(ddv->num_entries*sizeof(unsigned int));
  for(i=0; i<ddv->num_entries; i++) {
    for(j=0; j<WORD_ENCODE_WIDTH; j++) {
      ddv->wordcodes[WORD_ENCODE_WIDTH*i+j] = dvlist->wordcode[j];
    }
    ddv->weights[i] = dvlist->weight;
    ptr = dvlist->next;
    free(dvlist);
    dvlist = ptr;
  }

  return(ddv);
}
  
  

IndexVec *flatten_ivlist(void)
{
  /* here we want to just take GIVL and turn it into an IndexVec */
  int i,j;
  IV_List *list;
  IndexVec *iv = (IndexVec *)malloc(sizeof(IndexVec));

  iv->num_entries = Global_Num_Indices;
  if (Global_Num_Indices == 0) {
    iv->wordcodes = NULL;
    iv->indices = NULL;
  }
  else {
    iv->wordcodes = (unsigned int *)malloc(WORD_ENCODE_WIDTH*Global_Num_Indices*sizeof(unsigned int));
    iv->indices = (int *)malloc(Global_Num_Indices*sizeof(int));
  }

  for(i=0, list=Global_IV_List; i<Global_Num_Indices; i++) {
    for(j=0; j<WORD_ENCODE_WIDTH; j++) {
      iv->wordcodes[WORD_ENCODE_WIDTH*i+j] = list->wordcode[j];
    }
    iv->indices[i] = list->index;
    list = list->next;
  }

  return(iv);
}

int print_docvec(FILE * stream,
		 DocVec *vector)
{
  int i;
  char word[60];

  fprintf(stream, "*** Vector ***\n");
  fflush(stream);

  for(i=0;i<vector->num_entries;i++) {
    /* these should look to the global tree for the words */
    /*
      for (j=0; j<WORD_ENCODE_WIDTH; j++) {
        decode_word(&(vector->hash[WORD_ENCODE_WIDTH*i+j]), &(word[strlen(word)]));
      }
      */
    word[0] = '\0';  /* for the time being */
    fprintf(stream, "%s\t%d\n", word, vector->weights[i]);
    fflush(stream);
  }

  return(0);
}

int print_dvtree(FILE *stream,
		 DV_Tree *tree)
{
  char decoded[60];
  int i;
  if(tree == NULL) {
    return(0);
  }
  
  print_dvtree(stream, tree->left);
  decode_word(tree->wordcode, decoded);
  for(i=0; i<WORD_ENCODE_WIDTH; i++) {
    fprintf(stream, "%08x", tree->wordcode[i]);
  }
  fprintf(stream, " (%s): %d\n", decoded, tree->weight);
  fflush(stream);
  print_dvtree(stream, tree->right);

  return(1);
}

int print_givl(FILE *stream)
{
  return(print_ivlist(stream, Global_IV_List));

}

int print_ivlist(FILE *stream,
		 IV_List *ivl) 
{
  char decoded[60];
  int i;
  
  while(ivl != NULL) {
    decode_word(ivl->wordcode, decoded);
    for(i=0; i<WORD_ENCODE_WIDTH; i++) {
      fprintf(stream, "%08x", ivl->wordcode[i]);
    }
    fprintf(stream, " (%s): index %d\n", decoded, ivl->index);
    fflush(stream);
    ivl = ivl->next;
  }

  return(0);
}

int print_dvlist(FILE *stream, 
		 DV_List *dvl) 
{
  char decoded[60];
  int i;

  while(dvl != NULL) {
    decode_word(dvl->wordcode, decoded);
    for(i=0; i<WORD_ENCODE_WIDTH; i++) {
      fprintf(stream, "%08x", dvl->wordcode[i]);
    }
    fprintf(stream, " (%s): weight %d\n", decoded, dvl->weight);
    fflush(stream);
    dvl = dvl->next;
  }

  return(0);
}

int destroy_dv(DocVec *vec)
{
  free(vec->weights);
  free(vec);
  
  return(0);
}

int destroy_ddv(DenseDocVec *vec)
{
  free(vec->weights);
  free(vec->wordcodes);
  free(vec);
  
  return(0);
}

int destroy_dvlist(DV_List *dvlist)
{
  DV_List *dead;
  
  while(dvlist != NULL) {
    dead = dvlist;
    dvlist = dvlist->next;
    free(dead);
  }
  return(0);
}

int destroy_dvtree(DV_Tree *dvtree)
{
  
  if (dvtree == NULL) {
    return(0);
  }

  destroy_dvtree(dvtree->left);
  destroy_dvtree(dvtree->right);
  free(dvtree);
  return(0);
}


