/* Build a dictionary for Joggle lookup */

#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>

#define VERSION "v0.1"

#define MIN(a,b) ((a)>(b)?(b):(a))
                                         
#define OPTED_PREWORD  "<P><B>"
#define OPTED_POSTWORD "</B>"

#define DCT_MIN_WORD_LENGTH 3

unsigned char **all_words = 0;
int allocated_words = 0;
int total_words = 0;



typedef enum
{
  FT_OPTED = 0
}
dict_type;
                                                         
int add_words(char *filename, dict_type filetype);
void gen_dictionary(void);
void dump_standardised(void);
                                              
/* --------------------------------------------- main */
int main(int argc, char **argv)
{
  int dict_file = 0;

  printf("Joggle Dictionary Builder "VERSION"\n");

  //dump_standardised();

  while(++dict_file<argc)
  {
    int new_words = 0;

    printf("Processing '%s'\n", argv[dict_file]);

    new_words = add_words(argv[dict_file], FT_OPTED);

    printf("total_words=%d new_words=%d\n", total_words, new_words);
  }

  gen_dictionary();

  return 0;
}

void gen_dictionary(void)
{
  unsigned char   cur_stem[DCT_MIN_WORD_LENGTH+1] = "\0";
  int             cur_word       = 0;
  int             max_len        = 0;
  int             cur_stem_start = 0;

  unsigned char **stem_list;
  unsigned char  *two_char_index;
  unsigned int   *three_char_index;
  FILE           *fp;

  stem_list = malloc(100*sizeof(unsigned char *));
  memset(stem_list,0x00,100*sizeof(unsigned char *));

  two_char_index = malloc(26*26*sizeof(unsigned char));
  memset(two_char_index,0x00,26*26*sizeof(unsigned char));

  three_char_index = malloc(26*26*26*sizeof(unsigned int));
  memset(three_char_index,0x00,26*26*26*sizeof(unsigned int));

  fp = fopen("words.dct","wb");

  if (!fp)
  {
    printf("*E* Couldn't open words.dct\n");
    return;
  }

  fwrite(two_char_index, sizeof(unsigned char),26*26,fp);
  fwrite(three_char_index,sizeof(unsigned int),26*26*26,fp);

  cur_stem_start = ftell(fp);

  while(cur_word < total_words)
  {
    int cur_len = 0;

    /* New stem */
    if (strncmp(all_words[cur_word],(unsigned char *)&cur_stem,DCT_MIN_WORD_LENGTH))
    {
      unsigned short val = 0;

      int tmp;

      if (cur_stem[0])
      {
        two_char_index[((cur_stem[0]-'A')*26)+(cur_stem[1]-'A')] = 1;
        three_char_index[((cur_stem[0]-'A')*676)+((cur_stem[1]-'A')*26)+(cur_stem[2]-'A')] = cur_stem_start;

        printf("STEM: %s %08X max_len=%d\n",&cur_stem, cur_stem_start, max_len);

        /* --------------------------------------- */
        /* Need to do something with previous data */
        /* --------------------------------------- */

        for(tmp=0;tmp<=max_len;tmp++)
        {
          if (stem_list[tmp])
          {
            if (!tmp)
              val = 1;
            else
            {
              if ((strlen(stem_list[tmp])/tmp) > 65534)
                printf("*E* %s broke the less than 65534 rule!\n", &cur_stem);

              val = (unsigned short)(strlen(stem_list[tmp])/tmp);
            }
          }
          else
            val = 0;

          /* ------------ */
          /* Write number */
          /* ------------ */

          fwrite(&val,sizeof(unsigned short),1,fp);

          if (stem_list[tmp])
          {
            fprintf(fp,stem_list[tmp]);
            cur_stem_start += strlen(stem_list[tmp]);
          }

          cur_stem_start += sizeof(unsigned short);

          free(stem_list[tmp]);
          stem_list[tmp] = 0;
        }

        val = 65535;

        fwrite(&val,sizeof(unsigned short),1,fp);

        cur_stem_start += sizeof(unsigned short);
      
        max_len = 0;
      }

      strncpy((unsigned char *)&cur_stem,all_words[cur_word],DCT_MIN_WORD_LENGTH);
    }

    cur_len = strlen(&all_words[cur_word][DCT_MIN_WORD_LENGTH]);

    if (cur_len > max_len)
      max_len = cur_len;

    if (stem_list[cur_len])
      stem_list[cur_len] = realloc(stem_list[cur_len],strlen(stem_list[cur_len])+cur_len+1);
    else
    {
      stem_list[cur_len] = malloc(cur_len+1);
      stem_list[cur_len][0] = 0;
    }

    strcat(stem_list[cur_len],&all_words[cur_word][DCT_MIN_WORD_LENGTH]);
    
    cur_word++;
  }

  fseek(fp,0,0);

  fwrite(two_char_index, sizeof(unsigned char),26*26,fp);
  fwrite(three_char_index,sizeof(unsigned int),26*26*26,fp);

  fclose(fp);
}

/* --------------------------------------------- cmp_word */
int cmp_word(const void *entry1, const void *entry2)
{
  return strcmp(*(unsigned char **)entry1,*(unsigned char **)entry2);
}

/* --------------------------------------------- bs_cmp_word */
int bs_cmp_word(const void *entry1, const void *entry2)
{
  return strcmp((unsigned char *)entry1,*(unsigned char **)entry2);
}

/* --------------------------------------------- standardise_word */
/* # means skip letter, \0 means end word, other means replace    */

unsigned char *standardised[256] =
{
   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",
   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",
   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",    "#",   "\0",   "\0",
   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",
   "\0",    "A",    "B",    "C",    "D",    "E",    "F",    "G",    "H",    "I",    "J",    "K",    "L",    "M",    "N",    "O",
    "P",    "Q",    "R",    "S",    "T",    "U",    "V",    "W",    "X",    "Y",    "Z",   "\0",   "\0",   "\0",   "\0",   "\0",
   "\0",    "A",    "B",    "C",    "D",    "E",    "F",    "G",    "H",    "I",    "J",    "K",    "L",    "M",    "N",    "O",
    "P",    "Q",    "R",    "S",    "T",    "U",    "V",    "W",    "X",    "Y",    "Z",   "\0",   "\0",   "\0",   "\0",   "\0",
    "C",    "U",    "E",    "A",    "A",    "A",    "A",    "C",    "E",    "E",    "E",    "I",    "I",    "I",    "A",    "A",
    "E",   "AE",   "AE",    "O",    "O",    "O",    "U",    "U",    "Y",    "O",    "U",    "C",   "\0",   "\0",   "\0",   "\0",
    "A",    "I",    "O",    "U",    "N",    "N",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",
   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",
   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",
   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",
   "\0",   "SS",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",
   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0",   "\0"
};

unsigned char *standardise_word(unsigned char *word)
{
  unsigned char standardised_word[1024] = "";

  int letter   = 0;
  int word_len = strlen(word);

  /* ---------------------------------------- */
  /* If it contains a space, we don't want it */
  /* ---------------------------------------- */
  if (strspn(word," ") || word[0] == '-')
    return (unsigned char *)0;

  for(letter=0;letter<word_len;letter++)
  {
    switch(standardised[word[letter]][0])
    {
      case '#':
        break;
      case 0:
        return strdup(standardised_word);
        break;
      default:
        strcat((unsigned char *)&standardised_word,standardised[word[letter]]);
        break;
    }
  }
    
  return strdup(standardised_word);
}

void dump_standardised(void)
{
  int letter   = 0;

  for(letter=0;letter<256;letter++)
    printf("%03d %c == %s\n",letter,letter,standardised[letter]);
    
  return;
}

/* --------------------------------------------- add_word */
int add_word(char *word)
{
  unsigned char *standardised_word;

  int added  = 0;

  standardised_word = standardise_word(word);

  if (!standardised_word)
    return 0;

  if (total_words == allocated_words)
  {
    unsigned char **new_all_words = 0;

    if (allocated_words)
      allocated_words = allocated_words * 2;
    else
      allocated_words = 1000;

    new_all_words = (unsigned char **)realloc(all_words, allocated_words * sizeof(unsigned char *));

    if (new_all_words)
    {
      all_words = new_all_words;

      memset(&all_words[total_words], 0x00, (allocated_words-total_words)*sizeof(unsigned char *));
    }      
    else
      printf("*E* Unable to realloc %d words\n", allocated_words);
  }

  if (strlen(standardised_word) >= DCT_MIN_WORD_LENGTH && 
      !bsearch(standardised_word, all_words, total_words, sizeof(unsigned char *), bs_cmp_word))
  {
    all_words[total_words++] = standardised_word;

    qsort(all_words, total_words, sizeof(unsigned char *), cmp_word);

    added = 1;
  }
  else
    free(standardised_word);

  return added;
}

/* --------------------------------------------- add_words_opted */
int add_words_opted(char *filename)
{
  FILE *fp = 0;

  int new_words = 0;

  printf("Opening '%s' Type OPTED\n", filename);

  if ((fp=fopen(filename,"rb")))
  {
    struct stat filestat;

    if (!fstat(fileno(fp),&filestat))
    {
      unsigned char *filedata = (unsigned char *)calloc(filestat.st_size+1, sizeof(unsigned char));

      if (filedata)
      {
        if (fread(filedata, sizeof(unsigned char), filestat.st_size, fp) == filestat.st_size)
        {
          unsigned char *strpos = filedata;
          unsigned char *endpos = strpos;

          while((strpos = strstr(strpos,OPTED_PREWORD)))
          {
            strpos += sizeof(OPTED_PREWORD)-1;

            endpos = strstr(strpos,OPTED_POSTWORD);

            if (endpos)
            { 
              *endpos = 0;
                            
              new_words += add_word(strpos);

              strpos = endpos+1;
            }
          }
        }
        else
        {
          printf("*E* Failed to fread %d * %d from %s\n", filestat.st_size, sizeof(unsigned char), filename);
        }

        free(filedata);
      }
      else
      {
        printf("*E* Failed to malloc %d bytes for '%s' errno=%d\n", filestat.st_size, filename, errno);
      }

    }
    else
    {
      printf("*E* Failed to fstat '%s' errno=%d\n", filename, errno);
    }

    fclose(fp);
  }
  else
  {
    printf("*E* Failed to open '%s' errno=%d\n", filename, errno);
  }

  return new_words;
}

/* --------------------------------------------- add_words */
int add_words(char *filename, dict_type filetype)
{
  switch(filetype)
  {
    case FT_OPTED:
      return add_words_opted(filename);
      break;
  }
}
