/* Scan SRILM combined counts file
 * and print number of gram kinds and their totals
 * Copyright (C) 2008, Corpus Technologies
 * Author: Alexy Khrabrov deliverable gmail dot com */

#include <stdio.h>
#include <stdlib.h>
#define MAX_GRAM 100
#define MAX_WORD 1024

int main() {
  int c;
  unsigned long grams[MAX_GRAM+1], total[MAX_GRAM+1];
  long unsigned line=0;
  unsigned nchars;
  unsigned long count;
  unsigned n=0, nmax=0;
  
  for(n=0; n<MAX_GRAM+1; ++n)
    total[n] = grams[n] = 0;

  n = 0; count = 0; nchars = 0; int inword = 0;
  while ((c = getchar()) != EOF) {
    if (c == '\n') {
      ++line;

      if (n > 0) {
        if (nmax < n) nmax = n;
        if (n > MAX_GRAM) {
          printf("exceeded ngram limit %d with %d on line %d\n", MAX_GRAM, n, line);
          exit(1);
        }
        if (inword && nchars && count) {
          ++grams[n]; 
          total[n] += count;
          //printf("word: %s, atoi: %d\n", word, atoi(word));
          n = 0;
          nchars = 0;
          inword = 0;
        } else {
          printf("empty count at end of line %d\n", line);
          exit(4);
        }
      } else {
        printf("empty line %d\n", line);
        exit(5);
      }
      continue;
    }
    
    if (c == ' ') {
      if (inword) {
        printf("space after tab on line %d\n", line);
        exit(6);
      }
      
      if (nchars) { 
        ++n;
        nchars = 0;
      }
      else {
        printf("empty word on line %d\n", line);
        exit(7);
      }
      
    } else if (c == '\t') {
      if (inword) {
        printf("in word already on line %d\n", line);
        exit(9);
      }

      if (nchars) {
        ++n;
        nchars = 0;
      }
      else {
        printf("empty word on line %d\n", line);
        exit(8);
      }
      
      inword = 1;
      count = 0;

    } else if (nchars > MAX_WORD-1) { // leave last byte for \0
        printf("exceeded word buffer on line %d\n", line);
        exit(2);
    } 
    else {
      ++nchars;
      if (inword) {
        if (c < '0' || c > '9') {
          printf("not a digit in count on line %d: %c\n", line, c);
          exit(3);          
        }
        count = count*10 + c - '0';
      }
    }
  }
  
  printf("line=%d, nmax=%d\n", line, nmax);
  for (n=1; n<=nmax; ++n) printf("%d => grams kinds %d, total %d\n", n, grams[n], total[n]);
}