/* Scan SRILM combined counts file * and print number of gram kinds and their totals * Copyright (C) 2008, Corpus Technologies * Author: Alexy Khrabrov deliverable gmail dot com */ #include #include #define MAX_GRAM 100 #define MAX_WORD 1024 int main() { int c; unsigned long grams[MAX_GRAM+1], total[MAX_GRAM+1]; long unsigned line=0; unsigned nchars; unsigned long count; unsigned n=0, nmax=0; for(n=0; n 0) { if (nmax < n) nmax = n; if (n > MAX_GRAM) { printf("exceeded ngram limit %d with %d on line %d\n", MAX_GRAM, n, line); exit(1); } if (inword && nchars && count) { ++grams[n]; total[n] += count; //printf("word: %s, atoi: %d\n", word, atoi(word)); n = 0; nchars = 0; inword = 0; } else { printf("empty count at end of line %d\n", line); exit(4); } } else { printf("empty line %d\n", line); exit(5); } continue; } if (c == ' ') { if (inword) { printf("space after tab on line %d\n", line); exit(6); } if (nchars) { ++n; nchars = 0; } else { printf("empty word on line %d\n", line); exit(7); } } else if (c == '\t') { if (inword) { printf("in word already on line %d\n", line); exit(9); } if (nchars) { ++n; nchars = 0; } else { printf("empty word on line %d\n", line); exit(8); } inword = 1; count = 0; } else if (nchars > MAX_WORD-1) { // leave last byte for \0 printf("exceeded word buffer on line %d\n", line); exit(2); } else { ++nchars; if (inword) { if (c < '0' || c > '9') { printf("not a digit in count on line %d: %c\n", line, c); exit(3); } count = count*10 + c - '0'; } } } printf("line=%d, nmax=%d\n", line, nmax); for (n=1; n<=nmax; ++n) printf("%d => grams kinds %d, total %d\n", n, grams[n], total[n]); }