/****************************************************************************** IrstLM: IRST Language Model Toolkit, compile LM Copyright (C) 2006 Marcello Federico, ITC-irst Trento, Italy This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ using namespace std; #include #include #include #include #include #include #include "math.h" #include "util.h" #define MAX_LINE 1024 //---------------------------------------------------------------------- // Special type and global variable for the BIN CLUSTERING algorithm // // //---------------------------------------------------------------------- typedef struct{ double pt; int idx; short code; }BinEntry; int cmpBinEntry(const void* a,const void* b){ if (*(double *)a > *(double*)b) return 1; else if (*(double *)a < *(double*)b) return -1; else return 0; } BinEntry* bintable=NULL; //---------------------------------------------------------------------- // Global entry points //---------------------------------------------------------------------- int parseWords(char *sentence, char **words, int max); int ComputeCluster(int nc, double* cl,int N,double* Pts); //---------------------------------------------------------------------- // Global parameters (some are set in getArgs()) //---------------------------------------------------------------------- int k = 256; // number of centers const int MAXLEV = 11; //maximum n-gram size //---------------------------------------------------------------------- // Main program //---------------------------------------------------------------------- void usage(const char *msg = 0) { if (msg) { std::cerr << msg << std::endl; } std::cerr << "Usage: quantize-lm input-file.lm [output-file.qlm]" << std::endl; if (!msg) std::cerr << std::endl << " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl << " a version of it with quantized probabilities and back-off weights"<< std::endl << " that the IRST LMtoolkit can compile. Accepts LMs with .gz suffix." << std::endl; } int main(int argc, const char **argv) { //Process Parameters if (argc < 2) { usage(); exit(1); } std::vector files; for (int i=1; i < argc; i++) { std::string opt = argv[i]; files.push_back(opt); } if (files.size() > 2) { usage("Too many arguments"); exit(1); } if (files.size() < 1) { usage("Please specify a LM file to read from"); exit(1); } std::string infile = files[0]; std::string outfile=""; if (files.size() == 1) { outfile=infile; //remove path information std::string::size_type p = outfile.rfind('/'); if (p != std::string::npos && ((p+1) < outfile.size())) outfile.erase(0,p+1); //eventually strip .gz if (outfile.compare(outfile.size()-3,3,".gz")==0) outfile.erase(outfile.size()-3,3); outfile+=".qlm"; } else outfile = files[1]; std::cout << "Reading " << infile << "..." << std::endl; inputfilestream inp(infile.c_str()); if (!inp.good()) { std::cerr << "Failed to open " << infile << "!\n"; exit(1); } std::ofstream out(outfile.c_str()); std::cout << "Writing " << outfile << "..." << std::endl; //prepare temporary file to save n-gram blocks for multiple reads //this avoids using seeks which do not work with inputfilestream //it's odd but i need a bidirectional filestream! string filePath;ofstream dummy; createtempfile(dummy,filePath,ios::out); dummy.close(); fstream filebuff(filePath.c_str(),ios::out|ios::in); int nPts = 0; // actual number of points // *** Read ARPA FILE ** int numNgrams[MAXLEV + 1]; /* # n-grams for each order */ int Order,MaxOrder; int n; float logprob,logbow, logten=log(10.0); double* dataPts=NULL; double* centersP=NULL; double* centersB=NULL; int* mapP=NULL; int* mapB=NULL; int centers=k; streampos iposition; out << "qARPA\n"; //print output header for (int i=1;i<=MAXLEV;i++) numNgrams[i]=0; char line[MAX_LINE]; while (inp.getline(line,MAX_LINE)){ bool backslash = (line[0] == '\\'); if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) { numNgrams[Order] = n; MaxOrder=Order; } if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) { out << line << "\n"; cerr << "-- Start processing of " << Order << "-grams\n"; assert(Order <= MAXLEV); int N=numNgrams[Order]; centers=k; if (Order==1) centers=256; // always use 256 centers char* words[MAXLEV+3]; dataPts=new double[N]; // allocate data //reset tempout file filebuff.seekg(0); for (nPts=0;nPts0){ currcode++; } } if (bintable[i].pt == bintable[i-1].pt) bintable[i].code=bintable[i-1].code; else{ bintable[i].code=currcode; species[currcode]++; } population[bintable[i].code]++; assert(bintable[i].code < centers); ctrs[bintable[i].code]+=bintable[i].pt; } for (int i=0;i0){ ctrs[i]/=(float)population[i]; if (ctrs[i]<1e-99){ cerr << "Warning: adjusting center with too small prob " << ctrs[i] << "\n"; ctrs[i]=1e-99; } } //cout << i << " ctr " << ctrs[i] << " population " << population[i] << " species " << species[i] <<"\n"; } cout.flush(); delete [] population; delete [] species; return 1; } //---------------------------------------------------------------------- // Reading/Printing utilities // readPt - read a point from input stream into data storage // at position i. Returns false on error or EOF. // printPt - prints a points to output file //---------------------------------------------------------------------- int parseWords(char *sentence, char **words, int max) { char *word; int i = 0; char *const wordSeparators = " \t\r\n"; for (word = strtok(sentence, wordSeparators); i < max && word != 0; i++, word = strtok(0, wordSeparators)) { words[i] = word; } if (i < max) { words[i] = 0; } return i; }