diff options
author | mfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230> | 2006-10-07 01:52:09 +0400 |
---|---|---|
committer | mfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230> | 2006-10-07 01:52:09 +0400 |
commit | fc5e348fecb2b090f6102894f029ada8c564442d (patch) | |
tree | 02db5ac8e76524eea63a299ca8f83e0eae172b99 /irstlm/src | |
parent | 998a8216ba34075293b8d96bdf989c1e0c433f9c (diff) |
irstlm: extension to quantize-lm to work with gizipped LMs. Requires use of temporary bidirectional filestream. So, it's a bit slowers and requires some diskspace on /tmp
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@871 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'irstlm/src')
-rw-r--r-- | irstlm/src/lmtable.cpp | 8 | ||||
-rw-r--r-- | irstlm/src/quantize-lm.cpp | 87 |
2 files changed, 61 insertions, 34 deletions
diff --git a/irstlm/src/lmtable.cpp b/irstlm/src/lmtable.cpp index 2a5a1de91..050e86258 100644 --- a/irstlm/src/lmtable.cpp +++ b/irstlm/src/lmtable.cpp @@ -259,11 +259,9 @@ void lmtable::checkbounds(int level){ //re-order table at level+1 on disk //generate random filename to avoid collisions - - string filePath; - ofstream out; - createtempfile(out, filePath, ios::out); - + ofstream out;string filePath; + createtempfile(out,filePath,ios::out); + int start,end,newstart; //re-order table at level l+1 diff --git a/irstlm/src/quantize-lm.cpp b/irstlm/src/quantize-lm.cpp index 490ae55b6..8d6a78fa4 100644 --- a/irstlm/src/quantize-lm.cpp +++ b/irstlm/src/quantize-lm.cpp @@ -26,9 +26,10 @@ using namespace std; #include <string> #include <stdlib.h> #include <assert.h> - #include "math.h" -//#include "lmtable.h" +#include "util.h" + +#define MAX_LINE 1024 //---------------------------------------------------------------------- // Special type and global variable for the BIN CLUSTERING algorithm @@ -79,7 +80,7 @@ void usage(const char *msg = 0) { if (!msg) std::cerr << std::endl << " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl << " a version of it with quantized probabilities and back-off weights"<< std::endl - << " that the IRST LMtoolkit can compile." << std::endl; + << " that the IRST LMtoolkit can compile. Accepts LMs with .gz suffix." << std::endl; } @@ -99,28 +100,49 @@ int main(int argc, const char **argv) std::string infile = files[0]; - if (files.size() == 1) { - std::string::size_type p = infile.rfind('/'); - if (p != std::string::npos && ((p+1) < infile.size())) { - files.push_back(infile.substr(p+1) + ".qlm"); - } else { - files.push_back(infile + ".qlm"); - } + std::string outfile=""; + + if (files.size() == 1) { + outfile=infile; + + //remove path information + std::string::size_type p = outfile.rfind('/'); + if (p != std::string::npos && ((p+1) < outfile.size())) + outfile.erase(0,p+1); + + //eventually strip .gz + if (outfile.compare(outfile.size()-3,3,".gz")==0) + outfile.erase(outfile.size()-3,3); + + outfile+=".qlm"; } + else + outfile = files[1]; + std::cout << "Reading " << infile << "..." << std::endl; - std::fstream inp(infile.c_str()); + inputfilestream inp(infile.c_str()); if (!inp.good()) { std::cerr << "Failed to open " << infile << "!\n"; exit(1); } - std::string outfile = files[1]; + std::ofstream out(outfile.c_str()); std::cout << "Writing " << outfile << "..." << std::endl; + //prepare temporary file to save n-gram blocks for multiple reads + //this avoids using seeks which do not work with inputfilestream + //it's odd but i need a bidirectional filestream! + + string filePath;ofstream dummy; + createtempfile(dummy,filePath,ios::out); + dummy.close(); + + fstream filebuff(filePath.c_str(),ios::out|ios::in); + int nPts = 0; // actual number of points // *** Read ARPA FILE ** @@ -141,11 +163,12 @@ int main(int argc, const char **argv) out << "qARPA\n"; //print output header + for (int i=1;i<=MAXLEV;i++) numNgrams[i]=0; - char line[1024]; + char line[MAX_LINE]; - while (inp.getline(line,1024)){ + while (inp.getline(line,MAX_LINE)){ bool backslash = (line[0] == '\\'); @@ -165,18 +188,20 @@ int main(int argc, const char **argv) if (Order==1) centers=256; // always use 256 centers char* words[MAXLEV+3]; - dataPts=new double[N]; // allocate data - - iposition=inp.tellg(); + dataPts=new double[N]; // allocate data + //reset tempout file + filebuff.seekg(0); + for (nPts=0;nPts<N;nPts++){ - inp.getline(line,1024); + inp.getline(line,MAX_LINE); + filebuff << line << std::endl; int howmany = parseWords(line, words, Order + 3); assert(howmany == Order+2 || howmany == Order+1); sscanf(words[0],"%f",&logprob); dataPts[nPts]=exp(logprob * logten); } - + cerr << "quantizing " << N << " probabilities\n"; centersP=new double[centers]; @@ -184,18 +209,21 @@ int main(int argc, const char **argv) ComputeCluster(centers,centersP,N,dataPts); - + assert(bintable !=NULL); for (int p=0;p<N;p++){ mapP[bintable[p].idx]=bintable[p].code; } if (Order<MaxOrder){ - - inp.seekg(iposition); - + //second pass to read back-off weights + + filebuff.seekg(0); + for (nPts=0;nPts<N;nPts++){ - inp.getline(line,1024); + + filebuff.getline(line,MAX_LINE); + int howmany = parseWords(line, words, Order + 3); if (howmany==Order+2) //backoff is written sscanf(words[Order+1],"%f",&logbow); @@ -217,8 +245,7 @@ int main(int argc, const char **argv) } - inp.seekg(iposition); - + out << centers << "\n"; for (nPts=0;nPts<centers;nPts++){ out << log(centersP[nPts])/logten; @@ -226,9 +253,11 @@ int main(int argc, const char **argv) out << "\n"; } + filebuff.seekg(0); + for (nPts=0;nPts<numNgrams[Order];nPts++){ - inp.getline(line,1024); + filebuff.getline(line,MAX_LINE); parseWords(line, words, Order + 3); @@ -261,11 +290,11 @@ int main(int argc, const char **argv) cerr << "---- done\n"; out.flush(); - inp.flush(); - + out.close(); inp.close(); + removefile(filePath); } // Compute Clusters |