irstlm: extension to quantize-lm to work with gizipped LMs. Requires use of temporary bidirectional filestream. So, it's a bit slowers and requires some diskspace on /tmp

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@871 1f5c12ca-751b-0410-a591-d2e778427230
author: mfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230> 2006-10-07 01:52:09 +0400
committer: mfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230> 2006-10-07 01:52:09 +0400
commit: fc5e348fecb2b090f6102894f029ada8c564442d (patch)
tree: 02db5ac8e76524eea63a299ca8f83e0eae172b99 /irstlm/src
parent: 998a8216ba34075293b8d96bdf989c1e0c433f9c (diff)
2 files changed, 61 insertions, 34 deletions
diff --git a/irstlm/src/lmtable.cpp b/irstlm/src/lmtable.cpp
index 2a5a1de91..050e86258 100644
--- a/irstlm/src/lmtable.cpp
+++ b/irstlm/src/lmtable.cpp
@@ -259,11 +259,9 @@ void lmtable::checkbounds(int level){
 	
    //re-order table at level+1 on disk
   //generate random filename to avoid collisions 
-
-  string filePath;
-  ofstream out;
-  createtempfile(out, filePath, ios::out);
-
+  ofstream out;string filePath;
+  createtempfile(out,filePath,ios::out);
+ 
   int start,end,newstart;
 	
   //re-order table at level l+1
diff --git a/irstlm/src/quantize-lm.cpp b/irstlm/src/quantize-lm.cpp
index 490ae55b6..8d6a78fa4 100644
--- a/irstlm/src/quantize-lm.cpp
+++ b/irstlm/src/quantize-lm.cpp
@@ -26,9 +26,10 @@ using namespace std;
 #include <string>
 #include <stdlib.h>
 #include <assert.h>
-
 #include "math.h"
-//#include "lmtable.h"
+#include "util.h"
+
+#define MAX_LINE 1024
 
 //----------------------------------------------------------------------
 //  Special type and global variable for the BIN CLUSTERING algorithm
@@ -79,7 +80,7 @@ void usage(const char *msg = 0) {
   if (!msg) std::cerr << std::endl
     << "  quantize-lm reads a standard LM file in ARPA format and produces" << std::endl
     << "  a version of it with quantized probabilities and back-off weights"<< std::endl
-    << "  that the IRST LMtoolkit can compile." << std::endl;
+    << "  that the IRST LMtoolkit can compile. Accepts LMs with .gz suffix." << std::endl;
   }
 
 
@@ -99,28 +100,49 @@ int main(int argc, const char **argv)
   
   
   std::string infile = files[0];
-  if (files.size() == 1) {
-    std::string::size_type p = infile.rfind('/');
-    if (p != std::string::npos && ((p+1) < infile.size())) {
-      files.push_back(infile.substr(p+1) + ".qlm");
-    } else {
-      files.push_back(infile + ".qlm");
-    }
+  std::string outfile="";
+
+  if (files.size() == 1) {  
+    outfile=infile;
+    
+    //remove path information
+    std::string::size_type p = outfile.rfind('/');
+    if (p != std::string::npos && ((p+1) < outfile.size()))           
+      outfile.erase(0,p+1);
+    
+    //eventually strip .gz 
+    if (outfile.compare(outfile.size()-3,3,".gz")==0)
+      outfile.erase(outfile.size()-3,3);
+    
+    outfile+=".qlm";
   }
+  else
+    outfile = files[1];
+  
   
   
   std::cout << "Reading " << infile << "..." << std::endl;
   
-  std::fstream inp(infile.c_str());
+ inputfilestream inp(infile.c_str());
   if (!inp.good()) {
     std::cerr << "Failed to open " << infile << "!\n";
     exit(1);
   }
   
-  std::string outfile = files[1];
+
   std::ofstream out(outfile.c_str());
   std::cout << "Writing " << outfile << "..." << std::endl;
 
+  //prepare temporary file to save n-gram blocks for multiple reads 
+  //this avoids using seeks which do not work with inputfilestream
+  //it's odd but i need a bidirectional filestream!
+  
+  string filePath;ofstream dummy;
+  createtempfile(dummy,filePath,ios::out);
+  dummy.close();
+  
+  fstream filebuff(filePath.c_str(),ios::out|ios::in);
+    
   int nPts = 0;  // actual number of points
   
   // *** Read ARPA FILE ** 
@@ -141,11 +163,12 @@ int main(int argc, const char **argv)
   
   out << "qARPA\n"; //print output header
   
+    
   for (int i=1;i<=MAXLEV;i++) numNgrams[i]=0;
   
-  char line[1024];
+  char line[MAX_LINE];
   
-  while (inp.getline(line,1024)){
+  while (inp.getline(line,MAX_LINE)){
     
     bool backslash = (line[0] == '\\');
     
@@ -165,18 +188,20 @@ int main(int argc, const char **argv)
       if (Order==1) centers=256; // always use 256 centers
       
       char* words[MAXLEV+3];
-      dataPts=new double[N]; // allocate data   
-      
-      iposition=inp.tellg();
+      dataPts=new double[N]; // allocate data         
       
+      //reset tempout file 
+      filebuff.seekg(0);
+           
       for (nPts=0;nPts<N;nPts++){
-        inp.getline(line,1024);
+        inp.getline(line,MAX_LINE);  
+        filebuff << line << std::endl;
         int howmany = parseWords(line, words, Order + 3);
         assert(howmany == Order+2 || howmany == Order+1);
         sscanf(words[0],"%f",&logprob);
         dataPts[nPts]=exp(logprob * logten);
       }
-      
+                
       cerr << "quantizing " << N << " probabilities\n";
       
       centersP=new double[centers];
@@ -184,18 +209,21 @@ int main(int argc, const char **argv)
       
       ComputeCluster(centers,centersP,N,dataPts);
       
-      
+
       assert(bintable !=NULL);
       for (int p=0;p<N;p++){
         mapP[bintable[p].idx]=bintable[p].code;
       }
       
       if (Order<MaxOrder){
-        
-        inp.seekg(iposition);
-        
+        //second pass to read back-off weights
+      
+        filebuff.seekg(0);
+       
         for (nPts=0;nPts<N;nPts++){
-          inp.getline(line,1024);
+         
+          filebuff.getline(line,MAX_LINE);
+          
           int howmany = parseWords(line, words, Order + 3);
           if (howmany==Order+2) //backoff is written
             sscanf(words[Order+1],"%f",&logbow);
@@ -217,8 +245,7 @@ int main(int argc, const char **argv)
         
       }
       
-      inp.seekg(iposition);
-      
+            
       out << centers << "\n";
       for (nPts=0;nPts<centers;nPts++){
         out << log(centersP[nPts])/logten;
@@ -226,9 +253,11 @@ int main(int argc, const char **argv)
         out << "\n";
       }
       
+      filebuff.seekg(0);
+      
       for (nPts=0;nPts<numNgrams[Order];nPts++){
         
-        inp.getline(line,1024);
+        filebuff.getline(line,MAX_LINE);
         
         parseWords(line, words, Order + 3);
         
@@ -261,11 +290,11 @@ int main(int argc, const char **argv)
   cerr << "---- done\n";
   
   out.flush();
-  inp.flush();
-  
+   
   out.close();
   inp.close();
   
+  removefile(filePath);
 }
 
 // Compute Clusters
author	mfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230>	2006-10-07 01:52:09 +0400
committer	mfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230>	2006-10-07 01:52:09 +0400
commit	fc5e348fecb2b090f6102894f029ada8c564442d (patch)
tree	02db5ac8e76524eea63a299ca8f83e0eae172b99 /irstlm/src
parent	998a8216ba34075293b8d96bdf989c1e0c433f9c (diff)