Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230>2006-10-07 01:52:09 +0400
committermfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230>2006-10-07 01:52:09 +0400
commitfc5e348fecb2b090f6102894f029ada8c564442d (patch)
tree02db5ac8e76524eea63a299ca8f83e0eae172b99 /irstlm/src
parent998a8216ba34075293b8d96bdf989c1e0c433f9c (diff)
irstlm: extension to quantize-lm to work with gizipped LMs. Requires use of temporary bidirectional filestream. So, it's a bit slowers and requires some diskspace on /tmp
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@871 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'irstlm/src')
-rw-r--r--irstlm/src/lmtable.cpp8
-rw-r--r--irstlm/src/quantize-lm.cpp87
2 files changed, 61 insertions, 34 deletions
diff --git a/irstlm/src/lmtable.cpp b/irstlm/src/lmtable.cpp
index 2a5a1de91..050e86258 100644
--- a/irstlm/src/lmtable.cpp
+++ b/irstlm/src/lmtable.cpp
@@ -259,11 +259,9 @@ void lmtable::checkbounds(int level){
//re-order table at level+1 on disk
//generate random filename to avoid collisions
-
- string filePath;
- ofstream out;
- createtempfile(out, filePath, ios::out);
-
+ ofstream out;string filePath;
+ createtempfile(out,filePath,ios::out);
+
int start,end,newstart;
//re-order table at level l+1
diff --git a/irstlm/src/quantize-lm.cpp b/irstlm/src/quantize-lm.cpp
index 490ae55b6..8d6a78fa4 100644
--- a/irstlm/src/quantize-lm.cpp
+++ b/irstlm/src/quantize-lm.cpp
@@ -26,9 +26,10 @@ using namespace std;
#include <string>
#include <stdlib.h>
#include <assert.h>
-
#include "math.h"
-//#include "lmtable.h"
+#include "util.h"
+
+#define MAX_LINE 1024
//----------------------------------------------------------------------
// Special type and global variable for the BIN CLUSTERING algorithm
@@ -79,7 +80,7 @@ void usage(const char *msg = 0) {
if (!msg) std::cerr << std::endl
<< " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl
<< " a version of it with quantized probabilities and back-off weights"<< std::endl
- << " that the IRST LMtoolkit can compile." << std::endl;
+ << " that the IRST LMtoolkit can compile. Accepts LMs with .gz suffix." << std::endl;
}
@@ -99,28 +100,49 @@ int main(int argc, const char **argv)
std::string infile = files[0];
- if (files.size() == 1) {
- std::string::size_type p = infile.rfind('/');
- if (p != std::string::npos && ((p+1) < infile.size())) {
- files.push_back(infile.substr(p+1) + ".qlm");
- } else {
- files.push_back(infile + ".qlm");
- }
+ std::string outfile="";
+
+ if (files.size() == 1) {
+ outfile=infile;
+
+ //remove path information
+ std::string::size_type p = outfile.rfind('/');
+ if (p != std::string::npos && ((p+1) < outfile.size()))
+ outfile.erase(0,p+1);
+
+ //eventually strip .gz
+ if (outfile.compare(outfile.size()-3,3,".gz")==0)
+ outfile.erase(outfile.size()-3,3);
+
+ outfile+=".qlm";
}
+ else
+ outfile = files[1];
+
std::cout << "Reading " << infile << "..." << std::endl;
- std::fstream inp(infile.c_str());
+ inputfilestream inp(infile.c_str());
if (!inp.good()) {
std::cerr << "Failed to open " << infile << "!\n";
exit(1);
}
- std::string outfile = files[1];
+
std::ofstream out(outfile.c_str());
std::cout << "Writing " << outfile << "..." << std::endl;
+ //prepare temporary file to save n-gram blocks for multiple reads
+ //this avoids using seeks which do not work with inputfilestream
+ //it's odd but i need a bidirectional filestream!
+
+ string filePath;ofstream dummy;
+ createtempfile(dummy,filePath,ios::out);
+ dummy.close();
+
+ fstream filebuff(filePath.c_str(),ios::out|ios::in);
+
int nPts = 0; // actual number of points
// *** Read ARPA FILE **
@@ -141,11 +163,12 @@ int main(int argc, const char **argv)
out << "qARPA\n"; //print output header
+
for (int i=1;i<=MAXLEV;i++) numNgrams[i]=0;
- char line[1024];
+ char line[MAX_LINE];
- while (inp.getline(line,1024)){
+ while (inp.getline(line,MAX_LINE)){
bool backslash = (line[0] == '\\');
@@ -165,18 +188,20 @@ int main(int argc, const char **argv)
if (Order==1) centers=256; // always use 256 centers
char* words[MAXLEV+3];
- dataPts=new double[N]; // allocate data
-
- iposition=inp.tellg();
+ dataPts=new double[N]; // allocate data
+ //reset tempout file
+ filebuff.seekg(0);
+
for (nPts=0;nPts<N;nPts++){
- inp.getline(line,1024);
+ inp.getline(line,MAX_LINE);
+ filebuff << line << std::endl;
int howmany = parseWords(line, words, Order + 3);
assert(howmany == Order+2 || howmany == Order+1);
sscanf(words[0],"%f",&logprob);
dataPts[nPts]=exp(logprob * logten);
}
-
+
cerr << "quantizing " << N << " probabilities\n";
centersP=new double[centers];
@@ -184,18 +209,21 @@ int main(int argc, const char **argv)
ComputeCluster(centers,centersP,N,dataPts);
-
+
assert(bintable !=NULL);
for (int p=0;p<N;p++){
mapP[bintable[p].idx]=bintable[p].code;
}
if (Order<MaxOrder){
-
- inp.seekg(iposition);
-
+ //second pass to read back-off weights
+
+ filebuff.seekg(0);
+
for (nPts=0;nPts<N;nPts++){
- inp.getline(line,1024);
+
+ filebuff.getline(line,MAX_LINE);
+
int howmany = parseWords(line, words, Order + 3);
if (howmany==Order+2) //backoff is written
sscanf(words[Order+1],"%f",&logbow);
@@ -217,8 +245,7 @@ int main(int argc, const char **argv)
}
- inp.seekg(iposition);
-
+
out << centers << "\n";
for (nPts=0;nPts<centers;nPts++){
out << log(centersP[nPts])/logten;
@@ -226,9 +253,11 @@ int main(int argc, const char **argv)
out << "\n";
}
+ filebuff.seekg(0);
+
for (nPts=0;nPts<numNgrams[Order];nPts++){
- inp.getline(line,1024);
+ filebuff.getline(line,MAX_LINE);
parseWords(line, words, Order + 3);
@@ -261,11 +290,11 @@ int main(int argc, const char **argv)
cerr << "---- done\n";
out.flush();
- inp.flush();
-
+
out.close();
inp.close();
+ removefile(filePath);
}
// Compute Clusters