Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230>2007-01-04 02:25:53 +0300
committermfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230>2007-01-04 02:25:53 +0300
commit8700f33c9812749b360085f92ea174eae2db8065 (patch)
tree43a121973bd09faad736a4995fcef65d4d03d23c /irstlm/src
parent29705a2c818e57b78f63a0562a778fd4a4bd594c (diff)
a much more efficient version of quantize-lm
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1109 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'irstlm/src')
-rw-r--r--irstlm/src/ngramcache.cpp5
-rw-r--r--irstlm/src/quantize-lm.cpp166
-rw-r--r--irstlm/src/util.cpp2
-rw-r--r--irstlm/src/util.h8
4 files changed, 96 insertions, 85 deletions
diff --git a/irstlm/src/ngramcache.cpp b/irstlm/src/ngramcache.cpp
index 3b4234bcc..7c52d5206 100644
--- a/irstlm/src/ngramcache.cpp
+++ b/irstlm/src/ngramcache.cpp
@@ -57,15 +57,12 @@ void ngramcache::reset(int n){
entries=0;
}
-
-
-
char* ngramcache::get(const int* ngp,char* info){
char *found;
// cout << "ngramcache::get() ";
//for (int i=0;i<ngsize;i++) cout << ngp[i] << " "; cout <<"\n";
accesses++;
- if (found=ht->search((char *)ngp,HT_FIND)){
+ if ((found=ht->search((char *)ngp,HT_FIND))){
if (info) memcpy(info,found+ngsize*sizeof(int),infosize);
hits++;
};
diff --git a/irstlm/src/quantize-lm.cpp b/irstlm/src/quantize-lm.cpp
index 1b326ce97..e48c96b8d 100644
--- a/irstlm/src/quantize-lm.cpp
+++ b/irstlm/src/quantize-lm.cpp
@@ -17,7 +17,6 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
******************************************************************************/
-
using namespace std;
#include <iostream>
@@ -38,30 +37,28 @@ using namespace std;
//----------------------------------------------------------------------
typedef struct{
- double pt;
- int idx;
- short code;
-}BinEntry;
+ float pt;
+ unsigned int idx;
+ unsigned short code;
+}DataItem;
-int cmpBinEntry(const void* a,const void* b){
- if (*(double *)a > *(double*)b)
+int cmpFloatEntry(const void* a,const void* b){
+ if (*(float *)a > *(float*)b)
return 1;
- else if (*(double *)a < *(double*)b)
+ else if (*(float *)a < *(float *)b)
return -1;
else
return 0;
}
-BinEntry* bintable=NULL;
-
//----------------------------------------------------------------------
// Global entry points
//----------------------------------------------------------------------
int parseWords(char *sentence, char **words, int max);
-int ComputeCluster(int nc, double* cl,int N,double* Pts);
+int ComputeCluster(int nc, double* cl,int N,DataItem* Pts);
//----------------------------------------------------------------------
// Global parameters (some are set in getArgs())
@@ -81,12 +78,12 @@ void usage(const char *msg = 0) {
<< " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl
<< " a version of it with quantized probabilities and back-off weights"<< std::endl
<< " that the IRST LMtoolkit can compile. Accepts LMs with .gz suffix." << std::endl;
- }
+}
int main(int argc, const char **argv)
{
-
+
//Process Parameters
if (argc < 2) { usage(); exit(1); }
@@ -101,7 +98,7 @@ int main(int argc, const char **argv)
std::string infile = files[0];
std::string outfile="";
-
+
if (files.size() == 1) {
outfile=infile;
@@ -123,16 +120,16 @@ int main(int argc, const char **argv)
std::cout << "Reading " << infile << "..." << std::endl;
- inputfilestream inp(infile.c_str());
+ inputfilestream inp(infile.c_str());
if (!inp.good()) {
std::cerr << "Failed to open " << infile << "!\n";
exit(1);
}
-
+
std::ofstream out(outfile.c_str());
std::cout << "Writing " << outfile << "..." << std::endl;
-
+
//prepare temporary file to save n-gram blocks for multiple reads
//this avoids using seeks which do not work with inputfilestream
//it's odd but i need a bidirectional filestream!
@@ -142,29 +139,32 @@ int main(int argc, const char **argv)
dummy.close();
fstream filebuff(filePath.c_str(),ios::out|ios::in);
-
+
int nPts = 0; // actual number of points
// *** Read ARPA FILE **
int numNgrams[MAXLEV + 1]; /* # n-grams for each order */
- int Order,MaxOrder;
- int n;
+ int Order=0,MaxOrder=0;
+ int n=0;
- float logprob,logbow, logten=log(10.0);
+ float logprob,logbow;
- double* dataPts=NULL;
- double* centersP=NULL; double* centersB=NULL;
+ DataItem* dataPts;
- int* mapP=NULL; int* mapB=NULL;
+ double* centersP=NULL;
+ double* centersB=NULL;
- int centers=k;
+ //maps from point index to code
+ unsigned short* mapP=NULL; unsigned short* mapB=NULL;
+
+ int centers[MAXLEV + 1];
streampos iposition;
- out << "qARPA\n"; //print output header
+ for (int i=1;i<=MAXLEV;i++) numNgrams[i]=0;
+ for (int i=1;i<=MAXLEV;i++) centers[i]=k;
-
- for (int i=1;i<=MAXLEV;i++) numNgrams[i]=0;
+ /* all levels 256 centroids; in case read them as parameters */
char line[MAX_LINE];
@@ -175,53 +175,67 @@ int main(int argc, const char **argv)
if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) {
numNgrams[Order] = n;
MaxOrder=Order;
+ continue;
}
+ if (!strncmp(line, "\\data\\", 6) || strlen(line)==0)
+ continue;
+
if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) {
+ // print output header:
+ if (Order == 1) {
+ out << "qARPA " << MaxOrder;
+ for (int i=1;i<=MaxOrder;i++)
+ out << " " << centers[i];
+ out << "\n\n\\data\\\n";
+
+ for (int i=1;i<=MaxOrder;i++)
+ out << "ngram " << i << "= " << numNgrams[i] << "\n";
+ }
+
+ out << "\n";
out << line << "\n";
cerr << "-- Start processing of " << Order << "-grams\n";
assert(Order <= MAXLEV);
int N=numNgrams[Order];
- centers=k;
- if (Order==1) centers=256; // always use 256 centers
char* words[MAXLEV+3];
- dataPts=new double[N]; // allocate data
+ dataPts=new DataItem[N]; // allocate data
//reset tempout file
filebuff.seekg(0);
-
+
for (nPts=0;nPts<N;nPts++){
inp.getline(line,MAX_LINE);
filebuff << line << std::endl;
int howmany = parseWords(line, words, Order + 3);
assert(howmany == Order+2 || howmany == Order+1);
sscanf(words[0],"%f",&logprob);
- dataPts[nPts]=exp(logprob * logten);
+ dataPts[nPts].pt=logprob; //exp(logprob * logten);
+ dataPts[nPts].idx=nPts;
}
-
+
cerr << "quantizing " << N << " probabilities\n";
- centersP=new double[centers];
- mapP=new int[N];
+ centersP=new double[centers[Order]];
+ mapP=new unsigned short[N];
+
+ ComputeCluster(centers[Order],centersP,N,dataPts);
- ComputeCluster(centers,centersP,N,dataPts);
-
- assert(bintable !=NULL);
for (int p=0;p<N;p++){
- mapP[bintable[p].idx]=bintable[p].code;
+ mapP[dataPts[p].idx]=dataPts[p].code;
}
if (Order<MaxOrder){
//second pass to read back-off weights
-
+
filebuff.seekg(0);
-
+
for (nPts=0;nPts<N;nPts++){
-
+
filebuff.getline(line,MAX_LINE);
int howmany = parseWords(line, words, Order + 3);
@@ -229,27 +243,28 @@ int main(int argc, const char **argv)
sscanf(words[Order+1],"%f",&logbow);
else
logbow=0; // backoff is implicit
- dataPts[nPts]=exp(logbow * logten);
+
+ dataPts[nPts].pt=logbow;
+ dataPts[nPts].idx=nPts;
}
- centersB=new double[centers];
- mapB=new int[N];
+ centersB=new double[centers[Order]];
+ mapB=new unsigned short[N];
cerr << "quantizing " << N << " backoff weights\n";
- ComputeCluster(centers,centersB,N,dataPts);
+ ComputeCluster(centers[Order],centersB,N,dataPts);
- assert(bintable !=NULL);
for (int p=0;p<N;p++){
- mapB[bintable[p].idx]=bintable[p].code;
+ mapB[dataPts[p].idx]=dataPts[p].code;
}
}
-
- out << centers << "\n";
- for (nPts=0;nPts<centers;nPts++){
- out << log(centersP[nPts])/logten;
- if (Order<MaxOrder) out << " " << log(centersB[nPts])/logten;
+
+ out << centers[Order] << "\n";
+ for (nPts=0;nPts<centers[Order];nPts++){
+ out << centersP[nPts];
+ if (Order<MaxOrder) out << " " << centersB[nPts];
out << "\n";
}
@@ -270,7 +285,7 @@ int main(int argc, const char **argv)
out << "\n";
}
-
+
if (mapP){delete [] mapP;mapP=NULL;}
if (mapB){delete [] mapB;mapB=NULL;}
@@ -290,7 +305,7 @@ int main(int argc, const char **argv)
cerr << "---- done\n";
out.flush();
-
+
out.close();
inp.close();
@@ -299,22 +314,16 @@ int main(int argc, const char **argv)
// Compute Clusters
-int ComputeCluster(int centers,double* ctrs,int N,double* dataPts){
+int ComputeCluster(int centers,double* ctrs,int N,DataItem* bintable){
//cerr << "\nExecuting Clutering Algorithm: k=" << centers<< "\n";
+ double log10=log(10.0);
- if (bintable) delete [] bintable;
-
- bintable=new BinEntry[N];
- for (int i=0;i<N;i++){
- bintable[i].pt=dataPts[i];
- bintable[i].idx=i;
- bintable[i].code=0;
- }
+ for (int i=0;i<N;i++) bintable[i].code=0;
//cout << "start sort \n";
- qsort(bintable,N,sizeof(BinEntry),cmpBinEntry);
+ qsort(bintable,N,sizeof(DataItem),cmpFloatEntry);
int different=1;
@@ -333,10 +342,10 @@ int ComputeCluster(int centers,double* ctrs,int N,double* dataPts){
for (int i=0;i<centers;i++){
population[i]=species[i]=0;
- ctrs[i]=0.0;
+ ctrs[i]=0;
}
- // initial values
+ // initial values: this should catch up very low values: -99
bintable[0].code=0;
population[0]=1;
species[0]=1;
@@ -352,7 +361,7 @@ int ComputeCluster(int centers,double* ctrs,int N,double* dataPts){
if ((currcode+1) < centers
&&
population[currcode]>0){
- currcode++;
+ currcode++;
}
}
@@ -367,19 +376,21 @@ int ComputeCluster(int centers,double* ctrs,int N,double* dataPts){
assert(bintable[i].code < centers);
- ctrs[bintable[i].code]+=bintable[i].pt;
+ ctrs[bintable[i].code]=ctrs[bintable[i].code]+exp(bintable[i].pt * log10);
}
-
for (int i=0;i<centers;i++){
- if (population[i]>0){
- ctrs[i]/=(float)population[i];
- if (ctrs[i]<1e-99){
- cerr << "Warning: adjusting center with too small prob " << ctrs[i] << "\n";
- ctrs[i]=1e-99;
- }
+ if (population[i]>0)
+ ctrs[i]=log(ctrs[i]/population[i])/log10;
+ else
+ ctrs[i]=-99;
+
+ if (ctrs[i]<-99){
+ cerr << "Warning: adjusting center with too small prob " << ctrs[i] << "\n";
+ ctrs[i]=-99;
}
+
//cout << i << " ctr " << ctrs[i] << " population " << population[i] << " species " << species[i] <<"\n";
}
@@ -422,3 +433,4 @@ int parseWords(char *sentence, char **words, int max)
}
+
diff --git a/irstlm/src/util.cpp b/irstlm/src/util.cpp
index 8ecf91d56..1f6fff449 100644
--- a/irstlm/src/util.cpp
+++ b/irstlm/src/util.cpp
@@ -64,7 +64,7 @@ m_streambuf(0)
m_streambuf = new gzfilebuf(filePath.c_str());
} else {
std::filebuf* fb = new std::filebuf();
- fb->open(filePath.c_str(), std::ios::in);
+ _good=(fb->open(filePath.c_str(), std::ios::in)!=NULL);
m_streambuf = fb;
}
this->init(m_streambuf);
diff --git a/irstlm/src/util.h b/irstlm/src/util.h
index e8e9119e5..3fa66de9e 100644
--- a/irstlm/src/util.h
+++ b/irstlm/src/util.h
@@ -13,14 +13,16 @@ class inputfilestream : public std::istream
{
protected:
std::streambuf *m_streambuf;
+ bool _good;
public:
-
- inputfilestream(const std::string &filePath);
+
+ inputfilestream(const std::string &filePath);
~inputfilestream();
-
+ bool good(){return _good;}
void close();
};
+
void *MMap(int fd, int access, off_t offset, size_t len, off_t *gap);
int Munmap(void *p,size_t len,int sync);