diff options
author | mfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230> | 2007-01-04 02:25:53 +0300 |
---|---|---|
committer | mfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230> | 2007-01-04 02:25:53 +0300 |
commit | 8700f33c9812749b360085f92ea174eae2db8065 (patch) | |
tree | 43a121973bd09faad736a4995fcef65d4d03d23c /irstlm/src | |
parent | 29705a2c818e57b78f63a0562a778fd4a4bd594c (diff) |
a much more efficient version of quantize-lm
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@1109 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'irstlm/src')
-rw-r--r-- | irstlm/src/ngramcache.cpp | 5 | ||||
-rw-r--r-- | irstlm/src/quantize-lm.cpp | 166 | ||||
-rw-r--r-- | irstlm/src/util.cpp | 2 | ||||
-rw-r--r-- | irstlm/src/util.h | 8 |
4 files changed, 96 insertions, 85 deletions
diff --git a/irstlm/src/ngramcache.cpp b/irstlm/src/ngramcache.cpp index 3b4234bcc..7c52d5206 100644 --- a/irstlm/src/ngramcache.cpp +++ b/irstlm/src/ngramcache.cpp @@ -57,15 +57,12 @@ void ngramcache::reset(int n){ entries=0; } - - - char* ngramcache::get(const int* ngp,char* info){ char *found; // cout << "ngramcache::get() "; //for (int i=0;i<ngsize;i++) cout << ngp[i] << " "; cout <<"\n"; accesses++; - if (found=ht->search((char *)ngp,HT_FIND)){ + if ((found=ht->search((char *)ngp,HT_FIND))){ if (info) memcpy(info,found+ngsize*sizeof(int),infosize); hits++; }; diff --git a/irstlm/src/quantize-lm.cpp b/irstlm/src/quantize-lm.cpp index 1b326ce97..e48c96b8d 100644 --- a/irstlm/src/quantize-lm.cpp +++ b/irstlm/src/quantize-lm.cpp @@ -17,7 +17,6 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ******************************************************************************/ - using namespace std; #include <iostream> @@ -38,30 +37,28 @@ using namespace std; //---------------------------------------------------------------------- typedef struct{ - double pt; - int idx; - short code; -}BinEntry; + float pt; + unsigned int idx; + unsigned short code; +}DataItem; -int cmpBinEntry(const void* a,const void* b){ - if (*(double *)a > *(double*)b) +int cmpFloatEntry(const void* a,const void* b){ + if (*(float *)a > *(float*)b) return 1; - else if (*(double *)a < *(double*)b) + else if (*(float *)a < *(float *)b) return -1; else return 0; } -BinEntry* bintable=NULL; - //---------------------------------------------------------------------- // Global entry points //---------------------------------------------------------------------- int parseWords(char *sentence, char **words, int max); -int ComputeCluster(int nc, double* cl,int N,double* Pts); +int ComputeCluster(int nc, double* cl,int N,DataItem* Pts); //---------------------------------------------------------------------- // Global parameters (some are set in getArgs()) @@ -81,12 +78,12 @@ void usage(const char *msg = 0) { << " quantize-lm reads a standard LM file in ARPA format and produces" << std::endl << " a version of it with quantized probabilities and back-off weights"<< std::endl << " that the IRST LMtoolkit can compile. Accepts LMs with .gz suffix." << std::endl; - } +} int main(int argc, const char **argv) { - + //Process Parameters if (argc < 2) { usage(); exit(1); } @@ -101,7 +98,7 @@ int main(int argc, const char **argv) std::string infile = files[0]; std::string outfile=""; - + if (files.size() == 1) { outfile=infile; @@ -123,16 +120,16 @@ int main(int argc, const char **argv) std::cout << "Reading " << infile << "..." << std::endl; - inputfilestream inp(infile.c_str()); + inputfilestream inp(infile.c_str()); if (!inp.good()) { std::cerr << "Failed to open " << infile << "!\n"; exit(1); } - + std::ofstream out(outfile.c_str()); std::cout << "Writing " << outfile << "..." << std::endl; - + //prepare temporary file to save n-gram blocks for multiple reads //this avoids using seeks which do not work with inputfilestream //it's odd but i need a bidirectional filestream! @@ -142,29 +139,32 @@ int main(int argc, const char **argv) dummy.close(); fstream filebuff(filePath.c_str(),ios::out|ios::in); - + int nPts = 0; // actual number of points // *** Read ARPA FILE ** int numNgrams[MAXLEV + 1]; /* # n-grams for each order */ - int Order,MaxOrder; - int n; + int Order=0,MaxOrder=0; + int n=0; - float logprob,logbow, logten=log(10.0); + float logprob,logbow; - double* dataPts=NULL; - double* centersP=NULL; double* centersB=NULL; + DataItem* dataPts; - int* mapP=NULL; int* mapB=NULL; + double* centersP=NULL; + double* centersB=NULL; - int centers=k; + //maps from point index to code + unsigned short* mapP=NULL; unsigned short* mapB=NULL; + + int centers[MAXLEV + 1]; streampos iposition; - out << "qARPA\n"; //print output header + for (int i=1;i<=MAXLEV;i++) numNgrams[i]=0; + for (int i=1;i<=MAXLEV;i++) centers[i]=k; - - for (int i=1;i<=MAXLEV;i++) numNgrams[i]=0; + /* all levels 256 centroids; in case read them as parameters */ char line[MAX_LINE]; @@ -175,53 +175,67 @@ int main(int argc, const char **argv) if (sscanf(line, "ngram %d=%d", &Order, &n) == 2) { numNgrams[Order] = n; MaxOrder=Order; + continue; } + if (!strncmp(line, "\\data\\", 6) || strlen(line)==0) + continue; + if (backslash && sscanf(line, "\\%d-grams", &Order) == 1) { + // print output header: + if (Order == 1) { + out << "qARPA " << MaxOrder; + for (int i=1;i<=MaxOrder;i++) + out << " " << centers[i]; + out << "\n\n\\data\\\n"; + + for (int i=1;i<=MaxOrder;i++) + out << "ngram " << i << "= " << numNgrams[i] << "\n"; + } + + out << "\n"; out << line << "\n"; cerr << "-- Start processing of " << Order << "-grams\n"; assert(Order <= MAXLEV); int N=numNgrams[Order]; - centers=k; - if (Order==1) centers=256; // always use 256 centers char* words[MAXLEV+3]; - dataPts=new double[N]; // allocate data + dataPts=new DataItem[N]; // allocate data //reset tempout file filebuff.seekg(0); - + for (nPts=0;nPts<N;nPts++){ inp.getline(line,MAX_LINE); filebuff << line << std::endl; int howmany = parseWords(line, words, Order + 3); assert(howmany == Order+2 || howmany == Order+1); sscanf(words[0],"%f",&logprob); - dataPts[nPts]=exp(logprob * logten); + dataPts[nPts].pt=logprob; //exp(logprob * logten); + dataPts[nPts].idx=nPts; } - + cerr << "quantizing " << N << " probabilities\n"; - centersP=new double[centers]; - mapP=new int[N]; + centersP=new double[centers[Order]]; + mapP=new unsigned short[N]; + + ComputeCluster(centers[Order],centersP,N,dataPts); - ComputeCluster(centers,centersP,N,dataPts); - - assert(bintable !=NULL); for (int p=0;p<N;p++){ - mapP[bintable[p].idx]=bintable[p].code; + mapP[dataPts[p].idx]=dataPts[p].code; } if (Order<MaxOrder){ //second pass to read back-off weights - + filebuff.seekg(0); - + for (nPts=0;nPts<N;nPts++){ - + filebuff.getline(line,MAX_LINE); int howmany = parseWords(line, words, Order + 3); @@ -229,27 +243,28 @@ int main(int argc, const char **argv) sscanf(words[Order+1],"%f",&logbow); else logbow=0; // backoff is implicit - dataPts[nPts]=exp(logbow * logten); + + dataPts[nPts].pt=logbow; + dataPts[nPts].idx=nPts; } - centersB=new double[centers]; - mapB=new int[N]; + centersB=new double[centers[Order]]; + mapB=new unsigned short[N]; cerr << "quantizing " << N << " backoff weights\n"; - ComputeCluster(centers,centersB,N,dataPts); + ComputeCluster(centers[Order],centersB,N,dataPts); - assert(bintable !=NULL); for (int p=0;p<N;p++){ - mapB[bintable[p].idx]=bintable[p].code; + mapB[dataPts[p].idx]=dataPts[p].code; } } - - out << centers << "\n"; - for (nPts=0;nPts<centers;nPts++){ - out << log(centersP[nPts])/logten; - if (Order<MaxOrder) out << " " << log(centersB[nPts])/logten; + + out << centers[Order] << "\n"; + for (nPts=0;nPts<centers[Order];nPts++){ + out << centersP[nPts]; + if (Order<MaxOrder) out << " " << centersB[nPts]; out << "\n"; } @@ -270,7 +285,7 @@ int main(int argc, const char **argv) out << "\n"; } - + if (mapP){delete [] mapP;mapP=NULL;} if (mapB){delete [] mapB;mapB=NULL;} @@ -290,7 +305,7 @@ int main(int argc, const char **argv) cerr << "---- done\n"; out.flush(); - + out.close(); inp.close(); @@ -299,22 +314,16 @@ int main(int argc, const char **argv) // Compute Clusters -int ComputeCluster(int centers,double* ctrs,int N,double* dataPts){ +int ComputeCluster(int centers,double* ctrs,int N,DataItem* bintable){ //cerr << "\nExecuting Clutering Algorithm: k=" << centers<< "\n"; + double log10=log(10.0); - if (bintable) delete [] bintable; - - bintable=new BinEntry[N]; - for (int i=0;i<N;i++){ - bintable[i].pt=dataPts[i]; - bintable[i].idx=i; - bintable[i].code=0; - } + for (int i=0;i<N;i++) bintable[i].code=0; //cout << "start sort \n"; - qsort(bintable,N,sizeof(BinEntry),cmpBinEntry); + qsort(bintable,N,sizeof(DataItem),cmpFloatEntry); int different=1; @@ -333,10 +342,10 @@ int ComputeCluster(int centers,double* ctrs,int N,double* dataPts){ for (int i=0;i<centers;i++){ population[i]=species[i]=0; - ctrs[i]=0.0; + ctrs[i]=0; } - // initial values + // initial values: this should catch up very low values: -99 bintable[0].code=0; population[0]=1; species[0]=1; @@ -352,7 +361,7 @@ int ComputeCluster(int centers,double* ctrs,int N,double* dataPts){ if ((currcode+1) < centers && population[currcode]>0){ - currcode++; + currcode++; } } @@ -367,19 +376,21 @@ int ComputeCluster(int centers,double* ctrs,int N,double* dataPts){ assert(bintable[i].code < centers); - ctrs[bintable[i].code]+=bintable[i].pt; + ctrs[bintable[i].code]=ctrs[bintable[i].code]+exp(bintable[i].pt * log10); } - for (int i=0;i<centers;i++){ - if (population[i]>0){ - ctrs[i]/=(float)population[i]; - if (ctrs[i]<1e-99){ - cerr << "Warning: adjusting center with too small prob " << ctrs[i] << "\n"; - ctrs[i]=1e-99; - } + if (population[i]>0) + ctrs[i]=log(ctrs[i]/population[i])/log10; + else + ctrs[i]=-99; + + if (ctrs[i]<-99){ + cerr << "Warning: adjusting center with too small prob " << ctrs[i] << "\n"; + ctrs[i]=-99; } + //cout << i << " ctr " << ctrs[i] << " population " << population[i] << " species " << species[i] <<"\n"; } @@ -422,3 +433,4 @@ int parseWords(char *sentence, char **words, int max) } + diff --git a/irstlm/src/util.cpp b/irstlm/src/util.cpp index 8ecf91d56..1f6fff449 100644 --- a/irstlm/src/util.cpp +++ b/irstlm/src/util.cpp @@ -64,7 +64,7 @@ m_streambuf(0) m_streambuf = new gzfilebuf(filePath.c_str()); } else { std::filebuf* fb = new std::filebuf(); - fb->open(filePath.c_str(), std::ios::in); + _good=(fb->open(filePath.c_str(), std::ios::in)!=NULL); m_streambuf = fb; } this->init(m_streambuf); diff --git a/irstlm/src/util.h b/irstlm/src/util.h index e8e9119e5..3fa66de9e 100644 --- a/irstlm/src/util.h +++ b/irstlm/src/util.h @@ -13,14 +13,16 @@ class inputfilestream : public std::istream { protected: std::streambuf *m_streambuf; + bool _good; public: - - inputfilestream(const std::string &filePath); + + inputfilestream(const std::string &filePath); ~inputfilestream(); - + bool good(){return _good;} void close(); }; + void *MMap(int fd, int access, off_t offset, size_t len, off_t *gap); int Munmap(void *p,size_t len,int sync); |