Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/irstlm
diff options
context:
space:
mode:
authormfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-11 00:06:12 +0400
committermfederico <mfederico@1f5c12ca-751b-0410-a591-d2e778427230>2006-08-11 00:06:12 +0400
commit688e4173e438e1238a7f12057db84eb339bd0b60 (patch)
treecbf8e346439e78be114adda356925d8e78e9de2d /irstlm
parentb65eafacc671aa53d751325cda328991e139f26e (diff)
Added probability and LM state caching in IRST LM.
git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@623 1f5c12ca-751b-0410-a591-d2e778427230
Diffstat (limited to 'irstlm')
-rw-r--r--irstlm/src/compile-lm.cpp10
-rw-r--r--irstlm/src/htable.cpp29
-rw-r--r--irstlm/src/lmtable.cpp93
-rw-r--r--irstlm/src/lmtable.h89
4 files changed, 121 insertions, 100 deletions
diff --git a/irstlm/src/compile-lm.cpp b/irstlm/src/compile-lm.cpp
index 1985229bd..a3370a73f 100644
--- a/irstlm/src/compile-lm.cpp
+++ b/irstlm/src/compile-lm.cpp
@@ -27,10 +27,6 @@ using namespace std;
#include <stdlib.h>
#include "math.h"
-#include "mempool.h"
-#include "htable.h"
-#include "dictionary.h"
-#include "n_gram.h"
#include "lmtable.h"
@@ -139,15 +135,15 @@ int main(int argc, const char **argv)
double logPr=0,PP=0,PPwp=0,Pr;
int bos=ng.dict->encode(ng.dict->BoS());
-
+ lmt.init_prcache();
while(inptxt >> ng){
// reset ngram at begin of sentence
if (*ng.wordp(1)==bos) continue;
-
+
lmt.bo_state(0);
if (ng.size>=1){
- logPr+=(Pr=lmt.lprob(ng));
+ logPr+=(Pr=lmt.clprob(ng));
if (*ng.wordp(1) == lmt.dict->oovcode()) Noov++;
Nw++; if (lmt.bo_state()) Nbo++;
}
diff --git a/irstlm/src/htable.cpp b/irstlm/src/htable.cpp
index 793a1a59c..88194c409 100644
--- a/irstlm/src/htable.cpp
+++ b/irstlm/src/htable.cpp
@@ -31,14 +31,14 @@ htable::htable(int n,int kl,HTYPE ht,size_t (*klf)(const char* )){
cerr << "htable: key length must be specified for non-string entries!";
exit(1);
}
-
+
memory=new mempool( sizeof(entry) , BlockSize );
table = new entry* [ size=n ];
memset(table,0,sizeof(entry *) * n );
- keylen=kl;
+ keylen=(ht==INT || ht==INTPTR?kl/sizeof(int):kl);
htype=ht;
@@ -63,6 +63,8 @@ char *htable::search(char *item, HT_ACTION action)
i=(h % size);
+ //cout << "htable::search() hash i=" << i << "\n";
+
p = &table[h % size];
q=*p;
@@ -200,28 +202,25 @@ address htable::HashStr(char *key)
address htable::HashInt(char *key)
{
int *Key=(htype==INTPTR? *(int **)key:(int *)key);
- static int length=keylen/sizeof(int);
-
+
//cerr << "hash: " << Key << " length:" << length << "\n";
- register int h=0;
+ register int h;
register int i;
-
- for (i=0,h=0;i<length;i++){
- /*Thomas Wang's 32 bit Mix Function
+ /*Thomas Wang's 32 bit Mix Function*/
+ for (i=0,h=0;i<keylen;i++){
h+=Key[i];
h += ~(h << 15);
h ^= (h >> 10);
h += (h << 3);
h ^= (h >> 6);
h += ~(h << 11);
- h ^= (h >> 16);
- */
- h = h * Prime1 ^ Key[i];
+ h ^= (h >> 16);
};
- h %= Prime2;
+ //for (i=0,h=0;i<length;i++) h = h * Prime1 ^ Key[i];
+ //h %= Prime2;
return h;
}
@@ -255,12 +254,10 @@ int htable::CompInt(char *key1, char *key2)
int *Key2=(htype==INTPTR?*(int **)key2:(int*)key2);
assert(Key1 && Key2);
-
- static int length=keylen/sizeof(int);
-
+
register int i;
- for (i=0;i<length;i++)
+ for (i=0;i<keylen;i++)
if (Key1[i]!=Key2[i]) return 1;
return 0;
}
diff --git a/irstlm/src/lmtable.cpp b/irstlm/src/lmtable.cpp
index 737a07779..dfbe2e6d9 100644
--- a/irstlm/src/lmtable.cpp
+++ b/irstlm/src/lmtable.cpp
@@ -17,6 +17,7 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
******************************************************************************/
+using namespace std;
#include <iostream>
#include <fstream>
@@ -28,11 +29,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include "math.h"
#include "mempool.h"
#include "htable.h"
+#include "ngramcache.h"
#include "dictionary.h"
#include "n_gram.h"
#include "lmtable.h"
-using namespace std;
inline void error(char* message){
cerr << message << "\n";
@@ -53,12 +54,12 @@ lmtable::lmtable(){
memset(info, 0, sizeof(info));
memset(NumCenters, 0, sizeof(NumCenters));
- bicache=new bigramcache(1000000);
+ bicache=new ngramcache(2,sizeof(char*),1000000);
+ prcache=NULL;
};
-
//loadstd::istream& inp a lmtable from a lm file
void lmtable::load(fstream& inp){
@@ -567,18 +568,19 @@ int lmtable::get(ngram& ng,int n,int lev){
int offset=0,limit=cursize[1];
//information of table entries
- char* addr; char* found; LMT_TYPE ndt;
+ int hit;char* found; LMT_TYPE ndt;
ng.link=NULL;
ng.lev=0;
for (int l=1;l<=lev;l++){
//initialize entry information
- found = NULL; ndt=tbltype[l];
-
- if (l==2 && (addr=bicache->get(ng.wordp(n)))){
- found=*((char **)(addr + 2 * sizeof(int))); // get the information
- }
+ hit = 0 ; found = NULL; ndt=tbltype[l];
+
+ //if (l==2) cout <<"bicache: searching:" << ng <<"\n";
+
+ if (l==2 && bicache->get(ng.wordp(n),(char *)&found))
+ hit=1;
else
search(table[l] + (offset * nodesize(ndt)),
ndt,
@@ -589,14 +591,11 @@ int lmtable::get(ngram& ng,int n,int lev){
LMT_FIND,
&found);
- if (l==2 && !addr){
- if (bicache->isfull()){
- bicache->reset();
- cerr << "resetting the cache\n";
- }
- bicache->add(ng.wordp(n),found);
+ if (l==2 && hit==0){
+ if (bicache->isfull()) bicache->reset();
+ //cout << "bicache :" << ng <<"\n";
+ bicache->add(ng.wordp(n),(char *)&found);
}
-
if (!found) return 0;
@@ -721,8 +720,9 @@ const char *lmtable::maxsuffptr(ngram ong){
if (ong.size==0) return (char*) NULL;
if (ong.size>=maxlev) ong.size=maxlev-1;
- ngram ng(dict); //eventually use the <unk> word
- ng.trans(ong);
+ ngram ng=ong;
+ //ngram ng(dict); //eventually use the <unk> word
+ //ng.trans(ong);
if (get(ng,ng.size,ng.size))
return ng.link;
@@ -733,6 +733,29 @@ const char *lmtable::maxsuffptr(ngram ong){
}
+const char *lmtable::cmaxsuffptr(ngram ong){
+
+ if (ong.size==0) return (char*) NULL;
+ if (ong.size>=maxlev) ong.size=maxlev-1;
+
+ char* found;
+
+ if ((ong.size==maxlev-1) && statecache->get(ong.wordp(maxlev-1),(char *)&found))
+ return found;
+
+ found=(char *)maxsuffptr(ong);
+
+ if (ong.size==maxlev-1){
+ if (statecache->isfull()) statecache->reset();
+ // cout << "clprob: adding: " << ong <<"\n";
+ statecache->add(ong.wordp(maxlev-1),(char *)&found);
+ };
+
+ return found;
+
+}
+
+
// returns the probability of an n-gram
double lmtable::prob(ngram ong){
@@ -776,8 +799,9 @@ double lmtable::lprob(ngram ong){
if (ong.size==0) return 0.0;
if (ong.size>maxlev) ong.size=maxlev;
- ngram ng(dict);
- ng.trans(ong);
+ ngram ng=ong;
+ //ngram ng(dict); //avoid dictionary transfer
+ //ng.trans(ong);
double rbow;
int ibow,iprob;
@@ -805,6 +829,35 @@ double lmtable::lprob(ngram ong){
}
}
+//return log10 probsL use cache memory
+
+double lmtable::clprob(ngram ong){
+
+
+ if (ong.size==0) {
+ return 0.0;
+ }
+
+ if (ong.size>maxlev) ong.size=maxlev;
+
+ double pr;
+ if (ong.size==maxlev && prcache->get(ong.wordp(maxlev),(char *)&pr)){
+ //assert(pr==lprob(ong));
+ return pr;
+ };
+
+
+ pr=lprob(ong);
+
+ if (ong.size==maxlev){
+ if (prcache->isfull()) prcache->reset();
+ // cout << "clprob: adding: " << ong <<"\n";
+ prcache->add(ong.wordp(maxlev),(char *)&pr);
+ };
+
+ return pr;
+
+};
diff --git a/irstlm/src/lmtable.h b/irstlm/src/lmtable.h
index a66013fa2..7fe064bbb 100644
--- a/irstlm/src/lmtable.h
+++ b/irstlm/src/lmtable.h
@@ -22,6 +22,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#ifndef MF_LMTABLE_H
#define MF_LMTABLE_H
+#include "ngramcache.h"
+#include "dictionary.h"
+#include "n_gram.h"
+
#define LMTMAXLEV 11
#ifndef LMTCODESIZE
@@ -50,59 +54,6 @@ typedef enum {LMT_FIND, //!< search: find an entry
-//cache memory to store links to bigrams store in the tree
-#include "mempool.h"
-#include "htable.h"
-
-class bigramcache{
-private:
- htable* ht;
- mempool *mp;
- int maxn;
- int entries;
-public:
-
- bigramcache(int maxentries){
- maxn=maxentries;
- entries=0;
- ht=new htable(maxentries, 2 * sizeof(int),INT,NULL); //load factor
- mp=new mempool(2 * sizeof(int)+sizeof(char*),maxn/10);
- };
-
- ~bigramcache(){
- ht->stat();//ht->map();
- mp->stat();
- delete ht;delete mp;
- };
-
- void reset(){
- ht->stat();
- delete ht;delete mp;
- ht=new htable(maxn/5, 2 * sizeof(int),STR,NULL); //load factor
- mp=new mempool(2*sizeof(int)+sizeof(char*),maxn/10);
- entries=0;
- }
-
- char* get(int* ngp){
- char *found=ht->search((char *)ngp,HT_FIND);
- return found;
- };
-
- int add(int* ngp,char* link){
-
- char* entry=mp->alloc();
- memcpy(entry,(char*)ngp,sizeof(int) * 2);
- memcpy(entry + 2 * sizeof(int),(char *)&link,sizeof(char *));
- char *found=ht->search((char *)entry,HT_ENTER);
- assert(found==(char *)entry); //false if key is already insided
- entries++;
- return 1;
- }
-
- int isfull(){ return (entries >= maxn);};
-
-};
-
//disktable or accessing tables stored on disk
@@ -183,9 +134,12 @@ class lmtable{
int lmt_oov_code;
int lmt_oov_size;
int backoff_state;
+
//improve access speed
- bigramcache* bicache;
-
+ ngramcache* bicache;
+ ngramcache* prcache;
+ ngramcache* statecache;
+
public:
dictionary *dict; // dictionary
@@ -193,7 +147,20 @@ public:
lmtable();
~lmtable(){
- delete bicache;
+ if (bicache){
+ std::cerr << "Bigram Cache: "; bicache->stat();
+ delete bicache;
+ }
+ if (prcache){
+ std::cerr << "Prob Cache: "; prcache->stat();
+ delete prcache;
+ }
+ if (statecache){
+ std::cerr << "State Cache: "; statecache->stat();
+ delete statecache;
+ }
+
+
for (int i=1;i<=maxlev;i++){
if (table[i]) delete [] table[i];
if (isQtable){
@@ -203,7 +170,12 @@ public:
}
}
}
-
+
+ void init_prcache(){
+ assert(prcache==NULL);
+ prcache=new ngramcache(maxlev,sizeof(double),1000000);
+ statecache=new ngramcache(maxlev-1,sizeof(char *),1000000);
+ }
void configure(int n,bool quantized){
maxlev=n;
@@ -237,6 +209,8 @@ public:
double prob(ngram ng);
double lprob(ngram ng);
+ double clprob(ngram ng);
+
void *search(char *tb,LMT_TYPE ndt,int lev,int n,int sz,int *w,
LMT_ACTION action,char **found=(char **)NULL);
@@ -252,6 +226,7 @@ public:
int succscan(ngram& h,ngram& ng,LMT_ACTION action,int lev);
const char *maxsuffptr(ngram ong);
+ const char *cmaxsuffptr(ngram ong);
inline int putmem(char* ptr,int value,int offs,int size){
assert(ptr!=NULL);