/* EGYPT Toolkit for Statistical Machine Translation Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ /* --------------------------------------------------------------------------* * * * Module : TTables * * * * Prototypes File: TTables.h * * * * Objective: Defines clases and methods for handling I/O for Probability & * * Count tables and also alignment tables * *****************************************************************************/ #ifndef _ttables_h #define _ttables_h 1 #include "defs.h" #include "vocab.h" #include #include #include #include #include #include #include "Vector.h" #include #include #include "Globals.h" /* The tables defined in the following classes are defined as hash tables. For example. the t-table is a hash function of a word pair; an alignment is a hash function of a vector of integer numbers (sentence positions) and so on */ /*----------- Defnition of Hash Function for class tmodel ------- -----------*/ typedef pair wordPairIds; class hashpair : public unary_function< pair, size_t > { public: size_t operator() (const pair& key) const { return (size_t) MAX_W*key.first + key.second; /* hash function and it is guarnteed to have unique id for each unique pair */ } }; /* ------------------ Class Prototype Definitions ---------------------------* Class Name: tmodel Objective: This defines the underlying data structur for t Tables and t Count Tables. They are defined as a hash table. Each entry in the hash table is the probability (P(fj/ei) ) or count collected for ( C(fj/ei)). The probability and the count are represented as log integer probability as defined by the class LogProb . This class is used to represents t Tables (probabiliity) and n (fertility Tables and also their corresponding count tables . *---------------------------------------------------------------------------*/ //typedef float COUNT ; //typedef LogProb PROB ; template class LpPair { public: COUNT count ; PROB prob ; public: // constructor LpPair():count(0), prob(0){} ; LpPair(COUNT c, PROB p):count(c), prob(p){}; } ; #ifdef BINARY_SEARCH_FOR_TTABLE template T*mbinary_search(T*x,T*y,unsigned int val) { if( y-x==0 ) return 0; if( x->first==val) return x; if( y-x<2 ) return 0; T*mid=x+(y-x)/2; if( val < mid->first ) return mbinary_search(x,mid,val); else return mbinary_search(mid,y,val); } template const T*mbinary_search(const T*x,const T*y,unsigned int val) { if( y-x==0 ) return 0; if( x->first==val) return x; if( y-x<2 ) return 0; const T*mid=x+(y-x)/2; if( val < mid->first ) return mbinary_search(x,mid,val); else return mbinary_search(mid,y,val); } template class tmodel{ typedef LpPair CPPair; public: int noEnglishWords; // total number of unique source words int noFrenchWords; // total number of unique target words //vector > fs; //vector es; vector< vector >* > lexmat; void erase(WordIndex e, WordIndex f) { CPPair *p=find(e,f); if(p) *p=CPPair(0,0); }; CPPair*find(int e,int f) { //pair *be=&(fs[0])+es[e]; //pair *en=&(fs[0])+es[e+1]; pair *be=&(*lexmat[e])[0]; pair *en=&(*lexmat[e])[0]+(*lexmat[e]).size(); pair *x= mbinary_search(be,en,f); if( x==0 ) { //cerr << "A:DID NOT FIND ENTRY: " << e << " " << f << '\n'; //abort(); return 0; } return &(x->second); } const CPPair*find(int e,int f)const { const pair *be=&(*lexmat[e])[0]; const pair *en=&(*lexmat[e])[0]+(*lexmat[e]).size(); //const pair *be=&(fs[0])+es[e]; //const pair *en=&(fs[0])+es[e+1]; const pair *x= mbinary_search(be,en,f); if( x==0 ) { //cerr << "B:DID NOT FIND ENTRY: " << e << " " << f << '\n'; //abort(); return 0; } return &(x->second); } public: void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){ *find(e,f)=CPPair(cval,pval); } CPPair*getPtr(int e,int f){return find(e,f);} tmodel(const string&fn) { int count=0,count2=0; ifstream infile2(fn.c_str()); int e,f,olde=-1; pair cp; vector< pair > cps; while(infile2>>e>>f) { cp.first=f; assert(e>=olde); assert(e>olde ||f>oldf); if( e!=olde&&olde>=0 ) { int oldsize=lexmat.size(); lexmat.resize(olde+1); for(unsigned int i=oldsize;i > (cps); cps.clear(); if( !((*lexmat[olde]).size()==(*lexmat[olde]).capacity()) ) cerr << "eRROR: waste of memory: " << (*lexmat[olde]).size() << " " << (*lexmat[olde]).capacity() << endl; count2+=lexmat[olde]->capacity(); } cps.push_back(cp); olde=e; count++; } lexmat.resize(olde+1); lexmat[olde]=new vector< pair > (cps); count2+=lexmat[olde]->capacity(); cout << "There are " << count << " " << count2 << " entries in table" << '\n'; } /* tmodel(const string&fn) { size_t count=0; { ifstream infile1(fn.c_str()); if( !infile1 ) { cerr << "ERROR: can't read coocurrence file " << fn << '\n'; abort(); } int e,f; while(infile1>>e>>f) count++; } cout << "There are " << count << " entries in table" << '\n'; ifstream infile2(fn.c_str()); fs.resize(count); int e,f,olde=-1,oldf=-1; pair cp; count=0; while(infile2>>e>>f) { assert(e>=olde); assert(e>olde ||f>oldf); if( e!=olde ) { es.resize(e+1); for(unsigned int i=olde+1;int(i)<=e;++i) es[i]=count; } cp.first=f; assert(countcount += inc ; } } PROB getProb(WordIndex e, WordIndex f) const { const CPPair *p=find(e,f); if( p ) return max(p->prob, PROB_SMOOTH); else return PROB_SMOOTH; } COUNT getCount(WordIndex e, WordIndex f) const { const CPPair *p=find(e,f); if( p ) return p->count; else return 0.0; } void printProbTable(const char* filename, const Vector&, const Vector&,bool actual) const; void printCountTable(const char* filename, const Vector&, const Vector&,bool actual) const; void printProbTableInverse(const char *filename, const Vector& evlist, const Vector& fvlist, const double eTotal, const double fTotal, const bool actual = false ) const; void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2); void readProbTable(const char *filename); }; #else template class tmodel{ typedef LpPair CPPair; public: int noEnglishWords; // total number of unique source words int noFrenchWords; // total number of unique target words hash_map > ef; void erase(WordIndex e, WordIndex f) // In: a source and a target token ids. // removes the entry with that pair from table { ef.erase(wordPairIds(e, f)); }; public: Vector total2; Vector nFrench; Vector nEng; // methods; // insert: add entry P(fj/ei) to the hash function, Default value is 0.0 void insert(WordIndex e, WordIndex f, COUNT cval=0.0, PROB pval = 0.0){ ef[wordPairIds(e, f)].count = cval ; ef[wordPairIds(e, f)].prob = pval ; } // returns a reference to the word pair, if does not exists, it creates it. CPPair&getRe(WordIndex e, WordIndex f) {return ef[wordPairIds(e, f)];} // returns a pointer to an existing word pair. if pair does not exists, // the method returns the zero pointer (NULL) CPPair*getPtr(WordIndex e, WordIndex f) { // look up this pair and return its position typename hash_map >::iterator i = ef.find(wordPairIds(e, f)); if(i != ef.end()) // if it exists, return a pointer to it. return(&((*i).second)); else return(0) ; // else return NULL pointer } void incCount(WordIndex e, WordIndex f, COUNT inc) // increments the count of the given word pair. if the pair does not exist, // it creates it with the given value. { if( inc ) ef[wordPairIds(e, f)].count += inc ; } PROB getProb(WordIndex e, WordIndex f) const // read probability value for P(fj/ei) from the hash table // if pair does not exist, return floor value PROB_SMOOTH { typename hash_map >::const_iterator i= ef.find(wordPairIds(e, f)); if(i == ef.end()) return PROB_SMOOTH; else return max(((*i).second).prob, PROB_SMOOTH); } COUNT getCount(WordIndex e, WordIndex f) const /* read count value for entry pair (fj/ei) from the hash table */ { typename hash_map >::const_iterator i= ef.find(wordPairIds(e, f)); if(i == ef.end()) return 0; else return ((*i).second).count; } inline const hash_map >& getHash(void) const {return ef;}; /* get a refernece to the hash table */ //inline void resize(WordIndex n) {ef.resize(n);}; // to resize he hash table void printProbTable(const char* filename, const Vector&, const Vector&,bool actual) const; void printCountTable(const char* filename, const Vector&, const Vector&,bool actual) const; // print the t table to the given file but this time print actual source and // target words instead of thier token ids void printProbTableInverse(const char *filename, const Vector& evlist, const Vector& fvlist, const double eTotal, const double fTotal, const bool actual = false ) const; // dump inverse of t table (i.e P(ei/fj)) to the given file name, // if the given flag is true then actual words are printed not token ids void normalizeTable(const vcbList&engl, const vcbList&french, int iter=2); // to norlmalize the table i.e. make sure P(fj/ei) for all j is equal to 1 void readProbTable(const char *filename); // void readAsFertilityTable(const char *filename); }; /*--------------- End of Class Definition for tmodel -----------------------*/ #endif #endif