diff options
Diffstat (limited to 'experimental/bidirectional/src/model1.h')
-rw-r--r-- | experimental/bidirectional/src/model1.h | 205 |
1 files changed, 205 insertions, 0 deletions
diff --git a/experimental/bidirectional/src/model1.h b/experimental/bidirectional/src/model1.h new file mode 100644 index 0000000..cadc1e3 --- /dev/null +++ b/experimental/bidirectional/src/model1.h @@ -0,0 +1,205 @@ +/* + +EGYPT Toolkit for Statistical Machine Translation +Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +USA. + +*/ +#ifndef _model1_h +#define _model1_h 1 + +#include <assert.h> + +#include <iostream> +#include <strstream> +#include <algorithm> +#include <functional> +#include <list> +#include <map> +#include <set> +#include <utility> + +#if __GNUC__>2 +#include <ext/hash_map> +using __gnu_cxx::hash_map; +#else +#include <hash_map> +#endif +#include <time.h> +#include <fstream> +#include <math.h> +#include <stdio.h> + +#include "Vector.h" +#include "vocab.h" +#include "TTables.h" +#include "getSentence.h" +#include "Perplexity.h" +#include "vocab.h" +#include "Dictionary.h" +#include "ttableDiff.hpp" +#include "syncObj.h" + +extern int NumberOfVALIalignments; + +class report_info{ + public: + Mutex alLock; + Perplexity& perp; + sentenceHandler& sHandler1; + Perplexity* testPerp; + sentenceHandler* testHandler; + Perplexity& trainViterbiPerp; + Perplexity* testViterbiPerp; + report_info(Perplexity& _perp, + sentenceHandler& _sHandler1, + Perplexity* _testPerp, + sentenceHandler* _testHandler, + Perplexity& _trainViterbiPerp, + Perplexity* _testViterbiPerp) + : perp(_perp),sHandler1(_sHandler1),testPerp(_testPerp),testHandler(_testHandler),trainViterbiPerp(_trainViterbiPerp),testViterbiPerp(_testViterbiPerp) + {} + + report_info(const report_info & rp): + perp(rp.perp),sHandler1(rp.sHandler1), testPerp(rp.testPerp), + trainViterbiPerp(rp.trainViterbiPerp), testViterbiPerp(rp.testViterbiPerp), + testHandler(rp.testHandler) + {} +}; + + +class model1 : public report_info{ +public: + string efFilename; + vcbList& Elist ; + vcbList& Flist ; + double eTotalWCount ; // size of source copus in number of words + double fTotalWCount ; // size of target corpus in number of words + int noEnglishWords; + int noFrenchWords; + tmodel<COUNT, PROB>&tTable; + Vector<WordEntry>& evlist ; + Vector<WordEntry>& fvlist ; + int threadID; +public: + int ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch; + int ALmissingVALI,ALtoomuchVALI,ALeventsMissingVALI,ALeventsToomuchVALI; + int ALmissingTEST,ALtoomuchTEST,ALeventsMissingTEST,ALeventsToomuchTEST; + model1 (const char* efname, vcbList& evcblist, vcbList& fvcblist,tmodel<COUNT, PROB>&_tTable,Perplexity& _perp, + sentenceHandler& _sHandler1, + Perplexity* _testPerp, + sentenceHandler* _testHandler, + Perplexity& _trainViterbiPerp, + Perplexity* _testViterbiPerp); + + model1 (const model1& m1, int _threadID=0); + void initialize_table_uniformly(sentenceHandler& sHandler1); + + int em_with_tricks(int noIterations, + bool seedModel1, Dictionary& dictionary, bool useDict, bool dumpCount = false, + const char* dumpCountName = NULL, bool useString = false); + int em_thread(int noIterations, int thread,Dictionary& dictionary, bool useDict); + bool load_table(const char* tname); + void readVocabFile(const char* fname, Vector<WordEntry>& vlist, int& vsize, + int& total); + inline Vector<WordEntry>& getEnglishVocabList(void)const {return Elist.getVocabList();}; + inline Vector<WordEntry>& getFrenchVocabList(void)const {return Flist.getVocabList();}; + inline double getETotalWCount(void) const {return eTotalWCount;}; + inline double getFTotalWCount(void) const {return fTotalWCount;}; + inline int getNoEnglishWords(void) const {return noEnglishWords;}; + inline int getNoFrenchWords(void) const {return noFrenchWords;}; + inline tmodel<COUNT, PROB>& getTTable(void) {return tTable;}; + inline string& getEFFilename(void) {return efFilename;}; + +//////////////////////////////////////////////////////////////// +// Added by Qin Gao To Enable Parallel Training +//////////////////////////////////////////////////////////////// + + CTTableDiff<COUNT,PROB>* one_step_em(int it ,bool seedModel1, Dictionary& dictionary, + bool useDict); + + void recombine(); + + void combine_one(CTTableDiff<COUNT,PROB>* cb); + + void save_table(const char* tname); + + + + +//////////////////////////////////////////////////////////////// +// END OF QIN GAO's CODE +//////////////////////////////////////////////////////////////// +private: + void em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, bool , const char*, Dictionary& dictionary, bool useDict, + Perplexity& viterbiperp, bool=false); + void em_loop_1(CTTableDiff<COUNT,PROB> *diff,int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, bool , const char*, Dictionary& dictionary, bool useDict, + Perplexity& viterbiperp, bool=false); + friend class model2; + friend class hmm; +public: + void addAL(const Vector<WordIndex>& viterbi_alignment,int pair_no,int l){ + alLock.lock(); + if( pair_no<=int(ReferenceAlignment.size()) ){ + //cerr << "AL: " << viterbi_alignment << " " << pair_no << endl; + ErrorsInAlignment(ReferenceAlignment[pair_no-1],viterbi_alignment,l,ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch,pair_no); + if( pair_no<=NumberOfVALIalignments ){ + ErrorsInAlignment(ReferenceAlignment[pair_no-1],viterbi_alignment,l,ALmissingVALI,ALtoomuchVALI,ALeventsMissingVALI,ALeventsToomuchVALI,pair_no); + } + if( pair_no>NumberOfVALIalignments ){ + ErrorsInAlignment(ReferenceAlignment[pair_no-1],viterbi_alignment,l,ALmissingTEST,ALtoomuchTEST,ALeventsMissingTEST,ALeventsToomuchTEST,pair_no); + } + } + alLock.unlock(); + } + void initAL(){ALmissingVALI=ALtoomuchVALI=ALeventsMissingVALI=ALeventsToomuchVALI=ALmissingTEST=ALtoomuchTEST=ALeventsMissingTEST=ALeventsToomuchTEST=ALmissing=ALtoomuch=ALeventsMissing=ALeventsToomuch=0;} + double errorsAL()const{ + if( ALeventsMissingVALI+ALeventsToomuchVALI ){ + return (ALmissingVALI+ALtoomuchVALI)/double(ALeventsMissingVALI+ALeventsToomuchVALI); + }else{ + return 0.0; + } + } + void errorReportAL(ostream&out,string m)const{ + if( ALeventsMissing+ALeventsToomuch ){ + out << "alignmentErrors (" << m << "): " + << 100.0*(ALmissing+ALtoomuch)/double(ALeventsMissing+ALeventsToomuch) + << " recall: " << 100.0*(1.0-ALmissing/double(ALeventsMissing)) + << " precision: " << 100.0*(1.0-ALtoomuch/double(ALeventsToomuch)) + << " (missing:" << ALmissing << "/" << ALeventsMissing << " " << ALtoomuch + << " " << ALeventsToomuch << ")\n"; + } + if( ALeventsMissingVALI+ALeventsToomuchVALI ){ + out << "alignmentErrors VALI (" << m << "): " + << 100.0*(ALmissingVALI+ALtoomuchVALI)/double(ALeventsMissingVALI+ALeventsToomuchVALI) + << " recall: " << 100.0*(1.0-ALmissingVALI/double(ALeventsMissingVALI)) + << " precision: " << 100.0*(1.0-ALtoomuchVALI/double(ALeventsToomuchVALI)) + << " (missing:" << ALmissingVALI << "/" << ALeventsMissingVALI << " " << ALtoomuchVALI + << " " << ALeventsToomuchVALI << ")\n"; + } + if( ALeventsMissingTEST+ALeventsToomuchTEST ){ + out << "alignmentErrors TEST(" << m << "): " + << 100.0*(ALmissingTEST+ALtoomuchTEST)/double(ALeventsMissingTEST+ALeventsToomuchTEST) + << " recall: " << 100.0*(1.0-ALmissingTEST/double(ALeventsMissingTEST)) + << " precision: " << 100.0*(1.0-ALtoomuchTEST/double(ALeventsToomuchTEST)) + << " (missing:" << ALmissingTEST << "/" << ALeventsMissingTEST << " " << ALtoomuchTEST + << " " << ALeventsToomuchTEST << ")\n"; + } + } +}; + +#endif |