Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mgiza.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'v0.6.4/src/model2.cpp')
-rw-r--r--v0.6.4/src/model2.cpp238
1 files changed, 238 insertions, 0 deletions
diff --git a/v0.6.4/src/model2.cpp b/v0.6.4/src/model2.cpp
new file mode 100644
index 0000000..affa0bd
--- /dev/null
+++ b/v0.6.4/src/model2.cpp
@@ -0,0 +1,238 @@
+/*
+
+EGYPT Toolkit for Statistical Machine Translation
+Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky.
+
+This program is free software; you can redistribute it and/or
+modify it under the terms of the GNU General Public License
+as published by the Free Software Foundation; either version 2
+of the License, or (at your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with this program; if not, write to the Free Software
+Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
+USA.
+
+*/
+#include "model2.h"
+#include "Globals.h"
+#include "utility.h"
+#include "Parameter.h"
+#include "defs.h"
+
+extern short NoEmptyWord;
+
+
+GLOBAL_PARAMETER2(int,Model2_Dump_Freq,"MODEL 2 DUMP FREQUENCY","t2","dump frequency of Model 2",PARLEV_OUTPUT,0);
+
+model2::model2(model1& m,amodel<PROB>&_aTable,amodel<COUNT>&_aCountTable):
+ model1(m),aTable(_aTable),aCountTable(_aCountTable)
+{ }
+
+void model2::initialize_table_uniformly(sentenceHandler& sHandler1){
+ // initialize the aTable uniformly (run this before running em_with_tricks)
+ int n=0;
+ sentPair sent ;
+ sHandler1.rewind();
+ while(sHandler1.getNextSentence(sent)){
+ Vector<WordIndex>& es = sent.eSent;
+ Vector<WordIndex>& fs = sent.fSent;
+ WordIndex l = es.size() - 1;
+ WordIndex m = fs.size() - 1;
+ n++;
+ if(1<=m&&aTable.getValue(l,m,l,m)<=PROB_SMOOTH){
+ PROB uniform_val = 1.0 / (l+1) ;
+ for(WordIndex j=1; j <= m; j++)
+ for(WordIndex i=0; i <= l; i++)
+ aTable.setValue(i,j, l, m, uniform_val);
+ }
+ }
+}
+
+int model2::em_with_tricks(int noIterations,bool dumpCount,
+ const char* dumpCountName, bool useString){
+ double minErrors=1.0;int minIter=0;
+ string modelName="Model2",shortModelName="2";
+ time_t it_st, st, it_fn, fn;
+ string tfile, afile, number, alignfile, test_alignfile;
+ int pair_no = 0;
+ bool dump_files = false ;
+ ofstream of2 ;
+ st = time(NULL) ;
+ sHandler1.rewind();
+ cout << "\n==========================================================\n";
+ cout << modelName << " Training Started at: " << ctime(&st) << " iter: " << noIterations << "\n";
+ for(int it=1; it <= noIterations ; it++){
+ pair_no = 0;
+ it_st = time(NULL) ;
+ cout << endl << "-----------\n" << modelName << ": Iteration " << it << '\n';
+ dump_files = (Model2_Dump_Freq != 0) && ((it % Model2_Dump_Freq) == 0) && !NODUMPS;
+ number = "";
+ int n = it;
+ do{
+ number.insert((size_t)0, 1, (char)(n % 10 + '0'));
+ } while((n /= 10) > 0);
+ tfile = Prefix + ".t" + shortModelName + "." + number ;
+ afile = Prefix + ".a" + shortModelName + "." + number ;
+ alignfile = Prefix + ".A" + shortModelName + "." + number ;
+ test_alignfile = Prefix + ".tst.A" + shortModelName + "." + number ;
+ aCountTable.clear();
+ initAL();
+ em_loop(perp, sHandler1, dump_files, alignfile.c_str(), trainViterbiPerp, false);
+ if( errorsAL()<minErrors ){
+ minErrors=errorsAL();
+ minIter=it;
+ }
+ if (testPerp && testHandler)
+ em_loop(*testPerp, *testHandler, dump_files, test_alignfile.c_str(), *testViterbiPerp, true);
+ if (dump_files&&OutputInAachenFormat==1)
+ tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1);
+
+ if(dumpCount && it == noIterations){
+ string realTableName = dumpCountName;
+ realTableName += ".t.count";
+ tTable.printCountTable(realTableName.c_str(),Elist.getVocabList(),Flist.getVocabList(),useString);
+ string realATableName = dumpCountName;
+ realATableName += ".a.count";
+ aCountTable.printRealTable(realATableName.c_str());
+ }
+ tTable.normalizeTable(Elist, Flist);
+ aCountTable.normalize(aTable);
+ cout << modelName << ": ("<<it<<") TRAIN CROSS-ENTROPY " << perp.cross_entropy()
+ << " PERPLEXITY " << perp.perplexity() << '\n';
+ if (testPerp && testHandler)
+ cout << modelName << ": ("<<it<<") TEST CROSS-ENTROPY " << (*testPerp).cross_entropy()
+ << " PERPLEXITY " << (*testPerp).perplexity()
+ << '\n';
+ cout << modelName << ": ("<<it<<") VITERBI TRAIN CROSS-ENTROPY " << trainViterbiPerp.cross_entropy()
+ << " PERPLEXITY " << trainViterbiPerp.perplexity() << '\n';
+ if (testPerp && testHandler)
+ cout << modelName << ": ("<<it<<") VITERBI TEST CROSS-ENTROPY " << testViterbiPerp->cross_entropy()
+ << " PERPLEXITY " << testViterbiPerp->perplexity()
+ << '\n';
+ if (dump_files) {
+ if(OutputInAachenFormat==0)
+ tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat);
+ aCountTable.printTable(afile.c_str());
+ }
+ it_fn = time(NULL) ;
+ cout << modelName << " Iteration: " << it<< " took: " << difftime(it_fn, it_st) << " seconds\n";
+ } // end of iterations
+ aCountTable.clear();
+ fn = time(NULL) ;
+ cout << endl << "Entire " << modelName << " Training took: " << difftime(fn, st) << " seconds\n";
+ // cout << "tTable contains " << tTable.getHash().bucket_count()
+ // << " buckets and " << tTable.getHash().size() << " entries." ;
+ cout << "==========================================================\n";
+ return minIter;
+}
+
+void model2::load_table(const char* aname){
+ /* This function loads the a table from the given file; use it
+ when you want to load results from previous a training without
+ doing any new training.
+ NAS, 7/11/99
+ */
+ cout << "Model2: loading a table \n";
+ aTable.readTable(aname);
+}
+
+
+void model2::em_loop(Perplexity& perp, sentenceHandler& sHandler1,
+ bool dump_alignment, const char* alignfile, Perplexity& viterbi_perp,
+ bool test)
+{
+ massert( aTable.is_distortion==0 );
+ massert( aCountTable.is_distortion==0 );
+ WordIndex i, j, l, m ;
+ double cross_entropy;
+ int pair_no=0 ;
+ perp.clear();
+ viterbi_perp.clear();
+ ofstream of2;
+ // for each sentence pair in the corpus
+ if (dump_alignment||FEWDUMPS )
+ of2.open(alignfile);
+ sentPair sent ;
+
+ vector<double> ferts(evlist.size());
+
+ sHandler1.rewind();
+ while(sHandler1.getNextSentence(sent)){
+ Vector<WordIndex>& es = sent.eSent;
+ Vector<WordIndex>& fs = sent.fSent;
+ const float so = sent.getCount();
+ l = es.size() - 1;
+ m = fs.size() - 1;
+ cross_entropy = log(1.0);
+ Vector<WordIndex> viterbi_alignment(fs.size());
+ double viterbi_score = 1;
+ for(j=1; j <= m; j++){
+ Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table
+ // entries that map fs to all possible ei in this sentence.
+ PROB denom = 0.0;
+ PROB e = 0.0, word_best_score = 0;
+ WordIndex best_i = 0 ; // i for which fj is best maped to ei
+ for(i=0; i <= l; i++){
+ sPtrCache[i] = tTable.getPtr(es[i], fs[j]) ;
+ if (sPtrCache[i] != 0 &&(*(sPtrCache[i])).prob > PROB_SMOOTH )
+ e = (*(sPtrCache[i])).prob * aTable.getValue(i,j, l, m) ;
+ else e = PROB_SMOOTH * aTable.getValue(i,j, l, m);
+ denom += e ;
+ if (e > word_best_score){
+ word_best_score = e ;
+ best_i = i ;
+ }
+ }
+ viterbi_alignment[j] = best_i ;
+ viterbi_score *= word_best_score; ///denom ;
+ cross_entropy += log(denom) ;
+ if (denom == 0){
+ if (test)
+ cerr << "WARNING: denom is zero (TEST)\n";
+ else
+ cerr << "WARNING: denom is zero (TRAIN)\n";
+ }
+ if (!test){
+ if(denom > 0){
+ COUNT val = COUNT(so) / (COUNT) double(denom) ;
+ for( i=0; i <= l; i++){
+ PROB e(0.0);
+ if (sPtrCache[i] != 0 && (*(sPtrCache[i])).prob > PROB_SMOOTH)
+ e = (*(sPtrCache[i])).prob ;
+ else e = PROB_SMOOTH ;
+ e *= aTable.getValue(i,j, l, m);
+ COUNT temp = COUNT(e) * val ;
+ if( NoEmptyWord==0 || i!=0 )
+ if (sPtrCache[i] != 0)
+ (*(sPtrCache[i])).count += temp ;
+ else
+ tTable.incCount(es[i], fs[j], temp);
+ aCountTable.addValue(i,j, l, m,temp) ;
+ } /* end of for i */
+ } // end of if (denom > 0)
+ }// if (!test)
+ } // end of for (j) ;
+ sHandler1.setProbOfSentence(sent,cross_entropy);
+ perp.addFactor(cross_entropy, so, l, m,1);
+ viterbi_perp.addFactor(log(viterbi_score), so, l, m,1);
+ if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000) )
+ printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, viterbi_alignment, sent.sentenceNo, viterbi_score);
+ addAL(viterbi_alignment,sent.sentenceNo,l);
+ pair_no++;
+ } /* of while */
+ sHandler1.rewind();
+ perp.record("Model2");
+ viterbi_perp.record("Model2");
+ errorReportAL(cout,"IBM-2");
+}
+
+
+
+
+