diff options
Diffstat (limited to 'mgizapp/src/model1.cpp')
-rw-r--r-- | mgizapp/src/model1.cpp | 601 |
1 files changed, 601 insertions, 0 deletions
diff --git a/mgizapp/src/model1.cpp b/mgizapp/src/model1.cpp new file mode 100644 index 0000000..e649f8d --- /dev/null +++ b/mgizapp/src/model1.cpp @@ -0,0 +1,601 @@ +/* + +EGYPT Toolkit for Statistical Machine Translation +Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. + +This program is free software; you can redistribute it and/or +modify it under the terms of the GNU General Public License +as published by the Free Software Foundation; either version 2 +of the License, or (at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +You should have received a copy of the GNU General Public License +along with this program; if not, write to the Free Software +Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, +USA. + +*/ +#include "model1.h" +#include "Globals.h" +#include "utility.h" +#include "Parameter.h" + +extern short NoEmptyWord; +extern int VerboseSentence; + +extern short NCPUS; + +GLOBAL_PARAMETER2(int,Model1_Dump_Freq,"MODEL 1 DUMP FREQUENCY","t1","dump frequency of Model 1",PARLEV_OUTPUT,0); +int NumberOfVALIalignments=100; + +model1::model1(const char* efname, vcbList& evcblist, vcbList& fvcblist,tmodel<COUNT, PROB>&_tTable,Perplexity& _perp, + sentenceHandler& _sHandler1, + Perplexity* _testPerp, + sentenceHandler* _testHandler, + Perplexity& _trainViterbiPerp, + Perplexity* _testViterbiPerp): + report_info(_perp,_sHandler1,_testPerp,_testHandler,_trainViterbiPerp,_testViterbiPerp), + efFilename(efname), Elist(evcblist), Flist(fvcblist), + eTotalWCount(Elist.totalVocab()), fTotalWCount(Flist.totalVocab()), + noEnglishWords(Elist.size()), noFrenchWords(Flist.size()), tTable(_tTable), + evlist(Elist.getVocabList()), fvlist(Flist.getVocabList()) +{} + +model1::model1 (const model1& m1, int _threadID): +report_info(m1),efFilename(m1.efFilename), +Elist(m1.Elist),Flist(m1.Flist),eTotalWCount(m1.eTotalWCount),fTotalWCount(m1.fTotalWCount), +noEnglishWords(m1.noEnglishWords),noFrenchWords(m1.noFrenchWords),tTable(m1.tTable), +evlist(m1.evlist),fvlist(m1.fvlist) +{} + +void model1::initialize_table_uniformly(sentenceHandler& sHandler1){ + WordIndex i, j; + + cout << "Initialize tTable\n"; + + sentPair sent ; + sHandler1.rewind(); + while(sHandler1.getNextSentence(sent)){ + Vector<WordIndex>& es = sent.eSent; + Vector<WordIndex>& fs = sent.fSent; + PROB uniform = 1.0/es.size() ; + for( i=0; i < es.size(); i++) + for(j=1; j < fs.size(); j++) + tTable.insert(es[i],fs[j],0,uniform); + } +} + +struct em_loop_t{ + model1 *m1; + int it; + int nthread; + Dictionary *dict; + bool useDict; + int result; + pthread_t thread; + int valid ; +}; + +void* exe_emloop(void *arg){ + em_loop_t* em =(em_loop_t *) arg; + em->result = em->m1->em_thread(em->it,em->nthread,*em->dict,em->useDict); + return arg; +} + +int model1::em_thread(int noIterations, int nthread, /*Perplexity& perp, sentenceHandler& sHandler1, */ + Dictionary& dictionary, bool useDict /*Perplexity* testPerp, sentenceHandler* testHandler, + Perplexity& trainViterbiPerp, Perplexity* testViterbiPerp */ ) +{ + double minErrors=1.0;int minIter=0; + string modelName="Model1",shortModelName="1"; + char b[2]; + b[1] = '\0'; + b[0] = '0' + nthread; + time_t st, it_st, fn, it_fn; + string tfile, number, alignfile, test_alignfile; + int pair_no; + bool dump_files = false ; + cout << "==========================================================\n"; + cout << modelName << " Training Started at: "<< ctime(&st) << "\n"; + int it = noIterations; + pair_no = 0 ; + it_st = time(NULL); + cout << "-----------\n" << modelName << ": Iteration " << it << '\n'; + dump_files = (Model1_Dump_Freq != 0) && ((it % Model1_Dump_Freq) == 0) && !NODUMPS ; +// dump_files = true; + number = ""; + int n = it; + do{ + number.insert((size_t)0, 1, (char)(n % 10 + '0')); + } while((n /= 10) > 0); + alignfile = Prefix + ".A" + shortModelName + "." + number + ".part" ; + alignfile = alignfile + b; + + em_loop(it,perp, sHandler1, false, dump_files, alignfile.c_str(), dictionary, useDict, trainViterbiPerp); + return minIter; +} + +int model1::em_with_tricks(int noIterations, /*Perplexity& perp, sentenceHandler& sHandler1, */ + bool seedModel1, Dictionary& dictionary, bool useDict /*Perplexity* testPerp, sentenceHandler* testHandler, + Perplexity& trainViterbiPerp, Perplexity* testViterbiPerp */ +, bool dumpCount , const char* dumpCountName, bool useString) // If specified, then will dump files before last iteration +{ + double minErrors=1.0;int minIter=0; + string modelName="Model1",shortModelName="1"; + time_t st, it_st, fn, it_fn; + string tfile, number, alignfile, test_alignfile; + int pair_no; + bool dump_files = false ; + st = time(NULL); + sHandler1.rewind(); + cout << "==========================================================\n"; + cout << modelName << " Training Started at: "<< ctime(&st) << "\n"; + for(int it = 1; it <= noIterations; it++){ + pair_no = 0 ; + it_st = time(NULL); + cout << "-----------\n" << modelName << ": Iteration " << it << '\n'; + dump_files = (Model1_Dump_Freq != 0) && ((it % Model1_Dump_Freq) == 0) && !NODUMPS ; + //dump_files = true; + number = ""; + int n = it; + do{ + number.insert((size_t)0, 1, (char)(n % 10 + '0')); + } while((n /= 10) > 0); + tfile = Prefix + ".t" + shortModelName + "." + number ; + alignfile = Prefix + ".A" + shortModelName + "." + number+".part0" ; + test_alignfile = Prefix +".tst.A" + shortModelName + "." + number ; + initAL(); + threadID = 0; + int th; + vector<em_loop_t> ths; + ths.resize(NCPUS); + sHandler1.rewind(); + for (th=1;th<NCPUS;th++){ + ths[th].m1=this; + ths[th].it = it; + ths[th].nthread = th; + ths[th].dict = & dictionary; + ths[th].useDict = useDict; + ths[th].result = 0; + ths[th].valid = pthread_create(&(ths[th].thread),NULL,exe_emloop,&(ths[th])); + if(ths[th].valid){ + cerr << "Error starting thread " << th << endl; + } + } + em_loop(it,perp, sHandler1, seedModel1, dump_files, alignfile.c_str(), dictionary, useDict, trainViterbiPerp); + perp.record("Model1"); + trainViterbiPerp.record("Model1"); + errorReportAL(cout, "IBM-1"); + + cerr << "Main thread done, waiting" << endl;; + for (th=1;th<NCPUS;th++){ + pthread_join((ths[th].thread),NULL); + cerr << "Thread " << th << "done" << endl; + } + if (testPerp && testHandler) // calculate test perplexity + em_loop(it,*testPerp, *testHandler, seedModel1, dump_files, test_alignfile.c_str(), dictionary, useDict, *testViterbiPerp, true); + if( errorsAL()<minErrors ) { + minErrors=errorsAL(); + minIter=it; + } + //if (dump_files){ + // if( OutputInAachenFormat==1 ) + // tTable.printCountTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),1); + //} + cerr << "Normalizing T " << endl; + + /** + If asked for dumping count table, just dump it. + */ + if(dumpCount && it == noIterations){ + string realTableName = dumpCountName; + realTableName += ".t.count"; + tTable.printCountTable(realTableName.c_str(),Elist.getVocabList(),Flist.getVocabList(),useString); + } + + tTable.normalizeTable(Elist, Flist); + //cout << tTable.getProb(2,2) << endl; + cerr << " DONE Normalizing " << endl; + cout << modelName << ": ("<<it<<") TRAIN CROSS-ENTROPY " << perp.cross_entropy() + << " PERPLEXITY " << perp.perplexity() << '\n'; + if (testPerp && testHandler) + cout << modelName << ": ("<<it<<") TEST CROSS-ENTROPY " << (*testPerp).cross_entropy() + << " PERPLEXITY " << (*testPerp).perplexity() + << '\n'; + cout << modelName << ": ("<<it<<") VITERBI TRAIN CROSS-ENTROPY " << trainViterbiPerp.cross_entropy() + << " PERPLEXITY " << trainViterbiPerp.perplexity() << '\n'; + if (testPerp && testHandler) + cout << modelName << ": ("<< + it<<") VITERBI TEST CROSS-ENTROPY " + << (*testViterbiPerp).cross_entropy() + << " PERPLEXITY " << (*testViterbiPerp).perplexity() + << '\n'; + if (dump_files){ + if( OutputInAachenFormat==0 ) + tTable.printProbTable(tfile.c_str(),Elist.getVocabList(), + Flist.getVocabList(),OutputInAachenFormat); + } + it_fn = time(NULL); + cout << "Model 1 Iteration: " << it<< " took: " << difftime(it_fn, it_st) << " seconds\n"; + + + } + fn = time(NULL) ; + cout << "Entire " << modelName << " Training took: " << difftime(fn, st) << " seconds\n"; + return minIter; +} + +bool model1::load_table(const char* tname){ + /* This function loads the t table from the given file; use it + when you want to load results from previous t training + without doing any new training. + NAS, 7/11/99 + */ + cout << "Model1: loading t table \n" ; + return tTable.readProbTable(tname); +} + + +extern float MINCOUNTINCREASE; +void model1::em_loop(int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, + bool dump_alignment, const char* alignfile, Dictionary& dict, bool useDict, Perplexity& viterbi_perp, bool test) +{ + WordIndex i, j, l, m ; + double cross_entropy; + int pair_no=0 ; + perp.clear(); + viterbi_perp.clear(); + ofstream of2; + // for each sentence pair in the corpus + if (dump_alignment||FEWDUMPS) + of2.open(alignfile); + PROB uniform = 1.0/noFrenchWords ; + sentPair sent ; + + while(sHandler1.getNextSentence(sent)){ + Vector<WordIndex>& es = sent.eSent; + Vector<WordIndex>& fs = sent.fSent; + const float so = sent.getCount(); + l = es.size() - 1; + m = fs.size() - 1; + cross_entropy = log(1.0); + Vector<WordIndex> viterbi_alignment(fs.size()); + double viterbi_score = 1 ; + + bool eindict[l + 1]; + bool findict[m + 1]; + bool indict[m + 1][l + 1]; + if(it == 1 && useDict){ + for(unsigned int dummy = 0; dummy <= l; dummy++) eindict[dummy] = false; + for(unsigned int dummy = 0; dummy <= m; dummy++){ + findict[dummy] = false; + for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++) + indict[dummy][dummy2] = false; + } + for(j = 0; j <= m; j++) + for(i = 0; i <= l; i++) + if(dict.indict(fs[j], es[i])){ + eindict[i] = findict[j] = indict[j][i] = true; + } + } + + for(j=1; j <= m; j++){ + // entries that map fs to all possible ei in this sentence. + Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table + LpPair<COUNT,PROB> **sPtrCachePtr; + + PROB denom = 0.0; + WordIndex best_i = 0 ; // i for which fj is best maped to ei + PROB word_best_score = 0 ; // score for the best mapping of fj + if (it == 1 && !seedModel1){ + denom = uniform * es.size() ; + word_best_score = uniform ; + } + else + for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ + PROB e(0.0) ; + (*sPtrCachePtr) = tTable.getPtr(es[i], fs[j]) ; + if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) + e = (*((*sPtrCachePtr))).prob; + else e = PROB_SMOOTH ; + denom += e ; + if (e > word_best_score){ + word_best_score = e ; + best_i = i ; + } + } + viterbi_alignment[j] = best_i ; + viterbi_score *= word_best_score ; /// denom ; + if (denom == 0){ + if (test) + cerr << "WARNING: denom is zero (TEST)\n"; + else + cerr << "WARNING: denom is zero (TRAIN)\n"; + } + cross_entropy += log(denom) ; + if (!test){ + if(denom > 0){ + COUNT val = COUNT(so) / (COUNT) double(denom) ; + /* this if loop implements a constraint on counting: + count(es[i], fs[j]) is implemented if and only if + es[i] and fs[j] occur together in the dictionary, + OR + es[i] does not occur in the dictionary with any fs[x] and + fs[j] does not occur in the dictionary with any es[y] + */ + if(it == 1 && useDict){ + for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ + if(indict[j][i] || (!findict[j] && !eindict[i])){ + PROB e(0.0) ; + if (it == 1 && !seedModel1) + e = uniform ; + else if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) + e = (*((*sPtrCachePtr))).prob; + else e = PROB_SMOOTH ; + COUNT x=e*val; + if( (it==1 && !seedModel1)||x>MINCOUNTINCREASE ) + /* if ((*sPtrCachePtr) != 0) + (*((*sPtrCachePtr))).count += x; + else */ + tTable.incCount(es[i], fs[j], x); + } /* end of if */ + } /* end of for i */ + } /* end of it == 1 */ + // Old code: + else{ + for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ + //for(i=0; i <= l; i++) { + PROB e(0.0) ; + if (it == 1 && !seedModel1) + e = uniform ; + else if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) + e = (*((*sPtrCachePtr))).prob; + else e = PROB_SMOOTH ; + //if( !(i==0) ) + //cout << "COUNT(e): " << e << " " << MINCOUNTINCREASE << endl; + COUNT x=e*val; + if( pair_no==VerboseSentence ) + cout << i << "(" << evlist[es[i]].word << ")," << j << "(" << fvlist[fs[j]].word << ")=" << x << endl; + if( (it==1 && !seedModel1)||x>MINCOUNTINCREASE ){ + /*if( NoEmptyWord==0 || i!=0 ) + if ((*sPtrCachePtr) != 0) + (*((*sPtrCachePtr))).count += x; + else */ + //cerr << i << " " << j << " (+) " << endl; + //cerr.flush(); + //cerr << es[i] << " " << fs[j] << " (=) "<< endl; + //cerr.flush(); + tTable.incCount(es[i], fs[j], x); + //cerr << es[i] << " " << fs[j] << " (-) "<< endl; + //cerr.flush(); + } + } /* end of for i */ + } // end of else + } // end of if (denom > 0) + }// if (!test) + } // end of for (j) ; + sHandler1.setProbOfSentence(sent,cross_entropy); + //cerr << sent << "CE: " << cross_entropy << " " << so << endl; + perp.addFactor(cross_entropy-m*log(l+1.0), so, l, m,1); + viterbi_perp.addFactor(log(viterbi_score)-m*log(l+1.0), so, l, m,1); + if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000)) + printAlignToFile(es, fs, evlist, fvlist, of2, viterbi_alignment, sent.sentenceNo, viterbi_score); + addAL(viterbi_alignment,sent.sentenceNo,l); + pair_no++; + } /* of while */ +} + +CTTableDiff<COUNT,PROB>* model1::one_step_em(int it, bool seedModel1, + Dictionary& dictionary, bool useDict){ + CTTableDiff<COUNT,PROB> *diff = new CTTableDiff<COUNT,PROB>(); + double minErrors=1.0;int minIter=0; + string modelName="Model1",shortModelName="1"; + time_t st, it_st, fn, it_fn; + string tfile, number, alignfile, test_alignfile; + int pair_no; + bool dump_files = false ; + st = time(NULL); + sHandler1.rewind(); + cout << "==========================================================\n"; + cout << modelName << " Training Started at: "<< ctime(&st) << "\n"; + pair_no = 0 ; + it_st = time(NULL); + cout << "-----------\n" << modelName << ": Iteration " << it << '\n'; + dump_files = (Model1_Dump_Freq != 0) && ((it % Model1_Dump_Freq) == 0) && !NODUMPS ; + number = ""; + int n = it; + do{ + number.insert((size_t)0, 1, (char)(n % 10 + '0')); + } while((n /= 10) > 0); + tfile = Prefix + ".t" + shortModelName + "." + number ; + alignfile = Prefix + ".A1" ; + test_alignfile = Prefix +".tst.A" + shortModelName + "." + number ; + initAL(); + em_loop_1(diff,it,perp, sHandler1, seedModel1, + dump_files, alignfile.c_str(), dictionary, useDict, trainViterbiPerp); + //if (testPerp && testHandler) // calculate test perplexity + // em_loop(it,*testPerp, *testHandler, seedModel1, dump_files, test_alignfile.c_str(), dictionary, useDict, *testViterbiPerp, true); + if( errorsAL()<minErrors ){ + minErrors=errorsAL(); + minIter=it; + } + fn = time(NULL) ; + cout << "Partial " << modelName << " Training took: " << difftime(fn, it_st) << " seconds\n"; + return diff; + } + + void model1::combine_one(CTTableDiff<COUNT,PROB>* cb){ + cb->AugmentTTable(tTable); + } + + void model1::recombine(){ + tTable.normalizeTable(Elist, Flist); + } + + void save_table(const char* tname){ +/* if (dump_files){ + * if( OutputInAachenFormat==0 ) + * tTable.printProbTable(tfile.c_str(),Elist.getVocabList(),Flist.getVocabList(),OutputInAachenFormat); + */ + + } + + +void model1::em_loop_1(CTTableDiff<COUNT,PROB> *diff,int it,Perplexity& perp, sentenceHandler& sHandler1, bool seedModel1, + bool dump_alignment, const char* alignfile, Dictionary& dict, bool useDict, Perplexity& viterbi_perp, bool test) { + WordIndex i, j, l, m ; + double cross_entropy; + int pair_no=0 ; + perp.clear(); + viterbi_perp.clear(); + ofstream of2; + // for each sentence pair in the corpus + if (dump_alignment||FEWDUMPS) + of2.open(alignfile); + PROB uniform = 1.0/noFrenchWords ; + sentPair sent ; + sHandler1.rewind(); + while(sHandler1.getNextSentence(sent)){ + Vector<WordIndex>& es = sent.eSent; + Vector<WordIndex>& fs = sent.fSent; + const float so = sent.getCount(); + l = es.size() - 1; + m = fs.size() - 1; + cross_entropy = log(1.0); + Vector<WordIndex> viterbi_alignment(fs.size()); + double viterbi_score = 1 ; + + bool eindict[l + 1]; + bool findict[m + 1]; + bool indict[m + 1][l + 1]; + if(it == 1 && useDict){ + for(unsigned int dummy = 0; dummy <= l; dummy++) eindict[dummy] = false; + for(unsigned int dummy = 0; dummy <= m; dummy++){ + findict[dummy] = false; + for(unsigned int dummy2 = 0; dummy2 <= l; dummy2++) + indict[dummy][dummy2] = false; + } + for(j = 0; j <= m; j++) + for(i = 0; i <= l; i++) + if(dict.indict(fs[j], es[i])){ + eindict[i] = findict[j] = indict[j][i] = true; + } + } + + for(j=1; j <= m; j++){ + // entries that map fs to all possible ei in this sentence. + Vector<LpPair<COUNT,PROB> *> sPtrCache(es.size(),0); // cache pointers to table + //Vector<COUNT *> sPtrCacheDif(es.size(),0); // cache pointers to table + LpPair<COUNT,PROB> **sPtrCachePtr; + //COUNT **sPtrCachePtrDif; + + PROB denom = 0.0; + WordIndex best_i = 0 ; // i for which fj is best maped to ei + PROB word_best_score = 0 ; // score for the best mapping of fj + if (it == 1 && !seedModel1){ + denom = uniform * es.size() ; + word_best_score = uniform ; + } + else { + for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ + PROB e(0.0) ; + (*sPtrCachePtr) = tTable.getPtr(es[i], fs[j]) ; + //(*sPtrCachePtrDif) = diff->GetPtr(es[i], fs[j]) ; + if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) + e = (*((*sPtrCachePtr))).prob; + else e = PROB_SMOOTH ; + denom += e ; + if (e > word_best_score){ + word_best_score = e ; + best_i = i ; + } + } + } + viterbi_alignment[j] = best_i ; + viterbi_score *= word_best_score ; /// denom ; + if (denom == 0){ + if (test) + cerr << "WARNING: denom is zero (TEST)\n"; + else + cerr << "WARNING: denom is zero (TRAIN)\n"; + } + cross_entropy += log(denom) ; + if (!test){ + if(denom > 0){ + COUNT val = COUNT(so) / (COUNT) double(denom) ; + /* this if loop implements a constraint on counting: + count(es[i], fs[j]) is implemented if and only if + es[i] and fs[j] occur together in the dictionary, + OR + es[i] does not occur in the dictionary with any fs[x] and + fs[j] does not occur in the dictionary with any es[y] + */ + if(it == 1 && useDict){ + for((i=0),(sPtrCachePtr=&sPtrCache[0]); + i <= l; i++,sPtrCachePtr++){ + if(indict[j][i] || (!findict[j] && !eindict[i])){ + PROB e(0.0) ; + if (it == 1 && !seedModel1) + e = uniform ; + else if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) + e = (*((*sPtrCachePtr))).prob; + else e = PROB_SMOOTH ; + COUNT x=e*val; + if( it==1||x>MINCOUNTINCREASE ){ + /*if ((*sPtrCachePtr) != 0){ + (*((*sPtrCachePtr))).count += x; + } else {*/ + tTable.incCount(es[i], fs[j], x); + //} + diff->incCount(es[i], fs[j], x); + } + } /* end of if */ + } /* end of for i */ + } /* end of it == 1 */ + // Old code: + else{ + for((i=0),(sPtrCachePtr=&sPtrCache[0]); i <= l; i++,sPtrCachePtr++){ + //for(i=0; i <= l; i++) { + PROB e(0.0) ; + if (it == 1 && !seedModel1) + e = uniform ; + else if ((*sPtrCachePtr) != 0 && (*((*sPtrCachePtr))).prob > PROB_SMOOTH) + e = (*((*sPtrCachePtr))).prob; + else e = PROB_SMOOTH ; + //if( !(i==0) ) + //cout << "COUNT(e): " << e << " " << MINCOUNTINCREASE << endl; + COUNT x=e*val; + if( pair_no==VerboseSentence ) + cout << i << "(" << evlist[es[i]].word << ")," + << j << "(" << fvlist[fs[j]].word << ")=" << x << endl; + if( it==1||x>MINCOUNTINCREASE ) + if( NoEmptyWord==0 || ( NoEmptyWord==0 || i!=0 )){ + /*if ((*sPtrCachePtr) != 0){ + (*((*sPtrCachePtr))).count += x; + } else */ + tTable.incCount(es[i], fs[j], x); + diff->incCount(es[i], fs[j], x); + } + } /* end of for i */ + } // end of else + } // end of if (denom > 0) + }// if (!test) + } // end of for (j) ; + sHandler1.setProbOfSentence(sent,cross_entropy); + //cerr << sent << "CE: " << cross_entropy << " " << so << endl; + perp.addFactor(cross_entropy-m*log(l+1.0), so, l, m,1); + viterbi_perp.addFactor(log(viterbi_score)-m*log(l+1.0), so, l, m,1); + if (dump_alignment||(FEWDUMPS&&sent.sentenceNo<1000)) + printAlignToFile(es, fs, evlist, fvlist, of2, viterbi_alignment, sent.sentenceNo, viterbi_score); + addAL(viterbi_alignment,sent.sentenceNo,l); + pair_no++; + } /* of while */ + sHandler1.rewind(); + perp.record("Model1"); + viterbi_perp.record("Model1"); + errorReportAL(cout, "IBM-1"); + } |