/* EGYPT Toolkit for Statistical Machine Translation Written by Yaser Al-Onaizan, Jan Curin, Michael Jahr, Kevin Knight, John Lafferty, Dan Melamed, David Purdy, Franz Och, Noah Smith, and David Yarowsky. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. */ #include "mystl.h" #include "model3.h" #include "collCounts.h" #include "utility.h" #include "Globals.h" #include "D5Tables.h" #include "transpair_model5.h" #include "transpair_modelhmm.h" #include "myassert.h" #include "Parameter.h" GLOBAL_PARAMETER(float,PrintN,"nbestalignments","for printing the n best alignments",PARLEV_OUTPUT,0) ; const short LogHillClimb = 0, LogPeg = 0; const short UseHMMViterbiAlignmentIfPossible = 1; short DoViterbiTraining = 0; GLOBAL_PARAMETER(int,VerboseSentence,"VerboseSentence","number of sentence for which a lot of information should be printed (negative: no output)",PARLEV_OUTPUT,-10) ; GLOBAL_PARAMETER(double,PEGGED_CUTOFF,"PEGGED_CUTOFF","relative cutoff probability for alignment-centers in pegging",PARLEV_OPTHEUR,3e-2) ; GLOBAL_PARAMETER2(float, COUNTINCREASE_CUTOFF_AL,"COUNTINCREASE CUTOFF AL","countCutoffAl","Counts increment cutoff threshold for alignments in training of fertility models",PARLEV_OPTHEUR,1e-5) ; //int SentNr; bool UseLinkCache = 1; /// optimization for pegging int NumberOfAlignmentsInSophisticatedCountCollection; extern bool ONLYALDUMPS; int PrintHillClimbWarning = 0; int PrintZeroScoreWarning = 0; LogProb model3::viterbi_model2(const transpair_modelhmm&ef, alignment&output, int #ifdef STORE_HMM_ALIGNMENTS pair_no #endif , int i_peg, int j_peg) const { static Vector > viterbis; Vector vit; int m = ef.get_m(); int l = ef.get_l(); double ret = 0.0; //#define STORE_HMM_ALIGNMENTS #ifdef STORE_HMM_ALIGNMENTS if( i_peg==-1 && j_peg==-1 && viterbis.size()>pair_no ) { output=viterbis[pair_no].first; ret=viterbis[pair_no].second; massert( ret==HMMRealViterbi(*ef.net,vit,i_peg-1,j_peg-1)*ef.net->finalMultiply ); } else { ret=HMMRealViterbi(*ef.net,vit,i_peg-1,j_peg-1)*ef.net->finalMultiply; for(int j=1;j<=m;j++) { if( vit[j-1]+1>l ) output.set(j,0); else output.set(j,vit[j-1]+1); massert( (j==j_peg&&int(output(j))==i_peg) || j_peg!=j); } if( i_peg==-1 && j_peg==-1 ) { iassert(viterbis.size()==pair_no); viterbis.push_back(make_pair(output,ret)); } } #else ret = HMMRealViterbi(*ef.net, vit, i_peg - 1, j_peg - 1) * ef.net->finalMultiply; for (int j = 1; j <= m; j++) { if (vit[j - 1] + 1 > l) output.set(j, 0); else output.set(j, vit[j - 1] + 1); massert( (j==j_peg&&int(output(j))==i_peg) || j_peg!=j); } #endif massert( j_peg==-1 || int(output(j_peg))==i_peg ); if (j_peg != -1) massert(int(output(j_peg))==i_peg); if (output.valid()) return ret; else { return _viterbi_model2(ef, output, i_peg, j_peg); } } LogProb model3::_viterbi_model2(const transpair_model2&ef, alignment&output, int i_peg, int j_peg) const { WordIndex best_i = 0; LogProb ss = 1; PositionIndex l = ef.get_l(), m = ef.get_m(); Vector Fert(l + 1, (WordIndex) 0); if ((j_peg != -1) && (i_peg != -1)) { output.set(j_peg, i_peg); ss *= ef.get_t(i_peg, j_peg) * ef.get_a(i_peg, j_peg); if (ss == 0) cerr << "WARNING: already starting is zero: " << ef.get_t(i_peg, j_peg) << " " << ef.get_a(i_peg, j_peg) << '\n'; } else ss = 1; for (PositionIndex j = 1; j <= m; j++) if (int(j) != j_peg) { LogProb score = 0; for (PositionIndex i = 0; i <= l; i++) { if (Fert[i] + 1 < MAX_FERTILITY && (i != 0 || m >= (2 * (Fert[0] + 1)))) { LogProb temp = ef.get_t(i, j) * ef.get_a(i, j); if (temp > score) { best_i = i; score = temp; } } } if (score == 0) { cerr << "WARNING: In searching for model2 best alignment\n"; cerr << "Nothing was set for target token at position j: " << j << "\n"; for (PositionIndex i = 0; i <= l; i++) { cerr << "i: " << i << "ttable(" << i << ", " << j << ") = " << ef.get_t(i, j) << " atable(" << i << ", " << j << ", " << l << ", " << m << ") = " << ef.get_a(i, j) << " product " << ef.get_t(i, j) * ef.get_a(i, j); if ((Fert[i] + 1 < MAX_FERTILITY) && ((i == 0 && (m >= 2 * (Fert[0] + 1))) || (i != 0))) cerr << "Passed fertility condition \n"; else cerr << "Failed fertility condition \n"; } } else { output.set(j, best_i); Fert[best_i]++; } ss *= score; } if (ss <= 0) { //cerr << ef; cerr << "WARNING: Model2 viterbi alignment has zero score.\n"; cerr << "Here are the different elements that made this alignment probability zero \n"; cerr << "Source length " << l << " target length " << m << '\n'; LogProb gg = 1; // for debugging only ..... for (PositionIndex j = 1; j <= m; j++) if (int(j) != j_peg) { LogProb score = 0; LogProb a = 0, t = 0; for (PositionIndex i = 0; i <= l; i++) { // if( Debug_Fert[i]+1=(2 * (Debug_Fert[0] + 1)))){ LogProb temp = ef.get_t(i, j) * ef.get_a(i, j); if (temp > score) { score = temp; best_i = i; a = ef.get_a(i, j); t = ef.get_t(i, j); } // } } gg *= score; cerr << "best: fs[" << j << "] " << j << " : es[" << best_i << "] " << best_i << " , a: " << ef.get_a(best_i, j) << " t: " << t << " score " << score << " product : " << gg << " ss " << ss << '\n'; } for (PositionIndex i = 0; i <= l; i++) cerr << "Fert[" << i << "] selected " << Fert[i] << '\n'; } massert(output.valid()); return ss; } LogProb model3::viterbi_model2(const transpair_model3&ef, alignment&output, int pair_no, int i_peg, int j_peg) const { if (h && UseHMMViterbiAlignmentIfPossible) { transpair_modelhmm efhmm(ef.E, ef.F, tTable, aTable, dTable, nTable, 0.0, 0.0, h); LogProb ret = viterbi_model2(efhmm, output, pair_no, i_peg, j_peg); massert(output.valid()); return ret; } return _viterbi_model2(ef, output, i_peg, j_peg); } //int HillClimbingSteps=0; template LogProb greedyClimb_WithIBM3Scoring(MoveSwapMatrix&msc2, int& HillClimbingSteps, int j_peg = -1) { PositionIndex l = msc2.get_l(), m = msc2.get_m(); int changed = 0; int iter = 0; bool hereVERB = 0; do { MoveSwapMatrix msc_IBM3( msc2.get_ef(), alignment(msc2)); vector > msvec; for (PositionIndex j = 1; j <= m; j++) if (int(j) != j_peg) { WordIndex aj = msc2(j); for (PositionIndex j1 = j + 1; j1 <= m; j1++) if ((aj != msc2(j1)) && (int(j1) != j_peg)) msvec.push_back(pair ( -msc_IBM3.cswap(j, j1), OneMoveSwap(1, j, j1))); for (PositionIndex i = 0; i <= l; i++) if (i != aj && (i != 0 || (m >= 2 * (msc2.fert(0) + 1))) && msc2.fert(i) + 1 < MAX_FERTILITY) msvec.push_back(pair ( -msc_IBM3.cmove(i, j), OneMoveSwap(2, i, j))); } sort(msvec.begin(), msvec.end()); HillClimbingSteps++; int iused = -1; changed = 0; for (unsigned int i = 0; i < msvec.size() && changed == 0; ++i) { LogProb csts; const OneMoveSwap &oms = msvec[i].second; if (oms.type == 1 && (csts = msc2.cswap(oms.a, oms.b)) > 1.0001) { if (hereVERB == 1) cerr << "SWAP: " << csts << '\n'; msc2.doSwap(oms.a, oms.b); changed = 1; iused = i; break; } if (oms.type == 2 && (csts = msc2.cmove(oms.a, oms.b)) > 1.0001) { if (hereVERB == 1) cerr << "MOVE: " << csts << '\n'; msc2.doMove(oms.a, oms.b); changed = 1; iused = i; break; } } if (++iter > 30) { //msc2.ef.verboseTP=1; hereVERB = 1; cerr << "ERROR: more than 30 iterations in hill-climbing: " << iused << " improvement: " << msvec[iused].first << " value:" << msvec[iused].second << '\n' << msc2 << '\n'; for (int a = 0; a < 20; ++a) cout << a << ' ' << msvec[a].first << ' ' << msvec[a].second << '\n'; //cerr << msvec << '\n'; } if (iter > 50) break; } while (changed); return msc2.get_ef().prob_of_target_and_alignment_given_source(msc2); } template LogProb greedyClimb(MoveSwapMatrix&msc2, int& HillClimbingSteps, int j_peg = -1) { if (msc2.get_ef().greedyHillClimbing() == 1) return greedyClimb_WithIBM3Scoring(msc2, HillClimbingSteps, j_peg); PositionIndex l = msc2.get_l(), m = msc2.get_m(); int changed = 0; do { HillClimbingSteps++; changed = 0; for (PositionIndex j = 1; j <= m; j++) if (int(j) != j_peg) { WordIndex aj = msc2(j); for (PositionIndex j1 = j + 1; j1 <= m; j1++) if ((aj != msc2(j1)) && (int(j1) != j_peg) && msc2.cswap(j, j1) > 1.0) msc2.doSwap(j, j1), changed = 1; for (PositionIndex i = 0; i <= l; i++) if (i != aj && (i != 0 || (m >= 2 * (msc2.fert(0) + 1))) && msc2.fert(i) + 1 < MAX_FERTILITY && msc2.cmove( i, j) > 1.0) msc2.doMove(i, j), changed = 1; } } while (changed); return msc2.get_ef().prob_of_target_and_alignment_given_source(msc2); } template LogProb hillClimb_std(MoveSwapMatrix&msc2, int &HillClimbingSteps, int = -1, int j_peg = -1) { if (msc2.isLazy()) return greedyClimb_WithIBM3Scoring(msc2, HillClimbingSteps, j_peg); if (LogHillClimb > 1) cout << msc2 << '\n'; PositionIndex l = msc2.get_l(), m = msc2.get_m(); int changes = 0; int best_change_type = -1, best_change_v1 = -1, best_change_v2 = -1; do { HillClimbingSteps++; LogProb best_change_so_far = 1.00001; best_change_type = 0; for (PositionIndex j = 1; j <= m; j++) if (int(j) != j_peg) { WordIndex aj = msc2(j); for (PositionIndex j1 = j + 1; j1 <= m; j1++) if ((aj != msc2(j1)) && (int(j1) != j_peg)) { LogProb change = msc2.cswap(j, j1); if (change > best_change_so_far) { best_change_so_far = change; best_change_type = 1; best_change_v1 = j; best_change_v2 = j1; if (LogHillClimb) cerr << "CLIMB: " << best_change_type << " " << best_change_v1 << " " << best_change_v2 << " " << best_change_so_far << msc2 << '\n'; massert(msc2.get_ef().isSubOptimal()==1); } } for (PositionIndex i = 0; i <= l; i++) if (i != aj && (i != 0 || (m >= 2 * (msc2.fert(0) + 1))) && msc2.fert(i) + 1 < MAX_FERTILITY) { LogProb change = msc2.cmove(i, j); if (change > best_change_so_far) { best_change_so_far = change; best_change_type = 2; best_change_v1 = j; best_change_v2 = i; if (LogHillClimb) cerr << "CLIMB: " << best_change_type << " " << best_change_v1 << " " << best_change_v2 << " " << best_change_so_far << msc2 << '\n'; massert(msc2.get_ef().isSubOptimal()==1); } } } if (best_change_type == 1) { msc2.doSwap(best_change_v1, best_change_v2); if (LogHillClimb) cerr << "SW-CLIMB-DONE: " << j_peg << msc2 << '\n'; } if (best_change_type == 2) { msc2.doMove(best_change_v2, best_change_v1); if (LogHillClimb) cerr << "MO-CLIMB-DONE: " << j_peg << msc2 << '\n'; } changes++; if (changes > 40) { if (PrintHillClimbWarning++ < 1000) cerr << "WARNING: already " << changes << " iterations in hillclimb: " << best_change_so_far << " " << best_change_type << " " << best_change_v1 << " " << best_change_v2 << '\n'; else if (PrintHillClimbWarning == 1000) cerr << "ERROR: too many hill climbing warnings => I do not print more.\n"; } if (changes > 60) { cerr << msc2 << '\n'; break; } } while (best_change_type); return msc2.get_ef().prob_of_target_and_alignment_given_source(msc2); } template bool extendCenterList( Vector*, LogProb> >&setOfGoodCenters, MoveSwapMatrix *msc, double peggedAlignmentScore) { unsigned int l = msc->get_ef().get_l(); set alreadyCovered; for (unsigned int nr = 0; nr < setOfGoodCenters.size(); nr++) makeOneMoveSwap(*setOfGoodCenters[nr].first, *msc, alreadyCovered); for (set::const_iterator i = alreadyCovered.begin(); i != alreadyCovered.end(); ++i) { if (i->type == 1 || i->type == 4) msc->delCenter(); if (i->type == 1) { for (unsigned int ii = 0; ii <= l; ++ii) if ((*msc)(i->a) != ii) msc->delMove(ii, i->a); } else if (i->type == 2 || i->type == 4) msc->delSwap(i->a, i->b); else if (i->type == 3) msc->delMove(i->b, i->a); else abort(); } setOfGoodCenters.push_back(make_pair(msc, peggedAlignmentScore)); return 1; } bool OldLog = 0; short OldLogPeg = 0, OldLogHillClimb = 0; class Als { public: int s, a, b; double v; Als(int _s, int _a, int _b, double _v) : s(_s), a(_a), b(_b), v(_v) { } }; inline bool operator<(const Als&x, const Als&y) { return x.v > y.v; } template void model3::viterbi_loop_with_tricks(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, bool dump_files, const char* alignfile, bool collect_counts, string model, bool final, ADDITIONAL_MODEL_DATA_IN*dm_in, ADDITIONAL_MODEL_DATA_OUT*dm_out) { ofstream *writeNBestErrorsFile = 0; if ((dump_files || FEWDUMPS) && PrintN && ReferenceAlignment.size() > 0) { string x = alignfile + string("NBEST"); writeNBestErrorsFile = new ofstream(x.c_str()); } ofstream *of3 = 0; PositionIndex i, j, l, m; ofstream of2; int pair_no; int HillClimbingSteps = 0; NumberOfAlignmentsInSophisticatedCountCollection = 0; if (dump_files || FEWDUMPS || (final && (ONLYALDUMPS))) { of2.open(alignfile); if (of2.is_open()) { cout << "I will write alignment to " << alignfile << endl; } } /* if(!of2.is_open()){ cerr << "I don't know why you do not let me dump file " << alignfile << endl; }*/ if (dump_files && PrintN && final) { string x = alignfile + string("NBEST"); of3 = new ofstream(x.c_str()); } pair_no = 0; // sentence pair number // for each sentence pair in the corpus perp.clear(); // clears cross_entrop & perplexity viterbiPerp.clear(); // clears cross_entrop & perplexity sentPair sent; int NCenter = 0, NHillClimbed = 0, NAlignment = 0, NTotal = 0, NBetterByPegging = 0; while (sHandler1.getNextSentence(sent)) { //cerr << sent.fAnchor.size()<< endl; if (sent.eSent.size() == 1 || sent.fSent.size() == 1) continue; // SentNr=sent.sentenceNo; Vector& es = sent.eSent; Vector& fs = sent.fSent; const float count = sent.getCount(); if ((sent.sentenceNo % 10000) == 0) cerr << sent.sentenceNo << '\n'; time_t sent_s = time(NULL); pair_no++; l = es.size() - 1; m = fs.size() - 1; LogProb align_total_count = 0; alignment viterbi2alignment(l, m); MODEL_TYPE ef(es, fs, tTable, aTable, dTable, nTable, p1, p0, dm_in); viterbi_model2(ef, viterbi2alignment, pair_no - 1); // after getting viterbi alignment, we manually modify it to make sure the whitelist is not violated // for(int _i = 0; i < sent.fAnchor.size();i++){ // if(sent.fAnchor[_i]) // viterbi2alignment.set(sent.fAnchor[_i],sent.eAnchor[_i]); // else{ // the link means no word should be aligned to the target word, so we eliminate all the links. // for(int _j = 1 ; _j < m; _j++){ // int aj = viterbi2alignment.get_al(_j); // if(aj == sent.eAnchor[_i]){ // viterbi2alignment.set(_j,0); // } // } // } // } Vector*, LogProb> > setOfGoodCenters(1); set alignments; MoveSwapMatrix *best = (setOfGoodCenters[0].first = new MoveSwapMatrix (ef, viterbi2alignment)); // after getting viterbi alignment, we manually modify it to make sure the whitelist is not violated set visitedJ ; visitedJ.clear(); for(int _i = 0; _i < sent.fAnchor.size();_i++){ if(sent.fAnchor[_i] > m || sent.eAnchor[_i] > l) continue; if(sent.fAnchor[_i]){ if(visitedJ.find(sent.fAnchor[_i])==visitedJ.end() && best->get_al(sent.fAnchor[_i])!=sent.eAnchor[_i]) best->doMove(sent.eAnchor[_i],sent.fAnchor[_i]); setOfGoodCenters[0].first->addUnTouchJ(sent.fAnchor[_i],sent.eAnchor[_i]); //setOfGoodCenters[0].first->addUnTouchI(sent.eAnchor[_i],sent.fAnchor[_i]); }else{ // the link means no word should be aligned to the target word, so we eliminate all the links. //best->doMove(0,sent.fAnchor[_j]); for (int _j = 1; _j < m; _j++) { int aj = best->get_al(_j); if (aj == sent.eAnchor[_i]) { best->doMove(0,_j); } } setOfGoodCenters[0].first->addUnTouchI(sent.eAnchor[_i],sent.fAnchor[_i]); } } MoveSwapMatrix _viterbi(*best), *viterbi = &_viterbi; // please, don't delete this line (FJO) // We prepare the move matrix ignore list if (ef.isSubOptimal()) setOfGoodCenters[0].second = hillClimb_std(*best, HillClimbingSteps); else { setOfGoodCenters[0].second = best->get_ef().prob_of_target_and_alignment_given_source( *best); if (setOfGoodCenters[0].second == 0) { cerr << "PROBLEM: alignment is 0.\n"; best->get_ef().prob_of_target_and_alignment_given_source(*best, 1); } } int bestAlignment = 0; for (unsigned int i = 0; i < setOfGoodCenters.size(); ++i) setOfGoodCenters[i].first->check(); alignments.insert(*best); if (setOfGoodCenters[bestAlignment].second <= 0) { if (PrintZeroScoreWarning++ < 100) { cerr << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n"; cerr << alignment(*setOfGoodCenters[bestAlignment].first); printSentencePair(es, fs, cerr); } else if (PrintZeroScoreWarning == 100) { cerr << "ERROR: too many zero score warnings => no additional one will be printed\n"; } setOfGoodCenters[bestAlignment].second = 1e-300; continue; } int nHillClimbed = 1, nAlignment = 1; bool flagBetterByPegging = 0; if (Peg) { const MoveSwapMatrix *useMatrix = viterbi; // it is faster using 'best', ... (FJO) Array2 > linkCache(l + 1, m + 1, false); if (UseLinkCache) for (unsigned int j = 1; j <= m; j++) linkCache((*useMatrix)(j), j) = 1; for (PositionIndex j = 1; j <= m; j++) for (PositionIndex i = 0; i <= l; i++) { nAlignment++; if (i != (*useMatrix)(j) && (UseLinkCache == 0 || linkCache(i, j) == 0) && ef.get_t(i, j) > ef.get_t((*useMatrix)(j), j) * PEGGED_CUTOFF && (i != 0 || (m >= 2 * (useMatrix->fert(0) + 1)))) { MoveSwapMatrix *BESTPEGGED = 0; LogProb peggedAlignmentScore; nHillClimbed++; if (ef.isSubOptimal()) { BESTPEGGED = new MoveSwapMatrix ( *useMatrix); BESTPEGGED->doMove(i, j); peggedAlignmentScore = hillClimb_std(*BESTPEGGED, HillClimbingSteps, i, j); } else { alignment pegAlignment(l, m); peggedAlignmentScore = viterbi_model2(ef, pegAlignment, pair_no - 1, i, j); BESTPEGGED = new MoveSwapMatrix (ef, pegAlignment); massert( pegAlignment(j)==i ); } if (UseLinkCache) for (unsigned int j = 1; j <= m; j++) linkCache((*BESTPEGGED)(j), j) = 1; if (peggedAlignmentScore > setOfGoodCenters[bestAlignment].second * (LogProb) PEGGED_CUTOFF && alignments.count(*BESTPEGGED) == 0) { if (extendCenterList(setOfGoodCenters, BESTPEGGED, peggedAlignmentScore)) { alignments.insert(*BESTPEGGED); if (peggedAlignmentScore > 1.00001 * setOfGoodCenters[bestAlignment].second) { if (LogPeg) { cerr << "found better alignment by pegging " << pair_no << " " << peggedAlignmentScore / setOfGoodCenters[bestAlignment].second << '\n'; cerr << "NEW BEST: " << alignment( *BESTPEGGED); cerr << "OLD : " << alignment( *setOfGoodCenters[bestAlignment].first); } flagBetterByPegging = 1; bestAlignment = alignments.size() - 1; } } assert( differences(*BESTPEGGED, *best)!=0 ); BESTPEGGED = 0; } else delete BESTPEGGED; } } } // end of if(Peg) NBetterByPegging += flagBetterByPegging; for (unsigned int i = 0; i < setOfGoodCenters.size(); ++i) setOfGoodCenters[i].first->check(); if (LogPeg > 1) cout << "PEGGED: " << setOfGoodCenters.size() << " HILLCLIMBED:" << nHillClimbed << " TOTAL:" << nAlignment << " alignments." << '\n'; int alTotal = collectCountsOverNeighborhood(setOfGoodCenters, es, fs, tTable, aCountTable, dCountTable, nCountTable, p1_count, p0_count, align_total_count, count, collect_counts, dm_out); if (LogPeg > 1) { cout << "ALL: " << alTotal << " from " << pow(float(l + 1), float(m)) << '\n'; massert(alTotal<=pow(double(l+1),double(m))); } NCenter += setOfGoodCenters.size(); NHillClimbed += nHillClimbed; NAlignment += nAlignment; NTotal += alTotal; perp.addFactor(log(double(align_total_count)), count, l, m, 0); viterbiPerp.addFactor(log( double(setOfGoodCenters[bestAlignment].second)), count, l, m, 0); massert(log(double(setOfGoodCenters[bestAlignment].second)) <= log(double(align_total_count))); if (dump_files || (FEWDUMPS && sent.sentenceNo < 1000) || (final && (ONLYALDUMPS))) printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, (setOfGoodCenters[bestAlignment].first)->getAlignment(), sent.sentenceNo, setOfGoodCenters[bestAlignment].second); for (unsigned int i = 0; i < setOfGoodCenters.size(); ++i) setOfGoodCenters[i].first->check(); if (of3 || (writeNBestErrorsFile && pair_no < int(ReferenceAlignment.size()))) { vector als; for (unsigned int s = 0; s < setOfGoodCenters.size(); ++s) { const MoveSwapMatrix&msc = *setOfGoodCenters[s].first; msc.check(); double normalized_ascore = setOfGoodCenters[s].second; if (!msc.isCenterDeleted()) als.push_back(Als(s, 0, 0, normalized_ascore)); for (WordIndex j = 1; j <= m; j++) for (WordIndex i = 0; i <= l; i++) if (i != msc(j) && !msc.isDelMove(i, j)) als.push_back(Als(s, i, j, msc.cmove(i, j) * normalized_ascore)); for (PositionIndex j1 = 1; j1 <= m; j1++) for (PositionIndex j2 = j1 + 1; j2 <= m; j2++) if (msc(j1) != msc(j2) && !msc.isDelSwap(j1, j2)) als.push_back(Als(s, -j1, -j2, msc.cswap(j1, j2) * normalized_ascore)); } sort(als.begin(), als.end()); double sum = 0, sum2 = 0; for (unsigned int i = 0; i < als.size(); ++i) sum += als[i].v; for (unsigned int i = 0; i < min((unsigned int) als.size(), (unsigned int) PrintN); ++i) { alignment x = *setOfGoodCenters[als[i].s].first; if (!(als[i].a == 0 && als[i].b == 0)) { if (als[i].a <= 0 && als[i].b <= 0) x.doSwap(-als[i].a, -als[i].b); else x.doMove(als[i].a, als[i].b); } if (of3 && i < PrintN) printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), *of3, x.getAlignment(), sent.sentenceNo, als[i].v / sum * count); sum2 += als[i].v; if (writeNBestErrorsFile) { if (pair_no < int(ReferenceAlignment.size())) { int ALmissing = 0, ALtoomuch = 0, ALeventsMissing = 0, ALeventsToomuch = 0; vector scores; ErrorsInAlignment(ReferenceAlignment[pair_no - 1], x.getAlignment(), l, ALmissing, ALtoomuch, ALeventsMissing, ALeventsToomuch, pair_no); ef.computeScores(x, scores); *writeNBestErrorsFile << ALmissing + ALtoomuch << ' '; for (unsigned int i = 0; i < scores.size(); ++i) *writeNBestErrorsFile << ((scores[i] > 0.0) ? (-log(scores[i])) : 1.0e6) << ' '; *writeNBestErrorsFile << '\n'; } } } if (writeNBestErrorsFile) *writeNBestErrorsFile << '\n'; } addAL((setOfGoodCenters[bestAlignment].first)->getAlignment(), sent.sentenceNo, l); for (unsigned int i = 0; i < setOfGoodCenters.size(); i++) delete setOfGoodCenters[i].first; double period = difftime(time(NULL), sent_s); if (Verbose) cerr << "processing this sentence pair took : " << period << " seconds\n"; } /* of sentence pair E, F */ //sHandler1.rewind(); if (dump_files || FEWDUMPS || (final && (ONLYALDUMPS))) of2.close(); delete of3; delete writeNBestErrorsFile; double FSent = pair_no; cout << "#centers(pre/hillclimbed/real): " << NAlignment / FSent << " " << NHillClimbed / FSent << " " << NCenter / FSent << " #al: " << NTotal / FSent << " #alsophisticatedcountcollection: " << NumberOfAlignmentsInSophisticatedCountCollection / FSent << " #hcsteps: " << HillClimbingSteps / FSent << '\n'; cout << "#peggingImprovements: " << NBetterByPegging / FSent << '\n'; } /*Perform only one step of viterbi alignment*/ #if 0 template void model3::viterbi_loop_with_tricks_1(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, bool dump_files, const char* alignfile, bool collect_counts, string model, bool final, ADDITIONAL_MODEL_DATA_IN*dm_in, ADDITIONAL_MODEL_DATA_OUT*dm_out) { ofstream *writeNBestErrorsFile=0; if( (dump_files||FEWDUMPS)&&PrintN&&ReferenceAlignment.size()>0 ) { string x=alignfile+string("NBEST"); writeNBestErrorsFile= new ofstream(x.c_str()); } ofstream *of3=0; ofstream of2; int pair_no; HillClimbingSteps=0; NumberOfAlignmentsInSophisticatedCountCollection=0; if (dump_files||FEWDUMPS||(final&&(ONLYALDUMPS)) ) of2.open(alignfile); if( dump_files&&PrintN&&final ) { string x=alignfile+string("NBEST"); of3= new ofstream(x.c_str()); } pair_no = 0; // sentence pair number // for each sentence pair in the corpus perp.clear(); // clears cross_entrop & perplexity viterbiPerp.clear(); // clears cross_entrop & perplexity sentPair sent; int NCenter=0,NHillClimbed=0,NAlignment=0,NTotal=0,NBetterByPegging=0; while(sHandler1.getNextSentence(sent)) { if( sent.eSent.size()==1||sent.fSent.size()==1 ) continue; SentNr=sent.sentenceNo; Vector& es = sent.eSent; Vector& fs = sent.fSent; const float count = sent.getCount(); if ((sent.sentenceNo % 10000) == 0) cerr <*,LogProb> >setOfGoodCenters(1); set alignments; MoveSwapMatrix *best = (setOfGoodCenters[0].first = new MoveSwapMatrix(ef, viterbi2alignment)); MoveSwapMatrix _viterbi(*best), *viterbi=&_viterbi; // please, don't delete this line (FJO) if (Log) logmsg << "VITERBI: " << alignment(_viterbi); if( ef.isSubOptimal() ) setOfGoodCenters[0].second = hillClimb_std(*best); else { setOfGoodCenters[0].second = best->get_ef().prob_of_target_and_alignment_given_source(*best); if( setOfGoodCenters[0].second==0 ) { cerr << "PROBLEM: alignment is 0.\n"; best->get_ef().prob_of_target_and_alignment_given_source(*best,1); } } int bestAlignment=0; for(unsigned int i=0;icheck(); alignments.insert(*best); if (setOfGoodCenters[bestAlignment].second <= 0) { if( PrintZeroScoreWarning++<100 ) { cerr << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n"; cerr << alignment(*setOfGoodCenters[bestAlignment].first); printSentencePair(es, fs, cerr); if(Log) { logmsg << "WARNING: Hill Climbing yielded a zero score viterbi alignment for the following pair:\n"; printSentencePair(es, fs, logmsg); } } else if(PrintZeroScoreWarning==100) { cerr << "ERROR: too many zero score warnings => no additional one will be printed\n"; } setOfGoodCenters[bestAlignment].second=1e-300; continue; } int nHillClimbed=1,nAlignment=1; bool flagBetterByPegging=0; if ( Peg ) { const MoveSwapMatrix *useMatrix=viterbi; // it is faster using 'best', ... (FJO) Array2 > linkCache(l+1, m+1, false); if(UseLinkCache)for(unsigned int j=1;j<=m;j++)linkCache((*useMatrix)(j), j)=1; for(PositionIndex j=1;j<=m;j++)for(PositionIndex i=0;i<=l;i++) { nAlignment++; if( i!=(*useMatrix)(j) && (UseLinkCache==0||linkCache(i,j)==0) && ef.get_t(i,j)>ef.get_t((*useMatrix)(j),j)*PEGGED_CUTOFF && (i != 0 || (m >= 2 * (useMatrix->fert(0)+1)))) { MoveSwapMatrix *BESTPEGGED=0; LogProb peggedAlignmentScore; nHillClimbed++; if( ef.isSubOptimal() ) { BESTPEGGED = new MoveSwapMatrix(*useMatrix); BESTPEGGED->doMove(i, j); peggedAlignmentScore= hillClimb_std(*BESTPEGGED, i,j); } else { alignment pegAlignment(l,m); peggedAlignmentScore=viterbi_model2(ef,pegAlignment,pair_no-1,i,j); BESTPEGGED = new MoveSwapMatrix(ef,pegAlignment); massert( pegAlignment(j)==i ); } if(UseLinkCache) for(unsigned int j=1;j<=m;j++) linkCache((*BESTPEGGED)(j), j)=1; if( peggedAlignmentScore>setOfGoodCenters[bestAlignment].second*(LogProb)PEGGED_CUTOFF && alignments.count(*BESTPEGGED)==0 ) { if(extendCenterList(setOfGoodCenters,BESTPEGGED,peggedAlignmentScore)) { alignments.insert(*BESTPEGGED); if( peggedAlignmentScore>1.00001*setOfGoodCenters[bestAlignment].second ) { if( LogPeg ) { cerr << "found better alignment by pegging " << pair_no << " " << peggedAlignmentScore/setOfGoodCenters[bestAlignment].second << '\n'; cerr << "NEW BEST: " << alignment(*BESTPEGGED); cerr << "OLD : " << alignment(*setOfGoodCenters[bestAlignment].first); } flagBetterByPegging=1; bestAlignment=alignments.size()-1; } } assert( differences(*BESTPEGGED, *best)!=0 ); BESTPEGGED=0; } else delete BESTPEGGED; } } } // end of if(Peg) NBetterByPegging+=flagBetterByPegging; for(unsigned int i=0;icheck(); if( LogPeg>1 ) cout << "PEGGED: " << setOfGoodCenters.size() << " HILLCLIMBED:" << nHillClimbed << " TOTAL:" << nAlignment << " alignments." << '\n'; int alTotal=collectCountsOverNeighborhood(setOfGoodCenters,es, fs, tTable, aCountTable, dCountTable, nCountTable, p1_count, p0_count, align_total_count, count, collect_counts, dm_out); if( LogPeg>1 ) { cout << "ALL: " << alTotal << " from " << pow(float(l+1),float(m)) << '\n'; massert(alTotal<=pow(double(l+1),double(m))); } NCenter+=setOfGoodCenters.size();NHillClimbed+=nHillClimbed;NAlignment+=nAlignment;NTotal+=alTotal; perp.addFactor(log(double(align_total_count)), count, l, m,0); viterbiPerp.addFactor(log(double(setOfGoodCenters[bestAlignment].second)), count, l, m,0); massert(log(double(setOfGoodCenters[bestAlignment].second)) <= log(double(align_total_count))); if (dump_files||(FEWDUMPS&&sent.sentenceNo<1000)||(final&&(ONLYALDUMPS)) ) printAlignToFile(es, fs, Elist.getVocabList(), Flist.getVocabList(), of2, (setOfGoodCenters[bestAlignment].first)->getAlignment(), sent.sentenceNo, setOfGoodCenters[bestAlignment].second); for(unsigned int i=0;icheck(); if( of3||(writeNBestErrorsFile&&pair_no als; for(unsigned int s=0;s&msc= *setOfGoodCenters[s].first; msc.check(); double normalized_ascore=setOfGoodCenters[s].second; if( !msc.isCenterDeleted() ) als.push_back( Als(s,0,0,normalized_ascore) ); for(WordIndex j=1;j<=m;j++) for(WordIndex i=0;i<=l;i++) if( i!=msc(j)&& !msc.isDelMove(i,j) ) als.push_back( Als(s,i,j,msc.cmove(i,j)*normalized_ascore)); for(PositionIndex j1=1;j1<=m;j1++) for(PositionIndex j2=j1+1;j2<=m;j2++) if( msc(j1)!=msc(j2) && !msc.isDelSwap(j1,j2) ) als.push_back( Als(s,-j1,-j2,msc.cswap(j1,j2)*normalized_ascore)); } sort(als.begin(),als.end()); double sum=0,sum2=0; for(unsigned int i=0;i scores; ErrorsInAlignment(ReferenceAlignment[pair_no-1],x.getAlignment(),l,ALmissing,ALtoomuch,ALeventsMissing,ALeventsToomuch,pair_no); ef.computeScores(x,scores); *writeNBestErrorsFile << ALmissing+ALtoomuch << ' '; for(unsigned int i=0;i0.0)?(-log(scores[i])):1.0e6) << ' '; *writeNBestErrorsFile << '\n'; } } } if( writeNBestErrorsFile ) *writeNBestErrorsFile << '\n'; } addAL((setOfGoodCenters[bestAlignment].first)->getAlignment(),sent.sentenceNo,l); for(unsigned int i=0;i(Perplexity& perp, Perplexity& viterbiPerp, sentenceHandler& sHandler1, \ bool dump_files, const char* alignfile,bool collect_counts, string, bool final,\ B*d4m,C*d5m); INSTANTIATE(transpair_model3, void, void) ; INSTANTIATE(transpair_modelhmm, const hmm, void) ; INSTANTIATE(transpair_modelhmm, const hmm, d4model) ; INSTANTIATE(transpair_modelhmm, const hmm, d5model) ; INSTANTIATE(transpair_model3, void,d4model) ; INSTANTIATE(transpair_model3, void,d5model) ; INSTANTIATE(transpair_model4, d4model,d4model) ; INSTANTIATE(transpair_model4, d4model,d5model) ; INSTANTIATE(transpair_model5, d5model,d5model) ;