/********************************* tercpp: an open-source Translation Edit Rate (TER) scorer tool for Machine Translation. Copyright 2010-2013, Christophe Servan, LIUM, University of Le Mans, France Contact: christophe.servan@lium.univ-lemans.fr The tercpp tool and library are free software: you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation, either version 3 of the licence, or (at your option) any later version. This program and library are distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA **********************************/ // // C++ Implementation: tercalc // // Description: // // // Author: <>, (C) 2010 // // Copyright: See COPYING file that comes with this distribution // // #include "tercalc.h" using namespace std; using namespace TERCPPNS_Tools; namespace TERCPPNS_TERCpp { terCalc::terCalc() { TAILLE_PERMUT_MAX = 10; NBR_PERMUT_MAX = 10; infinite = 99999.0; shift_cost = 1.0; insert_cost = 1.0; delete_cost = 1.0; substitute_cost = 1.0; match_cost = 0.0; NBR_SEGS_EVALUATED = 0; NBR_PERMUTS_CONSID = 0; NBR_BS_APPELS = 0; TAILLE_BEAM = 10; DIST_MAX_PERMUT = 25; PRINT_DEBUG = false; hypSpans.clear(); refSpans.clear(); CALL_TER_ALIGN=0; CALL_CALC_PERMUT=0; CALL_FIND_BSHIFT=0; MAX_LENGTH_SENTENCE=10; S = new vector < vector < double > >(MAX_LENGTH_SENTENCE, std::vector(MAX_LENGTH_SENTENCE,0.0)); P = new vector < vector < char > >(MAX_LENGTH_SENTENCE, std::vector(MAX_LENGTH_SENTENCE,' ')); } terCalc::~terCalc() { delete(S); delete(P); } terAlignment terCalc::WERCalculation ( vector< string >& hyp , vector< string >& ref ) { return minimizeDistanceEdition ( hyp, ref, hypSpans ); } terAlignment terCalc::TER ( vector< int >& hyp, vector< int >& ref ) { stringstream s; s.str ( "" ); string stringRef ( "" ); string stringHyp ( "" ); for ( vector::iterator l_it = ref.begin(); l_it != ref.end(); l_it++ ) { if ( l_it == ref.begin() ) { s << ( *l_it ); } else { s << " " << ( *l_it ); } } stringRef = s.str(); s.str ( "" ); for ( vector::iterator l_itHyp = hyp.begin(); l_itHyp != hyp.end(); l_itHyp++ ) { if ( l_itHyp == hyp.begin() ) { s << ( *l_itHyp ); } else { s << " " << ( *l_itHyp ); } } stringHyp = s.str(); s.str ( "" ); vector l_vref=stringToVector ( stringRef , " " ); vector l_vhyp=stringToVector ( stringHyp , " " ); return TER ( l_vhyp , l_vref); } hashMapInfos terCalc::createConcordMots ( vector< string >& hyp, vector< string >& ref ) { hashMap tempHash; hashMapInfos retour; for ( int i = 0; i < ( int ) hyp.size(); i++ ) { tempHash.addHasher ( hyp.at ( i ), "" ); } bool cor[ref.size() ]; for ( int i = 0; i < ( int ) ref.size(); i++ ) { if ( tempHash.trouve ( ( string ) ref.at ( i ) ) ) { cor[i] = true; } else { cor[i] = false; } } for ( int start = 0; start < ( int ) ref.size(); start++ ) { if ( cor[start] ) { for ( int end = start; ( ( end < ( int ) ref.size() ) && ( end - start <= TAILLE_PERMUT_MAX ) && ( cor[end] ) ); end++ ) { vector ajouter = subVector ( ref, start, end + 1 ); string ajouterString = vectorToString ( ajouter ); vector values = retour.getValue ( ajouterString ); values.push_back ( start ); if ( values.size() > 1 ) { retour.setValue ( ajouterString, values ); } else { retour.addValue ( ajouterString, values ); } } } } return retour; } bool terCalc::trouverIntersection ( vecInt& refSpan, vecInt& hypSpan ) { if ( ( refSpan.at ( 1 ) >= hypSpan.at ( 0 ) ) && ( refSpan.at ( 0 ) <= hypSpan.at ( 1 ) ) ) { return true; } return false; } terAlignment terCalc::minimizeDistanceEdition ( vector< string >& hyp, vector< string >& ref, vector< vecInt >& curHypSpans ) { double current_best = infinite; double last_best = infinite; int first_good = 0; int current_first_good = 0; int last_good = -1; int cur_last_good = 0; int last_peak = 0; int cur_last_peak = 0; int i=0; int j=0; int ref_size=0 ; ref_size=( int ) ref.size(); int hyp_size=0; hyp_size=( int ) hyp.size(); double cost, icost, dcost; double score; delete(S); delete(P); S = new vector < vector < double > >(ref_size+1, std::vector(hyp_size+1,-1.0)); P = new vector < vector < char > >(ref_size+1, std::vector(hyp_size+1,'0')); NBR_BS_APPELS++; // cerr << "Appels : " << NBR_BS_APPELS << endl; // for ( i = 0; i <= ref_size; i++ ) // { // for ( j = 0; j <= hyp_size; j++ ) // { // S->at(i).at(j) = -1.0; // P->at(i).at(j) = '0'; // } // } S->at(0).at(0) = 0.0; for ( j = 0; j <= hyp_size; j++ ) { last_best = current_best; current_best = infinite; first_good = current_first_good; current_first_good = -1; last_good = cur_last_good; cur_last_good = -1; last_peak = cur_last_peak; cur_last_peak = 0; for ( i = first_good; i <= ref_size; i++ ) { if ( i > last_good ) { break; } if ( S->at(i).at(j) < 0 ) { continue; } score = S->at(i).at(j); if ( ( j < hyp_size ) && ( score > last_best + TAILLE_BEAM ) ) { continue; } if ( current_first_good == -1 ) { current_first_good = i ; } if ( ( i < ref_size ) && ( j < hyp_size ) ) { if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || trouverIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) { if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) { cost = match_cost + score; if ( ( S->at(i+1).at(j+1) == -1 ) || ( cost < S->at(i+1).at(j+1) ) ) { S->at(i+1).at(j+1) = cost; P->at(i+1).at(j+1) = 'A'; } if ( cost < current_best ) { current_best = cost; } if ( current_best == cost ) { cur_last_peak = i + 1; } } else { cost = substitute_cost + score; if ( ( S->at(i+1).at(j+1) < 0 ) || ( cost < S->at(i+1).at(j+1) ) ) { S->at(i+1).at(j+1) = cost; P->at(i+1).at(j+1) = 'S'; if ( cost < current_best ) { current_best = cost; } if ( current_best == cost ) { cur_last_peak = i + 1 ; } } } } } cur_last_good = i + 1; if ( j < hyp_size ) { icost = score + insert_cost; if ( ( S->at(i).at(j+1) < 0 ) || ( S->at(i).at(j+1) > icost ) ) { S->at(i).at(j+1) = icost; P->at(i).at(j+1) = 'I'; if ( ( cur_last_peak < i ) && ( current_best == icost ) ) { cur_last_peak = i; } } } if ( i < ref_size ) { dcost = score + delete_cost; if ( ( S->at(i+1).at(j) < 0.0 ) || ( S->at(i+1).at(j) > dcost ) ) { S->at(i+1).at(j) = dcost; P->at(i+1).at(j) = 'D'; if ( i >= last_good ) { last_good = i + 1 ; } } } } } int tracelength = 0; i = ref.size(); j = hyp.size(); while ( ( i > 0 ) || ( j > 0 ) ) { tracelength++; if ( P->at(i).at(j) == 'A' ) { i--; j--; } else if ( P->at(i).at(j) == 'S' ) { i--; j--; } else if ( P->at(i).at(j) == 'D' ) { i--; } else if ( P->at(i).at(j) == 'I' ) { j--; } else { cerr << "ERROR : terCalc::minimizeDistanceEdition : Invalid path : " << P->at(i).at(j) << endl; exit ( -1 ); } } vector path ( tracelength ); i = ref.size(); j = hyp.size(); while ( ( i > 0 ) || ( j > 0 ) ) { path[--tracelength] = P->at(i).at(j); if ( P->at(i).at(j) == 'A' ) { i--; j--; } else if ( P->at(i).at(j) == 'S' ) { i--; j--; } else if ( P->at(i).at(j) == 'D' ) { i--; } else if ( P->at(i).at(j) == 'I' ) { j--; } } terAlignment to_return; to_return.numWords = ref_size; to_return.alignment = path; to_return.numEdits = S->at(ref_size).at(hyp_size); to_return.hyp = hyp; to_return.ref = ref; to_return.averageWords = ref_size; if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::minimizeDistanceEdition : to_return :" << endl << to_return.toString() << endl << "END DEBUG" << endl; } return to_return; } void terCalc::minimizeDistanceEdition ( vector< string >& hyp, vector< string >& ref, vector< vecInt >& curHypSpans, terAlignment* to_return ) { double current_best = infinite; double last_best = infinite; int first_good = 0; int current_first_good = 0; int last_good = -1; int cur_last_good = 0; int last_peak = 0; int cur_last_peak = 0; int i=0; int j=0; int ref_size=0 ; ref_size=( int ) ref.size(); int hyp_size=0; hyp_size=( int ) hyp.size(); double cost, icost, dcost; double score; delete(S); delete(P); S = new vector < vector < double > >(ref_size+1, std::vector(hyp_size+1,-1.0)); P = new vector < vector < char > >(ref_size+1, std::vector(hyp_size+1,'0')); NBR_BS_APPELS++; // cerr << "Appels : " << NBR_BS_APPELS << endl; // for ( i = 0; i <= ref_size; i++ ) // { // for ( j = 0; j <= hyp_size; j++ ) // { // S->at(i).at(j) = -1.0; // P->at(i).at(j) = '0'; // } // } S->at(0).at(0) = 0.0; for ( j = 0; j <= hyp_size; j++ ) { last_best = current_best; current_best = infinite; first_good = current_first_good; current_first_good = -1; last_good = cur_last_good; cur_last_good = -1; last_peak = cur_last_peak; cur_last_peak = 0; for ( i = first_good; i <= ref_size; i++ ) { if ( i > last_good ) { break; } if (S->at(i).at(j) < 0 ) { continue; } score = S->at(i).at(j); if ( ( j < hyp_size ) && ( score > last_best + TAILLE_BEAM ) ) { continue; } if ( current_first_good == -1 ) { current_first_good = i ; } if ( ( i < ref_size ) && ( j < hyp_size ) ) { if ( ( int ) refSpans.size() == 0 || ( int ) hypSpans.size() == 0 || trouverIntersection ( refSpans.at ( i ), curHypSpans.at ( j ) ) ) { if ( ( int ) ( ref.at ( i ).compare ( hyp.at ( j ) ) ) == 0 ) { cost = match_cost + score; if ( ( S->at(i+1).at(j+1) == -1 ) || ( cost < S->at(i+1).at(j+1) ) ) { S->at(i+1).at(j+1) = cost; P->at(i+1).at(j+1) = 'A'; } if ( cost < current_best ) { current_best = cost; } if ( current_best == cost ) { cur_last_peak = i + 1; } } else { cost = substitute_cost + score; if ( ( S->at(i+1).at(j+1) < 0 ) || ( cost < S->at(i+1).at(j+1) ) ) { S->at(i+1).at(j+1) = cost; P->at(i+1).at(j+1) = 'S'; if ( cost < current_best ) { current_best = cost; } if ( current_best == cost ) { cur_last_peak = i + 1 ; } } } } } cur_last_good = i + 1; if ( j < hyp_size ) { icost = score + insert_cost; if ( ( S->at(i).at(j+1) < 0 ) || ( S->at(i).at(j+1) > icost ) ) { S->at(i).at(j+1) = icost; P->at(i).at(j+1) = 'I'; if ( ( cur_last_peak < i ) && ( current_best == icost ) ) { cur_last_peak = i; } } } if ( i < ref_size ) { dcost = score + delete_cost; if ( ( S->at(i+1).at(j) < 0.0 ) || ( S->at(i+1).at(j) > dcost ) ) { S->at(i+1).at(j) = dcost; P->at(i+1).at(j) = 'D'; if ( i >= last_good ) { last_good = i + 1 ; } } } } } int tracelength = 0; i = ref_size;; j = hyp_size; while ( ( i > 0 ) || ( j > 0 ) ) { tracelength++; if (P->at(i).at(j) == 'A' ) { i--; j--; } else if (P->at(i).at(j) == 'S' ) { i--; j--; } else if (P->at(i).at(j) == 'D' ) { i--; } else if (P->at(i).at(j) == 'I' ) { j--; } else { cerr << "ERROR : terCalc::minimizeDistanceEdition : Invalid path : " <at(i).at(j) << endl; exit ( -1 ); } } vector path ( tracelength ); i = ref_size; j = hyp_size; while ( ( i > 0 ) || ( j > 0 ) ) { path[--tracelength] =P->at(i).at(j); if (P->at(i).at(j) == 'A' ) { i--; j--; } else if (P->at(i).at(j) == 'S' ) { i--; j--; } else if (P->at(i).at(j) == 'D' ) { i--; } else if (P->at(i).at(j) == 'I' ) { j--; } } // terAlignment to_return; to_return->numWords = ref_size; to_return->alignment = path; to_return->numEdits = S->at(ref_size).at(hyp_size); to_return->hyp = hyp; to_return->ref = ref; to_return->averageWords = ref_size; if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::minimizeDistanceEdition : to_return :" << endl << to_return->toString() << endl << "END DEBUG" << endl; } // return to_return; } terAlignment terCalc::TER ( vector& hyp, vector& ref ) { hashMapInfos rloc = createConcordMots ( hyp, ref ); terAlignment cur_align = minimizeDistanceEdition ( hyp, ref, hypSpans ); vector cur = hyp; cur_align.hyp = hyp; cur_align.ref = ref; cur_align.aftershift = hyp; double edits = 0; // int numshifts = 0; vector * allshifts=new vector(0); bestShiftStruct * returns=new bestShiftStruct(); // cerr << "Initial Alignment:" << endl << cur_align.toString() <getEmpty() << endl; if ( returns->getEmpty()) { break; } terShift bestShift = (*(returns->m_best_shift)); cur_align = (*(returns->m_best_align)); edits += bestShift.cost; bestShift.alignment = cur_align.alignment; bestShift.aftershift = cur_align.aftershift; allshifts->push_back ( bestShift ); cur = cur_align.aftershift; delete(returns); } if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::TER : Final to return :" << endl << cur_align.toString() << endl << "END DEBUG" << endl; } terAlignment to_return; to_return = cur_align; to_return.allshifts = (*(allshifts)); to_return.numEdits += edits; NBR_SEGS_EVALUATED++; return to_return; } bestShiftStruct * terCalc::findBestShift ( vector& cur, vector& hyp, vector& ref, hashMapInfos& rloc, terAlignment& med_align ) { CALL_FIND_BSHIFT++; // cerr << "CALL_FIND_BSHIFT " << CALL_FIND_BSHIFT <m_empty = new bool(false); bool anygain = false; vector * herr = new vector(( int ) hyp.size() + 1 ); vector * rerr = new vector( ( int ) ref.size() + 1 ); vector * ralign = new vector( ( int ) ref.size() + 1 ); int l_i,i,j,s; for (i = 0 ; i< ( int ) hyp.size() + 1 ; i++) { herr->at(i)=false; } for (i = 0 ; i< ( int ) ref.size() + 1 ; i++) { rerr->at(i)=false; ralign->at(i)=-1; } calculateTerAlignment ( med_align, herr, rerr, ralign ); vector * poss_shifts = new vector< vector >(0) ; terAlignment * cur_best_align = new terAlignment(); terShift * cur_best_shift = new terShift(); double cur_best_shift_cost = 0.0; vector shiftarr; vector curHypSpans; terShift * curshift = new terShift(); alignmentStruct shiftReturns; terAlignment * curalign = new terAlignment() ; if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::findBestShift (after the calculateTerAlignment call) :" << endl; cerr << "indices: "; for (l_i=0; l_i < ( int ) ref.size() ; l_i++) { cerr << l_i << "\t"; } cerr << endl; cerr << "hyp : \t"<size() - 1; i >= 0; i-- ) { for ( j = 0; j < ( int ) ( poss_shifts->at ( i ) ).size(); j++ ) { cerr << " [" << i << "] " << ( ( poss_shifts->at ( i ) ).at ( j ) ).toString() << endl; } } cerr << endl; cerr << "END DEBUG " << endl; } // exit(0); cur_best_align->set(med_align); for ( i = ( int ) poss_shifts->size() - 1; i >= 0; i-- ) { if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; cerr << "Considering shift of length " << i << " (" << ( poss_shifts->at ( i ) ).size() << ")" << endl; cerr << "END DEBUG " << endl; } /* Consider shifts of length i+1 */ double curfix = curerr - ( cur_best_shift_cost + cur_best_align->numEdits ); double maxfix = ( 2 * ( 1 + i ) ); if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) { break; } else { for ( s = 0; s < ( int ) ( poss_shifts->at ( i ) ).size(); s++ ) { curfix = curerr - ( cur_best_shift_cost + cur_best_align->numEdits ); if ( ( curfix > maxfix ) || ( ( cur_best_shift_cost != 0 ) && ( curfix == maxfix ) ) ) { break; } else { curshift->set(( poss_shifts->at ( i ) ).at ( s )); if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; cerr << "cur : "<< join(" ",cur) << endl; cerr << "shift size : "<< i << endl; cerr << "shift number : "<< s << endl; cerr << "size of shift size : "<< ( int ) ( poss_shifts->at ( i ) ).size() << endl; cerr << "curshift : "<< curshift->toString() << endl; } // alignmentStruct shiftReturns; shiftReturns.set(permuter ( cur, curshift )); shiftarr = shiftReturns.nwords; curHypSpans = shiftReturns.aftershift; if ( PRINT_DEBUG ) { cerr << "shiftarr : "<< join(" ",shiftarr) << endl; cerr << "curHypSpans size : "<< (int)curHypSpans.size() << endl; cerr << "END DEBUG " << endl; } // terAlignment tmp=minimizeDistanceEdition ( shiftarr, ref, curHypSpans ); minimizeDistanceEdition ( shiftarr, ref, curHypSpans, curalign ); // curalign->set(tmp); curalign->hyp = hyp; curalign->ref = ref; curalign->aftershift = shiftarr; double gain = ( cur_best_align->numEdits + cur_best_shift_cost ) - ( curalign->numEdits + curshift->cost ); if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; cerr << "Gain for " << curshift->toString() << " is " << gain << ". (result: [" << curalign->join ( " ", shiftarr ) << "]" << endl; cerr << "Details of gains : gain = ( cur_best_align->numEdits + cur_best_shift_cost ) - ( curalign->numEdits + curshift->cost )"<numEdits << "+" << cur_best_shift_cost << ") - (" << curalign->numEdits << "+" << curshift->cost << ")"<toString() << "\n" << endl; cerr << "END DEBUG " << endl; } if ( ( gain > 0 ) || ( ( cur_best_shift_cost == 0 ) && ( gain == 0 ) ) ) { anygain = true; cur_best_shift->set(curshift); cur_best_shift_cost = curshift->cost; cur_best_align->set(curalign); if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; cerr << "Tmp Choosing shift: " << cur_best_shift->toString() << " gives:\n" << cur_best_align->toString() << "\n" << endl; cerr << "END DEBUG " << endl; } } } } } } bestShiftStruct * to_return=new bestShiftStruct(); if ( anygain ) { to_return->setEmpty(false); if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::findBestShift :" << endl; cerr << "Final shift chosen : " << cur_best_shift->toString() << " gives:\n" << cur_best_align->toString() << "\n" << endl; cerr << "END DEBUG " << endl; } to_return->m_best_shift->set(cur_best_shift); // terAlignment tmp=cur_best_align; // cur_best_align->toString(); // to_return.m_best_align.toString(); // if ((int)cur_best_align->alignment.size() == 0) // { // to_return.m_best_align = cur_best_align; // } // else // { // cerr << "Warning: cur_best_align->alignment.size() = 0 !!!"<m_best_align->set(cur_best_align); // to_return.m_best_align.toString(); } else { to_return->setEmpty(true); } // // cerr << to_return->toString() << endl; delete(poss_shifts); delete(cur_best_align); delete(cur_best_shift); delete(curshift); delete(curalign) ; return to_return; } void terCalc::calculateTerAlignment ( terAlignment& align, vector* herr, vector* rerr, vector* ralign ) { int hpos = -1; int rpos = -1; CALL_TER_ALIGN++; // cerr << "CALL_TER_ALIGN " << CALL_TER_ALIGN << endl; if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::calculateTerAlignment : " << endl << align.toString() << endl; cerr << "END DEBUG " << endl; } // cerr << (int)herr->size() <size() <at(i) = false; // rerr->at(i) = false; // ralign->at(i) = -1; // } for ( int i = 0; i < ( int ) align.alignment.size(); i++ ) { char sym = align.alignment.at(i); if ( sym == 'A' ) { hpos++; rpos++; herr->at(hpos) = false; rerr->at(rpos) = false; ralign->at(rpos) = hpos; } else if ( sym == 'S' ) { hpos++; rpos++; herr->at(hpos) = true; rerr->at(rpos) = true; ralign->at(rpos) = hpos; } else if ( sym == 'I' ) { hpos++; herr->at(hpos) = true; } else if ( sym == 'D' ) { rpos++; rerr->at(rpos) = true; ralign->at(rpos) = hpos+1; } else { cerr << "ERROR : terCalc::calculateTerAlignment : Invalid mini align sequence " << sym << " at pos " << i << endl; exit ( -1 ); } } } vector * terCalc::calculerPermutations ( vector< string >& hyp, vector< string >& ref, hashMapInfos& rloc, TERCPPNS_TERCpp::terAlignment& align, vector* herr, vector* rerr, vector* ralign ) { vector * allshifts = new vector(0); // to_return.clear(); CALL_CALC_PERMUT++; // cerr << "CALL_CALC_PERMUT " << CALL_CALC_PERMUT << endl; if ( ( TAILLE_PERMUT_MAX <= 0 ) || ( DIST_MAX_PERMUT <= 0 ) ) { return allshifts; } allshifts = new vector( TAILLE_PERMUT_MAX + 1 ); int start=0; int end=0; bool ok = false; vector mtiVec(0); vector::iterator mti; int moveto=0; vector cand(0); bool any_herr = false; bool any_rerr = false; int i=0; int l_nbr_permuts=0; // for (i=0; i< (int)ref.size() +1 ; i++) {cerr << " " << ralign[i] ;} cerr < movetoitVec(0); string subVectorHypString=""; terShift * topush; for ( start = 0; start < ( int ) hyp.size(); start++ ) { subVectorHypString = vectorToString ( subVector ( hyp, start, start + 1 ) ); if ( ! rloc.trouve ( subVectorHypString ) ) { continue; } ok = false; mtiVec = rloc.getValue ( subVectorHypString ); mti = mtiVec.begin(); while ( mti != mtiVec.end() && ( ! ok ) ) { moveto = ( *mti ); mti++; if ( ( start != ralign->at(moveto) ) && ( ( ralign->at(moveto) - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign->at(moveto) - 1 ) <= DIST_MAX_PERMUT ) ) { ok = true; } } if ( ! ok ) { continue; } ok = true; for ( end = start; ( ok && ( end < ( int ) hyp.size() ) && ( end < start + TAILLE_PERMUT_MAX ) ); end++ ) { /* check if cand is good if so, add it */ cand = subVector ( hyp, start, end + 1 ); ok = false; if ( ! ( rloc.trouve ( vectorToString ( cand ) ) ) ) { continue; } any_herr = false; for ( i = 0; ( ( i <= ( end - start ) ) && ( ! any_herr ) ); i++ ) { if ( herr->at(start+i) ) { any_herr = true; } } if ( any_herr == false ) { ok = true; continue; } movetoitVec = rloc.getValue ( ( string ) vectorToString ( cand ) ); // cerr << "CANDIDATE " << ( string ) vectorToString ( cand ) <<" PLACED : " << ( string ) vectorToString ( movetoitVec," ") << endl; vector::iterator movetoit; movetoit = movetoitVec.begin(); while ( movetoit != movetoitVec.end() ) { moveto = ( *movetoit ); movetoit++; if ( ! ( ( ralign->at(moveto) != start ) && ( ( ralign->at(moveto) < start ) || ( ralign->at(moveto) > end ) ) && ( ( ralign->at(moveto) - start ) <= DIST_MAX_PERMUT ) && ( ( start - ralign->at(moveto) ) <= DIST_MAX_PERMUT ) ) ) { continue; } ok = true; /* check to see if there are any errors in either string (only move if this is the case!) */ any_rerr = false; for ( int i = 0; ( i <= end - start ) && ( ! any_rerr ); i++ ) { if ( rerr->at(moveto+i) ) { any_rerr = true; } } if ( ! any_rerr ) { continue; } for ( int roff = -1; roff <= ( end - start ); roff++ ) { topush = new terShift(); bool topushNull = true; if ( ( roff == -1 ) && ( moveto == 0 ) ) { if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::calculerPermutations 01 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: -1" << endl << "END DEBUG" << endl; } // terShift t01 ( start, end, -1, -1 ); // topush = t01; topush->start=start; topush->end=end; topush->moveto=-1; topush->newloc=-1; topushNull = false; } else if ( ( start != ralign->at(moveto+roff) ) && ( ( roff == 0 ) || ( ralign->at(moveto+roff) != ralign->at(moveto) ) ) ) { int newloc = ralign->at(moveto+roff); if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl << "Consider making " << start << "..." << end << " (" << vectorToString(cand," ")<< ") moveto: " << moveto << " roff: " << roff << " ralign[mt+roff]: " << newloc << endl << "END DEBUG" << endl; } // terShift t02 ( start, end, moveto + roff, newloc ); // topush = t02; topush->start=start; topush->end=end; topush->moveto=moveto + roff; topush->newloc=newloc; topushNull = false; } if ( !topushNull ) { topush->shifted = cand; topush->cost = shift_cost; l_nbr_permuts++; if ( PRINT_DEBUG ) { cerr << "BEGIN DEBUG : terCalc::calculerPermutations 02 : " << endl; cerr << "start : " << start << endl; cerr << "end : " << end << endl; cerr << "end - start : " << end - start << endl; cerr << "nbr Permutations added: " << l_nbr_permuts << endl; cerr << "END DEBUG " << endl; } if (l_nbr_permuts < NBR_PERMUT_MAX + 1) { ( allshifts->at ( end - start ) ).push_back ( (*(topush)) ); } // else // { // break; // } } delete(topush); } } } } // to_return.clear(); // for ( int i = 0; i < TAILLE_PERMUT_MAX + 1; i++ ) // { // to_return.push_back ( ( vecTerShift ) allshifts.at ( i ) ); // } return allshifts; } alignmentStruct terCalc::permuter ( vector< string >& words, TERCPPNS_TERCpp::terShift& s ) { return permuter ( words, s.start, s.end, s.newloc ); } alignmentStruct terCalc::permuter ( vector< string >& words, TERCPPNS_TERCpp::terShift* s ) { return permuter ( words, s->start, s->end, s->newloc ); } alignmentStruct terCalc::permuter ( vector< string >& words, int start, int end, int newloc ) { int c = 0; vector nwords ( words ); vector spans ( ( int ) hypSpans.size() ); alignmentStruct to_return; if ( PRINT_DEBUG ) { if ( ( int ) hypSpans.size() > 0 ) { cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: " << ( int ) hypSpans.size() << endl ; } else { cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << "word length: " << ( int ) words.size() << " span length: null" << endl ; } cerr << "BEGIN DEBUG : terCalc::permuter :" << endl << join(" ",words) << " start: " << start << " end: " << end << " newloc "<< newloc << endl << "END DEBUG " << endl; } if (newloc >= ( int ) words.size()) { if ( PRINT_DEBUG ) { cerr << "WARNING: Relocation over the size of the hypothesis, replacing at the end of it."< 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } for ( int i = 0; i <= start - 1; i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } for ( int i = end + 1; i < ( int ) words.size(); i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } } else { if ( newloc < start ) { for ( int i = 0; i < newloc; i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } for ( int i = start; i <= end; i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } for ( int i = newloc ; i < start ; i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } for ( int i = end + 1; i < ( int ) words.size(); i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } } else { if ( newloc > end ) { for ( int i = 0; i <= start - 1; i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } for ( int i = end + 1; i <= newloc; i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } for ( int i = start; i <= end; i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } for ( int i = newloc + 1; i < ( int ) words.size(); i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } } else { // we are moving inside of ourselves for ( int i = 0; i <= start - 1; i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } for ( int i = end + 1; ( i < ( int ) words.size() ) && ( i <= ( end + ( newloc - start ) ) ); i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } for ( int i = start; i <= end; i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } for ( int i = ( end + ( newloc - start ) + 1 ); i < ( int ) words.size(); i++ ) { nwords.at ( c++ ) = words.at ( i ); if ( ( int ) hypSpans.size() > 0 ) { spans.at ( c - 1 ) = hypSpans.at ( i ); } } } } } NBR_PERMUTS_CONSID++; if ( PRINT_DEBUG ) { cerr << "nwords" << join(" ",nwords) << endl; // cerr << "spans" << spans. << endl; } to_return.nwords = nwords; to_return.aftershift = spans; return to_return; } void terCalc::setDebugMode ( bool b ) { PRINT_DEBUG = b; } }