#include "PhrasePairCollection.h" #include #include #include #include "Vocabulary.h" #include "SuffixArray.h" #include "TargetCorpus.h" #include "Alignment.h" #include "PhrasePair.h" #include "Mismatch.h" using namespace std; PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a, int max_translation, int max_example ) :m_suffixArray(sa) ,m_targetCorpus(tc) ,m_alignment(a) ,m_size(0) ,m_max_lookup(10000) // maximum number of source occurrences sampled ,m_max_translation(max_translation) // max number of different distinct translations returned ,m_max_example(max_example) // max number of examples returned for each distinct translation {} PhrasePairCollection::~PhrasePairCollection() {} int PhrasePairCollection::GetCollection( const vector< string >& sourceString ) { INDEX first_match, last_match; if (! m_suffixArray->FindMatches( sourceString, first_match, last_match )) { return 0; } //cerr << "\tfirst match " << first_match << endl; //cerr << "\tlast match " << last_match << endl; INDEX found = last_match - first_match +1; map< vector< WORD_ID >, INDEX > index; int real_count = 0; for( INDEX i=first_match; i<=last_match; i++ ) { int position = m_suffixArray->GetPosition( i ); int source_start = m_suffixArray->GetWordInSentence( position ); int source_end = source_start + sourceString.size()-1; INDEX sentence_id = m_suffixArray->GetSentence( position ); int sentence_length = m_suffixArray->GetSentenceLength( sentence_id ); int target_length = m_targetCorpus->GetSentenceLength( sentence_id ); //cerr << "match " << (i-first_match) //<< " in sentence " << sentence_id //<< ", starting at word " << source_start //<< " of " << sentence_length //<< ". target sentence has " << target_length << " words."; int target_start, target_end, pre_null, post_null; if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) { //cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]"; //cerr << " +(" << (int)pre_null << "," << (int)post_null << ")"; bool null_boundary_words = false; for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) { for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) { vector< WORD_ID > targetString; //cerr << "; "; for (int target = target_start - pre; target <= target_end + post; target++) { targetString.push_back( m_targetCorpus->GetWordId( sentence_id, target) ); //cerr << m_targetCorpus->GetWord( sentence_id, target) << " "; } PhrasePair *phrasePair = new PhrasePair( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, target_length, position, source_start, source_end, target_start-pre, target_end+post, pre, post, pre_null-pre, post_null-post); // matchCollection.Add( sentence_id, ) if (index.find( targetString ) == index.end()) { index[targetString] = m_collection.size(); vector< PhrasePair* > emptyVector; m_collection.push_back( emptyVector ); } m_collection[ index[targetString] ].push_back( phrasePair ); m_size++; } } } else { //cerr << "mismatch " << (i-first_match) // << " in sentence " << sentence_id // << ", starting at word " << source_start // << " of " << sentence_length // << ". target sentence has " << target_length << " words."; Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end ); if (mismatch->Unaligned()) m_unaligned.push_back( mismatch ); else m_mismatch.push_back( mismatch ); } //cerr << endl; if (found > (INDEX)m_max_lookup) { i += found/m_max_lookup-1; } real_count++; } sort(m_collection.begin(), m_collection.end(), CompareBySize()); return real_count; } void PhrasePairCollection::Print(bool pretty) const { vector< vector >::const_iterator ppWithSameTarget; int i=0; for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && ibegin()))->PrintTarget( &cout ); int count = ppWithSameTarget->size(); cout << "(" << count << ")" << endl; vector< PhrasePair* >::const_iterator p = ppWithSameTarget->begin(); for(int j=0; jsize() && jPrintPretty( &cout, 100 ); } else { (*p)->Print( &cout ); } if (ppWithSameTarget->size() > m_max_example) { p += ppWithSameTarget->size()/m_max_example-1; } } } } void PhrasePairCollection::PrintHTML() const { int pp_target = 0; bool singleton = false; // loop over all translations vector< vector >::const_iterator ppWithSameTarget; for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_targetsize(); if (!singleton) { if (count == 1) { singleton = true; cout << "

singleton" << (m_collection.end() - ppWithSameTarget==1?"":"s") << " (" << (m_collection.end() - ppWithSameTarget) << "/" << m_size << ")

"; } else { cout << "

"; (*(ppWithSameTarget->begin()))->PrintTarget( &cout ); cout << " (" << count << "/" << m_size << ")" << endl; cout << "

"; } cout << ""; } vector< PhrasePair* >::const_iterator p; // loop over all sentences where translation occurs int pp=0; int i=0; for(p = ppWithSameTarget->begin(); i<10 && ppend(); p++, pp++, i++ ) { (*p)->PrintClippedHTML( &cout, 160 ); if (count > m_max_example) { p += count/m_max_example-1; pp += count/m_max_example-1; } } if (i == 10 && pp < count) { // extended table cout << "
(more)
"; cout << "
"; cout << ""; for(i=0, pp=0, p = ppWithSameTarget->begin(); iend(); p++, pp++, i++ ) { (*p)->PrintClippedHTML( &cout, 160 ); if (count > m_max_example) { p += count/m_max_example-1; pp += count/m_max_example-1; } } } if (!singleton) cout << "
\n"; if (!singleton && pp_target == 9) { cout << "
"; cout << "

(more)

"; cout << "
"; } } if (singleton) cout << "
\n"; else if (pp_target > 9) cout << ""; size_t max_mismatch = m_max_example/3; // unaligned phrases if (m_unaligned.size() > 0) { cout << "

unaligned" << " (" << (m_unaligned.size()) << ")

"; cout << ""; int step_size = 1; if (m_unaligned.size() > max_mismatch) step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch; for(size_t i=0; iPrintClippedHTML( &cout, 160 ); cout << "
"; } // mismatched phrases if (m_mismatch.size() > 0) { cout << "

mismatched" << " (" << (m_mismatch.size()) << ")

"; cout << ""; int step_size = 1; if (m_mismatch.size() > max_mismatch) step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch; for(size_t i=0; iPrintClippedHTML( &cout, 160 ); cout << "
"; } }