#include "PhrasePair.h" #include #include "TargetCorpus.h" #include "Alignment.h" #include "Vocabulary.h" #include "SuffixArray.h" using namespace std; void PhrasePair::Print( ostream* out ) const { // source int sentence_start = m_source_position - m_source_start; char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) ); for( char i=0; i0) *out << " "; *out << m_suffixArray->GetWord( sentence_start + i ); } // target *out << " |||"; for( char i=0; iGetWord( m_sentence_id, i); } // source span *out << " ||| " << (int)m_source_start << " " << (int)m_source_end; // target span *out << " ||| " << (int)m_target_start << " " << (int)m_target_end; // word alignment *out << " |||"; INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id ); for( INDEX i=0; iGetSourceWord( m_sentence_id, i ) << "-" << m_alignment->GetTargetWord( m_sentence_id, i ); } *out << endl; } void PhrasePair::PrintPretty( ostream* out, int width ) const { vector< WORD_ID >::const_iterator t; // source int sentence_start = m_source_position - m_source_start; size_t source_width = (width-3)/2; string source_pre = ""; string source = ""; string source_post = ""; for( size_t space=0; spaceGetWord( sentence_start + i ); } for( char i=m_source_start; i<=m_source_end; i++ ) { if (i>m_source_start) source += " "; source += m_suffixArray->GetWord( sentence_start + i ); } char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) ); for( char i=m_source_end+1; im_source_end+1) source_post += " "; source_post += m_suffixArray->GetWord( sentence_start + i ); } for( size_t space=0; space (size_t)width) { source_pre_width = 0; source_post_width = 0; } *out << source_pre.substr( source_pre.size()-source_pre_width, source_pre_width ) << " " << source.substr( 0, source_width -2 ) << " " << source_post.substr( 0, source_post_width ) << " | "; // target size_t target_width = (width-3)/2; string target_pre = ""; string target = ""; string target_post = ""; for( size_t space=0; spaceGetWord( m_sentence_id, i); } for( char i=m_target_start; i<=m_target_end; i++ ) { if (i>m_target_start) target += " "; target += m_targetCorpus->GetWord( m_sentence_id, i); } for( char i=m_target_end+1; im_target_end+1) target_post += " "; target_post += m_targetCorpus->GetWord( m_sentence_id, i); } size_t target_pre_width = (target_width-target.size()-2)/2; size_t target_post_width = (target_width-target.size()-2+1)/2; if (target.size() > (size_t)width) { target_pre_width = 0; target_post_width = 0; } *out << target_pre.substr( target_pre.size()-target_pre_width, target_pre_width ) << " " << target.substr( 0, target_width -2 ) << " " << target_post.substr( 0, target_post_width ) << endl; } void PhrasePair::PrintTarget( ostream* out ) const { for( char i=m_target_start; i<=m_target_end; i++ ) { if (i>m_target_start) *out << " "; *out << m_targetCorpus->GetWord( m_sentence_id, i); } } void PhrasePair::PrintHTML( ostream* out ) const { // source int sentence_start = m_source_position - m_source_start; char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) ); *out << ""; for( char i=0; i0) *out << " "; *out << m_suffixArray->GetWord( sentence_start + i ); } *out << ""; for( char i=m_source_start; i<=m_source_end; i++ ) { if (i>m_source_start) *out << " "; *out << m_suffixArray->GetWord( sentence_start + i ); } *out << ""; for( char i=m_source_end+1; im_source_end+1) *out << " "; *out << m_suffixArray->GetWord( sentence_start + i ); } // target *out << ""; for( char i=0; i0) *out << " "; *out << m_targetCorpus->GetWord( m_sentence_id, i); } *out << ""; for( char i=m_target_start; i<=m_target_end; i++ ) { if (i>m_target_start) *out << " "; *out << m_targetCorpus->GetWord( m_sentence_id, i); } *out << ""; for( char i=m_target_end+1; im_target_end+1) *out << " "; *out << m_targetCorpus->GetWord( m_sentence_id, i); } *out << "\n"; } void PhrasePair::PrintClippedHTML( ostream* out, int width ) const { vector< WORD_ID >::const_iterator t; // source int sentence_start = m_source_position - m_source_start; size_t source_width = (width+1)/2; string source_pre = ""; string source = ""; string source_post = ""; for( char i=0; iGetWord( sentence_start + i ); } for( char i=m_source_start; i<=m_source_end; i++ ) { if (i>m_source_start) source += " "; source += m_suffixArray->GetWord( sentence_start + i ); } char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) ); for( char i=m_source_end+1; im_source_end+1) source_post += " "; source_post += m_suffixArray->GetWord( sentence_start + i ); } size_t source_pre_width = (source_width-source.size())/2; size_t source_post_width = (source_width-source.size()+1)/2; // if phrase is too long, don't show any context if (source.size() > (size_t)width) { source_pre_width = 0; source_post_width = 0; } // too long -> truncate and add "..." if (source_pre.size() > source_pre_width) { // first skip up to a space while(source_pre_width>0 && source_pre.substr(source_pre.size()-source_pre_width,1) != " ") { source_pre_width--; } source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width ); } if (source_post.size() > source_post_width) { while(source_post_width>0 && source_post.substr(source_post_width-1,1) != " ") { source_post_width--; } source_post = source_post.substr( 0, source_post_width ) + "..."; } *out << "" << source_pre << "" << source.substr( 0, source_width -2 ) << "" << source_post << ""; // target size_t target_width = width/2; string target_pre = ""; string target = ""; string target_post = ""; size_t target_pre_null_width = 0; size_t target_post_null_width = 0; for( char i=0; iGetWord( m_sentence_id, i); target_pre += " " + word; if (i >= m_target_start-m_pre_null) target_pre_null_width += word.size() + 1; } for( char i=m_target_start; i<=m_target_end; i++ ) { if (i>m_target_start) target += " "; target += m_targetCorpus->GetWord( m_sentence_id, i); } for( char i=m_target_end+1; im_target_end+1) target_post += " "; WORD word = m_targetCorpus->GetWord( m_sentence_id, i); target_post += word; if (i-(m_target_end+1) < m_post_null) { target_post_null_width += word.size() + 1; } } size_t target_pre_width = (target_width-target.size())/2; size_t target_post_width = (target_width-target.size()+1)/2; if (target.size() > (size_t)width) { target_pre_width = 0; target_post_width = 0; } if (target_pre.size() < target_pre_width) target_pre_width = target_pre.size(); else { while(target_pre_width>0 && target_pre.substr(target_pre.size()-target_pre_width,1) != " ") { target_pre_width--; } target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width ); } if (target_post.size() < target_post_width) { target_post_width = target_post.size(); } else { while(target_post_width>0 && target_post.substr(target_post_width-1,1) != " ") { target_post_width--; } target_post = target_post.substr( 0, target_post_width ) + "..."; } if (m_pre_null) { //cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl; if (target_pre_width < target_pre.size()) target_pre_null_width -= target_pre.size()-target_pre_width; target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width) + "" + target_pre.substr(target_pre_width-target_pre_null_width) + ""; } if (m_post_null) { //cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl; if (target_post_null_width > target_post.size()) { target_post_null_width = target_post.size(); } target_post = "" + target_post.substr(0,target_post_null_width) + "" + target_post.substr(target_post_null_width); } *out << "" << target_pre << "" << target.substr( 0, target_width -2 ) << "" << target_post << ""<< endl; }