#include "Mismatch.h" #include #include #include #include #include #include "SuffixArray.h" #include "TargetCorpus.h" #include "Alignment.h" #include "Vocabulary.h" using namespace std; enum { UNANNOTATED = 0, PRE_ALIGNED = 1, POST_ALIGNED = 2, UNALIGNED = 3, MISALIGNED = 4, ALIGNED = 5 }; Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end ) :m_suffixArray(sa) ,m_targetCorpus(tc) ,m_alignment(a) ,m_sentence_id(sentence_id) ,m_source_length(source_length) ,m_target_length(target_length) ,m_source_position(position) ,m_source_start(source_start) ,m_source_end(source_end) ,m_unaligned(true) { // initialize unaligned indexes for (int i = 0; i < m_source_length; i++) { m_source_unaligned[i] = true; } for (int i = 0; i < m_target_length; i++) { m_target_unaligned[i] = true; } m_num_alignment_points = m_alignment->GetNumberOfAlignmentPoints( sentence_id ); for(INDEX ap=0; apGetSourceWord( sentence_id, ap ) ] = false; m_target_unaligned[ (int)m_alignment->GetTargetWord( sentence_id, ap ) ] = false; } for(int i = source_start; i <= source_end; i++) { if (!m_source_unaligned[ i ]) { m_unaligned = false; } } } Mismatch::~Mismatch () {} void Mismatch::PrintClippedHTML( ostream* out, int width ) { int source_annotation[256], target_annotation[256]; vector< string > label_class; label_class.push_back( "" ); label_class.push_back( "mismatch_pre_aligned" ); label_class.push_back( "mismatch_post_aligned" ); label_class.push_back( "null_aligned" ); label_class.push_back( "mismatch_misaligned" ); label_class.push_back( "mismatch_aligned" ); for(int i=0; i= 0) { int word_id = m_source_start-i; source_annotation[ word_id ] = UNALIGNED; if (!m_source_unaligned[ word_id ]) { found_aligned = true; LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED ); } } if (m_source_end+i < m_source_length) { int word_id = m_source_end+i; source_annotation[ word_id ] = UNALIGNED; if (!m_source_unaligned[ word_id ]) { found_aligned = true; LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED ); } } } } // misalignment else { // label aligned output words for(int i=m_source_start; i<=m_source_end; i++) LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED ); // find first and last int target_start = -1; int target_end; for(int i=0; iGetTargetWord( m_sentence_id, ap ) == i) { int source_word = m_alignment->GetSourceWord( m_sentence_id, ap ); // if not part of the source phrase -> also misaligned if (source_word < m_source_start || source_word > m_source_end) source_annotation[ source_word ] = MISALIGNED; } } } } // closure bool change = true; while(change) { change = false; for(INDEX ap=0; apGetSourceWord( m_sentence_id, ap ); int target_word = m_alignment->GetTargetWord( m_sentence_id, ap ); if (source_annotation[source_word] != UNANNOTATED && target_annotation[target_word] == UNANNOTATED) { target_annotation[target_word] = MISALIGNED; change = true; } if (source_annotation[source_word] == UNANNOTATED && target_annotation[target_word] != UNANNOTATED) { source_annotation[source_word] = MISALIGNED; change = true; } } } } // print source // shorten source context if too long int sentence_start = m_source_position - m_source_start; int context_space = width/2; for(int i=m_source_start; i<=m_source_end; i++) context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1; context_space /= 2; int remaining = context_space; int start_word = m_source_start; for(; start_word>0 && remaining>0; start_word--) remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1; if (remaining<0 || start_word == -1) start_word++; remaining = context_space; int end_word = m_source_end; for(; end_word0; end_word++) remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1; end_word--; // output with markup *out << ""; char current_label = UNANNOTATED; if (start_word>0) { current_label = source_annotation[start_word-1]; *out << "... "; } for(int i=start_word; i<=end_word; i++) { // change to phrase block if (i == m_source_start) { if (current_label != UNANNOTATED && i!=start_word) *out << ""; *out << ""; current_label = UNANNOTATED; } // change to labeled word else if (source_annotation[i] != current_label && source_annotation[i] != ALIGNED) { if (current_label != UNANNOTATED && i!=start_word) *out << ""; if (source_annotation[i] != UNANNOTATED) *out << ""; current_label = source_annotation[i]; } // output word *out << m_suffixArray->GetWord( sentence_start + i ) << " "; // change to right context block if (i == m_source_end) { *out << ""; current_label = UNANNOTATED; } } if (current_label != UNANNOTATED && end_word>m_source_end) *out << ""; if (end_wordGetWord( m_sentence_id, i ).size() + 1; while (context_space < 0) { // shorten matched part, if too long context_space += m_targetCorpus->GetWord( m_sentence_id, target_start ).size() + m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2; target_start++; target_end--; } context_space /= 2; remaining = context_space; start_word = target_start; for(; start_word>0 && remaining>0; start_word--) { //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl; remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1; } if (remaining<0 || start_word == -1) start_word++; remaining = context_space; end_word = target_end; for(; end_word0; end_word++) { //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl; remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1; } end_word--; // output with markup *out << ""; current_label = UNANNOTATED; if (start_word>0) { current_label = target_annotation[start_word-1]; *out << "... "; } for(int i=start_word; i<=end_word; i++) { if (target_annotation[i] != current_label) { if (current_label != UNANNOTATED && i!=start_word) *out << ""; if (target_annotation[i] != UNANNOTATED) *out << ""; current_label = target_annotation[i]; } // output word *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " "; } if (current_label != UNANNOTATED && end_word>target_end) *out << ""; if (end_word"; } void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ) { for(INDEX ap=0; apGetSourceWord( m_sentence_id, ap ) == source_id) { source_annotation[ source_id ] = label; target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label; } } }