diff options
author | Hieu Hoang <hieu@hoang.co.uk> | 2013-05-29 21:16:15 +0400 |
---|---|---|
committer | Hieu Hoang <hieu@hoang.co.uk> | 2013-05-29 21:16:15 +0400 |
commit | 6249432407af8730c10bccc7894c0725fcaf5e47 (patch) | |
tree | 3ac1f094b9fdc199b04bc5ef209ce00e3596e37d /biconcor | |
parent | 59bd7deb4b6b9c4f7b3b7dbb055783528fbc31ca (diff) |
beautify
Diffstat (limited to 'biconcor')
-rw-r--r-- | biconcor/Alignment.cpp | 11 | ||||
-rw-r--r-- | biconcor/Mismatch.cpp | 443 | ||||
-rw-r--r-- | biconcor/Mismatch.h | 4 | ||||
-rw-r--r-- | biconcor/PhrasePair.cpp | 123 | ||||
-rw-r--r-- | biconcor/PhrasePairCollection.cpp | 175 | ||||
-rw-r--r-- | biconcor/SuffixArray.cpp | 21 | ||||
-rw-r--r-- | biconcor/TargetCorpus.cpp | 13 | ||||
-rw-r--r-- | biconcor/Vocabulary.cpp | 3 | ||||
-rw-r--r-- | biconcor/base64.cpp | 25 | ||||
-rw-r--r-- | biconcor/biconcor.cpp | 11 |
10 files changed, 416 insertions, 413 deletions
diff --git a/biconcor/Alignment.cpp b/biconcor/Alignment.cpp index e73e18840..814802531 100644 --- a/biconcor/Alignment.cpp +++ b/biconcor/Alignment.cpp @@ -5,7 +5,8 @@ #include <stdlib.h> #include <cstring> -namespace { +namespace +{ const int LINE_MAX_LENGTH = 10000; @@ -84,10 +85,10 @@ void Alignment::Create(const string& fileName) } Alignment::Alignment() - : m_array(NULL), - m_sentenceEnd(NULL), - m_size(0), - m_sentenceCount(0) {} + : m_array(NULL), + m_sentenceEnd(NULL), + m_size(0), + m_sentenceCount(0) {} Alignment::~Alignment() { diff --git a/biconcor/Mismatch.cpp b/biconcor/Mismatch.cpp index 31140b200..c3afec781 100644 --- a/biconcor/Mismatch.cpp +++ b/biconcor/Mismatch.cpp @@ -23,16 +23,16 @@ enum { }; Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end ) - :m_suffixArray(sa) - ,m_targetCorpus(tc) - ,m_alignment(a) - ,m_sentence_id(sentence_id) - ,m_source_length(source_length) - ,m_target_length(target_length) - ,m_source_position(position) - ,m_source_start(source_start) - ,m_source_end(source_end) - ,m_unaligned(true) + :m_suffixArray(sa) + ,m_targetCorpus(tc) + ,m_alignment(a) + ,m_sentence_id(sentence_id) + ,m_source_length(source_length) + ,m_target_length(target_length) + ,m_source_position(position) + ,m_source_start(source_start) + ,m_source_end(source_end) + ,m_unaligned(true) { // initialize unaligned indexes for (int i = 0; i < m_source_length; i++) { @@ -42,7 +42,7 @@ Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sente m_target_unaligned[i] = true; } m_num_alignment_points = - m_alignment->GetNumberOfAlignmentPoints( sentence_id ); + m_alignment->GetNumberOfAlignmentPoints( sentence_id ); for(INDEX ap=0; ap<m_num_alignment_points; ap++) { m_source_unaligned[ (int)m_alignment->GetSourceWord( sentence_id, ap ) ] = false; m_target_unaligned[ (int)m_alignment->GetTargetWord( sentence_id, ap ) ] = false; @@ -58,234 +58,235 @@ Mismatch::~Mismatch () {} void Mismatch::PrintClippedHTML( ostream* out, int width ) { - int source_annotation[256], target_annotation[256]; - vector< string > label_class; - label_class.push_back( "" ); - label_class.push_back( "mismatch_pre_aligned" ); - label_class.push_back( "mismatch_post_aligned" ); - label_class.push_back( "null_aligned" ); - label_class.push_back( "mismatch_misaligned" ); - label_class.push_back( "mismatch_aligned" ); + int source_annotation[256], target_annotation[256]; + vector< string > label_class; + label_class.push_back( "" ); + label_class.push_back( "mismatch_pre_aligned" ); + label_class.push_back( "mismatch_post_aligned" ); + label_class.push_back( "null_aligned" ); + label_class.push_back( "mismatch_misaligned" ); + label_class.push_back( "mismatch_aligned" ); - for(int i=0; i<m_source_length;i++) source_annotation[i] = UNANNOTATED; - for(int i=0; i<m_target_length;i++) target_annotation[i] = UNANNOTATED; - - if (m_unaligned) { - // find alignment points for prior and next word(s) and - // center target phrase around those. - bool found_aligned = false; - for(int i=1; i<m_source_length && !found_aligned; i++) { - if (m_source_start-i >= 0) { - int word_id = m_source_start-i; - source_annotation[ word_id ] = UNALIGNED; - if (!m_source_unaligned[ word_id ]) { - found_aligned = true; - LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED ); - } - } + for(int i=0; i<m_source_length; i++) source_annotation[i] = UNANNOTATED; + for(int i=0; i<m_target_length; i++) target_annotation[i] = UNANNOTATED; - if (m_source_end+i < m_source_length) { - int word_id = m_source_end+i; - source_annotation[ word_id ] = UNALIGNED; - if (!m_source_unaligned[ word_id ]) { - found_aligned = true; - LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED ); - } - } - } - - } - // misalignment - else { - // label aligned output words - for(int i=m_source_start; i<=m_source_end; i++) - LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED ); + if (m_unaligned) { + // find alignment points for prior and next word(s) and + // center target phrase around those. + bool found_aligned = false; + for(int i=1; i<m_source_length && !found_aligned; i++) { + if (m_source_start-i >= 0) { + int word_id = m_source_start-i; + source_annotation[ word_id ] = UNALIGNED; + if (!m_source_unaligned[ word_id ]) { + found_aligned = true; + LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED ); + } + } - // find first and last - int target_start = -1; - int target_end; - for(int i=0; i<m_target_length; i++) - if (target_annotation[i] == ALIGNED) { - if (target_start == -1) - target_start = i; - target_end = i; - } - // go over all enclosed target words - for(int i=target_start; i<=target_end; i++) { - // label other target words as unaligned or misaligned - if (m_target_unaligned[ i ]) - target_annotation[ i ] = UNALIGNED; - else { - if (target_annotation[ i ] != ALIGNED) - target_annotation[ i ] = MISALIGNED; - // loop over aligned source words - for(INDEX ap=0; ap<m_num_alignment_points; ap++) { - if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) { - int source_word = m_alignment->GetSourceWord( m_sentence_id, ap ); - // if not part of the source phrase -> also misaligned - if (source_word < m_source_start || source_word > m_source_end) - source_annotation[ source_word ] = MISALIGNED; - } - } - } - } - // closure - bool change = true; - while(change) { - change = false; - for(INDEX ap=0; ap<m_num_alignment_points; ap++) { - int source_word = m_alignment->GetSourceWord( m_sentence_id, ap ); - int target_word = m_alignment->GetTargetWord( m_sentence_id, ap ); - if (source_annotation[source_word] != UNANNOTATED && - target_annotation[target_word] == UNANNOTATED) { - target_annotation[target_word] = MISALIGNED; - change = true; - } - if (source_annotation[source_word] == UNANNOTATED && - target_annotation[target_word] != UNANNOTATED) { - source_annotation[source_word] = MISALIGNED; - change = true; - } - } - } - } - - // print source - // shorten source context if too long + if (m_source_end+i < m_source_length) { + int word_id = m_source_end+i; + source_annotation[ word_id ] = UNALIGNED; + if (!m_source_unaligned[ word_id ]) { + found_aligned = true; + LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED ); + } + } + } + + } + // misalignment + else { + // label aligned output words + for(int i=m_source_start; i<=m_source_end; i++) + LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED ); + + // find first and last + int target_start = -1; + int target_end; + for(int i=0; i<m_target_length; i++) + if (target_annotation[i] == ALIGNED) { + if (target_start == -1) + target_start = i; + target_end = i; + } + // go over all enclosed target words + for(int i=target_start; i<=target_end; i++) { + // label other target words as unaligned or misaligned + if (m_target_unaligned[ i ]) + target_annotation[ i ] = UNALIGNED; + else { + if (target_annotation[ i ] != ALIGNED) + target_annotation[ i ] = MISALIGNED; + // loop over aligned source words + for(INDEX ap=0; ap<m_num_alignment_points; ap++) { + if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) { + int source_word = m_alignment->GetSourceWord( m_sentence_id, ap ); + // if not part of the source phrase -> also misaligned + if (source_word < m_source_start || source_word > m_source_end) + source_annotation[ source_word ] = MISALIGNED; + } + } + } + } + // closure + bool change = true; + while(change) { + change = false; + for(INDEX ap=0; ap<m_num_alignment_points; ap++) { + int source_word = m_alignment->GetSourceWord( m_sentence_id, ap ); + int target_word = m_alignment->GetTargetWord( m_sentence_id, ap ); + if (source_annotation[source_word] != UNANNOTATED && + target_annotation[target_word] == UNANNOTATED) { + target_annotation[target_word] = MISALIGNED; + change = true; + } + if (source_annotation[source_word] == UNANNOTATED && + target_annotation[target_word] != UNANNOTATED) { + source_annotation[source_word] = MISALIGNED; + change = true; + } + } + } + } + + // print source + // shorten source context if too long int sentence_start = m_source_position - m_source_start; - int context_space = width/2; - for(int i=m_source_start;i<=m_source_end;i++) - context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1; - context_space /= 2; + int context_space = width/2; + for(int i=m_source_start; i<=m_source_end; i++) + context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1; + context_space /= 2; - int remaining = context_space; - int start_word = m_source_start; - for(;start_word>0 && remaining>0; start_word--) - remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1; - if (remaining<0 || start_word == -1) start_word++; + int remaining = context_space; + int start_word = m_source_start; + for(; start_word>0 && remaining>0; start_word--) + remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1; + if (remaining<0 || start_word == -1) start_word++; - remaining = context_space; - int end_word = m_source_end; - for(;end_word<m_source_length && remaining>0; end_word++) - remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1; - end_word--; + remaining = context_space; + int end_word = m_source_end; + for(; end_word<m_source_length && remaining>0; end_word++) + remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1; + end_word--; - // output with markup - *out << "<tr><td class=\"pp_source_left\">"; - char current_label = UNANNOTATED; - if (start_word>0) { - current_label = source_annotation[start_word-1]; - *out << "... "; - } - for(int i=start_word; i<=end_word; i++) { - // change to phrase block - if (i == m_source_start) { - if (current_label != UNANNOTATED && i!=start_word) - *out << "</span>"; - *out << "</td><td class=\"pp_source\">"; - current_label = UNANNOTATED; - } + // output with markup + *out << "<tr><td class=\"pp_source_left\">"; + char current_label = UNANNOTATED; + if (start_word>0) { + current_label = source_annotation[start_word-1]; + *out << "... "; + } + for(int i=start_word; i<=end_word; i++) { + // change to phrase block + if (i == m_source_start) { + if (current_label != UNANNOTATED && i!=start_word) + *out << "</span>"; + *out << "</td><td class=\"pp_source\">"; + current_label = UNANNOTATED; + } - // change to labeled word - else if (source_annotation[i] != current_label && - source_annotation[i] != ALIGNED) { - if (current_label != UNANNOTATED && i!=start_word) - *out << "</span>"; - if (source_annotation[i] != UNANNOTATED) - *out << "<span class=\"" - << label_class[ source_annotation[i] ] - << "\">"; - current_label = source_annotation[i]; - } + // change to labeled word + else if (source_annotation[i] != current_label && + source_annotation[i] != ALIGNED) { + if (current_label != UNANNOTATED && i!=start_word) + *out << "</span>"; + if (source_annotation[i] != UNANNOTATED) + *out << "<span class=\"" + << label_class[ source_annotation[i] ] + << "\">"; + current_label = source_annotation[i]; + } - // output word - *out << m_suffixArray->GetWord( sentence_start + i ) << " "; + // output word + *out << m_suffixArray->GetWord( sentence_start + i ) << " "; - // change to right context block - if (i == m_source_end) { - *out << "</td><td class=\"pp_source_right\">"; - current_label = UNANNOTATED; - } - } + // change to right context block + if (i == m_source_end) { + *out << "</td><td class=\"pp_source_right\">"; + current_label = UNANNOTATED; + } + } - if (current_label != UNANNOTATED && end_word>m_source_end) - *out << "</span>"; - if (end_word<m_source_length-1) - *out << "... "; + if (current_label != UNANNOTATED && end_word>m_source_end) + *out << "</span>"; + if (end_word<m_source_length-1) + *out << "... "; - // print target - // shorten target context if too long - int target_start = -1; - int target_end; - for(int i=0; i<m_target_length; i++) - if (target_annotation[i] != UNANNOTATED) { - if (target_start == -1) - target_start = i; - target_end = i; - } + // print target + // shorten target context if too long + int target_start = -1; + int target_end; + for(int i=0; i<m_target_length; i++) + if (target_annotation[i] != UNANNOTATED) { + if (target_start == -1) + target_start = i; + target_end = i; + } - context_space = width/2; - for(int i=target_start;i<=target_end;i++) - context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1; - while (context_space < 0) { // shorten matched part, if too long - context_space += - m_targetCorpus->GetWord( m_sentence_id, target_start ).size() + - m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2; - target_start++; - target_end--; - } - context_space /= 2; + context_space = width/2; + for(int i=target_start; i<=target_end; i++) + context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1; + while (context_space < 0) { // shorten matched part, if too long + context_space += + m_targetCorpus->GetWord( m_sentence_id, target_start ).size() + + m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2; + target_start++; + target_end--; + } + context_space /= 2; - remaining = context_space; - start_word = target_start; - for(;start_word>0 && remaining>0; start_word--) { - //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl; - remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1; - } - if (remaining<0 || start_word == -1) start_word++; + remaining = context_space; + start_word = target_start; + for(; start_word>0 && remaining>0; start_word--) { + //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl; + remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1; + } + if (remaining<0 || start_word == -1) start_word++; - remaining = context_space; - end_word = target_end; - for(;end_word<m_target_length && remaining>0; end_word++) { - //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl; - remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1; - } - end_word--; + remaining = context_space; + end_word = target_end; + for(; end_word<m_target_length && remaining>0; end_word++) { + //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl; + remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1; + } + end_word--; - // output with markup - *out << "</td><td class=\"mismatch_target\">"; - current_label = UNANNOTATED; - if (start_word>0) { - current_label = target_annotation[start_word-1]; - *out << "... "; - } - for(int i=start_word; i<=end_word; i++) { - if (target_annotation[i] != current_label) { - if (current_label != UNANNOTATED && i!=start_word) - *out << "</span>"; - if (target_annotation[i] != UNANNOTATED) - *out << "<span class=\"" - << label_class[ target_annotation[i] ] - << "\">"; - current_label = target_annotation[i]; - } + // output with markup + *out << "</td><td class=\"mismatch_target\">"; + current_label = UNANNOTATED; + if (start_word>0) { + current_label = target_annotation[start_word-1]; + *out << "... "; + } + for(int i=start_word; i<=end_word; i++) { + if (target_annotation[i] != current_label) { + if (current_label != UNANNOTATED && i!=start_word) + *out << "</span>"; + if (target_annotation[i] != UNANNOTATED) + *out << "<span class=\"" + << label_class[ target_annotation[i] ] + << "\">"; + current_label = target_annotation[i]; + } - // output word - *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " "; - } + // output word + *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " "; + } - if (current_label != UNANNOTATED && end_word>target_end) - *out << "</span>"; - if (end_word<m_target_length-1) - *out << "... "; - *out << "</td></tr>"; + if (current_label != UNANNOTATED && end_word>target_end) + *out << "</span>"; + if (end_word<m_target_length-1) + *out << "... "; + *out << "</td></tr>"; } -void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ) { - for(INDEX ap=0; ap<m_num_alignment_points; ap++) { - if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) { - source_annotation[ source_id ] = label; - target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label; - } - } +void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ) +{ + for(INDEX ap=0; ap<m_num_alignment_points; ap++) { + if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) { + source_annotation[ source_id ] = label; + target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label; + } + } } diff --git a/biconcor/Mismatch.h b/biconcor/Mismatch.h index c0063d049..1277ed95a 100644 --- a/biconcor/Mismatch.h +++ b/biconcor/Mismatch.h @@ -34,7 +34,9 @@ public: Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end ); ~Mismatch(); - bool Unaligned() const { return m_unaligned; } + bool Unaligned() const { + return m_unaligned; + } void PrintClippedHTML(std::ostream* out, int width ); void LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ); }; diff --git a/biconcor/PhrasePair.cpp b/biconcor/PhrasePair.cpp index 038fa3a31..b6409258b 100644 --- a/biconcor/PhrasePair.cpp +++ b/biconcor/PhrasePair.cpp @@ -37,7 +37,7 @@ void PhrasePair::Print( ostream* out ) const INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id ); for( INDEX i=0; i<ap_points; i++) { *out << " " << m_alignment->GetSourceWord( m_sentence_id, i ) - << "-" << m_alignment->GetTargetWord( m_sentence_id, i ); + << "-" << m_alignment->GetTargetWord( m_sentence_id, i ); } *out << endl; @@ -185,27 +185,27 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const size_t source_pre_width = (source_width-source.size())/2; size_t source_post_width = (source_width-source.size()+1)/2; - // if phrase is too long, don't show any context + // if phrase is too long, don't show any context if (source.size() > (size_t)width) { source_pre_width = 0; source_post_width = 0; } - // too long -> truncate and add "..." + // too long -> truncate and add "..." if (source_pre.size() > source_pre_width) { - // first skip up to a space - while(source_pre_width>0 && - source_pre.substr(source_pre.size()-source_pre_width,1) != " ") { - source_pre_width--; - } + // first skip up to a space + while(source_pre_width>0 && + source_pre.substr(source_pre.size()-source_pre_width,1) != " ") { + source_pre_width--; + } source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width ); - } + } if (source_post.size() > source_post_width) { - while(source_post_width>0 && - source_post.substr(source_post_width-1,1) != " ") { - source_post_width--; - } + while(source_post_width>0 && + source_post.substr(source_post_width-1,1) != " ") { + source_post_width--; + } source_post = source_post.substr( 0, source_post_width ) + "..."; - } + } *out << "<tr><td class=\"pp_source_left\">" << source_pre @@ -220,13 +220,13 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const string target_pre = ""; string target = ""; string target_post = ""; - size_t target_pre_null_width = 0; - size_t target_post_null_width = 0; + size_t target_pre_null_width = 0; + size_t target_post_null_width = 0; for( char i=0; i<m_target_start; i++ ) { - WORD word = m_targetCorpus->GetWord( m_sentence_id, i); + WORD word = m_targetCorpus->GetWord( m_sentence_id, i); target_pre += " " + word; - if (i >= m_target_start-m_pre_null) - target_pre_null_width += word.size() + 1; + if (i >= m_target_start-m_pre_null) + target_pre_null_width += word.size() + 1; } for( char i=m_target_start; i<=m_target_end; i++ ) { if (i>m_target_start) target += " "; @@ -234,11 +234,11 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const } for( char i=m_target_end+1; i<m_target_length; i++ ) { if (i>m_target_end+1) target_post += " "; - WORD word = m_targetCorpus->GetWord( m_sentence_id, i); + WORD word = m_targetCorpus->GetWord( m_sentence_id, i); target_post += word; - if (i-(m_target_end+1) < m_post_null) { - target_post_null_width += word.size() + 1; - } + if (i-(m_target_end+1) < m_post_null) { + target_post_null_width += word.size() + 1; + } } size_t target_pre_width = (target_width-target.size())/2; @@ -249,46 +249,45 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const target_post_width = 0; } - if (target_pre.size() < target_pre_width) - target_pre_width = target_pre.size(); - else { - while(target_pre_width>0 && - target_pre.substr(target_pre.size()-target_pre_width,1) != " ") { - target_pre_width--; - } + if (target_pre.size() < target_pre_width) + target_pre_width = target_pre.size(); + else { + while(target_pre_width>0 && + target_pre.substr(target_pre.size()-target_pre_width,1) != " ") { + target_pre_width--; + } target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width ); - } - - if (target_post.size() < target_post_width) { - target_post_width = target_post.size(); - } - else { - while(target_post_width>0 && - target_post.substr(target_post_width-1,1) != " ") { - target_post_width--; - } - target_post = target_post.substr( 0, target_post_width ) + "..."; - } - - if (m_pre_null) { - //cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl; - if (target_pre_width < target_pre.size()) - target_pre_null_width -= target_pre.size()-target_pre_width; - target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width) - + "<span class=\"null_aligned\">" - + target_pre.substr(target_pre_width-target_pre_null_width) - + "</span>"; - } - if (m_post_null) { - //cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl; - if (target_post_null_width > target_post.size()) { - target_post_null_width = target_post.size(); - } - target_post = "<span class=\"null_aligned\">" - + target_post.substr(0,target_post_null_width) - + "</span>" - + target_post.substr(target_post_null_width); - } + } + + if (target_post.size() < target_post_width) { + target_post_width = target_post.size(); + } else { + while(target_post_width>0 && + target_post.substr(target_post_width-1,1) != " ") { + target_post_width--; + } + target_post = target_post.substr( 0, target_post_width ) + "..."; + } + + if (m_pre_null) { + //cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl; + if (target_pre_width < target_pre.size()) + target_pre_null_width -= target_pre.size()-target_pre_width; + target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width) + + "<span class=\"null_aligned\">" + + target_pre.substr(target_pre_width-target_pre_null_width) + + "</span>"; + } + if (m_post_null) { + //cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl; + if (target_post_null_width > target_post.size()) { + target_post_null_width = target_post.size(); + } + target_post = "<span class=\"null_aligned\">" + + target_post.substr(0,target_post_null_width) + + "</span>" + + target_post.substr(target_post_null_width); + } *out << "<td class=\"pp_target_left\">" << target_pre diff --git a/biconcor/PhrasePairCollection.cpp b/biconcor/PhrasePairCollection.cpp index 7497b2af8..dd21faad3 100644 --- a/biconcor/PhrasePairCollection.cpp +++ b/biconcor/PhrasePairCollection.cpp @@ -47,15 +47,15 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString ) int sentence_length = m_suffixArray->GetSentenceLength( sentence_id ); int target_length = m_targetCorpus->GetSentenceLength( sentence_id ); //cerr << "match " << (i-first_match) - //<< " in sentence " << sentence_id - //<< ", starting at word " << source_start - //<< " of " << sentence_length - //<< ". target sentence has " << target_length << " words."; + //<< " in sentence " << sentence_id + //<< ", starting at word " << source_start + //<< " of " << sentence_length + //<< ". target sentence has " << target_length << " words."; int target_start, target_end, pre_null, post_null; if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) { //cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]"; //cerr << " +(" << (int)pre_null << "," << (int)post_null << ")"; - bool null_boundary_words = false; + bool null_boundary_words = false; for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) { for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) { vector< WORD_ID > targetString; @@ -75,19 +75,18 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString ) m_size++; } } + } else { + //cerr << "mismatch " << (i-first_match) + // << " in sentence " << sentence_id + // << ", starting at word " << source_start + // << " of " << sentence_length + // << ". target sentence has " << target_length << " words."; + Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end ); + if (mismatch->Unaligned()) + m_unaligned.push_back( mismatch ); + else + m_mismatch.push_back( mismatch ); } - else { - //cerr << "mismatch " << (i-first_match) - // << " in sentence " << sentence_id - // << ", starting at word " << source_start - // << " of " << sentence_length - // << ". target sentence has " << target_length << " words."; - Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end ); - if (mismatch->Unaligned()) - m_unaligned.push_back( mismatch ); - else - m_mismatch.push_back( mismatch ); - } //cerr << endl; if (found > (INDEX)m_max_lookup) { @@ -111,8 +110,7 @@ void PhrasePairCollection::Print(bool pretty) const for(int j=0; j<ppWithSameTarget->size() && j<m_max_example; j++, p++ ) { if (pretty) { (*p)->PrintPretty( &cout, 100 ); - } - else { + } else { (*p)->Print( &cout ); } if (ppWithSameTarget->size() > m_max_example) { @@ -125,33 +123,32 @@ void PhrasePairCollection::Print(bool pretty) const void PhrasePairCollection::PrintHTML() const { int pp_target = 0; - bool singleton = false; - // loop over all translations + bool singleton = false; + // loop over all translations vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget; for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_translation; ppWithSameTarget++, pp_target++ ) { - int count = ppWithSameTarget->size(); - if (!singleton) { - if (count == 1) { - singleton = true; - cout << "<p class=\"pp_singleton_header\">singleton" - << (m_collection.end() - ppWithSameTarget==1?"":"s") << " (" - << (m_collection.end() - ppWithSameTarget) - << "/" << m_size << ")</p>"; - } - else { - cout << "<p class=\"pp_target_header\">"; - (*(ppWithSameTarget->begin()))->PrintTarget( &cout ); - cout << " (" << count << "/" << m_size << ")" << endl; - cout << "<p><div id=\"pp_" << pp_target << "\">"; - } - cout << "<table align=\"center\">"; - } + int count = ppWithSameTarget->size(); + if (!singleton) { + if (count == 1) { + singleton = true; + cout << "<p class=\"pp_singleton_header\">singleton" + << (m_collection.end() - ppWithSameTarget==1?"":"s") << " (" + << (m_collection.end() - ppWithSameTarget) + << "/" << m_size << ")</p>"; + } else { + cout << "<p class=\"pp_target_header\">"; + (*(ppWithSameTarget->begin()))->PrintTarget( &cout ); + cout << " (" << count << "/" << m_size << ")" << endl; + cout << "<p><div id=\"pp_" << pp_target << "\">"; + } + cout << "<table align=\"center\">"; + } vector< PhrasePair* >::const_iterator p; - // loop over all sentences where translation occurs + // loop over all sentences where translation occurs int pp=0; - int i=0; + int i=0; for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) { (*p)->PrintClippedHTML( &cout, 160 ); if (count > m_max_example) { @@ -159,54 +156,54 @@ void PhrasePairCollection::PrintHTML() const pp += count/m_max_example-1; } } - if (i == 10 && pp < count) { - // extended table - cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>"; - cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">"; - cout << "<table align=\"center\">"; - for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) { - (*p)->PrintClippedHTML( &cout, 160 ); - if (count > m_max_example) { - p += count/m_max_example-1; - pp += count/m_max_example-1; - } - } - } - if (!singleton) cout << "</table></div>\n"; - - if (!singleton && pp_target == 9) { - cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">"; - cout << "<p class=\"pp_target_header\">(more)</p></div>"; - cout << "<div id=\"pp_additional\" style=\"display:none;\";\">"; - } + if (i == 10 && pp < count) { + // extended table + cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>"; + cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">"; + cout << "<table align=\"center\">"; + for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) { + (*p)->PrintClippedHTML( &cout, 160 ); + if (count > m_max_example) { + p += count/m_max_example-1; + pp += count/m_max_example-1; + } + } + } + if (!singleton) cout << "</table></div>\n"; + + if (!singleton && pp_target == 9) { + cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">"; + cout << "<p class=\"pp_target_header\">(more)</p></div>"; + cout << "<div id=\"pp_additional\" style=\"display:none;\";\">"; + } + } + if (singleton) cout << "</table></div>\n"; + else if (pp_target > 9) cout << "</div>"; + + size_t max_mismatch = m_max_example/3; + // unaligned phrases + if (m_unaligned.size() > 0) { + cout << "<p class=\"pp_singleton_header\">unaligned" + << " (" << (m_unaligned.size()) << ")</p>"; + cout << "<table align=\"center\">"; + int step_size = 1; + if (m_unaligned.size() > max_mismatch) + step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch; + for(size_t i=0; i<m_unaligned.size(); i+=step_size) + m_unaligned[i]->PrintClippedHTML( &cout, 160 ); + cout << "</table>"; + } + + // mismatched phrases + if (m_mismatch.size() > 0) { + cout << "<p class=\"pp_singleton_header\">mismatched" + << " (" << (m_mismatch.size()) << ")</p>"; + cout << "<table align=\"center\">"; + int step_size = 1; + if (m_mismatch.size() > max_mismatch) + step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch; + for(size_t i=0; i<m_mismatch.size(); i+=step_size) + m_mismatch[i]->PrintClippedHTML( &cout, 160 ); + cout << "</table>"; } - if (singleton) cout << "</table></div>\n"; - else if (pp_target > 9) cout << "</div>"; - - size_t max_mismatch = m_max_example/3; - // unaligned phrases - if (m_unaligned.size() > 0) { - cout << "<p class=\"pp_singleton_header\">unaligned" - << " (" << (m_unaligned.size()) << ")</p>"; - cout << "<table align=\"center\">"; - int step_size = 1; - if (m_unaligned.size() > max_mismatch) - step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch; - for(size_t i=0;i<m_unaligned.size();i+=step_size) - m_unaligned[i]->PrintClippedHTML( &cout, 160 ); - cout << "</table>"; - } - - // mismatched phrases - if (m_mismatch.size() > 0) { - cout << "<p class=\"pp_singleton_header\">mismatched" - << " (" << (m_mismatch.size()) << ")</p>"; - cout << "<table align=\"center\">"; - int step_size = 1; - if (m_mismatch.size() > max_mismatch) - step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch; - for(size_t i=0;i<m_mismatch.size();i+=step_size) - m_mismatch[i]->PrintClippedHTML( &cout, 160 ); - cout << "</table>"; - } } diff --git a/biconcor/SuffixArray.cpp b/biconcor/SuffixArray.cpp index 15e6b47b0..f4122a2d8 100644 --- a/biconcor/SuffixArray.cpp +++ b/biconcor/SuffixArray.cpp @@ -5,7 +5,8 @@ #include <stdlib.h> #include <cstring> -namespace { +namespace +{ const int LINE_MAX_LENGTH = 10000; @@ -14,15 +15,15 @@ const int LINE_MAX_LENGTH = 10000; using namespace std; SuffixArray::SuffixArray() - : m_array(NULL), - m_index(NULL), - m_buffer(NULL), - m_wordInSentence(NULL), - m_sentence(NULL), - m_sentenceLength(NULL), - m_vcb(), - m_size(0), - m_sentenceCount(0) { } + : m_array(NULL), + m_index(NULL), + m_buffer(NULL), + m_wordInSentence(NULL), + m_sentence(NULL), + m_sentenceLength(NULL), + m_vcb(), + m_size(0), + m_sentenceCount(0) { } SuffixArray::~SuffixArray() { diff --git a/biconcor/TargetCorpus.cpp b/biconcor/TargetCorpus.cpp index d331a548a..06468007f 100644 --- a/biconcor/TargetCorpus.cpp +++ b/biconcor/TargetCorpus.cpp @@ -5,7 +5,8 @@ #include <stdlib.h> #include <cstring> -namespace { +namespace +{ const int LINE_MAX_LENGTH = 10000; @@ -14,11 +15,11 @@ const int LINE_MAX_LENGTH = 10000; using namespace std; TargetCorpus::TargetCorpus() - : m_array(NULL), - m_sentenceEnd(NULL), - m_vcb(), - m_size(0), - m_sentenceCount(0) {} + : m_array(NULL), + m_sentenceEnd(NULL), + m_vcb(), + m_size(0), + m_sentenceCount(0) {} TargetCorpus::~TargetCorpus() { diff --git a/biconcor/Vocabulary.cpp b/biconcor/Vocabulary.cpp index 9c35b3feb..9d52ee44e 100644 --- a/biconcor/Vocabulary.cpp +++ b/biconcor/Vocabulary.cpp @@ -2,7 +2,8 @@ #include "Vocabulary.h"
#include <fstream>
-namespace {
+namespace
+{
const int MAX_LENGTH = 10000;
diff --git a/biconcor/base64.cpp b/biconcor/base64.cpp index 2a863d161..8032399b5 100644 --- a/biconcor/base64.cpp +++ b/biconcor/base64.cpp @@ -1,4 +1,4 @@ -/* +/* base64.cpp and base64.h Copyright (C) 2004-2008 René Nyffenegger @@ -28,17 +28,19 @@ #include "base64.h" #include <iostream> -static const std::string base64_chars = - "ABCDEFGHIJKLMNOPQRSTUVWXYZ" - "abcdefghijklmnopqrstuvwxyz" - "0123456789+/"; +static const std::string base64_chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; -static inline bool is_base64(unsigned char c) { +static inline bool is_base64(unsigned char c) +{ return (isalnum(c) || (c == '+') || (c == '/')); } -std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) { +std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) +{ std::string ret; int i = 0; int j = 0; @@ -59,8 +61,7 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_ } } - if (i) - { + if (i) { for(j = i; j < 3; j++) char_array_3[j] = '\0'; @@ -81,7 +82,8 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_ } -std::string base64_decode(std::string const& encoded_string) { +std::string base64_decode(std::string const& encoded_string) +{ int in_len = encoded_string.size(); int i = 0; int j = 0; @@ -90,7 +92,8 @@ std::string base64_decode(std::string const& encoded_string) { std::string ret; while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { - char_array_4[i++] = encoded_string[in_]; in_++; + char_array_4[i++] = encoded_string[in_]; + in_++; if (i ==4) { for (i = 0; i <4; i++) char_array_4[i] = base64_chars.find(char_array_4[i]); diff --git a/biconcor/biconcor.cpp b/biconcor/biconcor.cpp index f4e7c03fb..cb63e855d 100644 --- a/biconcor/biconcor.cpp +++ b/biconcor/biconcor.cpp @@ -150,22 +150,19 @@ int main(int argc, char* argv[]) cout << "TOTAL: " << total << endl; if (htmlFlag) { ppCollection.PrintHTML(); - } - else { - ppCollection.Print(prettyFlag); + } else { + ppCollection.Print(prettyFlag); } cout << "-|||- BICONCOR END -|||-" << endl << flush; } - } - else if (queryFlag) { + } else if (queryFlag) { cerr << "query is " << query << endl; vector< string > queryString = alignment.Tokenize( query.c_str() ); PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example ); ppCollection.GetCollection( queryString ); if (htmlFlag) { ppCollection.PrintHTML(); - } - else { + } else { ppCollection.Print(prettyFlag); } } |