Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <hieu@hoang.co.uk>2013-05-29 21:16:15 +0400
committerHieu Hoang <hieu@hoang.co.uk>2013-05-29 21:16:15 +0400
commit6249432407af8730c10bccc7894c0725fcaf5e47 (patch)
tree3ac1f094b9fdc199b04bc5ef209ce00e3596e37d /biconcor
parent59bd7deb4b6b9c4f7b3b7dbb055783528fbc31ca (diff)
beautify
Diffstat (limited to 'biconcor')
-rw-r--r--biconcor/Alignment.cpp11
-rw-r--r--biconcor/Mismatch.cpp443
-rw-r--r--biconcor/Mismatch.h4
-rw-r--r--biconcor/PhrasePair.cpp123
-rw-r--r--biconcor/PhrasePairCollection.cpp175
-rw-r--r--biconcor/SuffixArray.cpp21
-rw-r--r--biconcor/TargetCorpus.cpp13
-rw-r--r--biconcor/Vocabulary.cpp3
-rw-r--r--biconcor/base64.cpp25
-rw-r--r--biconcor/biconcor.cpp11
10 files changed, 416 insertions, 413 deletions
diff --git a/biconcor/Alignment.cpp b/biconcor/Alignment.cpp
index e73e18840..814802531 100644
--- a/biconcor/Alignment.cpp
+++ b/biconcor/Alignment.cpp
@@ -5,7 +5,8 @@
#include <stdlib.h>
#include <cstring>
-namespace {
+namespace
+{
const int LINE_MAX_LENGTH = 10000;
@@ -84,10 +85,10 @@ void Alignment::Create(const string& fileName)
}
Alignment::Alignment()
- : m_array(NULL),
- m_sentenceEnd(NULL),
- m_size(0),
- m_sentenceCount(0) {}
+ : m_array(NULL),
+ m_sentenceEnd(NULL),
+ m_size(0),
+ m_sentenceCount(0) {}
Alignment::~Alignment()
{
diff --git a/biconcor/Mismatch.cpp b/biconcor/Mismatch.cpp
index 31140b200..c3afec781 100644
--- a/biconcor/Mismatch.cpp
+++ b/biconcor/Mismatch.cpp
@@ -23,16 +23,16 @@ enum {
};
Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end )
- :m_suffixArray(sa)
- ,m_targetCorpus(tc)
- ,m_alignment(a)
- ,m_sentence_id(sentence_id)
- ,m_source_length(source_length)
- ,m_target_length(target_length)
- ,m_source_position(position)
- ,m_source_start(source_start)
- ,m_source_end(source_end)
- ,m_unaligned(true)
+ :m_suffixArray(sa)
+ ,m_targetCorpus(tc)
+ ,m_alignment(a)
+ ,m_sentence_id(sentence_id)
+ ,m_source_length(source_length)
+ ,m_target_length(target_length)
+ ,m_source_position(position)
+ ,m_source_start(source_start)
+ ,m_source_end(source_end)
+ ,m_unaligned(true)
{
// initialize unaligned indexes
for (int i = 0; i < m_source_length; i++) {
@@ -42,7 +42,7 @@ Mismatch::Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sente
m_target_unaligned[i] = true;
}
m_num_alignment_points =
- m_alignment->GetNumberOfAlignmentPoints( sentence_id );
+ m_alignment->GetNumberOfAlignmentPoints( sentence_id );
for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
m_source_unaligned[ (int)m_alignment->GetSourceWord( sentence_id, ap ) ] = false;
m_target_unaligned[ (int)m_alignment->GetTargetWord( sentence_id, ap ) ] = false;
@@ -58,234 +58,235 @@ Mismatch::~Mismatch () {}
void Mismatch::PrintClippedHTML( ostream* out, int width )
{
- int source_annotation[256], target_annotation[256];
- vector< string > label_class;
- label_class.push_back( "" );
- label_class.push_back( "mismatch_pre_aligned" );
- label_class.push_back( "mismatch_post_aligned" );
- label_class.push_back( "null_aligned" );
- label_class.push_back( "mismatch_misaligned" );
- label_class.push_back( "mismatch_aligned" );
+ int source_annotation[256], target_annotation[256];
+ vector< string > label_class;
+ label_class.push_back( "" );
+ label_class.push_back( "mismatch_pre_aligned" );
+ label_class.push_back( "mismatch_post_aligned" );
+ label_class.push_back( "null_aligned" );
+ label_class.push_back( "mismatch_misaligned" );
+ label_class.push_back( "mismatch_aligned" );
- for(int i=0; i<m_source_length;i++) source_annotation[i] = UNANNOTATED;
- for(int i=0; i<m_target_length;i++) target_annotation[i] = UNANNOTATED;
-
- if (m_unaligned) {
- // find alignment points for prior and next word(s) and
- // center target phrase around those.
- bool found_aligned = false;
- for(int i=1; i<m_source_length && !found_aligned; i++) {
- if (m_source_start-i >= 0) {
- int word_id = m_source_start-i;
- source_annotation[ word_id ] = UNALIGNED;
- if (!m_source_unaligned[ word_id ]) {
- found_aligned = true;
- LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
- }
- }
+ for(int i=0; i<m_source_length; i++) source_annotation[i] = UNANNOTATED;
+ for(int i=0; i<m_target_length; i++) target_annotation[i] = UNANNOTATED;
- if (m_source_end+i < m_source_length) {
- int word_id = m_source_end+i;
- source_annotation[ word_id ] = UNALIGNED;
- if (!m_source_unaligned[ word_id ]) {
- found_aligned = true;
- LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
- }
- }
- }
-
- }
- // misalignment
- else {
- // label aligned output words
- for(int i=m_source_start; i<=m_source_end; i++)
- LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
+ if (m_unaligned) {
+ // find alignment points for prior and next word(s) and
+ // center target phrase around those.
+ bool found_aligned = false;
+ for(int i=1; i<m_source_length && !found_aligned; i++) {
+ if (m_source_start-i >= 0) {
+ int word_id = m_source_start-i;
+ source_annotation[ word_id ] = UNALIGNED;
+ if (!m_source_unaligned[ word_id ]) {
+ found_aligned = true;
+ LabelSourceMatches( source_annotation, target_annotation, word_id, PRE_ALIGNED );
+ }
+ }
- // find first and last
- int target_start = -1;
- int target_end;
- for(int i=0; i<m_target_length; i++)
- if (target_annotation[i] == ALIGNED) {
- if (target_start == -1)
- target_start = i;
- target_end = i;
- }
- // go over all enclosed target words
- for(int i=target_start; i<=target_end; i++) {
- // label other target words as unaligned or misaligned
- if (m_target_unaligned[ i ])
- target_annotation[ i ] = UNALIGNED;
- else {
- if (target_annotation[ i ] != ALIGNED)
- target_annotation[ i ] = MISALIGNED;
- // loop over aligned source words
- for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
- if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
- int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
- // if not part of the source phrase -> also misaligned
- if (source_word < m_source_start || source_word > m_source_end)
- source_annotation[ source_word ] = MISALIGNED;
- }
- }
- }
- }
- // closure
- bool change = true;
- while(change) {
- change = false;
- for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
- int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
- int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
- if (source_annotation[source_word] != UNANNOTATED &&
- target_annotation[target_word] == UNANNOTATED) {
- target_annotation[target_word] = MISALIGNED;
- change = true;
- }
- if (source_annotation[source_word] == UNANNOTATED &&
- target_annotation[target_word] != UNANNOTATED) {
- source_annotation[source_word] = MISALIGNED;
- change = true;
- }
- }
- }
- }
-
- // print source
- // shorten source context if too long
+ if (m_source_end+i < m_source_length) {
+ int word_id = m_source_end+i;
+ source_annotation[ word_id ] = UNALIGNED;
+ if (!m_source_unaligned[ word_id ]) {
+ found_aligned = true;
+ LabelSourceMatches( source_annotation, target_annotation, word_id, POST_ALIGNED );
+ }
+ }
+ }
+
+ }
+ // misalignment
+ else {
+ // label aligned output words
+ for(int i=m_source_start; i<=m_source_end; i++)
+ LabelSourceMatches( source_annotation, target_annotation, i, ALIGNED );
+
+ // find first and last
+ int target_start = -1;
+ int target_end;
+ for(int i=0; i<m_target_length; i++)
+ if (target_annotation[i] == ALIGNED) {
+ if (target_start == -1)
+ target_start = i;
+ target_end = i;
+ }
+ // go over all enclosed target words
+ for(int i=target_start; i<=target_end; i++) {
+ // label other target words as unaligned or misaligned
+ if (m_target_unaligned[ i ])
+ target_annotation[ i ] = UNALIGNED;
+ else {
+ if (target_annotation[ i ] != ALIGNED)
+ target_annotation[ i ] = MISALIGNED;
+ // loop over aligned source words
+ for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
+ if (m_alignment->GetTargetWord( m_sentence_id, ap ) == i) {
+ int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
+ // if not part of the source phrase -> also misaligned
+ if (source_word < m_source_start || source_word > m_source_end)
+ source_annotation[ source_word ] = MISALIGNED;
+ }
+ }
+ }
+ }
+ // closure
+ bool change = true;
+ while(change) {
+ change = false;
+ for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
+ int source_word = m_alignment->GetSourceWord( m_sentence_id, ap );
+ int target_word = m_alignment->GetTargetWord( m_sentence_id, ap );
+ if (source_annotation[source_word] != UNANNOTATED &&
+ target_annotation[target_word] == UNANNOTATED) {
+ target_annotation[target_word] = MISALIGNED;
+ change = true;
+ }
+ if (source_annotation[source_word] == UNANNOTATED &&
+ target_annotation[target_word] != UNANNOTATED) {
+ source_annotation[source_word] = MISALIGNED;
+ change = true;
+ }
+ }
+ }
+ }
+
+ // print source
+ // shorten source context if too long
int sentence_start = m_source_position - m_source_start;
- int context_space = width/2;
- for(int i=m_source_start;i<=m_source_end;i++)
- context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
- context_space /= 2;
+ int context_space = width/2;
+ for(int i=m_source_start; i<=m_source_end; i++)
+ context_space -= m_suffixArray->GetWord( sentence_start + i ).size() + 1;
+ context_space /= 2;
- int remaining = context_space;
- int start_word = m_source_start;
- for(;start_word>0 && remaining>0; start_word--)
- remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
- if (remaining<0 || start_word == -1) start_word++;
+ int remaining = context_space;
+ int start_word = m_source_start;
+ for(; start_word>0 && remaining>0; start_word--)
+ remaining -= m_suffixArray->GetWord( sentence_start + start_word-1 ).size() + 1;
+ if (remaining<0 || start_word == -1) start_word++;
- remaining = context_space;
- int end_word = m_source_end;
- for(;end_word<m_source_length && remaining>0; end_word++)
- remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
- end_word--;
+ remaining = context_space;
+ int end_word = m_source_end;
+ for(; end_word<m_source_length && remaining>0; end_word++)
+ remaining -= m_suffixArray->GetWord( sentence_start + end_word ).size() + 1;
+ end_word--;
- // output with markup
- *out << "<tr><td class=\"pp_source_left\">";
- char current_label = UNANNOTATED;
- if (start_word>0) {
- current_label = source_annotation[start_word-1];
- *out << "... ";
- }
- for(int i=start_word; i<=end_word; i++) {
- // change to phrase block
- if (i == m_source_start) {
- if (current_label != UNANNOTATED && i!=start_word)
- *out << "</span>";
- *out << "</td><td class=\"pp_source\">";
- current_label = UNANNOTATED;
- }
+ // output with markup
+ *out << "<tr><td class=\"pp_source_left\">";
+ char current_label = UNANNOTATED;
+ if (start_word>0) {
+ current_label = source_annotation[start_word-1];
+ *out << "... ";
+ }
+ for(int i=start_word; i<=end_word; i++) {
+ // change to phrase block
+ if (i == m_source_start) {
+ if (current_label != UNANNOTATED && i!=start_word)
+ *out << "</span>";
+ *out << "</td><td class=\"pp_source\">";
+ current_label = UNANNOTATED;
+ }
- // change to labeled word
- else if (source_annotation[i] != current_label &&
- source_annotation[i] != ALIGNED) {
- if (current_label != UNANNOTATED && i!=start_word)
- *out << "</span>";
- if (source_annotation[i] != UNANNOTATED)
- *out << "<span class=\""
- << label_class[ source_annotation[i] ]
- << "\">";
- current_label = source_annotation[i];
- }
+ // change to labeled word
+ else if (source_annotation[i] != current_label &&
+ source_annotation[i] != ALIGNED) {
+ if (current_label != UNANNOTATED && i!=start_word)
+ *out << "</span>";
+ if (source_annotation[i] != UNANNOTATED)
+ *out << "<span class=\""
+ << label_class[ source_annotation[i] ]
+ << "\">";
+ current_label = source_annotation[i];
+ }
- // output word
- *out << m_suffixArray->GetWord( sentence_start + i ) << " ";
+ // output word
+ *out << m_suffixArray->GetWord( sentence_start + i ) << " ";
- // change to right context block
- if (i == m_source_end) {
- *out << "</td><td class=\"pp_source_right\">";
- current_label = UNANNOTATED;
- }
- }
+ // change to right context block
+ if (i == m_source_end) {
+ *out << "</td><td class=\"pp_source_right\">";
+ current_label = UNANNOTATED;
+ }
+ }
- if (current_label != UNANNOTATED && end_word>m_source_end)
- *out << "</span>";
- if (end_word<m_source_length-1)
- *out << "... ";
+ if (current_label != UNANNOTATED && end_word>m_source_end)
+ *out << "</span>";
+ if (end_word<m_source_length-1)
+ *out << "... ";
- // print target
- // shorten target context if too long
- int target_start = -1;
- int target_end;
- for(int i=0; i<m_target_length; i++)
- if (target_annotation[i] != UNANNOTATED) {
- if (target_start == -1)
- target_start = i;
- target_end = i;
- }
+ // print target
+ // shorten target context if too long
+ int target_start = -1;
+ int target_end;
+ for(int i=0; i<m_target_length; i++)
+ if (target_annotation[i] != UNANNOTATED) {
+ if (target_start == -1)
+ target_start = i;
+ target_end = i;
+ }
- context_space = width/2;
- for(int i=target_start;i<=target_end;i++)
- context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
- while (context_space < 0) { // shorten matched part, if too long
- context_space +=
- m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
- m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
- target_start++;
- target_end--;
- }
- context_space /= 2;
+ context_space = width/2;
+ for(int i=target_start; i<=target_end; i++)
+ context_space -= m_targetCorpus->GetWord( m_sentence_id, i ).size() + 1;
+ while (context_space < 0) { // shorten matched part, if too long
+ context_space +=
+ m_targetCorpus->GetWord( m_sentence_id, target_start ).size() +
+ m_targetCorpus->GetWord( m_sentence_id, target_end ).size() + 2;
+ target_start++;
+ target_end--;
+ }
+ context_space /= 2;
- remaining = context_space;
- start_word = target_start;
- for(;start_word>0 && remaining>0; start_word--) {
- //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
- remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
- }
- if (remaining<0 || start_word == -1) start_word++;
+ remaining = context_space;
+ start_word = target_start;
+ for(; start_word>0 && remaining>0; start_word--) {
+ //cerr << "remaining: " << remaining << ", start_word: " << start_word << endl;
+ remaining -= m_targetCorpus->GetWord( m_sentence_id, start_word-1 ).size() + 1;
+ }
+ if (remaining<0 || start_word == -1) start_word++;
- remaining = context_space;
- end_word = target_end;
- for(;end_word<m_target_length && remaining>0; end_word++) {
- //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
- remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
- }
- end_word--;
+ remaining = context_space;
+ end_word = target_end;
+ for(; end_word<m_target_length && remaining>0; end_word++) {
+ //cerr << "remaining: " << remaining << ", end_word: " << end_word << endl;
+ remaining -= m_targetCorpus->GetWord( m_sentence_id, end_word ).size() + 1;
+ }
+ end_word--;
- // output with markup
- *out << "</td><td class=\"mismatch_target\">";
- current_label = UNANNOTATED;
- if (start_word>0) {
- current_label = target_annotation[start_word-1];
- *out << "... ";
- }
- for(int i=start_word; i<=end_word; i++) {
- if (target_annotation[i] != current_label) {
- if (current_label != UNANNOTATED && i!=start_word)
- *out << "</span>";
- if (target_annotation[i] != UNANNOTATED)
- *out << "<span class=\""
- << label_class[ target_annotation[i] ]
- << "\">";
- current_label = target_annotation[i];
- }
+ // output with markup
+ *out << "</td><td class=\"mismatch_target\">";
+ current_label = UNANNOTATED;
+ if (start_word>0) {
+ current_label = target_annotation[start_word-1];
+ *out << "... ";
+ }
+ for(int i=start_word; i<=end_word; i++) {
+ if (target_annotation[i] != current_label) {
+ if (current_label != UNANNOTATED && i!=start_word)
+ *out << "</span>";
+ if (target_annotation[i] != UNANNOTATED)
+ *out << "<span class=\""
+ << label_class[ target_annotation[i] ]
+ << "\">";
+ current_label = target_annotation[i];
+ }
- // output word
- *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
- }
+ // output word
+ *out << m_targetCorpus->GetWord( m_sentence_id, i ) << " ";
+ }
- if (current_label != UNANNOTATED && end_word>target_end)
- *out << "</span>";
- if (end_word<m_target_length-1)
- *out << "... ";
- *out << "</td></tr>";
+ if (current_label != UNANNOTATED && end_word>target_end)
+ *out << "</span>";
+ if (end_word<m_target_length-1)
+ *out << "... ";
+ *out << "</td></tr>";
}
-void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label ) {
- for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
- if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
- source_annotation[ source_id ] = label;
- target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
- }
- }
+void Mismatch::LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label )
+{
+ for(INDEX ap=0; ap<m_num_alignment_points; ap++) {
+ if (m_alignment->GetSourceWord( m_sentence_id, ap ) == source_id) {
+ source_annotation[ source_id ] = label;
+ target_annotation[ m_alignment->GetTargetWord( m_sentence_id, ap ) ] = label;
+ }
+ }
}
diff --git a/biconcor/Mismatch.h b/biconcor/Mismatch.h
index c0063d049..1277ed95a 100644
--- a/biconcor/Mismatch.h
+++ b/biconcor/Mismatch.h
@@ -34,7 +34,9 @@ public:
Mismatch( SuffixArray *sa, TargetCorpus *tc, Alignment *a, INDEX sentence_id, INDEX position, int source_length, int target_length, int source_start, int source_end );
~Mismatch();
- bool Unaligned() const { return m_unaligned; }
+ bool Unaligned() const {
+ return m_unaligned;
+ }
void PrintClippedHTML(std::ostream* out, int width );
void LabelSourceMatches(int *source_annotation, int *target_annotation, int source_id, int label );
};
diff --git a/biconcor/PhrasePair.cpp b/biconcor/PhrasePair.cpp
index 038fa3a31..b6409258b 100644
--- a/biconcor/PhrasePair.cpp
+++ b/biconcor/PhrasePair.cpp
@@ -37,7 +37,7 @@ void PhrasePair::Print( ostream* out ) const
INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id );
for( INDEX i=0; i<ap_points; i++) {
*out << " " << m_alignment->GetSourceWord( m_sentence_id, i )
- << "-" << m_alignment->GetTargetWord( m_sentence_id, i );
+ << "-" << m_alignment->GetTargetWord( m_sentence_id, i );
}
*out << endl;
@@ -185,27 +185,27 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
size_t source_pre_width = (source_width-source.size())/2;
size_t source_post_width = (source_width-source.size()+1)/2;
- // if phrase is too long, don't show any context
+ // if phrase is too long, don't show any context
if (source.size() > (size_t)width) {
source_pre_width = 0;
source_post_width = 0;
}
- // too long -> truncate and add "..."
+ // too long -> truncate and add "..."
if (source_pre.size() > source_pre_width) {
- // first skip up to a space
- while(source_pre_width>0 &&
- source_pre.substr(source_pre.size()-source_pre_width,1) != " ") {
- source_pre_width--;
- }
+ // first skip up to a space
+ while(source_pre_width>0 &&
+ source_pre.substr(source_pre.size()-source_pre_width,1) != " ") {
+ source_pre_width--;
+ }
source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width );
- }
+ }
if (source_post.size() > source_post_width) {
- while(source_post_width>0 &&
- source_post.substr(source_post_width-1,1) != " ") {
- source_post_width--;
- }
+ while(source_post_width>0 &&
+ source_post.substr(source_post_width-1,1) != " ") {
+ source_post_width--;
+ }
source_post = source_post.substr( 0, source_post_width ) + "...";
- }
+ }
*out << "<tr><td class=\"pp_source_left\">"
<< source_pre
@@ -220,13 +220,13 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
string target_pre = "";
string target = "";
string target_post = "";
- size_t target_pre_null_width = 0;
- size_t target_post_null_width = 0;
+ size_t target_pre_null_width = 0;
+ size_t target_post_null_width = 0;
for( char i=0; i<m_target_start; i++ ) {
- WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
+ WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
target_pre += " " + word;
- if (i >= m_target_start-m_pre_null)
- target_pre_null_width += word.size() + 1;
+ if (i >= m_target_start-m_pre_null)
+ target_pre_null_width += word.size() + 1;
}
for( char i=m_target_start; i<=m_target_end; i++ ) {
if (i>m_target_start) target += " ";
@@ -234,11 +234,11 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
}
for( char i=m_target_end+1; i<m_target_length; i++ ) {
if (i>m_target_end+1) target_post += " ";
- WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
+ WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
target_post += word;
- if (i-(m_target_end+1) < m_post_null) {
- target_post_null_width += word.size() + 1;
- }
+ if (i-(m_target_end+1) < m_post_null) {
+ target_post_null_width += word.size() + 1;
+ }
}
size_t target_pre_width = (target_width-target.size())/2;
@@ -249,46 +249,45 @@ void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
target_post_width = 0;
}
- if (target_pre.size() < target_pre_width)
- target_pre_width = target_pre.size();
- else {
- while(target_pre_width>0 &&
- target_pre.substr(target_pre.size()-target_pre_width,1) != " ") {
- target_pre_width--;
- }
+ if (target_pre.size() < target_pre_width)
+ target_pre_width = target_pre.size();
+ else {
+ while(target_pre_width>0 &&
+ target_pre.substr(target_pre.size()-target_pre_width,1) != " ") {
+ target_pre_width--;
+ }
target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width );
- }
-
- if (target_post.size() < target_post_width) {
- target_post_width = target_post.size();
- }
- else {
- while(target_post_width>0 &&
- target_post.substr(target_post_width-1,1) != " ") {
- target_post_width--;
- }
- target_post = target_post.substr( 0, target_post_width ) + "...";
- }
-
- if (m_pre_null) {
- //cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl;
- if (target_pre_width < target_pre.size())
- target_pre_null_width -= target_pre.size()-target_pre_width;
- target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width)
- + "<span class=\"null_aligned\">"
- + target_pre.substr(target_pre_width-target_pre_null_width)
- + "</span>";
- }
- if (m_post_null) {
- //cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl;
- if (target_post_null_width > target_post.size()) {
- target_post_null_width = target_post.size();
- }
- target_post = "<span class=\"null_aligned\">"
- + target_post.substr(0,target_post_null_width)
- + "</span>"
- + target_post.substr(target_post_null_width);
- }
+ }
+
+ if (target_post.size() < target_post_width) {
+ target_post_width = target_post.size();
+ } else {
+ while(target_post_width>0 &&
+ target_post.substr(target_post_width-1,1) != " ") {
+ target_post_width--;
+ }
+ target_post = target_post.substr( 0, target_post_width ) + "...";
+ }
+
+ if (m_pre_null) {
+ //cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl;
+ if (target_pre_width < target_pre.size())
+ target_pre_null_width -= target_pre.size()-target_pre_width;
+ target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width)
+ + "<span class=\"null_aligned\">"
+ + target_pre.substr(target_pre_width-target_pre_null_width)
+ + "</span>";
+ }
+ if (m_post_null) {
+ //cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl;
+ if (target_post_null_width > target_post.size()) {
+ target_post_null_width = target_post.size();
+ }
+ target_post = "<span class=\"null_aligned\">"
+ + target_post.substr(0,target_post_null_width)
+ + "</span>"
+ + target_post.substr(target_post_null_width);
+ }
*out << "<td class=\"pp_target_left\">"
<< target_pre
diff --git a/biconcor/PhrasePairCollection.cpp b/biconcor/PhrasePairCollection.cpp
index 7497b2af8..dd21faad3 100644
--- a/biconcor/PhrasePairCollection.cpp
+++ b/biconcor/PhrasePairCollection.cpp
@@ -47,15 +47,15 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
//cerr << "match " << (i-first_match)
- //<< " in sentence " << sentence_id
- //<< ", starting at word " << source_start
- //<< " of " << sentence_length
- //<< ". target sentence has " << target_length << " words.";
+ //<< " in sentence " << sentence_id
+ //<< ", starting at word " << source_start
+ //<< " of " << sentence_length
+ //<< ". target sentence has " << target_length << " words.";
int target_start, target_end, pre_null, post_null;
if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
//cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
//cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
- bool null_boundary_words = false;
+ bool null_boundary_words = false;
for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) {
for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) {
vector< WORD_ID > targetString;
@@ -75,19 +75,18 @@ int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
m_size++;
}
}
+ } else {
+ //cerr << "mismatch " << (i-first_match)
+ // << " in sentence " << sentence_id
+ // << ", starting at word " << source_start
+ // << " of " << sentence_length
+ // << ". target sentence has " << target_length << " words.";
+ Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
+ if (mismatch->Unaligned())
+ m_unaligned.push_back( mismatch );
+ else
+ m_mismatch.push_back( mismatch );
}
- else {
- //cerr << "mismatch " << (i-first_match)
- // << " in sentence " << sentence_id
- // << ", starting at word " << source_start
- // << " of " << sentence_length
- // << ". target sentence has " << target_length << " words.";
- Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
- if (mismatch->Unaligned())
- m_unaligned.push_back( mismatch );
- else
- m_mismatch.push_back( mismatch );
- }
//cerr << endl;
if (found > (INDEX)m_max_lookup) {
@@ -111,8 +110,7 @@ void PhrasePairCollection::Print(bool pretty) const
for(int j=0; j<ppWithSameTarget->size() && j<m_max_example; j++, p++ ) {
if (pretty) {
(*p)->PrintPretty( &cout, 100 );
- }
- else {
+ } else {
(*p)->Print( &cout );
}
if (ppWithSameTarget->size() > m_max_example) {
@@ -125,33 +123,32 @@ void PhrasePairCollection::Print(bool pretty) const
void PhrasePairCollection::PrintHTML() const
{
int pp_target = 0;
- bool singleton = false;
- // loop over all translations
+ bool singleton = false;
+ // loop over all translations
vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_translation; ppWithSameTarget++, pp_target++ ) {
- int count = ppWithSameTarget->size();
- if (!singleton) {
- if (count == 1) {
- singleton = true;
- cout << "<p class=\"pp_singleton_header\">singleton"
- << (m_collection.end() - ppWithSameTarget==1?"":"s") << " ("
- << (m_collection.end() - ppWithSameTarget)
- << "/" << m_size << ")</p>";
- }
- else {
- cout << "<p class=\"pp_target_header\">";
- (*(ppWithSameTarget->begin()))->PrintTarget( &cout );
- cout << " (" << count << "/" << m_size << ")" << endl;
- cout << "<p><div id=\"pp_" << pp_target << "\">";
- }
- cout << "<table align=\"center\">";
- }
+ int count = ppWithSameTarget->size();
+ if (!singleton) {
+ if (count == 1) {
+ singleton = true;
+ cout << "<p class=\"pp_singleton_header\">singleton"
+ << (m_collection.end() - ppWithSameTarget==1?"":"s") << " ("
+ << (m_collection.end() - ppWithSameTarget)
+ << "/" << m_size << ")</p>";
+ } else {
+ cout << "<p class=\"pp_target_header\">";
+ (*(ppWithSameTarget->begin()))->PrintTarget( &cout );
+ cout << " (" << count << "/" << m_size << ")" << endl;
+ cout << "<p><div id=\"pp_" << pp_target << "\">";
+ }
+ cout << "<table align=\"center\">";
+ }
vector< PhrasePair* >::const_iterator p;
- // loop over all sentences where translation occurs
+ // loop over all sentences where translation occurs
int pp=0;
- int i=0;
+ int i=0;
for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
(*p)->PrintClippedHTML( &cout, 160 );
if (count > m_max_example) {
@@ -159,54 +156,54 @@ void PhrasePairCollection::PrintHTML() const
pp += count/m_max_example-1;
}
}
- if (i == 10 && pp < count) {
- // extended table
- cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
- cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
- cout << "<table align=\"center\">";
- for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
- (*p)->PrintClippedHTML( &cout, 160 );
- if (count > m_max_example) {
- p += count/m_max_example-1;
- pp += count/m_max_example-1;
- }
- }
- }
- if (!singleton) cout << "</table></div>\n";
-
- if (!singleton && pp_target == 9) {
- cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">";
- cout << "<p class=\"pp_target_header\">(more)</p></div>";
- cout << "<div id=\"pp_additional\" style=\"display:none;\";\">";
- }
+ if (i == 10 && pp < count) {
+ // extended table
+ cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
+ cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
+ cout << "<table align=\"center\">";
+ for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
+ (*p)->PrintClippedHTML( &cout, 160 );
+ if (count > m_max_example) {
+ p += count/m_max_example-1;
+ pp += count/m_max_example-1;
+ }
+ }
+ }
+ if (!singleton) cout << "</table></div>\n";
+
+ if (!singleton && pp_target == 9) {
+ cout << "<div id=\"pp_toggle\" onclick=\"javascript:document.getElementById('pp_toggle').style.display = 'none'; document.getElementById('pp_additional').style.display = 'block';\">";
+ cout << "<p class=\"pp_target_header\">(more)</p></div>";
+ cout << "<div id=\"pp_additional\" style=\"display:none;\";\">";
+ }
+ }
+ if (singleton) cout << "</table></div>\n";
+ else if (pp_target > 9) cout << "</div>";
+
+ size_t max_mismatch = m_max_example/3;
+ // unaligned phrases
+ if (m_unaligned.size() > 0) {
+ cout << "<p class=\"pp_singleton_header\">unaligned"
+ << " (" << (m_unaligned.size()) << ")</p>";
+ cout << "<table align=\"center\">";
+ int step_size = 1;
+ if (m_unaligned.size() > max_mismatch)
+ step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch;
+ for(size_t i=0; i<m_unaligned.size(); i+=step_size)
+ m_unaligned[i]->PrintClippedHTML( &cout, 160 );
+ cout << "</table>";
+ }
+
+ // mismatched phrases
+ if (m_mismatch.size() > 0) {
+ cout << "<p class=\"pp_singleton_header\">mismatched"
+ << " (" << (m_mismatch.size()) << ")</p>";
+ cout << "<table align=\"center\">";
+ int step_size = 1;
+ if (m_mismatch.size() > max_mismatch)
+ step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch;
+ for(size_t i=0; i<m_mismatch.size(); i+=step_size)
+ m_mismatch[i]->PrintClippedHTML( &cout, 160 );
+ cout << "</table>";
}
- if (singleton) cout << "</table></div>\n";
- else if (pp_target > 9) cout << "</div>";
-
- size_t max_mismatch = m_max_example/3;
- // unaligned phrases
- if (m_unaligned.size() > 0) {
- cout << "<p class=\"pp_singleton_header\">unaligned"
- << " (" << (m_unaligned.size()) << ")</p>";
- cout << "<table align=\"center\">";
- int step_size = 1;
- if (m_unaligned.size() > max_mismatch)
- step_size = (m_unaligned.size()+max_mismatch-1) / max_mismatch;
- for(size_t i=0;i<m_unaligned.size();i+=step_size)
- m_unaligned[i]->PrintClippedHTML( &cout, 160 );
- cout << "</table>";
- }
-
- // mismatched phrases
- if (m_mismatch.size() > 0) {
- cout << "<p class=\"pp_singleton_header\">mismatched"
- << " (" << (m_mismatch.size()) << ")</p>";
- cout << "<table align=\"center\">";
- int step_size = 1;
- if (m_mismatch.size() > max_mismatch)
- step_size = (m_mismatch.size()+max_mismatch-1) / max_mismatch;
- for(size_t i=0;i<m_mismatch.size();i+=step_size)
- m_mismatch[i]->PrintClippedHTML( &cout, 160 );
- cout << "</table>";
- }
}
diff --git a/biconcor/SuffixArray.cpp b/biconcor/SuffixArray.cpp
index 15e6b47b0..f4122a2d8 100644
--- a/biconcor/SuffixArray.cpp
+++ b/biconcor/SuffixArray.cpp
@@ -5,7 +5,8 @@
#include <stdlib.h>
#include <cstring>
-namespace {
+namespace
+{
const int LINE_MAX_LENGTH = 10000;
@@ -14,15 +15,15 @@ const int LINE_MAX_LENGTH = 10000;
using namespace std;
SuffixArray::SuffixArray()
- : m_array(NULL),
- m_index(NULL),
- m_buffer(NULL),
- m_wordInSentence(NULL),
- m_sentence(NULL),
- m_sentenceLength(NULL),
- m_vcb(),
- m_size(0),
- m_sentenceCount(0) { }
+ : m_array(NULL),
+ m_index(NULL),
+ m_buffer(NULL),
+ m_wordInSentence(NULL),
+ m_sentence(NULL),
+ m_sentenceLength(NULL),
+ m_vcb(),
+ m_size(0),
+ m_sentenceCount(0) { }
SuffixArray::~SuffixArray()
{
diff --git a/biconcor/TargetCorpus.cpp b/biconcor/TargetCorpus.cpp
index d331a548a..06468007f 100644
--- a/biconcor/TargetCorpus.cpp
+++ b/biconcor/TargetCorpus.cpp
@@ -5,7 +5,8 @@
#include <stdlib.h>
#include <cstring>
-namespace {
+namespace
+{
const int LINE_MAX_LENGTH = 10000;
@@ -14,11 +15,11 @@ const int LINE_MAX_LENGTH = 10000;
using namespace std;
TargetCorpus::TargetCorpus()
- : m_array(NULL),
- m_sentenceEnd(NULL),
- m_vcb(),
- m_size(0),
- m_sentenceCount(0) {}
+ : m_array(NULL),
+ m_sentenceEnd(NULL),
+ m_vcb(),
+ m_size(0),
+ m_sentenceCount(0) {}
TargetCorpus::~TargetCorpus()
{
diff --git a/biconcor/Vocabulary.cpp b/biconcor/Vocabulary.cpp
index 9c35b3feb..9d52ee44e 100644
--- a/biconcor/Vocabulary.cpp
+++ b/biconcor/Vocabulary.cpp
@@ -2,7 +2,8 @@
#include "Vocabulary.h"
#include <fstream>
-namespace {
+namespace
+{
const int MAX_LENGTH = 10000;
diff --git a/biconcor/base64.cpp b/biconcor/base64.cpp
index 2a863d161..8032399b5 100644
--- a/biconcor/base64.cpp
+++ b/biconcor/base64.cpp
@@ -1,4 +1,4 @@
-/*
+/*
base64.cpp and base64.h
Copyright (C) 2004-2008 René Nyffenegger
@@ -28,17 +28,19 @@
#include "base64.h"
#include <iostream>
-static const std::string base64_chars =
- "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
- "abcdefghijklmnopqrstuvwxyz"
- "0123456789+/";
+static const std::string base64_chars =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+ "abcdefghijklmnopqrstuvwxyz"
+ "0123456789+/";
-static inline bool is_base64(unsigned char c) {
+static inline bool is_base64(unsigned char c)
+{
return (isalnum(c) || (c == '+') || (c == '/'));
}
-std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) {
+std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len)
+{
std::string ret;
int i = 0;
int j = 0;
@@ -59,8 +61,7 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_
}
}
- if (i)
- {
+ if (i) {
for(j = i; j < 3; j++)
char_array_3[j] = '\0';
@@ -81,7 +82,8 @@ std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_
}
-std::string base64_decode(std::string const& encoded_string) {
+std::string base64_decode(std::string const& encoded_string)
+{
int in_len = encoded_string.size();
int i = 0;
int j = 0;
@@ -90,7 +92,8 @@ std::string base64_decode(std::string const& encoded_string) {
std::string ret;
while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
- char_array_4[i++] = encoded_string[in_]; in_++;
+ char_array_4[i++] = encoded_string[in_];
+ in_++;
if (i ==4) {
for (i = 0; i <4; i++)
char_array_4[i] = base64_chars.find(char_array_4[i]);
diff --git a/biconcor/biconcor.cpp b/biconcor/biconcor.cpp
index f4e7c03fb..cb63e855d 100644
--- a/biconcor/biconcor.cpp
+++ b/biconcor/biconcor.cpp
@@ -150,22 +150,19 @@ int main(int argc, char* argv[])
cout << "TOTAL: " << total << endl;
if (htmlFlag) {
ppCollection.PrintHTML();
- }
- else {
- ppCollection.Print(prettyFlag);
+ } else {
+ ppCollection.Print(prettyFlag);
}
cout << "-|||- BICONCOR END -|||-" << endl << flush;
}
- }
- else if (queryFlag) {
+ } else if (queryFlag) {
cerr << "query is " << query << endl;
vector< string > queryString = alignment.Tokenize( query.c_str() );
PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
ppCollection.GetCollection( queryString );
if (htmlFlag) {
ppCollection.PrintHTML();
- }
- else {
+ } else {
ppCollection.Print(prettyFlag);
}
}