Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <fishandfrolick@gmail.com>2012-06-01 02:32:24 +0400
committerHieu Hoang <fishandfrolick@gmail.com>2012-06-01 02:32:24 +0400
commit417d5ee57ba980b4a88031550163a218bf9c5a1c (patch)
tree29662a0db328b4fc5fa3ece97cb5bbf931d79fee /biconcor/PhrasePair.cpp
parent48faedb3bdfe1507d19cf2b76fbc81f3be2ff244 (diff)
move biconcor to /
Diffstat (limited to 'biconcor/PhrasePair.cpp')
-rw-r--r--biconcor/PhrasePair.cpp266
1 files changed, 266 insertions, 0 deletions
diff --git a/biconcor/PhrasePair.cpp b/biconcor/PhrasePair.cpp
new file mode 100644
index 000000000..9c16be77c
--- /dev/null
+++ b/biconcor/PhrasePair.cpp
@@ -0,0 +1,266 @@
+#include "PhrasePair.h"
+
+#include <iostream>
+#include "TargetCorpus.h"
+#include "Alignment.h"
+#include "Vocabulary.h"
+#include "SuffixArray.h"
+
+using namespace std;
+
+void PhrasePair::Print( ostream* out, int width ) const
+{
+ vector< WORD_ID >::const_iterator t;
+
+ // source
+ int sentence_start = m_source_position - m_source_start;
+ size_t source_width = (width-3)/2;
+ string source_pre = "";
+ string source = "";
+ string source_post = "";
+ for( size_t space=0; space<source_width/2; space++ ) source_pre += " ";
+ for( char i=0; i<m_source_start; i++ ) {
+ source_pre += " " + m_suffixArray->GetWord( sentence_start + i );
+ }
+ for( char i=m_source_start; i<=m_source_end; i++ ) {
+ if (i>m_source_start) source += " ";
+ source += m_suffixArray->GetWord( sentence_start + i );
+ }
+ char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
+ for( char i=m_source_end+1; i<source_length; i++ ) {
+ if (i>m_source_end+1) source_post += " ";
+ source_post += m_suffixArray->GetWord( sentence_start + i );
+ }
+ for( size_t space=0; space<source_width/2; space++ ) source_post += " ";
+
+ size_t source_pre_width = (source_width-source.size()-2)/2;
+ size_t source_post_width = (source_width-source.size()-2+1)/2;
+
+ if (source.size() > (size_t)width) {
+ source_pre_width = 0;
+ source_post_width = 0;
+ }
+
+ *out << source_pre.substr( source_pre.size()-source_pre_width, source_pre_width ) << " "
+ << source.substr( 0, source_width -2 ) << " "
+ << source_post.substr( 0, source_post_width ) << " | ";
+
+ // target
+ size_t target_width = (width-3)/2;
+ string target_pre = "";
+ string target = "";
+ string target_post = "";
+ for( size_t space=0; space<target_width/2; space++ ) target_pre += " ";
+ for( char i=0; i<m_target_start; i++ ) {
+ target_pre += " " + m_targetCorpus->GetWord( m_sentence_id, i);
+ }
+ for( char i=m_target_start; i<=m_target_end; i++ ) {
+ if (i>m_target_start) target += " ";
+ target += m_targetCorpus->GetWord( m_sentence_id, i);
+ }
+ for( char i=m_target_end+1; i<m_target_length; i++ ) {
+ if (i>m_target_end+1) target_post += " ";
+ target_post += m_targetCorpus->GetWord( m_sentence_id, i);
+ }
+
+ size_t target_pre_width = (target_width-target.size()-2)/2;
+ size_t target_post_width = (target_width-target.size()-2+1)/2;
+
+ if (target.size() > (size_t)width) {
+ target_pre_width = 0;
+ target_post_width = 0;
+ }
+
+ *out << target_pre.substr( target_pre.size()-target_pre_width, target_pre_width ) << " "
+ << target.substr( 0, target_width -2 ) << " "
+ << target_post.substr( 0, target_post_width ) << endl;
+}
+
+void PhrasePair::PrintTarget( ostream* out ) const
+{
+ for( char i=m_target_start; i<=m_target_end; i++ ) {
+ if (i>m_target_start) *out << " ";
+ *out << m_targetCorpus->GetWord( m_sentence_id, i);
+ }
+}
+
+void PhrasePair::PrintHTML( ostream* out ) const
+{
+ // source
+ int sentence_start = m_source_position - m_source_start;
+ char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
+
+ *out << "<tr><td align=right class=\"pp_source_left\">";
+ for( char i=0; i<m_source_start; i++ ) {
+ if (i>0) *out << " ";
+ *out << m_suffixArray->GetWord( sentence_start + i );
+ }
+ *out << "</td><td class=\"pp_source\">";
+ for( char i=m_source_start; i<=m_source_end; i++ ) {
+ if (i>m_source_start) *out << " ";
+ *out << m_suffixArray->GetWord( sentence_start + i );
+ }
+ *out << "</td><td class=\"pp_source_right\">";
+ for( char i=m_source_end+1; i<source_length; i++ ) {
+ if (i>m_source_end+1) *out << " ";
+ *out << m_suffixArray->GetWord( sentence_start + i );
+ }
+
+ // target
+ *out << "</td><td class=\"pp_target_left\">";
+ for( char i=0; i<m_target_start; i++ ) {
+ if (i>0) *out << " ";
+ *out << m_targetCorpus->GetWord( m_sentence_id, i);
+ }
+ *out << "</td><td class=\"pp_target\">";
+ for( char i=m_target_start; i<=m_target_end; i++ ) {
+ if (i>m_target_start) *out << " ";
+ *out << m_targetCorpus->GetWord( m_sentence_id, i);
+ }
+ *out << "</td><td class=\"pp_target_right\">";
+ for( char i=m_target_end+1; i<m_target_length; i++ ) {
+ if (i>m_target_end+1) *out << " ";
+ *out << m_targetCorpus->GetWord( m_sentence_id, i);
+ }
+ *out << "</td></tr>\n";
+}
+
+void PhrasePair::PrintClippedHTML( ostream* out, int width ) const
+{
+ vector< WORD_ID >::const_iterator t;
+
+ // source
+ int sentence_start = m_source_position - m_source_start;
+ size_t source_width = (width+1)/2;
+ string source_pre = "";
+ string source = "";
+ string source_post = "";
+ for( char i=0; i<m_source_start; i++ ) {
+ source_pre += " " + m_suffixArray->GetWord( sentence_start + i );
+ }
+ for( char i=m_source_start; i<=m_source_end; i++ ) {
+ if (i>m_source_start) source += " ";
+ source += m_suffixArray->GetWord( sentence_start + i );
+ }
+ char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
+ for( char i=m_source_end+1; i<source_length; i++ ) {
+ if (i>m_source_end+1) source_post += " ";
+ source_post += m_suffixArray->GetWord( sentence_start + i );
+ }
+ size_t source_pre_width = (source_width-source.size())/2;
+ size_t source_post_width = (source_width-source.size()+1)/2;
+
+ // if phrase is too long, don't show any context
+ if (source.size() > (size_t)width) {
+ source_pre_width = 0;
+ source_post_width = 0;
+ }
+ // too long -> truncate and add "..."
+ if (source_pre.size() > source_pre_width) {
+ // first skip up to a space
+ while(source_pre_width>0 &&
+ source_pre.substr(source_pre.size()-source_pre_width,1) != " ") {
+ source_pre_width--;
+ }
+ source_pre = "..." + source_pre.substr( source_pre.size()-source_pre_width, source_pre_width );
+ }
+ if (source_post.size() > source_post_width) {
+ while(source_post_width>0 &&
+ source_post.substr(source_post_width-1,1) != " ") {
+ source_post_width--;
+ }
+ source_post = source_post.substr( 0, source_post_width ) + "...";
+ }
+
+ *out << "<tr><td class=\"pp_source_left\">"
+ << source_pre
+ << "</td><td class=\"pp_source\">"
+ << source.substr( 0, source_width -2 )
+ << "</td><td class=\"pp_source_right\">"
+ << source_post
+ << "</td>";
+
+ // target
+ size_t target_width = width/2;
+ string target_pre = "";
+ string target = "";
+ string target_post = "";
+ size_t target_pre_null_width = 0;
+ size_t target_post_null_width = 0;
+ for( char i=0; i<m_target_start; i++ ) {
+ WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
+ target_pre += " " + word;
+ if (i >= m_target_start-m_pre_null)
+ target_pre_null_width += word.size() + 1;
+ }
+ for( char i=m_target_start; i<=m_target_end; i++ ) {
+ if (i>m_target_start) target += " ";
+ target += m_targetCorpus->GetWord( m_sentence_id, i);
+ }
+ for( char i=m_target_end+1; i<m_target_length; i++ ) {
+ if (i>m_target_end+1) target_post += " ";
+ WORD word = m_targetCorpus->GetWord( m_sentence_id, i);
+ target_post += word;
+ if (i-(m_target_end+1) < m_post_null) {
+ target_post_null_width += word.size() + 1;
+ }
+ }
+
+ size_t target_pre_width = (target_width-target.size())/2;
+ size_t target_post_width = (target_width-target.size()+1)/2;
+
+ if (target.size() > (size_t)width) {
+ target_pre_width = 0;
+ target_post_width = 0;
+ }
+
+ if (target_pre.size() < target_pre_width)
+ target_pre_width = target_pre.size();
+ else {
+ while(target_pre_width>0 &&
+ target_pre.substr(target_pre.size()-target_pre_width,1) != " ") {
+ target_pre_width--;
+ }
+ target_pre = "..." + target_pre.substr( target_pre.size()-target_pre_width, target_pre_width );
+ }
+
+ if (target_post.size() < target_post_width) {
+ target_post_width = target_post.size();
+ }
+ else {
+ while(target_post_width>0 &&
+ target_post.substr(target_post_width-1,1) != " ") {
+ target_post_width--;
+ }
+ target_post = target_post.substr( 0, target_post_width ) + "...";
+ }
+
+ if (m_pre_null) {
+ //cerr << endl << "target_pre_width=" << target_pre_width << ", target_pre_null_width=" << target_pre_null_width << ", target_pre.size()=" << target_pre.size() << endl;
+ if (target_pre_width < target_pre.size())
+ target_pre_null_width -= target_pre.size()-target_pre_width;
+ target_pre = target_pre.substr(0,target_pre_width-target_pre_null_width)
+ + "<span class=\"null_aligned\">"
+ + target_pre.substr(target_pre_width-target_pre_null_width)
+ + "</span>";
+ }
+ if (m_post_null) {
+ //cerr << endl << "target_post_width=" << target_post_width << ", target_post_null_width=" << target_post_null_width << ", target_post.size()=" << target_post.size() << endl;
+ if (target_post_null_width > target_post.size()) {
+ target_post_null_width = target_post.size();
+ }
+ target_post = "<span class=\"null_aligned\">"
+ + target_post.substr(0,target_post_null_width)
+ + "</span>"
+ + target_post.substr(target_post_null_width);
+ }
+
+ *out << "<td class=\"pp_target_left\">"
+ << target_pre
+ << "</td><td class=\"pp_target\">"
+ << target.substr( 0, target_width -2 )
+ << "</td><td class=\"pp_target_right\">"
+ << target_post
+ << "</td></tr>"<< endl;
+}
+