extended display options for biconcor

author: phikoehn <pkoehn@inf.ed.ac.uk> 2013-04-05 14:26:00 +0400
committer: phikoehn <pkoehn@inf.ed.ac.uk> 2013-04-05 14:26:00 +0400
commit: c016b6e04b9a0c6eddc423b0b839021f00599bbe (patch)
tree: 0f3d3bd32fb4d00d5c7d619db2a0d5027549b15f /biconcor/PhrasePairCollection.cpp
parent: ac82be3120ac0b143039e2e36dff8b8538bdcb68 (diff)
1 files changed, 49 insertions, 38 deletions
diff --git a/biconcor/PhrasePairCollection.cpp b/biconcor/PhrasePairCollection.cpp
index 17c95d24a..7497b2af8 100644
--- a/biconcor/PhrasePairCollection.cpp
+++ b/biconcor/PhrasePairCollection.cpp
@@ -13,31 +13,32 @@
 
 using namespace std;
 
-PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a )
+PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a, int max_translation, int max_example )
   :m_suffixArray(sa)
   ,m_targetCorpus(tc)
   ,m_alignment(a)
   ,m_size(0)
-  ,m_max_lookup(10000)
-  ,m_max_pp_target(50)
-  ,m_max_pp(50)
+  ,m_max_lookup(10000)          // maximum number of source occurrences sampled
+  ,m_max_translation(max_translation)    // max number of different distinct translations returned
+  ,m_max_example(max_example) // max number of examples returned for each distinct translation
 {}
 
 PhrasePairCollection::~PhrasePairCollection()
 {}
 
-bool PhrasePairCollection::GetCollection( const vector< string >& sourceString )
+int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
 {
   INDEX first_match, last_match;
   if (! m_suffixArray->FindMatches( sourceString, first_match, last_match )) {
-    return false;
+    return 0;
   }
-  cerr << "\tfirst match " << first_match << endl;
-  cerr << "\tlast match " << last_match << endl;
+  //cerr << "\tfirst match " << first_match << endl;
+  //cerr << "\tlast match " << last_match << endl;
 
   INDEX found = last_match - first_match +1;
 
   map< vector< WORD_ID >, INDEX > index;
+  int real_count = 0;
   for( INDEX i=first_match; i<=last_match; i++ ) {
     int position = m_suffixArray->GetPosition( i );
     int source_start = m_suffixArray->GetWordInSentence( position );
@@ -45,23 +46,23 @@ bool PhrasePairCollection::GetCollection( const vector< string >& sourceString )
     INDEX sentence_id = m_suffixArray->GetSentence( position );
     int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
     int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
-    cerr << "match " << (i-first_match)
-         << " in sentence " << sentence_id
-         << ", starting at word " << source_start
-         << " of " << sentence_length
-         << ". target sentence has " << target_length << " words.";
+    //cerr << "match " << (i-first_match)
+         //<< " in sentence " << sentence_id
+         //<< ", starting at word " << source_start
+         //<< " of " << sentence_length
+         //<< ". target sentence has " << target_length << " words.";
     int target_start, target_end, pre_null, post_null;
     if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
-      cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
-      cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
+      //cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
+      //cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
 			bool null_boundary_words = false;
       for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) {
         for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) {
           vector< WORD_ID > targetString;
-          cerr << "; ";
+          //cerr << "; ";
           for (int target = target_start - pre; target <= target_end + post; target++) {
             targetString.push_back( m_targetCorpus->GetWordId( sentence_id, target) );
-            cerr << m_targetCorpus->GetWord( sentence_id, target) << " ";
+            //cerr << m_targetCorpus->GetWord( sentence_id, target) << " ";
           }
           PhrasePair *phrasePair = new PhrasePair( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, target_length, position, source_start, source_end, target_start-pre, target_end+post, pre, post, pre_null-pre, post_null-post);
           // matchCollection.Add( sentence_id, )
@@ -76,37 +77,47 @@ bool PhrasePairCollection::GetCollection( const vector< string >& sourceString )
       }
     }
 		else {
-			cerr << "mismatch " << (i-first_match)
-					 << " in sentence " << sentence_id
-					 << ", starting at word " << source_start
-					 << " of " << sentence_length
-					 << ". target sentence has " << target_length << " words.";
+			//cerr << "mismatch " << (i-first_match)
+			//		 << " in sentence " << sentence_id
+			//		 << ", starting at word " << source_start
+			//		 << " of " << sentence_length
+			//		 << ". target sentence has " << target_length << " words.";
 			Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
 			if (mismatch->Unaligned())
 				m_unaligned.push_back( mismatch );
 			else
 				m_mismatch.push_back( mismatch );
 		}
-    cerr << endl;
+    //cerr << endl;
 
     if (found > (INDEX)m_max_lookup) {
       i += found/m_max_lookup-1;
     }
+    real_count++;
   }
   sort(m_collection.begin(), m_collection.end(), CompareBySize());
-  return true;
+  return real_count;
 }
 
-void PhrasePairCollection::Print() const
+void PhrasePairCollection::Print(bool pretty) const
 {
   vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
-  for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end(); ppWithSameTarget++ ) {
+  int i=0;
+  for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && i<m_max_translation; i++, ppWithSameTarget++ ) {
     (*(ppWithSameTarget->begin()))->PrintTarget( &cout );
     int count = ppWithSameTarget->size();
     cout << "(" << count << ")" << endl;
-    vector< PhrasePair* >::const_iterator p;
-    for(p = ppWithSameTarget->begin(); p != ppWithSameTarget->end(); p++ ) {
-      (*p)->Print( &cout, 100 );
+    vector< PhrasePair* >::const_iterator p = ppWithSameTarget->begin();
+    for(int j=0; j<ppWithSameTarget->size() && j<m_max_example; j++, p++ ) {
+      if (pretty) {
+        (*p)->PrintPretty( &cout, 100 );
+      }
+      else {
+        (*p)->Print( &cout );
+      }
+      if (ppWithSameTarget->size() > m_max_example) {
+        p += ppWithSameTarget->size()/m_max_example-1;
+      }
     }
   }
 }
@@ -117,7 +128,7 @@ void PhrasePairCollection::PrintHTML() const
 	bool singleton = false;
 	// loop over all translations
   vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
-  for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_pp_target; ppWithSameTarget++, pp_target++ ) {
+  for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_translation; ppWithSameTarget++, pp_target++ ) {
 
 		int count = ppWithSameTarget->size();
 		if (!singleton) {
@@ -143,9 +154,9 @@ void PhrasePairCollection::PrintHTML() const
 		int i=0;
     for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
       (*p)->PrintClippedHTML( &cout, 160 );
-      if (count > m_max_pp) {
-        p += count/m_max_pp-1;
-        pp += count/m_max_pp-1;
+      if (count > m_max_example) {
+        p += count/m_max_example-1;
+        pp += count/m_max_example-1;
       }
     }
 		if (i == 10 && pp < count) {			
@@ -153,11 +164,11 @@ void PhrasePairCollection::PrintHTML() const
 			cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
 			cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
 			cout << "<table align=\"center\">";
-			for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_pp && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
+			for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
 				(*p)->PrintClippedHTML( &cout, 160 );
-				if (count > m_max_pp) {
-					p += count/m_max_pp-1;
-					pp += count/m_max_pp-1;
+				if (count > m_max_example) {
+					p += count/m_max_example-1;
+					pp += count/m_max_example-1;
 				}
 			}
 		}
@@ -172,7 +183,7 @@ void PhrasePairCollection::PrintHTML() const
 	if (singleton) cout << "</table></div>\n";
 	else if (pp_target > 9)	cout << "</div>";
 
-	size_t max_mismatch = m_max_pp/3;
+	size_t max_mismatch = m_max_example/3;
 	// unaligned phrases
 	if (m_unaligned.size() > 0) {
 		cout << "<p class=\"pp_singleton_header\">unaligned"
author	phikoehn <pkoehn@inf.ed.ac.uk>	2013-04-05 14:26:00 +0400
committer	phikoehn <pkoehn@inf.ed.ac.uk>	2013-04-05 14:26:00 +0400
commit	c016b6e04b9a0c6eddc423b0b839021f00599bbe (patch)
tree	0f3d3bd32fb4d00d5c7d619db2a0d5027549b15f /biconcor/PhrasePairCollection.cpp
parent	ac82be3120ac0b143039e2e36dff8b8538bdcb68 (diff)