print out span widths of non-terms. Extra argument --OutputNTLengths

git-svn-id: https://mosesdecoder.svn.sourceforge.net/svnroot/mosesdecoder/trunk@4224 1f5c12ca-751b-0410-a591-d2e778427230
author: hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230> 2011-09-14 14:23:14 +0400
committer: hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230> 2011-09-14 14:23:14 +0400
commit: 149208ecbaed011a4555c55fcd5b828b4a9a8cc0 (patch)
tree: 355d3446d370a38777d2866c1c1d0c7aea526da2 /scripts
parent: d68274d2178196a4d1f9ffa3347336eec0e50f93 (diff)
3 files changed, 97 insertions, 3 deletions
diff --git a/scripts/training/phrase-extract/PhraseAlignment.cpp b/scripts/training/phrase-extract/PhraseAlignment.cpp
index c6e6fb92c..6ff7cc97a 100644
--- a/scripts/training/phrase-extract/PhraseAlignment.cpp
+++ b/scripts/training/phrase-extract/PhraseAlignment.cpp
@@ -7,6 +7,7 @@
  *
  */
 
+#include <sstream>
 #include "PhraseAlignment.h"
 #include "SafeGetline.h"
 #include "tables-core.h"
@@ -19,6 +20,59 @@ extern Vocabulary vcbS;
 
 extern bool hierarchicalFlag;
 
+//! convert string to variable of type T. Used to reading floats, int etc from files
+template<typename T>
+inline T Scan(const std::string &input)
+{
+	std::stringstream stream(input);
+	T ret;
+	stream >> ret;
+	return ret;
+}
+
+
+//! speeded up version of above
+template<typename T>
+inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
+{
+	output.resize(input.size());
+	for (size_t i = 0 ; i < input.size() ; i++)
+	{
+		output[i] = Scan<T>( input[i] );
+	}
+}
+
+
+inline void Tokenize(std::vector<std::string> &output
+                     , const std::string& str
+                     , const std::string& delimiters = " \t")
+{
+  // Skip delimiters at beginning.
+  std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+  // Find first "non-delimiter".
+  std::string::size_type pos     = str.find_first_of(delimiters, lastPos);
+  
+  while (std::string::npos != pos || std::string::npos != lastPos) {
+    // Found a token, add it to the vector.
+    output.push_back(str.substr(lastPos, pos - lastPos));
+    // Skip delimiters.  Note the "not_of"
+    lastPos = str.find_first_not_of(delimiters, pos);
+    // Find next "non-delimiter"
+    pos = str.find_first_of(delimiters, lastPos);
+  }
+}
+
+// speeded up version of above
+template<typename T>
+inline void Tokenize( std::vector<T> &output
+										 , const std::string &input
+										 , const std::string& delimiters = " \t")
+{
+	std::vector<std::string> stringVector;
+	Tokenize(stringVector, input, delimiters);
+	return Scan<T>(output, stringVector );
+}
+
 // read in a phrase pair and store it
 void PhraseAlignment::create( char line[], int lineID )
 {
@@ -37,7 +91,6 @@ void PhraseAlignment::create( char line[], int lineID )
     else if (item == 2) { // target phrase
       phraseT.push_back( vcbT.storeIfNew( token[j] ) );
     }
-
     else if (item == 3) { // alignment
       int s,t;
       sscanf(token[j].c_str(), "%d-%d", &s, &t);
@@ -56,6 +109,9 @@ void PhraseAlignment::create( char line[], int lineID )
     } else if (item == 4) { // count
       sscanf(token[j].c_str(), "%f", &count);
     }
+    else if (item == 5) { // non-term lengths
+      addNTLength(token[j]);
+    }
   }
 
   createAlignVec(phraseS.size(), phraseT.size());
@@ -63,11 +119,28 @@ void PhraseAlignment::create( char line[], int lineID )
   if (item == 3) {
     count = 1.0;
   }
-  if (item < 3 || item > 4) {
+  if (item < 3 || item > 5) {
     cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
   }
 }
 
+void PhraseAlignment::addNTLength(const std::string &tok)
+{
+  vector< string > tokens;
+  
+  Tokenize(tokens, tok, "=");
+  assert(tokens.size() == 2);
+  
+  size_t sourcePos = Scan<size_t>(tokens[0]);
+  assert(sourcePos < phraseS.size());
+  
+  vector< size_t > ntLengths;
+  Tokenize<size_t>(ntLengths, tokens[1], ",");
+  assert(ntLengths.size() == 2);
+  
+  m_ntLengths[sourcePos] = std::pair<size_t, size_t>(ntLengths[0], ntLengths[1]);
+}
+
 void PhraseAlignment::createAlignVec(size_t sourceSize, size_t targetSize)
 {
   // in case of no align info. always need align info, even if blank
diff --git a/scripts/training/phrase-extract/PhraseAlignment.h b/scripts/training/phrase-extract/PhraseAlignment.h
index d3557e4a8..68b82563a 100644
--- a/scripts/training/phrase-extract/PhraseAlignment.h
+++ b/scripts/training/phrase-extract/PhraseAlignment.h
@@ -11,6 +11,7 @@
 
 #include <vector>
 #include <set>
+#include <map>
 
 // data structure for a single phrase pair
 class PhraseAlignment
@@ -19,7 +20,10 @@ protected:
   PHRASE phraseS;
   PHRASE phraseT;
 
+  std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
+  
   void createAlignVec(size_t sourceSize, size_t targetSize);
+  void addNTLength(const std::string &tok);
 public:
   float count;
   std::vector< std::set<size_t> > alignedToT;
diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp
index fbb27b944..5ae714aa4 100644
--- a/scripts/training/phrase-extract/score.cpp
+++ b/scripts/training/phrase-extract/score.cpp
@@ -65,6 +65,7 @@ double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment
 set<string> functionWordList;
 void loadFunctionWords( const char* fileNameFunctionWords );
 double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePair, map<size_t, pair<float, float> > &ntLengthsProb);
 
 LexicalTable lexTable;
 bool inverseFlag = false;
@@ -78,6 +79,7 @@ int negLogProb = 1;
 bool lexFlag = true;
 bool unalignedFlag = false;
 bool unalignedFWFlag = false;
+bool outputNTLengths = false;
 int countOfCounts[COC_MAX+1];
 int totalDistinct = 0;
 float minCountHierarchical = 0;
@@ -148,6 +150,8 @@ int main(int argc, char* argv[])
       minCountHierarchical = atof(argv[++i]);
       cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n";
       minCountHierarchical -= 0.00001; // account for rounding
+    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
+      outputNTLengths = true;
     } else {
       cerr << "ERROR: unknown option " << argv[i] << endl;
       exit(1);
@@ -325,12 +329,17 @@ PhraseAlignment* findBestAlignment( vector< PhraseAlignment* > &phrasePair )
   return bestAlignment;
 }
 
+void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePair, map<size_t, pair<float, float> > &ntLengthsProb)
+{
+  
+}
+
 void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile )
 {
   if (phrasePair.size() == 0) return;
 
   PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
-
+    
   // compute count
   float count = 0;
   for(size_t i=0; i<phrasePair.size(); i++) {
@@ -433,6 +442,14 @@ void outputPhrasePair( vector< PhraseAlignment* > &phrasePair, float totalCount,
   phraseTableFile << " ||| " << totalCount << " " << count;
   if (kneserNeyFlag) 
     phraseTableFile << " " << distinctCount;
+  
+  // nt lengths
+  map<size_t, pair<float, float> > ntLengthsProb;
+  if (outputNTLengths)
+  {
+    calcNTLengthProb(phrasePair, ntLengthsProb);
+  }
+  
   phraseTableFile << endl;
 }
author	hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>	2011-09-14 14:23:14 +0400
committer	hieuhoang1972 <hieuhoang1972@1f5c12ca-751b-0410-a591-d2e778427230>	2011-09-14 14:23:14 +0400
commit	149208ecbaed011a4555c55fcd5b828b4a9a8cc0 (patch)
tree	355d3446d370a38777d2866c1c1d0c7aea526da2 /scripts
parent	d68274d2178196a4d1f9ffa3347336eec0e50f93 (diff)