set source phrase in RuleTableLoader, read rule count info from phrase table

author: Eva Hasler <evahasler@gmail.com> 2012-04-10 18:45:18 +0400
committer: Eva Hasler <evahasler@gmail.com> 2012-04-10 18:45:18 +0400
commit: a729e2447df1be10cc71093a3d08a954d32811dc (patch)
tree: 202503544297a9c5c3eeca1771903e6c67fb377f /moses
parent: 2b9c250d3665b17ee8c9537db40e8b357ce4840c (diff)
4 files changed, 40 insertions, 2 deletions
diff --git a/moses/src/RuleTableLoaderCompact.cpp b/moses/src/RuleTableLoaderCompact.cpp
index 21d146bec..dce3382e7 100644
--- a/moses/src/RuleTableLoaderCompact.cpp
+++ b/moses/src/RuleTableLoaderCompact.cpp
@@ -226,6 +226,7 @@ bool RuleTableLoaderCompact::LoadRuleSection(
     targetPhrase->SetTargetLHS(targetLhs);
     targetPhrase->SetScoreChart(ruleTable.GetFeature(), scoreVector, weights,
                                 languageModels, wpProducer);
+    targetPhrase->SetSourcePhrase(sourcePhrase);
 
     // Insert rule into table.
     TargetPhraseCollection &coll = GetOrCreateTargetPhraseCollection(
diff --git a/moses/src/RuleTableLoaderStandard.cpp b/moses/src/RuleTableLoaderStandard.cpp
index 190241a13..dc4fb7235 100644
--- a/moses/src/RuleTableLoaderStandard.cpp
+++ b/moses/src/RuleTableLoaderStandard.cpp
@@ -185,7 +185,8 @@ bool RuleTableLoaderStandard::Load(FormatType format
     const string &sourcePhraseString = tokens[0]
                , &targetPhraseString = tokens[1]
                , &scoreString        = tokens[2]
-               , &alignString        = tokens[3];
+               , &alignString        = tokens[3]
+               , &ruleCountString    = tokens[4];
 
     bool isLHSEmpty = (sourcePhraseString.find_first_not_of(" \t", 0) == string::npos);
     if (isLHSEmpty && !staticData.IsWordDeletionEnabled()) {
@@ -216,10 +217,12 @@ bool RuleTableLoaderStandard::Load(FormatType format
     // create target phrase obj
     TargetPhrase *targetPhrase = new TargetPhrase(Output);
     targetPhrase->CreateFromStringNewFormat(Output, output, targetPhraseString, factorDelimiter, targetLHS);
+    targetPhrase->SetSourcePhrase(sourcePhrase);
 
     // rest of target phrase
     targetPhrase->SetAlignmentInfo(alignString);
     targetPhrase->SetTargetLHS(targetLHS);
+    targetPhrase->SetRuleCount(ruleCountString, scoreVector);
     //targetPhrase->SetDebugOutput(string("New Format pt ") + line);
 
     // component score, for n-best output
diff --git a/moses/src/TargetPhrase.cpp b/moses/src/TargetPhrase.cpp
index 8dae1c694..ce0623e5b 100644
--- a/moses/src/TargetPhrase.cpp
+++ b/moses/src/TargetPhrase.cpp
@@ -34,6 +34,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "Util.h"
 #include "DummyScoreProducers.h"
 #include "AlignmentInfoCollection.h"
+#include <boost/algorithm/string.hpp>
+
 
 using namespace std;
 
@@ -332,5 +334,34 @@ std::ostream& operator<<(std::ostream& os, const TargetPhrase& tp)
   return os;
 }
 
+void TargetPhrase::SetRuleCount(const StringPiece &ruleCountString, std::vector<float> &scoreVector) {
+	set<pair<size_t,size_t> > ruleCountInfo;
+	float p_f_given_e = 0, p_e_given_f = 0;
+	p_f_given_e = scoreVector[0];
+	if (scoreVector.size() >= 4) {
+		p_f_given_e = scoreVector[0];
+		p_e_given_f = scoreVector[2];
+	}
+	else {
+		if (scoreVector.size() >= 1 ) p_f_given_e = scoreVector[0];
+		std::cerr << "Warning: possibly wrong format of phrase translation scores" << endl;
+	}
+
+  std::vector<std::string> tokens;
+  boost::split(tokens, ruleCountString, boost::is_any_of("\t "));
+
+  float targetCount = 0, sourceCount = 0;
+  if (tokens.size() == 2) {
+   targetCount = Scan<float>(tokens[0]);
+   sourceCount = Scan<float>(tokens[1]);
+   float ruleCount = p_f_given_e * targetCount;
+   //float ruleCount2 = p_e_given_f * sourceCount; // could use this to double-check the counts
+   m_ruleCount = floor(ruleCount + 0.5);
+  }
+  else if (tokens.size() == 3) {
+  	m_ruleCount = Scan<float>(tokens[2]);
+  }
+}
+
 }
 
diff --git a/moses/src/TargetPhrase.h b/moses/src/TargetPhrase.h
index f5e20271c..9226b2636 100644
--- a/moses/src/TargetPhrase.h
+++ b/moses/src/TargetPhrase.h
@@ -59,11 +59,12 @@ protected:
 	Phrase m_sourcePhrase; 
 	const AlignmentInfo* m_alignmentInfo;
 	Word m_lhsTarget;
+	size_t m_ruleCount;
 
 public:
   TargetPhrase();
   TargetPhrase(std::string out_string);
-  TargetPhrase(const Phrase &);
+  TargetPhrase(const Phrase &targetPhrase);
   ~TargetPhrase();
 
   //! used by the unknown word handler- these targets
@@ -159,6 +160,8 @@ public:
 	const AlignmentInfo &GetAlignmentInfo() const
 	{ return *m_alignmentInfo; }
 	
+	void SetRuleCount(const StringPiece &ruleCountString, std::vector<float> &scoreVector);
+	size_t GetRuleCount() const { return m_ruleCount; }
 
   TO_STRING();
author	Eva Hasler <evahasler@gmail.com>	2012-04-10 18:45:18 +0400
committer	Eva Hasler <evahasler@gmail.com>	2012-04-10 18:45:18 +0400
commit	a729e2447df1be10cc71093a3d08a954d32811dc (patch)
tree	202503544297a9c5c3eeca1771903e6c67fb377f /moses
parent	2b9c250d3665b17ee8c9537db40e8b357ce4840c (diff)