Support for using factors in mert and evaluator

example: Use --factor "0|2" to use only first and third factor from nbest list and from reference. If you use interpolated scorer, separate records with comma (e.g. --factor "0|2,1").
author: Matous Machacek <machacekmatous@gmail.com> 2012-02-28 05:27:23 +0400
committer: Matous Machacek <machacekmatous@gmail.com> 2012-02-28 05:27:23 +0400
commit: ba987c94ba9be5e7c8eb9c3e7c83d8f971fbd3aa (patch)
tree: 4140bd7b8fcef62012d2fe59956dcc81bb25c2bd /mert
parent: e38cd12ef3385304a4b363ca9b9ab16ed886a2ff (diff)
10 files changed, 123 insertions, 7 deletions
diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp
index cf2b84242..baf8a0f8b 100644
--- a/mert/BleuScorer.cpp
+++ b/mert/BleuScorer.cpp
@@ -139,6 +139,7 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
     string line;
     size_t sid = 0; //sentence counter
     while (getline(refin,line)) {
+      line = this->applyFactors(line);
       if (i == 0) {
         NgramCounts *counts = new NgramCounts; //these get leaked
         m_ref_counts.push_back(counts);
@@ -183,8 +184,9 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
   }
   NgramCounts testcounts;
   // stats for this line
-  vector<ScoreStatsType> stats(kLENGTH * 2);;
-  const size_t length = countNgrams(text, testcounts, kLENGTH);
+  vector<ScoreStatsType> stats(kLENGTH * 2);
+  string sentence = this->applyFactors(text);
+  const size_t length = countNgrams(sentence, testcounts, kLENGTH);
 
   // Calculate effective reference length.
   switch (m_ref_length_type) {
diff --git a/mert/CderScorer.cpp b/mert/CderScorer.cpp
index 2105820e9..424c210b3 100644
--- a/mert/CderScorer.cpp
+++ b/mert/CderScorer.cpp
@@ -31,6 +31,7 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
     m_ref_sentences.push_back(vector<sent_t>());
     string line;
     while (getline(refin,line)) {
+      line = this->applyFactors(line);
       sent_t encoded;
       TokenizeAndEncode(line, encoded);
       m_ref_sentences[rid].push_back(encoded);
@@ -40,8 +41,10 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
 
 void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
 {
+  string sentence = this->applyFactors(text);
+
   vector<int> stats;
-  prepareStatsVector(sid, text, stats);
+  prepareStatsVector(sid, sentence, stats);
   entry.set(stats);
 }
 
diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp
index 5ba410539..1951e4234 100644
--- a/mert/InterpolatedScorer.cpp
+++ b/mert/InterpolatedScorer.cpp
@@ -159,3 +159,24 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats
   string str = buff.str();
   entry.set(str);
 }
+
+void InterpolatedScorer::setFactors(const string& factors)
+{
+  if (factors.empty()) return;
+
+  vector<string> fsplit;
+  split(factors, ',', fsplit);
+
+  if (fsplit.size() != _scorers.size()) throw runtime_error("Number of factor specifications does not equal number of interpolated scorers.");
+  
+  for (size_t i = 0; i < _scorers.size(); ++i)
+  {
+    _scorers[i]->setFactors(fsplit[i]);
+  }
+}
+
+
+
+
+
+
diff --git a/mert/InterpolatedScorer.h b/mert/InterpolatedScorer.h
index d8eb87e3f..2a538bc39 100644
--- a/mert/InterpolatedScorer.h
+++ b/mert/InterpolatedScorer.h
@@ -42,6 +42,11 @@ public:
 
   virtual void setScoreData(ScoreData* data);
 
+  /**
+   * Set the factors, which should be used for this metric
+   */
+  virtual void setFactors(const string& factors);
+
 protected:
   ScopedVector<Scorer> _scorers;
 
diff --git a/mert/PerScorer.cpp b/mert/PerScorer.cpp
index 76c2765dd..06a83bd2f 100644
--- a/mert/PerScorer.cpp
+++ b/mert/PerScorer.cpp
@@ -29,6 +29,7 @@ void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
   string line;
   int sid = 0;
   while (getline(in,line)) {
+    line = this->applyFactors(line);
     vector<int> tokens;
     TokenizeAndEncode(line, tokens);
     m_ref_tokens.push_back(multiset<int>());
@@ -52,10 +53,13 @@ void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
     msg << "Sentence id (" << sid << ") not found in reference set";
     throw runtime_error(msg.str());
   }
+
+  string sentence = this->applyFactors(text);
+
   // Calculate correct, output_length and ref_length for
   // the line and store it in entry
   vector<int> testtokens;
-  TokenizeAndEncode(text, testtokens);
+  TokenizeAndEncode(sentence, testtokens);
   multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
   set<int> testtokens_unique(testtokens.begin(),testtokens.end());
   int correct = 0;
diff --git a/mert/Scorer.cpp b/mert/Scorer.cpp
index a2bb4720c..6b36c8f14 100644
--- a/mert/Scorer.cpp
+++ b/mert/Scorer.cpp
@@ -1,5 +1,6 @@
 #include "Scorer.h"
 #include <limits>
+#include "Util.h"
 
 namespace {
 
@@ -95,6 +96,55 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
   }
 }
 
+/**
+ * Set the factors, which should be used for this metric
+ */
+void Scorer::setFactors(const string& factors)
+{
+  if (factors.empty()) return;
+  vector<string> factors_vec;
+  split(factors, '|', factors_vec);
+  for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
+  {
+    int factor = atoi(it->c_str());
+    m_factors.push_back(factor);        
+  }
+}
+
+/**
+ * Take the factored sentence and return the desired factors
+ */
+string Scorer::applyFactors(const string& sentence)
+{
+  if (m_factors.size() == 0) return sentence;
+  
+  vector<string> tokens;
+  split(sentence, ' ', tokens);
+ 
+  stringstream sstream; 
+  for (size_t i = 0; i < tokens.size(); ++i)
+  {
+    if (tokens[i] == "") continue;   
+
+    vector<string> factors;
+    split(tokens[i], '|', factors);
+
+    int fsize = factors.size();
+    
+    if (i>0) sstream << " ";
+    
+    for (size_t j = 0; j < m_factors.size(); ++j)
+    {
+      int findex = m_factors[j];
+      if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
+
+      if (j>0) sstream << "|";
+      sstream << factors[findex];
+    }    
+  }
+  return sstream.str();
+}
+
 StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& config)
     : Scorer(name,config) {
   //configure regularisation
diff --git a/mert/Scorer.h b/mert/Scorer.h
index ad44fab0e..880f4e228 100644
--- a/mert/Scorer.h
+++ b/mert/Scorer.h
@@ -97,6 +97,16 @@ class Scorer
     m_score_data = data;
   }
 
+  /**
+   * Set the factors, which should be used for this metric
+   */
+  virtual void setFactors(const string& factors);
+
+  /**
+   * Take the factored sentence and return the desired factors
+   */
+  virtual string applyFactors(const string& sentece);
+
  private:
   class Encoder {
    public:
@@ -114,6 +124,7 @@ class Scorer
   string m_name;
   Encoder* m_encoder;
   map<string, string> m_config;
+  vector<int> m_factors;
 
  protected:
   ScoreData* m_score_data;
diff --git a/mert/TerScorer.cpp b/mert/TerScorer.cpp
index e5e670cc2..608b82cde 100644
--- a/mert/TerScorer.cpp
+++ b/mert/TerScorer.cpp
@@ -33,6 +33,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
     string line;
     int sid = 0;
     while ( getline ( in, line ) ) {
+      line = this->applyFactors(line);
       vector<int> tokens;
       TokenizeAndEncode(line, tokens);
       m_references.push_back ( tokens );
@@ -48,6 +49,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
 
 void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry )
 {
+  string sentence = this->applyFactors(text);
 
   terAlignment result;
   result.numEdits = 0.0 ;
@@ -74,7 +76,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry
       averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size();
     }
     averageLength=averageLength/( double ) m_multi_references.size();
-    TokenizeAndEncode(text, testtokens);
+    TokenizeAndEncode(sentence, testtokens);
     terCalc * evaluation=new terCalc();
     evaluation->setDebugMode ( false );
     terAlignment tmp_result = evaluation->TER ( reftokens, testtokens );
diff --git a/mert/evaluator.cpp b/mert/evaluator.cpp
index 21556e9ee..3b2e0d61f 100644
--- a/mert/evaluator.cpp
+++ b/mert/evaluator.cpp
@@ -131,6 +131,7 @@ void usage()
   cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
   cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
   cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
+  cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
   cerr << "[--reference|-R] comma separated list of reference files" << endl;
   cerr << "[--candidate|-C] comma separated list of candidate files" << endl;
   cerr << "[--bootstrap|-b] number of booststraped samples (default 0 - no bootstraping)" << endl;
@@ -164,6 +165,7 @@ static struct option long_options[] = {
   {"candidate", required_argument, 0, 'C'},
   {"bootstrap", required_argument, 0, 'b'},
   {"rseed", required_argument, 0, 'r'},
+  {"factors", required_argument, 0, 'f'},
   {"help", no_argument, 0, 'h'},
   {0, 0, 0, 0}
 };
@@ -174,6 +176,7 @@ struct ProgramOption {
   vector<string> scorer_configs;
   string reference;
   string candidate;
+  vector<string> scorer_factors;
   int bootstrap;
   int seed;
   bool has_seed;
@@ -190,11 +193,12 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
   int c;
   int option_index;
   int last_scorer_index = -1;
-  while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:h", long_options, &option_index)) != -1) {
+  while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:f:h", long_options, &option_index)) != -1) {
     switch(c) {
       case 's':
         opt->scorer_types.push_back(string(optarg));
         opt->scorer_configs.push_back(string(""));
+        opt->scorer_factors.push_back(string(""));
         last_scorer_index++;
         break;
       case 'c':
@@ -213,6 +217,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
         opt->seed = strtol(optarg, NULL, 10);
         opt->has_seed = true;
         break;
+      case 'f':
+        opt->scorer_factors[last_scorer_index] = string(optarg);
+        break;
       default:
         usage();
     }
@@ -223,6 +230,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
   {
     opt->scorer_types.push_back(string("BLEU"));
     opt->scorer_configs.push_back(string(""));
+    opt->scorer_factors.push_back(string(""));
   }
 }
 
@@ -268,6 +276,7 @@ int main(int argc, char** argv)
         for (size_t i = 0; i < option.scorer_types.size(); i++)
         {
             g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]);
+            g_scorer->setFactors(option.scorer_factors[i]);
             g_scorer->setReferenceFiles(refFiles);
             EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
             delete g_scorer;
diff --git a/mert/extractor.cpp b/mert/extractor.cpp
index 1c80211b9..0d38c2e25 100644
--- a/mert/extractor.cpp
+++ b/mert/extractor.cpp
@@ -26,6 +26,7 @@ void usage()
   cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
   cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
   cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
+  cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
   cerr << "[--reference|-r] comma separated list of reference files" << endl;
   cerr << "[--binary|-b] use binary output format (default to text )" << endl;
   cerr << "[--nbest|-n] the nbest file" << endl;
@@ -41,6 +42,7 @@ void usage()
 static struct option long_options[] = {
   {"sctype", required_argument, 0, 's'},
   {"scconfig", required_argument,0, 'c'},
+  {"factors", required_argument,0, 'f'},
   {"reference", required_argument, 0, 'r'},
   {"binary", no_argument, 0, 'b'},
   {"nbest", required_argument, 0, 'n'},
@@ -57,6 +59,7 @@ static struct option long_options[] = {
 struct ProgramOption {
   string scorerType;
   string scorerConfig;
+  string scorerFactors;
   string referenceFile;
   string nbestFile;
   string scoreDataFile;
@@ -69,6 +72,7 @@ struct ProgramOption {
   ProgramOption()
       : scorerType("BLEU"),
         scorerConfig(""),
+        scorerFactors(""),
         referenceFile(""),
         nbestFile(""),
         scoreDataFile("statscore.data"),
@@ -83,7 +87,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
   int c;
   int option_index;
 
-  while ((c = getopt_long(argc, argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
+  while ((c = getopt_long(argc, argv, "s:r:f:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
     switch (c) {
       case 's':
         opt->scorerType = string(optarg);
@@ -91,6 +95,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
       case 'c':
         opt->scorerConfig = string(optarg);
         break;
+      case 'f':
+        opt->scorerFactors = string(optarg);
+        break;
       case 'r':
         opt->referenceFile = string(optarg);
         break;
@@ -180,6 +187,8 @@ int main(int argc, char** argv)
 
     Scorer* scorer = ScorerFactory::getScorer(option.scorerType, option.scorerConfig);
 
+    scorer->setFactors(option.scorerFactors);
+
     // load references
     if (referenceFiles.size() > 0)
       scorer->setReferenceFiles(referenceFiles);
author	Matous Machacek <machacekmatous@gmail.com>	2012-02-28 05:27:23 +0400
committer	Matous Machacek <machacekmatous@gmail.com>	2012-02-28 05:27:23 +0400
commit	ba987c94ba9be5e7c8eb9c3e7c83d8f971fbd3aa (patch)
tree	4140bd7b8fcef62012d2fe59956dcc81bb25c2bd /mert
parent	e38cd12ef3385304a4b363ca9b9ab16ed886a2ff (diff)