Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/mert
diff options
context:
space:
mode:
authorMatous Machacek <machacekmatous@gmail.com>2012-02-28 05:27:23 +0400
committerMatous Machacek <machacekmatous@gmail.com>2012-02-28 05:27:23 +0400
commitba987c94ba9be5e7c8eb9c3e7c83d8f971fbd3aa (patch)
tree4140bd7b8fcef62012d2fe59956dcc81bb25c2bd /mert
parente38cd12ef3385304a4b363ca9b9ab16ed886a2ff (diff)
Support for using factors in mert and evaluator
example: Use --factor "0|2" to use only first and third factor from nbest list and from reference. If you use interpolated scorer, separate records with comma (e.g. --factor "0|2,1").
Diffstat (limited to 'mert')
-rw-r--r--mert/BleuScorer.cpp6
-rw-r--r--mert/CderScorer.cpp5
-rw-r--r--mert/InterpolatedScorer.cpp21
-rw-r--r--mert/InterpolatedScorer.h5
-rw-r--r--mert/PerScorer.cpp6
-rw-r--r--mert/Scorer.cpp50
-rw-r--r--mert/Scorer.h11
-rw-r--r--mert/TerScorer.cpp4
-rw-r--r--mert/evaluator.cpp11
-rw-r--r--mert/extractor.cpp11
10 files changed, 123 insertions, 7 deletions
diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp
index cf2b84242..baf8a0f8b 100644
--- a/mert/BleuScorer.cpp
+++ b/mert/BleuScorer.cpp
@@ -139,6 +139,7 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
string line;
size_t sid = 0; //sentence counter
while (getline(refin,line)) {
+ line = this->applyFactors(line);
if (i == 0) {
NgramCounts *counts = new NgramCounts; //these get leaked
m_ref_counts.push_back(counts);
@@ -183,8 +184,9 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
}
NgramCounts testcounts;
// stats for this line
- vector<ScoreStatsType> stats(kLENGTH * 2);;
- const size_t length = countNgrams(text, testcounts, kLENGTH);
+ vector<ScoreStatsType> stats(kLENGTH * 2);
+ string sentence = this->applyFactors(text);
+ const size_t length = countNgrams(sentence, testcounts, kLENGTH);
// Calculate effective reference length.
switch (m_ref_length_type) {
diff --git a/mert/CderScorer.cpp b/mert/CderScorer.cpp
index 2105820e9..424c210b3 100644
--- a/mert/CderScorer.cpp
+++ b/mert/CderScorer.cpp
@@ -31,6 +31,7 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
m_ref_sentences.push_back(vector<sent_t>());
string line;
while (getline(refin,line)) {
+ line = this->applyFactors(line);
sent_t encoded;
TokenizeAndEncode(line, encoded);
m_ref_sentences[rid].push_back(encoded);
@@ -40,8 +41,10 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
+ string sentence = this->applyFactors(text);
+
vector<int> stats;
- prepareStatsVector(sid, text, stats);
+ prepareStatsVector(sid, sentence, stats);
entry.set(stats);
}
diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp
index 5ba410539..1951e4234 100644
--- a/mert/InterpolatedScorer.cpp
+++ b/mert/InterpolatedScorer.cpp
@@ -159,3 +159,24 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats
string str = buff.str();
entry.set(str);
}
+
+void InterpolatedScorer::setFactors(const string& factors)
+{
+ if (factors.empty()) return;
+
+ vector<string> fsplit;
+ split(factors, ',', fsplit);
+
+ if (fsplit.size() != _scorers.size()) throw runtime_error("Number of factor specifications does not equal number of interpolated scorers.");
+
+ for (size_t i = 0; i < _scorers.size(); ++i)
+ {
+ _scorers[i]->setFactors(fsplit[i]);
+ }
+}
+
+
+
+
+
+
diff --git a/mert/InterpolatedScorer.h b/mert/InterpolatedScorer.h
index d8eb87e3f..2a538bc39 100644
--- a/mert/InterpolatedScorer.h
+++ b/mert/InterpolatedScorer.h
@@ -42,6 +42,11 @@ public:
virtual void setScoreData(ScoreData* data);
+ /**
+ * Set the factors, which should be used for this metric
+ */
+ virtual void setFactors(const string& factors);
+
protected:
ScopedVector<Scorer> _scorers;
diff --git a/mert/PerScorer.cpp b/mert/PerScorer.cpp
index 76c2765dd..06a83bd2f 100644
--- a/mert/PerScorer.cpp
+++ b/mert/PerScorer.cpp
@@ -29,6 +29,7 @@ void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
string line;
int sid = 0;
while (getline(in,line)) {
+ line = this->applyFactors(line);
vector<int> tokens;
TokenizeAndEncode(line, tokens);
m_ref_tokens.push_back(multiset<int>());
@@ -52,10 +53,13 @@ void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str());
}
+
+ string sentence = this->applyFactors(text);
+
// Calculate correct, output_length and ref_length for
// the line and store it in entry
vector<int> testtokens;
- TokenizeAndEncode(text, testtokens);
+ TokenizeAndEncode(sentence, testtokens);
multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
set<int> testtokens_unique(testtokens.begin(),testtokens.end());
int correct = 0;
diff --git a/mert/Scorer.cpp b/mert/Scorer.cpp
index a2bb4720c..6b36c8f14 100644
--- a/mert/Scorer.cpp
+++ b/mert/Scorer.cpp
@@ -1,5 +1,6 @@
#include "Scorer.h"
#include <limits>
+#include "Util.h"
namespace {
@@ -95,6 +96,55 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
}
}
+/**
+ * Set the factors, which should be used for this metric
+ */
+void Scorer::setFactors(const string& factors)
+{
+ if (factors.empty()) return;
+ vector<string> factors_vec;
+ split(factors, '|', factors_vec);
+ for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
+ {
+ int factor = atoi(it->c_str());
+ m_factors.push_back(factor);
+ }
+}
+
+/**
+ * Take the factored sentence and return the desired factors
+ */
+string Scorer::applyFactors(const string& sentence)
+{
+ if (m_factors.size() == 0) return sentence;
+
+ vector<string> tokens;
+ split(sentence, ' ', tokens);
+
+ stringstream sstream;
+ for (size_t i = 0; i < tokens.size(); ++i)
+ {
+ if (tokens[i] == "") continue;
+
+ vector<string> factors;
+ split(tokens[i], '|', factors);
+
+ int fsize = factors.size();
+
+ if (i>0) sstream << " ";
+
+ for (size_t j = 0; j < m_factors.size(); ++j)
+ {
+ int findex = m_factors[j];
+ if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
+
+ if (j>0) sstream << "|";
+ sstream << factors[findex];
+ }
+ }
+ return sstream.str();
+}
+
StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& config)
: Scorer(name,config) {
//configure regularisation
diff --git a/mert/Scorer.h b/mert/Scorer.h
index ad44fab0e..880f4e228 100644
--- a/mert/Scorer.h
+++ b/mert/Scorer.h
@@ -97,6 +97,16 @@ class Scorer
m_score_data = data;
}
+ /**
+ * Set the factors, which should be used for this metric
+ */
+ virtual void setFactors(const string& factors);
+
+ /**
+ * Take the factored sentence and return the desired factors
+ */
+ virtual string applyFactors(const string& sentece);
+
private:
class Encoder {
public:
@@ -114,6 +124,7 @@ class Scorer
string m_name;
Encoder* m_encoder;
map<string, string> m_config;
+ vector<int> m_factors;
protected:
ScoreData* m_score_data;
diff --git a/mert/TerScorer.cpp b/mert/TerScorer.cpp
index e5e670cc2..608b82cde 100644
--- a/mert/TerScorer.cpp
+++ b/mert/TerScorer.cpp
@@ -33,6 +33,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
string line;
int sid = 0;
while ( getline ( in, line ) ) {
+ line = this->applyFactors(line);
vector<int> tokens;
TokenizeAndEncode(line, tokens);
m_references.push_back ( tokens );
@@ -48,6 +49,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry )
{
+ string sentence = this->applyFactors(text);
terAlignment result;
result.numEdits = 0.0 ;
@@ -74,7 +76,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry
averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size();
}
averageLength=averageLength/( double ) m_multi_references.size();
- TokenizeAndEncode(text, testtokens);
+ TokenizeAndEncode(sentence, testtokens);
terCalc * evaluation=new terCalc();
evaluation->setDebugMode ( false );
terAlignment tmp_result = evaluation->TER ( reftokens, testtokens );
diff --git a/mert/evaluator.cpp b/mert/evaluator.cpp
index 21556e9ee..3b2e0d61f 100644
--- a/mert/evaluator.cpp
+++ b/mert/evaluator.cpp
@@ -131,6 +131,7 @@ void usage()
cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
+ cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
cerr << "[--reference|-R] comma separated list of reference files" << endl;
cerr << "[--candidate|-C] comma separated list of candidate files" << endl;
cerr << "[--bootstrap|-b] number of booststraped samples (default 0 - no bootstraping)" << endl;
@@ -164,6 +165,7 @@ static struct option long_options[] = {
{"candidate", required_argument, 0, 'C'},
{"bootstrap", required_argument, 0, 'b'},
{"rseed", required_argument, 0, 'r'},
+ {"factors", required_argument, 0, 'f'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
};
@@ -174,6 +176,7 @@ struct ProgramOption {
vector<string> scorer_configs;
string reference;
string candidate;
+ vector<string> scorer_factors;
int bootstrap;
int seed;
bool has_seed;
@@ -190,11 +193,12 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
int c;
int option_index;
int last_scorer_index = -1;
- while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:h", long_options, &option_index)) != -1) {
+ while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:f:h", long_options, &option_index)) != -1) {
switch(c) {
case 's':
opt->scorer_types.push_back(string(optarg));
opt->scorer_configs.push_back(string(""));
+ opt->scorer_factors.push_back(string(""));
last_scorer_index++;
break;
case 'c':
@@ -213,6 +217,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
opt->seed = strtol(optarg, NULL, 10);
opt->has_seed = true;
break;
+ case 'f':
+ opt->scorer_factors[last_scorer_index] = string(optarg);
+ break;
default:
usage();
}
@@ -223,6 +230,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
{
opt->scorer_types.push_back(string("BLEU"));
opt->scorer_configs.push_back(string(""));
+ opt->scorer_factors.push_back(string(""));
}
}
@@ -268,6 +276,7 @@ int main(int argc, char** argv)
for (size_t i = 0; i < option.scorer_types.size(); i++)
{
g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]);
+ g_scorer->setFactors(option.scorer_factors[i]);
g_scorer->setReferenceFiles(refFiles);
EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
delete g_scorer;
diff --git a/mert/extractor.cpp b/mert/extractor.cpp
index 1c80211b9..0d38c2e25 100644
--- a/mert/extractor.cpp
+++ b/mert/extractor.cpp
@@ -26,6 +26,7 @@ void usage()
cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl;
cerr << "[--scconfig|-c] configuration string passed to scorer" << endl;
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
+ cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
cerr << "[--reference|-r] comma separated list of reference files" << endl;
cerr << "[--binary|-b] use binary output format (default to text )" << endl;
cerr << "[--nbest|-n] the nbest file" << endl;
@@ -41,6 +42,7 @@ void usage()
static struct option long_options[] = {
{"sctype", required_argument, 0, 's'},
{"scconfig", required_argument,0, 'c'},
+ {"factors", required_argument,0, 'f'},
{"reference", required_argument, 0, 'r'},
{"binary", no_argument, 0, 'b'},
{"nbest", required_argument, 0, 'n'},
@@ -57,6 +59,7 @@ static struct option long_options[] = {
struct ProgramOption {
string scorerType;
string scorerConfig;
+ string scorerFactors;
string referenceFile;
string nbestFile;
string scoreDataFile;
@@ -69,6 +72,7 @@ struct ProgramOption {
ProgramOption()
: scorerType("BLEU"),
scorerConfig(""),
+ scorerFactors(""),
referenceFile(""),
nbestFile(""),
scoreDataFile("statscore.data"),
@@ -83,7 +87,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
int c;
int option_index;
- while ((c = getopt_long(argc, argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
+ while ((c = getopt_long(argc, argv, "s:r:f:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
switch (c) {
case 's':
opt->scorerType = string(optarg);
@@ -91,6 +95,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
case 'c':
opt->scorerConfig = string(optarg);
break;
+ case 'f':
+ opt->scorerFactors = string(optarg);
+ break;
case 'r':
opt->referenceFile = string(optarg);
break;
@@ -180,6 +187,8 @@ int main(int argc, char** argv)
Scorer* scorer = ScorerFactory::getScorer(option.scorerType, option.scorerConfig);
+ scorer->setFactors(option.scorerFactors);
+
// load references
if (referenceFiles.size() > 0)
scorer->setReferenceFiles(referenceFiles);