diff options
author | Matous Machacek <machacekmatous@gmail.com> | 2012-02-28 05:27:23 +0400 |
---|---|---|
committer | Matous Machacek <machacekmatous@gmail.com> | 2012-02-28 05:27:23 +0400 |
commit | ba987c94ba9be5e7c8eb9c3e7c83d8f971fbd3aa (patch) | |
tree | 4140bd7b8fcef62012d2fe59956dcc81bb25c2bd /mert | |
parent | e38cd12ef3385304a4b363ca9b9ab16ed886a2ff (diff) |
Support for using factors in mert and evaluator
example:
Use --factor "0|2" to use only first and third factor from nbest list and from reference.
If you use interpolated scorer, separate records with comma (e.g. --factor "0|2,1").
Diffstat (limited to 'mert')
-rw-r--r-- | mert/BleuScorer.cpp | 6 | ||||
-rw-r--r-- | mert/CderScorer.cpp | 5 | ||||
-rw-r--r-- | mert/InterpolatedScorer.cpp | 21 | ||||
-rw-r--r-- | mert/InterpolatedScorer.h | 5 | ||||
-rw-r--r-- | mert/PerScorer.cpp | 6 | ||||
-rw-r--r-- | mert/Scorer.cpp | 50 | ||||
-rw-r--r-- | mert/Scorer.h | 11 | ||||
-rw-r--r-- | mert/TerScorer.cpp | 4 | ||||
-rw-r--r-- | mert/evaluator.cpp | 11 | ||||
-rw-r--r-- | mert/extractor.cpp | 11 |
10 files changed, 123 insertions, 7 deletions
diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp index cf2b84242..baf8a0f8b 100644 --- a/mert/BleuScorer.cpp +++ b/mert/BleuScorer.cpp @@ -139,6 +139,7 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles) string line; size_t sid = 0; //sentence counter while (getline(refin,line)) { + line = this->applyFactors(line); if (i == 0) { NgramCounts *counts = new NgramCounts; //these get leaked m_ref_counts.push_back(counts); @@ -183,8 +184,9 @@ void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) } NgramCounts testcounts; // stats for this line - vector<ScoreStatsType> stats(kLENGTH * 2);; - const size_t length = countNgrams(text, testcounts, kLENGTH); + vector<ScoreStatsType> stats(kLENGTH * 2); + string sentence = this->applyFactors(text); + const size_t length = countNgrams(sentence, testcounts, kLENGTH); // Calculate effective reference length. switch (m_ref_length_type) { diff --git a/mert/CderScorer.cpp b/mert/CderScorer.cpp index 2105820e9..424c210b3 100644 --- a/mert/CderScorer.cpp +++ b/mert/CderScorer.cpp @@ -31,6 +31,7 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles) m_ref_sentences.push_back(vector<sent_t>()); string line; while (getline(refin,line)) { + line = this->applyFactors(line); sent_t encoded; TokenizeAndEncode(line, encoded); m_ref_sentences[rid].push_back(encoded); @@ -40,8 +41,10 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles) void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) { + string sentence = this->applyFactors(text); + vector<int> stats; - prepareStatsVector(sid, text, stats); + prepareStatsVector(sid, sentence, stats); entry.set(stats); } diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp index 5ba410539..1951e4234 100644 --- a/mert/InterpolatedScorer.cpp +++ b/mert/InterpolatedScorer.cpp @@ -159,3 +159,24 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats string str = buff.str(); entry.set(str); } + +void InterpolatedScorer::setFactors(const string& factors) +{ + if (factors.empty()) return; + + vector<string> fsplit; + split(factors, ',', fsplit); + + if (fsplit.size() != _scorers.size()) throw runtime_error("Number of factor specifications does not equal number of interpolated scorers."); + + for (size_t i = 0; i < _scorers.size(); ++i) + { + _scorers[i]->setFactors(fsplit[i]); + } +} + + + + + + diff --git a/mert/InterpolatedScorer.h b/mert/InterpolatedScorer.h index d8eb87e3f..2a538bc39 100644 --- a/mert/InterpolatedScorer.h +++ b/mert/InterpolatedScorer.h @@ -42,6 +42,11 @@ public: virtual void setScoreData(ScoreData* data); + /** + * Set the factors, which should be used for this metric + */ + virtual void setFactors(const string& factors); + protected: ScopedVector<Scorer> _scorers; diff --git a/mert/PerScorer.cpp b/mert/PerScorer.cpp index 76c2765dd..06a83bd2f 100644 --- a/mert/PerScorer.cpp +++ b/mert/PerScorer.cpp @@ -29,6 +29,7 @@ void PerScorer::setReferenceFiles(const vector<string>& referenceFiles) string line; int sid = 0; while (getline(in,line)) { + line = this->applyFactors(line); vector<int> tokens; TokenizeAndEncode(line, tokens); m_ref_tokens.push_back(multiset<int>()); @@ -52,10 +53,13 @@ void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) msg << "Sentence id (" << sid << ") not found in reference set"; throw runtime_error(msg.str()); } + + string sentence = this->applyFactors(text); + // Calculate correct, output_length and ref_length for // the line and store it in entry vector<int> testtokens; - TokenizeAndEncode(text, testtokens); + TokenizeAndEncode(sentence, testtokens); multiset<int> testtokens_all(testtokens.begin(),testtokens.end()); set<int> testtokens_unique(testtokens.begin(),testtokens.end()); int correct = 0; diff --git a/mert/Scorer.cpp b/mert/Scorer.cpp index a2bb4720c..6b36c8f14 100644 --- a/mert/Scorer.cpp +++ b/mert/Scorer.cpp @@ -1,5 +1,6 @@ #include "Scorer.h" #include <limits> +#include "Util.h" namespace { @@ -95,6 +96,55 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) { } } +/** + * Set the factors, which should be used for this metric + */ +void Scorer::setFactors(const string& factors) +{ + if (factors.empty()) return; + vector<string> factors_vec; + split(factors, '|', factors_vec); + for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it) + { + int factor = atoi(it->c_str()); + m_factors.push_back(factor); + } +} + +/** + * Take the factored sentence and return the desired factors + */ +string Scorer::applyFactors(const string& sentence) +{ + if (m_factors.size() == 0) return sentence; + + vector<string> tokens; + split(sentence, ' ', tokens); + + stringstream sstream; + for (size_t i = 0; i < tokens.size(); ++i) + { + if (tokens[i] == "") continue; + + vector<string> factors; + split(tokens[i], '|', factors); + + int fsize = factors.size(); + + if (i>0) sstream << " "; + + for (size_t j = 0; j < m_factors.size(); ++j) + { + int findex = m_factors[j]; + if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range."); + + if (j>0) sstream << "|"; + sstream << factors[findex]; + } + } + return sstream.str(); +} + StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& config) : Scorer(name,config) { //configure regularisation diff --git a/mert/Scorer.h b/mert/Scorer.h index ad44fab0e..880f4e228 100644 --- a/mert/Scorer.h +++ b/mert/Scorer.h @@ -97,6 +97,16 @@ class Scorer m_score_data = data; } + /** + * Set the factors, which should be used for this metric + */ + virtual void setFactors(const string& factors); + + /** + * Take the factored sentence and return the desired factors + */ + virtual string applyFactors(const string& sentece); + private: class Encoder { public: @@ -114,6 +124,7 @@ class Scorer string m_name; Encoder* m_encoder; map<string, string> m_config; + vector<int> m_factors; protected: ScoreData* m_score_data; diff --git a/mert/TerScorer.cpp b/mert/TerScorer.cpp index e5e670cc2..608b82cde 100644 --- a/mert/TerScorer.cpp +++ b/mert/TerScorer.cpp @@ -33,6 +33,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles ) string line; int sid = 0; while ( getline ( in, line ) ) { + line = this->applyFactors(line); vector<int> tokens; TokenizeAndEncode(line, tokens); m_references.push_back ( tokens ); @@ -48,6 +49,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles ) void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry ) { + string sentence = this->applyFactors(text); terAlignment result; result.numEdits = 0.0 ; @@ -74,7 +76,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size(); } averageLength=averageLength/( double ) m_multi_references.size(); - TokenizeAndEncode(text, testtokens); + TokenizeAndEncode(sentence, testtokens); terCalc * evaluation=new terCalc(); evaluation->setDebugMode ( false ); terAlignment tmp_result = evaluation->TER ( reftokens, testtokens ); diff --git a/mert/evaluator.cpp b/mert/evaluator.cpp index 21556e9ee..3b2e0d61f 100644 --- a/mert/evaluator.cpp +++ b/mert/evaluator.cpp @@ -131,6 +131,7 @@ void usage() cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl; cerr << "[--scconfig|-c] configuration string passed to scorer" << endl; cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl; + cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl; cerr << "[--reference|-R] comma separated list of reference files" << endl; cerr << "[--candidate|-C] comma separated list of candidate files" << endl; cerr << "[--bootstrap|-b] number of booststraped samples (default 0 - no bootstraping)" << endl; @@ -164,6 +165,7 @@ static struct option long_options[] = { {"candidate", required_argument, 0, 'C'}, {"bootstrap", required_argument, 0, 'b'}, {"rseed", required_argument, 0, 'r'}, + {"factors", required_argument, 0, 'f'}, {"help", no_argument, 0, 'h'}, {0, 0, 0, 0} }; @@ -174,6 +176,7 @@ struct ProgramOption { vector<string> scorer_configs; string reference; string candidate; + vector<string> scorer_factors; int bootstrap; int seed; bool has_seed; @@ -190,11 +193,12 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) { int c; int option_index; int last_scorer_index = -1; - while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:h", long_options, &option_index)) != -1) { + while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:f:h", long_options, &option_index)) != -1) { switch(c) { case 's': opt->scorer_types.push_back(string(optarg)); opt->scorer_configs.push_back(string("")); + opt->scorer_factors.push_back(string("")); last_scorer_index++; break; case 'c': @@ -213,6 +217,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) { opt->seed = strtol(optarg, NULL, 10); opt->has_seed = true; break; + case 'f': + opt->scorer_factors[last_scorer_index] = string(optarg); + break; default: usage(); } @@ -223,6 +230,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) { { opt->scorer_types.push_back(string("BLEU")); opt->scorer_configs.push_back(string("")); + opt->scorer_factors.push_back(string("")); } } @@ -268,6 +276,7 @@ int main(int argc, char** argv) for (size_t i = 0; i < option.scorer_types.size(); i++) { g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]); + g_scorer->setFactors(option.scorer_factors[i]); g_scorer->setReferenceFiles(refFiles); EvaluatorUtil::evaluate(*fileIt, option.bootstrap); delete g_scorer; diff --git a/mert/extractor.cpp b/mert/extractor.cpp index 1c80211b9..0d38c2e25 100644 --- a/mert/extractor.cpp +++ b/mert/extractor.cpp @@ -26,6 +26,7 @@ void usage() cerr << "[--sctype|-s] the scorer type (default BLEU)" << endl; cerr << "[--scconfig|-c] configuration string passed to scorer" << endl; cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl; + cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl; cerr << "[--reference|-r] comma separated list of reference files" << endl; cerr << "[--binary|-b] use binary output format (default to text )" << endl; cerr << "[--nbest|-n] the nbest file" << endl; @@ -41,6 +42,7 @@ void usage() static struct option long_options[] = { {"sctype", required_argument, 0, 's'}, {"scconfig", required_argument,0, 'c'}, + {"factors", required_argument,0, 'f'}, {"reference", required_argument, 0, 'r'}, {"binary", no_argument, 0, 'b'}, {"nbest", required_argument, 0, 'n'}, @@ -57,6 +59,7 @@ static struct option long_options[] = { struct ProgramOption { string scorerType; string scorerConfig; + string scorerFactors; string referenceFile; string nbestFile; string scoreDataFile; @@ -69,6 +72,7 @@ struct ProgramOption { ProgramOption() : scorerType("BLEU"), scorerConfig(""), + scorerFactors(""), referenceFile(""), nbestFile(""), scoreDataFile("statscore.data"), @@ -83,7 +87,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) { int c; int option_index; - while ((c = getopt_long(argc, argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) { + while ((c = getopt_long(argc, argv, "s:r:f:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) { switch (c) { case 's': opt->scorerType = string(optarg); @@ -91,6 +95,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) { case 'c': opt->scorerConfig = string(optarg); break; + case 'f': + opt->scorerFactors = string(optarg); + break; case 'r': opt->referenceFile = string(optarg); break; @@ -180,6 +187,8 @@ int main(int argc, char** argv) Scorer* scorer = ScorerFactory::getScorer(option.scorerType, option.scorerConfig); + scorer->setFactors(option.scorerFactors); + // load references if (referenceFiles.size() > 0) scorer->setReferenceFiles(referenceFiles); |