Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/mert
diff options
context:
space:
mode:
authorChristian Federmann <cfedermann@gmail.com>2012-05-09 23:10:52 +0400
committerChristian Federmann <cfedermann@gmail.com>2012-05-09 23:10:52 +0400
commit25f43d13b8f3cc6cc0be19028605efe15eaa416b (patch)
tree5e2a7e22a17f88e0dc53032a473bb20ff368375a /mert
parentd9e77ed5b1676956978e221427c8c39f372e7612 (diff)
parent440650bd6e03cdea9aa2d3f11b32697bb9340ca0 (diff)
Merged in upstream.
Diffstat (limited to 'mert')
-rw-r--r--mert/BleuScorer.cpp283
-rw-r--r--mert/BleuScorer.h78
-rw-r--r--mert/BleuScorerTest.cpp272
-rw-r--r--mert/CderScorer.cpp48
-rw-r--r--mert/CderScorer.h29
-rw-r--r--mert/Data.cpp297
-rw-r--r--mert/Data.h115
-rw-r--r--mert/DataTest.cpp16
-rw-r--r--mert/Fdstream.h167
-rw-r--r--mert/FeatureArray.cpp135
-rw-r--r--mert/FeatureArray.h94
-rw-r--r--mert/FeatureData.cpp134
-rw-r--r--mert/FeatureData.h133
-rw-r--r--mert/FeatureDataIterator.h6
-rw-r--r--mert/FeatureDataTest.cpp40
-rw-r--r--mert/FeatureStats.cpp151
-rw-r--r--mert/FeatureStats.h80
-rw-r--r--mert/FileStream.cpp12
-rw-r--r--mert/FileStream.h21
-rw-r--r--mert/GzFileBuf.cpp80
-rw-r--r--mert/InterpolatedScorer.cpp189
-rw-r--r--mert/InterpolatedScorer.h55
-rw-r--r--mert/Jamfile47
-rw-r--r--mert/MergeScorer.cpp7
-rw-r--r--mert/MergeScorer.h15
-rw-r--r--mert/Ngram.h98
-rw-r--r--mert/NgramTest.cpp83
-rw-r--r--mert/Optimizer.cpp140
-rw-r--r--mert/Optimizer.h69
-rw-r--r--mert/OptimizerFactory.cpp67
-rw-r--r--mert/OptimizerFactory.h42
-rw-r--r--mert/OptimizerFactoryTest.cpp49
-rw-r--r--mert/PerScorer.cpp6
-rw-r--r--mert/PerScorer.h19
-rw-r--r--mert/Point.cpp119
-rw-r--r--mert/Point.h68
-rw-r--r--mert/PointTest.cpp60
-rw-r--r--mert/PreProcessFilter.cpp135
-rw-r--r--mert/PreProcessFilter.h24
-rw-r--r--mert/Reference.h82
-rw-r--r--mert/ReferenceTest.cpp116
-rw-r--r--mert/ScopedVector.h42
-rw-r--r--mert/ScoreArray.cpp127
-rw-r--r--mert/ScoreArray.h97
-rw-r--r--mert/ScoreData.cpp99
-rw-r--r--mert/ScoreData.h83
-rw-r--r--mert/ScoreDataIterator.h6
-rw-r--r--mert/ScoreStats.cpp111
-rw-r--r--mert/ScoreStats.h73
-rw-r--r--mert/Scorer.cpp105
-rw-r--r--mert/Scorer.h74
-rw-r--r--mert/ScorerFactory.cpp26
-rw-r--r--mert/ScorerFactory.h6
-rw-r--r--mert/SemposOverlapping.cpp109
-rw-r--r--mert/SemposOverlapping.h90
-rw-r--r--mert/SemposScorer.cpp179
-rw-r--r--mert/SemposScorer.h64
-rw-r--r--mert/Singleton.h33
-rw-r--r--mert/SingletonTest.cpp27
-rw-r--r--mert/TER/alignmentStruct.cpp (renamed from mert/TERsrc/alignmentStruct.cpp)0
-rw-r--r--mert/TER/alignmentStruct.h (renamed from mert/TERsrc/alignmentStruct.h)8
-rw-r--r--mert/TER/bestShiftStruct.h (renamed from mert/TERsrc/bestShiftStruct.h)6
-rw-r--r--mert/TER/hashMap.cpp (renamed from mert/TERsrc/hashMap.cpp)0
-rw-r--r--mert/TER/hashMap.h (renamed from mert/TERsrc/hashMap.h)6
-rw-r--r--mert/TER/hashMapInfos.cpp (renamed from mert/TERsrc/hashMapInfos.cpp)0
-rw-r--r--mert/TER/hashMapInfos.h (renamed from mert/TERsrc/hashMapInfos.h)6
-rw-r--r--mert/TER/hashMapStringInfos.cpp (renamed from mert/TERsrc/hashMapStringInfos.cpp)0
-rw-r--r--mert/TER/hashMapStringInfos.h (renamed from mert/TERsrc/hashMapStringInfos.h)6
-rw-r--r--mert/TER/infosHasher.cpp (renamed from mert/TERsrc/infosHasher.cpp)0
-rw-r--r--mert/TER/infosHasher.h (renamed from mert/TERsrc/infosHasher.h)6
-rw-r--r--mert/TER/stringHasher.cpp (renamed from mert/TERsrc/stringHasher.cpp)0
-rw-r--r--mert/TER/stringHasher.h (renamed from mert/TERsrc/stringHasher.h)6
-rw-r--r--mert/TER/stringInfosHasher.cpp (renamed from mert/TERsrc/stringInfosHasher.cpp)0
-rw-r--r--mert/TER/stringInfosHasher.h (renamed from mert/TERsrc/stringInfosHasher.h)6
-rw-r--r--mert/TER/terAlignment.cpp (renamed from mert/TERsrc/terAlignment.cpp)0
-rw-r--r--mert/TER/terAlignment.h (renamed from mert/TERsrc/terAlignment.h)6
-rw-r--r--mert/TER/terShift.cpp (renamed from mert/TERsrc/terShift.cpp)0
-rw-r--r--mert/TER/terShift.h (renamed from mert/TERsrc/terShift.h)6
-rw-r--r--mert/TER/tercalc.cpp (renamed from mert/TERsrc/tercalc.cpp)0
-rw-r--r--mert/TER/tercalc.h (renamed from mert/TERsrc/tercalc.h)6
-rw-r--r--mert/TER/tools.cpp (renamed from mert/TERsrc/tools.cpp)0
-rw-r--r--mert/TER/tools.h (renamed from mert/TERsrc/tools.h)6
-rw-r--r--mert/TODO18
-rw-r--r--mert/TerScorer.cpp8
-rw-r--r--mert/TerScorer.h6
-rw-r--r--mert/Timer.cpp127
-rw-r--r--mert/Timer.h114
-rw-r--r--mert/TimerTest.cpp27
-rw-r--r--mert/Types.h6
-rw-r--r--mert/Util.cpp38
-rw-r--r--mert/Util.h40
-rw-r--r--mert/UtilTest.cpp76
-rw-r--r--mert/Vocabulary.cpp21
-rw-r--r--mert/Vocabulary.h79
-rw-r--r--mert/VocabularyTest.cpp52
-rw-r--r--mert/evaluator.cpp71
-rw-r--r--mert/example/README26
-rwxr-xr-xmert/example/gzipped_test.sh41
-rwxr-xr-xmert/example/normal_test.sh30
-rwxr-xr-xmert/example/smoke_test.sh39
-rw-r--r--mert/extractor.cpp39
-rw-r--r--mert/gzfilebuf.h85
-rw-r--r--[-rwxr-xr-x]mert/mert.cpp80
-rw-r--r--mert/pro.cpp101
104 files changed, 4351 insertions, 1879 deletions
diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp
index 09b0d292f..32c192a5c 100644
--- a/mert/BleuScorer.cpp
+++ b/mert/BleuScorer.cpp
@@ -1,24 +1,33 @@
#include "BleuScorer.h"
#include <algorithm>
+#include <cassert>
#include <cmath>
#include <climits>
#include <fstream>
-#include <iterator>
+#include <iostream>
#include <stdexcept>
+
+#include "util/check.hh"
+#include "Ngram.h"
+#include "Reference.h"
#include "Util.h"
+#include "Vocabulary.h"
+
+namespace {
+
+// configure regularisation
+const char KEY_REFLEN[] = "reflen";
+const char REFLEN_AVERAGE[] = "average";
+const char REFLEN_SHORTEST[] = "shortest";
+const char REFLEN_CLOSEST[] = "closest";
+
+} // namespace
BleuScorer::BleuScorer(const string& config)
- : StatisticsBasedScorer("BLEU",config),
- kLENGTH(4),
+ : StatisticsBasedScorer("BLEU", config),
m_ref_length_type(CLOSEST) {
- //configure regularisation
- static string KEY_REFLEN = "reflen";
- static string REFLEN_AVERAGE = "average";
- static string REFLEN_SHORTEST = "shortest";
- static string REFLEN_CLOSEST = "closest";
-
- string reflen = getConfig(KEY_REFLEN,REFLEN_CLOSEST);
+ const string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
if (reflen == REFLEN_AVERAGE) {
m_ref_length_type = AVERAGE;
} else if (reflen == REFLEN_SHORTEST) {
@@ -28,18 +37,16 @@ BleuScorer::BleuScorer(const string& config)
} else {
throw runtime_error("Unknown reference length strategy: " + reflen);
}
- // cerr << "Using reference length strategy: " << reflen << endl;
}
BleuScorer::~BleuScorer() {}
-size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned int n)
+size_t BleuScorer::CountNgrams(const string& line, NgramCounts& counts,
+ unsigned int n)
{
+ assert(n > 0);
vector<int> encoded_tokens;
- //cerr << line << endl;
TokenizeAndEncode(line, encoded_tokens);
- //copy(encoded_tokens.begin(), encoded_tokens.end(), ostream_iterator<int>(cerr," "));
- //cerr << endl;
for (size_t k = 1; k <= n; ++k) {
//ngram order longer than sentence - no point
if (k > encoded_tokens.size()) {
@@ -50,168 +57,176 @@ size_t BleuScorer::countNgrams(const string& line, counts_t& counts, unsigned in
for (size_t j = i; j < i+k && j < encoded_tokens.size(); ++j) {
ngram.push_back(encoded_tokens[j]);
}
- int count = 1;
- counts_iterator oldcount = counts.find(ngram);
- if (oldcount != counts.end()) {
- count = (oldcount->second) + 1;
- }
- //cerr << count << endl;
- counts[ngram] = count;
- //cerr << endl;
+ counts.Add(ngram);
}
}
- //cerr << "counted ngrams" << endl;
- //dump_counts(counts);
return encoded_tokens.size();
}
void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
{
- //make sure reference data is clear
- m_ref_counts.reset();
- m_ref_lengths.clear();
- ClearEncoder();
+ // Make sure reference data is clear
+ m_references.reset();
+ mert::VocabularyFactory::GetVocabulary()->clear();
//load reference data
for (size_t i = 0; i < referenceFiles.size(); ++i) {
TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
- ifstream refin(referenceFiles[i].c_str());
- if (!refin) {
- throw runtime_error("Unable to open: " + referenceFiles[i]);
+
+ if (!OpenReference(referenceFiles[i].c_str(), i)) {
+ throw runtime_error("Unable to open " + referenceFiles[i]);
}
- string line;
- size_t sid = 0; //sentence counter
- while (getline(refin,line)) {
- //cerr << line << endl;
- if (i == 0) {
- counts_t *counts = new counts_t; //these get leaked
- m_ref_counts.push_back(counts);
- vector<size_t> lengths;
- m_ref_lengths.push_back(lengths);
- }
- if (m_ref_counts.size() <= sid) {
- throw runtime_error("File " + referenceFiles[i] + " has too many sentences");
- }
- counts_t counts;
- size_t length = countNgrams(line,counts,kLENGTH);
- //for any counts larger than those already there, merge them in
- for (counts_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
- counts_iterator oldcount_it = m_ref_counts[sid]->find(ci->first);
- int oldcount = 0;
- if (oldcount_it != m_ref_counts[sid]->end()) {
- oldcount = oldcount_it->second;
- }
- int newcount = ci->second;
- if (newcount > oldcount) {
- m_ref_counts[sid]->operator[](ci->first) = newcount;
- }
- }
- //add in the length
- m_ref_lengths[sid].push_back(length);
- if (sid > 0 && sid % 100 == 0) {
- TRACE_ERR(".");
+ }
+}
+
+bool BleuScorer::OpenReference(const char* filename, size_t file_id) {
+ ifstream ifs(filename);
+ if (!ifs) {
+ cerr << "Cannot open " << filename << endl;
+ return false;
+ }
+ return OpenReferenceStream(&ifs, file_id);
+}
+
+bool BleuScorer::OpenReferenceStream(istream* is, size_t file_id) {
+ if (is == NULL) return false;
+
+ string line;
+ size_t sid = 0;
+ while (getline(*is, line)) {
+ line = preprocessSentence(line);
+ if (file_id == 0) {
+ Reference* ref = new Reference;
+ m_references.push_back(ref); // Take ownership of the Reference object.
+ }
+ if (m_references.size() <= sid) {
+ cerr << "Reference " << file_id << "has too many sentences." << endl;
+ return false;
+ }
+ NgramCounts counts;
+ size_t length = CountNgrams(line, counts, kBleuNgramOrder);
+
+ //for any counts larger than those already there, merge them in
+ for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
+ const NgramCounts::Key& ngram = ci->first;
+ const NgramCounts::Value newcount = ci->second;
+
+ NgramCounts::Value oldcount = 0;
+ m_references[sid]->get_counts()->Lookup(ngram, &oldcount);
+ if (newcount > oldcount) {
+ m_references[sid]->get_counts()->operator[](ngram) = newcount;
}
- ++sid;
}
- TRACE_ERR(endl);
+ //add in the length
+ m_references[sid]->push_back(length);
+ if (sid > 0 && sid % 100 == 0) {
+ TRACE_ERR(".");
+ }
+ ++sid;
}
+ return true;
}
-
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
-// cerr << text << endl;
-// cerr << sid << endl;
- //dump_counts(*m_ref_counts[sid]);
- if (sid >= m_ref_counts.size()) {
+ if (sid >= m_references.size()) {
stringstream msg;
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str());
}
- counts_t testcounts;
- //stats for this line
- vector<float> stats(kLENGTH*2);;
- size_t length = countNgrams(text,testcounts,kLENGTH);
- //dump_counts(testcounts);
- if (m_ref_length_type == SHORTEST) {
- //cerr << reflengths.size() << " " << sid << endl;
- int shortest = *min_element(m_ref_lengths[sid].begin(), m_ref_lengths[sid].end());
- stats.push_back(shortest);
- } else if (m_ref_length_type == AVERAGE) {
- int total = 0;
- for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) {
- total += m_ref_lengths[sid][i];
- }
- const float mean = static_cast<float>(total) / m_ref_lengths[sid].size();
- stats.push_back(mean);
- } else if (m_ref_length_type == CLOSEST) {
- int min_diff = INT_MAX;
- int min_idx = 0;
- for (size_t i = 0; i < m_ref_lengths[sid].size(); ++i) {
- const int reflength = m_ref_lengths[sid][i];
- const int diff = reflength - static_cast<int>(length);
- const int absolute_diff = abs(diff) - abs(min_diff);
-
- if (absolute_diff < 0) { //look for the closest reference
- min_diff = diff;
- min_idx = i;
- } else if (absolute_diff == 0) { // if two references has the same closest length, take the shortest
- if (reflength < static_cast<int>(m_ref_lengths[sid][min_idx])) {
- min_idx = i;
- }
- }
- }
- stats.push_back(m_ref_lengths[sid][min_idx]);
- } else {
- throw runtime_error("Unsupported reflength strategy");
- }
- //cerr << "computed length" << endl;
+ NgramCounts testcounts;
+ // stats for this line
+ vector<ScoreStatsType> stats(kBleuNgramOrder * 2);
+ string sentence = preprocessSentence(text);
+ const size_t length = CountNgrams(sentence, testcounts, kBleuNgramOrder);
+
+ const int reference_len = CalcReferenceLength(sid, length);
+ stats.push_back(reference_len);
+
//precision on each ngram type
- for (counts_iterator testcounts_it = testcounts.begin();
+ for (NgramCounts::const_iterator testcounts_it = testcounts.begin();
testcounts_it != testcounts.end(); ++testcounts_it) {
- counts_iterator refcounts_it = m_ref_counts[sid]->find(testcounts_it->first);
- int correct = 0;
- int guess = testcounts_it->second;
- if (refcounts_it != m_ref_counts[sid]->end()) {
- correct = min(refcounts_it->second,guess);
+ const NgramCounts::Value guess = testcounts_it->second;
+ const size_t len = testcounts_it->first.size();
+ NgramCounts::Value correct = 0;
+
+ NgramCounts::Value v = 0;
+ if (m_references[sid]->get_counts()->Lookup(testcounts_it->first, &v)) {
+ correct = min(v, guess);
}
- size_t len = testcounts_it->first.size();
- stats[len*2-2] += correct;
- stats[len*2-1] += guess;
+ stats[len * 2 - 2] += correct;
+ stats[len * 2 - 1] += guess;
}
- stringstream sout;
- copy(stats.begin(),stats.end(),ostream_iterator<float>(sout," "));
- //TRACE_ERR(sout.str() << endl);
- string stats_str = sout.str();
- entry.set(stats_str);
+ entry.set(stats);
}
float BleuScorer::calculateScore(const vector<int>& comps) const
{
- //cerr << "BLEU: ";
- //copy(comps.begin(),comps.end(), ostream_iterator<int>(cerr," "));
+ CHECK(comps.size() == kBleuNgramOrder * 2 + 1);
+
float logbleu = 0.0;
- for (int i = 0; i < kLENGTH; ++i) {
+ for (int i = 0; i < kBleuNgramOrder; ++i) {
if (comps[2*i] == 0) {
return 0.0;
}
logbleu += log(comps[2*i]) - log(comps[2*i+1]);
}
- logbleu /= kLENGTH;
- const float brevity = 1.0 - static_cast<float>(comps[kLENGTH*2]) / comps[1];//reflength divided by test length
+ logbleu /= kBleuNgramOrder;
+ // reflength divided by test length
+ const float brevity = 1.0 - static_cast<float>(comps[kBleuNgramOrder * 2]) / comps[1];
if (brevity < 0.0) {
logbleu += brevity;
}
- //cerr << " " << exp(logbleu) << endl;
return exp(logbleu);
}
-void BleuScorer::dump_counts(counts_t& counts) const {
- for (counts_const_iterator i = counts.begin(); i != counts.end(); ++i) {
- cerr << "(";
- copy(i->first.begin(), i->first.end(), ostream_iterator<int>(cerr," "));
- cerr << ") " << i->second << ", ";
+int BleuScorer::CalcReferenceLength(size_t sentence_id, size_t length) {
+ switch (m_ref_length_type) {
+ case AVERAGE:
+ return m_references[sentence_id]->CalcAverage();
+ break;
+ case CLOSEST:
+ return m_references[sentence_id]->CalcClosest(length);
+ break;
+ case SHORTEST:
+ return m_references[sentence_id]->CalcShortest();
+ break;
+ default:
+ cerr << "unknown reference types." << endl;
+ exit(1);
}
- cerr << endl;
+}
+
+void BleuScorer::DumpCounts(ostream* os,
+ const NgramCounts& counts) const {
+ for (NgramCounts::const_iterator it = counts.begin();
+ it != counts.end(); ++it) {
+ *os << "(";
+ const NgramCounts::Key& keys = it->first;
+ for (size_t i = 0; i < keys.size(); ++i) {
+ if (i != 0) {
+ *os << " ";
+ }
+ *os << keys[i];
+ }
+ *os << ") : " << it->second << ", ";
+ }
+ *os << endl;
+}
+
+float sentenceLevelBleuPlusOne(const vector<float>& stats) {
+ CHECK(stats.size() == kBleuNgramOrder * 2 + 1);
+
+ float logbleu = 0.0;
+ for (int j = 0; j < kBleuNgramOrder; j++) {
+ logbleu += log(stats[2 * j] + 1.0) - log(stats[2 * j + 1] + 1.0);
+ }
+ logbleu /= kBleuNgramOrder;
+ const float brevity = 1.0 - stats[(kBleuNgramOrder * 2)] / stats[1];
+
+ if (brevity < 0.0) {
+ logbleu += brevity;
+ }
+ return exp(logbleu);
}
diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h
index 5f105add2..b6503ba9b 100644
--- a/mert/BleuScorer.h
+++ b/mert/BleuScorer.h
@@ -1,7 +1,7 @@
-#ifndef __BLEUSCORER_H__
-#define __BLEUSCORER_H__
+#ifndef MERT_BLEU_SCORER_H_
+#define MERT_BLEU_SCORER_H_
-#include <iostream>
+#include <ostream>
#include <string>
#include <vector>
@@ -12,72 +12,64 @@
using namespace std;
+const int kBleuNgramOrder = 4;
+
+class NgramCounts;
+class Reference;
+
/**
* Bleu scoring
*/
class BleuScorer: public StatisticsBasedScorer
{
public:
+ enum ReferenceLengthType {
+ AVERAGE,
+ CLOSEST,
+ SHORTEST
+ };
+
explicit BleuScorer(const string& config = "");
~BleuScorer();
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
virtual float calculateScore(const vector<int>& comps) const;
+ virtual size_t NumberOfScores() const { return 2 * kBleuNgramOrder + 1; }
- virtual size_t NumberOfScores() const {
- return 2 * kLENGTH + 1;
- }
+ int CalcReferenceLength(size_t sentence_id, size_t length);
-private:
- enum ReferenceLengthType {
- AVERAGE,
- SHORTEST,
- CLOSEST,
- };
+ ReferenceLengthType GetReferenceLengthType() const { return m_ref_length_type; }
+ void SetReferenceLengthType(ReferenceLengthType type) { m_ref_length_type = type; }
- //Used to construct the ngram map
- struct CompareNgrams {
- bool operator()(const vector<int>& a, const vector<int>& b) const {
- size_t i;
- const size_t as = a.size();
- const size_t bs = b.size();
- for (i = 0; i < as && i < bs; ++i) {
- if (a[i] < b[i]) {
- //cerr << "true" << endl;
- return true;
- }
- if (a[i] > b[i]) {
- //cerr << "false" << endl;
- return false;
- }
- }
- //entries are equal, shortest wins
- return as < bs;;
- }
- };
-
- typedef map<vector<int>,int,CompareNgrams> counts_t;
- typedef map<vector<int>,int,CompareNgrams>::iterator counts_iterator;
- typedef map<vector<int>,int,CompareNgrams>::const_iterator counts_const_iterator;
+ const std::vector<Reference*>& GetReferences() const { return m_references.get(); }
/**
* Count the ngrams of each type, up to the given length in the input line.
*/
- size_t countNgrams(const string& line, counts_t& counts, unsigned int n);
+ size_t CountNgrams(const string& line, NgramCounts& counts, unsigned int n);
+
+ void DumpCounts(std::ostream* os, const NgramCounts& counts) const;
- void dump_counts(counts_t& counts) const;
+ bool OpenReference(const char* filename, size_t file_id);
- const int kLENGTH;
+ // NOTE: this function is used for unit testing.
+ bool OpenReferenceStream(std::istream* is, size_t file_id);
+
+private:
ReferenceLengthType m_ref_length_type;
- // data extracted from reference files
- ScopedVector<counts_t> m_ref_counts;
- vector<vector<size_t> > m_ref_lengths;
+ // reference translations.
+ ScopedVector<Reference> m_references;
// no copying allowed
BleuScorer(const BleuScorer&);
BleuScorer& operator=(const BleuScorer&);
};
-#endif // __BLEUSCORER_H__
+/** Computes sentence-level BLEU+1 score.
+ * This function is used in PRO.
+ */
+float sentenceLevelBleuPlusOne(const vector<float>& stats);
+
+#endif // MERT_BLEU_SCORER_H_
diff --git a/mert/BleuScorerTest.cpp b/mert/BleuScorerTest.cpp
new file mode 100644
index 000000000..5da4cfc6c
--- /dev/null
+++ b/mert/BleuScorerTest.cpp
@@ -0,0 +1,272 @@
+#include "BleuScorer.h"
+
+#define BOOST_TEST_MODULE MertBleuScorer
+#include <boost/test/unit_test.hpp>
+
+#include <cmath>
+#include "Ngram.h"
+#include "Vocabulary.h"
+#include "Util.h"
+
+namespace {
+
+NgramCounts* g_counts = NULL;
+
+NgramCounts* GetNgramCounts() {
+ assert(g_counts);
+ return g_counts;
+}
+
+void SetNgramCounts(NgramCounts* counts) {
+ g_counts = counts;
+}
+
+struct Unigram {
+ Unigram(const std::string& a) {
+ instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(a));
+ }
+ NgramCounts::Key instance;
+};
+
+struct Bigram {
+ Bigram(const std::string& a, const std::string& b) {
+ instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(a));
+ instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(b));
+ }
+ NgramCounts::Key instance;
+};
+
+struct Trigram {
+ Trigram(const std::string& a, const std::string& b, const std::string& c) {
+ instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(a));
+ instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(b));
+ instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(c));
+ }
+ NgramCounts::Key instance;
+};
+
+struct Fourgram {
+ Fourgram(const std::string& a, const std::string& b,
+ const std::string& c, const std::string& d) {
+ instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(a));
+ instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(b));
+ instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(c));
+ instance.push_back(mert::VocabularyFactory::GetVocabulary()->Encode(d));
+ }
+ NgramCounts::Key instance;
+};
+
+bool CheckUnigram(const std::string& str) {
+ Unigram unigram(str);
+ NgramCounts::Value v;
+ return GetNgramCounts()->Lookup(unigram.instance, &v);
+}
+
+bool CheckBigram(const std::string& a, const std::string& b) {
+ Bigram bigram(a, b);
+ NgramCounts::Value v;
+ return GetNgramCounts()->Lookup(bigram.instance, &v);
+}
+
+bool CheckTrigram(const std::string& a, const std::string& b,
+ const std::string& c) {
+ Trigram trigram(a, b, c);
+ NgramCounts::Value v;
+ return GetNgramCounts()->Lookup(trigram.instance, &v);
+}
+
+bool CheckFourgram(const std::string& a, const std::string& b,
+ const std::string& c, const std::string& d) {
+ Fourgram fourgram(a, b, c, d);
+ NgramCounts::Value v;
+ return GetNgramCounts()->Lookup(fourgram.instance, &v);
+}
+
+void SetUpReferences(BleuScorer& scorer) {
+ // The following examples are taken from Koehn, "Statistical Machine Translation",
+ // Cambridge University Press, 2010.
+ {
+ std::stringstream ref1;
+ ref1 << "israeli officials are responsible for airport security" << std::endl;
+ BOOST_CHECK(scorer.OpenReferenceStream(&ref1, 0));
+ }
+
+ {
+ std::stringstream ref2;
+ ref2 << "israel is in charge of the security at this airport" << std::endl;
+ BOOST_CHECK(scorer.OpenReferenceStream(&ref2, 1));
+ }
+
+ {
+ std::stringstream ref3;
+ ref3 << "the security work for this airport is the responsibility of the israel government"
+ << std::endl;
+ BOOST_CHECK(scorer.OpenReferenceStream(&ref3, 2));
+ }
+
+ {
+ std::stringstream ref4;
+ ref4 << "israli side was in charge of the security of this airport" << std::endl;
+ BOOST_CHECK(scorer.OpenReferenceStream(&ref4, 3));
+ }
+}
+
+} // namespace
+
+BOOST_AUTO_TEST_CASE(bleu_reference_type) {
+ BleuScorer scorer;
+ // BleuScorer will use "closest" by default.
+ BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::CLOSEST);
+
+ scorer.SetReferenceLengthType(BleuScorer::AVERAGE);
+ BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::AVERAGE);
+
+ scorer.SetReferenceLengthType(BleuScorer::SHORTEST);
+ BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::SHORTEST);
+}
+
+BOOST_AUTO_TEST_CASE(bleu_reference_type_with_config) {
+ {
+ BleuScorer scorer("reflen:average");
+ BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::AVERAGE);
+ }
+
+ {
+ BleuScorer scorer("reflen:shortest");
+ BOOST_CHECK_EQUAL(scorer.GetReferenceLengthType(), BleuScorer::SHORTEST);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(bleu_count_ngrams) {
+ BleuScorer scorer;
+
+ std::string line = "I saw a girl with a telescope .";
+ // In the above string, we will get the 25 ngrams.
+ //
+ // unigram: "I", "saw", "a", "girl", "with", "telescope", "."
+ // bigram: "I saw", "saw a", "a girl", "girl with", "with a", "a telescope"
+ // "telescope ."
+ // trigram: "I saw a", "saw a girl", "a girl with", "girl with a",
+ // "with a telescope", "a telescope ."
+ // 4-gram: "I saw a girl", "saw a girl with", "a girl with a",
+ // "girl with a telescope", "with a telescope ."
+ NgramCounts counts;
+ BOOST_REQUIRE(scorer.CountNgrams(line, counts, kBleuNgramOrder) == 8);
+ BOOST_CHECK_EQUAL(25, counts.size());
+
+ mert::Vocabulary* vocab = scorer.GetVocab();
+ BOOST_CHECK_EQUAL(7, vocab->size());
+
+ std::vector<std::string> res;
+ Tokenize(line.c_str(), ' ', &res);
+ std::vector<int> ids(res.size());
+ for (std::size_t i = 0; i < res.size(); ++i) {
+ BOOST_CHECK(vocab->Lookup(res[i], &ids[i]));
+ }
+
+ SetNgramCounts(&counts);
+
+ // unigram
+ for (std::size_t i = 0; i < res.size(); ++i) {
+ BOOST_CHECK(CheckUnigram(res[i]));
+ }
+
+ // bigram
+ BOOST_CHECK(CheckBigram("I", "saw"));
+ BOOST_CHECK(CheckBigram("saw", "a"));
+ BOOST_CHECK(CheckBigram("a", "girl"));
+ BOOST_CHECK(CheckBigram("girl", "with"));
+ BOOST_CHECK(CheckBigram("with", "a"));
+ BOOST_CHECK(CheckBigram("a", "telescope"));
+ BOOST_CHECK(CheckBigram("telescope", "."));
+
+ // trigram
+ BOOST_CHECK(CheckTrigram("I", "saw", "a"));
+ BOOST_CHECK(CheckTrigram("saw", "a", "girl"));
+ BOOST_CHECK(CheckTrigram("a", "girl", "with"));
+ BOOST_CHECK(CheckTrigram("girl", "with", "a"));
+ BOOST_CHECK(CheckTrigram("with", "a", "telescope"));
+ BOOST_CHECK(CheckTrigram("a", "telescope", "."));
+
+ // 4-gram
+ BOOST_CHECK(CheckFourgram("I", "saw", "a", "girl"));
+ BOOST_CHECK(CheckFourgram("saw", "a", "girl", "with"));
+ BOOST_CHECK(CheckFourgram("a", "girl", "with", "a"));
+ BOOST_CHECK(CheckFourgram("girl", "with", "a", "telescope"));
+ BOOST_CHECK(CheckFourgram("with", "a", "telescope", "."));
+}
+
+BOOST_AUTO_TEST_CASE(bleu_clipped_counts) {
+ BleuScorer scorer;
+ SetUpReferences(scorer);
+ std::string line("israeli officials responsibility of airport safety");
+ ScoreStats entry;
+ scorer.prepareStats(0, line, entry);
+
+ BOOST_CHECK_EQUAL(entry.size(), 2 * kBleuNgramOrder + 1);
+
+ // Test hypothesis ngram counts
+ BOOST_CHECK_EQUAL(entry.get(0), 5); // unigram
+ BOOST_CHECK_EQUAL(entry.get(2), 2); // bigram
+ BOOST_CHECK_EQUAL(entry.get(4), 0); // trigram
+ BOOST_CHECK_EQUAL(entry.get(6), 0); // fourgram
+
+ // Test reference ngram counts.
+ BOOST_CHECK_EQUAL(entry.get(1), 6); // unigram
+ BOOST_CHECK_EQUAL(entry.get(3), 5); // bigram
+ BOOST_CHECK_EQUAL(entry.get(5), 4); // trigram
+ BOOST_CHECK_EQUAL(entry.get(7), 3); // fourgram
+}
+
+BOOST_AUTO_TEST_CASE(calculate_actual_score) {
+ BOOST_REQUIRE(4 == kBleuNgramOrder);
+ vector<int> stats(2 * kBleuNgramOrder + 1);
+ BleuScorer scorer;
+
+ // unigram
+ stats[0] = 6;
+ stats[1] = 6;
+
+ // bigram
+ stats[2] = 4;
+ stats[3] = 5;
+
+ // trigram
+ stats[4] = 2;
+ stats[5] = 4;
+
+ // fourgram
+ stats[6] = 1;
+ stats[7] = 3;
+
+ // reference-length
+ stats[8] = 7;
+
+ BOOST_CHECK(IsAlmostEqual(0.5115f, scorer.calculateScore(stats)));
+}
+
+BOOST_AUTO_TEST_CASE(sentence_level_bleu) {
+ BOOST_REQUIRE(4 == kBleuNgramOrder);
+ vector<float> stats(2 * kBleuNgramOrder + 1);
+
+ // unigram
+ stats[0] = 6.0;
+ stats[1] = 6.0;
+
+ // bigram
+ stats[2] = 4.0;
+ stats[3] = 5.0;
+
+ // trigram
+ stats[4] = 2.0;
+ stats[5] = 4.0;
+
+ // fourgram
+ stats[6] = 1.0;
+ stats[7] = 3.0;
+
+ // reference-length
+ stats[8] = 7.0;
+
+ BOOST_CHECK(IsAlmostEqual(0.5985f, sentenceLevelBleuPlusOne(stats)));
+}
diff --git a/mert/CderScorer.cpp b/mert/CderScorer.cpp
index ef1f6195f..896017056 100644
--- a/mert/CderScorer.cpp
+++ b/mert/CderScorer.cpp
@@ -1,6 +1,6 @@
#include "CderScorer.h"
-#include <iterator>
+#include <algorithm>
#include <fstream>
#include <stdexcept>
@@ -12,8 +12,9 @@ inline int CalcDistance(int word1, int word2) {
} // namespace
-CderScorer::CderScorer(const string& config)
- : StatisticsBasedScorer("CDER",config) {}
+CderScorer::CderScorer(const string& config, bool allowed_long_jumps)
+ : StatisticsBasedScorer("CDER", config),
+ m_allowed_long_jumps(allowed_long_jumps) {}
CderScorer::~CderScorer() {}
@@ -31,6 +32,7 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
m_ref_sentences.push_back(vector<sent_t>());
string line;
while (getline(refin,line)) {
+ line = this->preprocessSentence(line);
sent_t encoded;
TokenizeAndEncode(line, encoded);
m_ref_sentences[rid].push_back(encoded);
@@ -40,13 +42,11 @@ void CderScorer::setReferenceFiles(const vector<string>& referenceFiles)
void CderScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
- vector<int> stats;
- prepareStatsVector(sid, text, stats);
+ string sentence = this->preprocessSentence(text);
- stringstream sout;
- copy(stats.begin(), stats.end(), ostream_iterator<float>(sout," "));
- string stats_str = sout.str();
- entry.set(stats_str);
+ vector<int> stats;
+ prepareStatsVector(sid, sentence, stats);
+ entry.set(stats);
}
void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>& stats)
@@ -55,9 +55,11 @@ void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>&
TokenizeAndEncode(text, cand);
float max = -2;
+ vector<int> tmp;
for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
- sent_t& ref = m_ref_sentences[rid][sid];
- vector<int> tmp = computeCD(cand, ref);
+ const sent_t& ref = m_ref_sentences[rid][sid];
+ tmp.clear();
+ computeCD(cand, ref, tmp);
if (calculateScore(tmp) > max) {
stats = tmp;
}
@@ -66,16 +68,15 @@ void CderScorer::prepareStatsVector(size_t sid, const string& text, vector<int>&
float CderScorer::calculateScore(const vector<int>& comps) const
{
- if (comps.size() != 2)
- {
+ if (comps.size() != 2) {
throw runtime_error("Size of stat vector for CDER is not 2");
}
-
- return 1 - (comps[0] / static_cast<float>(comps[1]));
+ if (comps[1] == 0) return 1.0f;
+ return 1.0f - (comps[0] / static_cast<float>(comps[1]));
}
-vector<int> CderScorer::computeCD(const sent_t& cand, const sent_t& ref) const
-{
+void CderScorer::computeCD(const sent_t& cand, const sent_t& ref,
+ vector<int>& stats) const {
int I = cand.size() + 1; // Number of inter-words positions in candidate sentence
int L = ref.size() + 1; // Number of inter-words positions in reference sentence
@@ -102,21 +103,22 @@ vector<int> CderScorer::computeCD(const sent_t& cand, const sent_t& ref) const
(*nextRow)[i] = *min_element(possibleCosts.begin(), possibleCosts.end());
}
- // Cost of LongJumps is the same for all in the row
- int LJ = 1 + *min_element(nextRow->begin(), nextRow->end());
+ if (m_allowed_long_jumps) {
+ // Cost of LongJumps is the same for all in the row
+ int LJ = 1 + *min_element(nextRow->begin(), nextRow->end());
- for (int i = 0; i < I; ++i) {
- (*nextRow)[i] = min((*nextRow)[i], LJ); // LongJumps
+ for (int i = 0; i < I; ++i) {
+ (*nextRow)[i] = min((*nextRow)[i], LJ); // LongJumps
+ }
}
delete row;
row = nextRow;
}
- vector<int> stats(2);
+ stats.resize(2);
stats[0] = *(row->rbegin()); // CD distance is the cost of path from (0,0) to (I,L)
stats[1] = ref.size();
delete row;
- return stats;
}
diff --git a/mert/CderScorer.h b/mert/CderScorer.h
index bcc4946dc..dc6714115 100644
--- a/mert/CderScorer.h
+++ b/mert/CderScorer.h
@@ -1,8 +1,6 @@
-#ifndef __CDERSCORER_H__
-#define __CDERSCORER_H__
+#ifndef MERT_CDER_SCORER_H_
+#define MERT_CDER_SCORER_H_
-#include <algorithm>
-#include <iostream>
#include <string>
#include <vector>
#include "Types.h"
@@ -10,10 +8,12 @@
using namespace std;
-class CderScorer: public StatisticsBasedScorer
-{
-public:
- explicit CderScorer(const string& config);
+/**
+ * CderScorer class can compute both CDER and WER metric.
+ */
+class CderScorer: public StatisticsBasedScorer {
+ public:
+ explicit CderScorer(const string& config, bool allowed_long_jumps = true);
~CderScorer();
virtual void setReferenceFiles(const vector<string>& referenceFiles);
@@ -22,21 +22,22 @@ public:
virtual void prepareStatsVector(size_t sid, const string& text, vector<int>& stats);
- virtual size_t NumberOfScores() const {
- return 2;
- }
+ virtual size_t NumberOfScores() const { return 2; }
virtual float calculateScore(const vector<int>& comps) const;
-private:
+ private:
+ bool m_allowed_long_jumps;
+
typedef vector<int> sent_t;
vector<vector<sent_t> > m_ref_sentences;
- vector<int> computeCD(const sent_t& cand, const sent_t& ref) const;
+ void computeCD(const sent_t& cand, const sent_t& ref,
+ vector<int>& stats) const;
// no copying allowed
CderScorer(const CderScorer&);
CderScorer& operator=(const CderScorer&);
};
-#endif // __CDERSCORER_H__
+#endif // MERT_CDER_SCORER_H_
diff --git a/mert/Data.cpp b/mert/Data.cpp
index a4e6c2b24..19a89f754 100644
--- a/mert/Data.cpp
+++ b/mert/Data.cpp
@@ -1,13 +1,12 @@
/*
* Data.cpp
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
#include <algorithm>
-#include "util/check.hh"
#include <cmath>
#include <fstream>
@@ -16,87 +15,84 @@
#include "Scorer.h"
#include "ScorerFactory.h"
#include "Util.h"
+#include "util/check.hh"
+
+using namespace std;
Data::Data()
- : theScorer(NULL),
- number_of_scores(0),
- _sparse_flag(false),
- scoredata(),
- featdata() {}
-
-Data::Data(Scorer& ptr)
- : theScorer(&ptr),
- score_type(theScorer->getName()),
- number_of_scores(0),
- _sparse_flag(false),
- scoredata(new ScoreData(*theScorer)),
- featdata(new FeatureData)
+ : m_scorer(NULL),
+ m_num_scores(0),
+ m_sparse_flag(false),
+ m_score_data(),
+ m_feature_data() {}
+
+Data::Data(Scorer* scorer)
+ : m_scorer(scorer),
+ m_score_type(m_scorer->getName()),
+ m_num_scores(0),
+ m_sparse_flag(false),
+ m_score_data(new ScoreData(m_scorer)),
+ m_feature_data(new FeatureData)
{
- TRACE_ERR("Data::score_type " << score_type << std::endl);
- TRACE_ERR("Data::Scorer type from Scorer: " << theScorer->getName() << endl);
+ TRACE_ERR("Data::m_score_type " << m_score_type << endl);
+ TRACE_ERR("Data::Scorer type from Scorer: " << m_scorer->getName() << endl);
}
//ADDED BY TS
-void Data::remove_duplicates() {
-
- size_t nSentences = featdata->size();
- assert(scoredata->size() == nSentences);
-
- for (size_t s=0; s < nSentences; s++) {
+// TODO: This is too long; consider creating additional functions to
+// reduce the lines of this function.
+void Data::removeDuplicates() {
+ size_t nSentences = m_feature_data->size();
+ assert(m_score_data->size() == nSentences);
- FeatureArray& feat_array = featdata->get(s);
- ScoreArray& score_array = scoredata->get(s);
+ for (size_t s = 0; s < nSentences; s++) {
+ FeatureArray& feat_array = m_feature_data->get(s);
+ ScoreArray& score_array = m_score_data->get(s);
assert(feat_array.size() == score_array.size());
//serves as a hash-map:
- std::map<double, std::vector<size_t> > lookup;
+ map<double, vector<size_t> > lookup;
size_t end_pos = feat_array.size() - 1;
size_t nRemoved = 0;
- for (size_t k=0; k <= end_pos; k++) {
+ for (size_t k = 0; k <= end_pos; k++) {
const FeatureStats& cur_feats = feat_array.get(k);
-
double sum = 0.0;
- for (size_t l=0; l < cur_feats.size(); l++)
- sum += cur_feats.get(l);
+ for (size_t l = 0; l < cur_feats.size(); l++)
+ sum += cur_feats.get(l);
if (lookup.find(sum) != lookup.end()) {
- //std::cerr << "hit" << std::endl;
-
- std::vector<size_t>& cur_list = lookup[sum];
-
- size_t l=0;
- for (l=0; l < cur_list.size(); l++) {
-
- size_t j=cur_list[l];
-
- if (cur_feats == feat_array.get(j)
- && score_array.get(k) == score_array.get(j)) {
-
- if (k < end_pos) {
-
- feat_array.swap(k,end_pos);
- score_array.swap(k,end_pos);
-
- k--;
- }
-
- end_pos--;
- nRemoved++;
- break;
- }
- }
-
- if (l == lookup[sum].size())
- cur_list.push_back(k);
+ //cerr << "hit" << endl;
+ vector<size_t>& cur_list = lookup[sum];
+
+ // TODO: Make sure this is correct because we have already used 'l'.
+ // If this does not impact on the removing duplicates, it is better
+ // to change
+ size_t l = 0;
+ for (l = 0; l < cur_list.size(); l++) {
+ size_t j = cur_list[l];
+
+ if (cur_feats == feat_array.get(j)
+ && score_array.get(k) == score_array.get(j)) {
+ if (k < end_pos) {
+ feat_array.swap(k,end_pos);
+ score_array.swap(k,end_pos);
+ k--;
+ }
+ end_pos--;
+ nRemoved++;
+ break;
+ }
+ }
+ if (l == lookup[sum].size())
+ cur_list.push_back(k);
+ } else {
+ lookup[sum].push_back(k);
}
- else
- lookup[sum].push_back(k);
-
// for (size_t j=0; j < k; j++) {
// if (feat_array.get(k) == feat_array.get(j)
@@ -115,11 +111,9 @@ void Data::remove_duplicates() {
// break;
// }
// }
- }
-
+ } // end for k
if (nRemoved > 0) {
-
feat_array.resize(end_pos+1);
score_array.resize(end_pos+1);
}
@@ -127,124 +121,131 @@ void Data::remove_duplicates() {
}
//END_ADDED
+void Data::load(const std::string &featfile, const std::string &scorefile) {
+ m_feature_data->load(featfile);
+ m_score_data->load(scorefile);
+ if (m_feature_data->hasSparseFeatures())
+ m_sparse_flag = true;
+}
-void Data::loadnbest(const std::string &file)
+void Data::loadNBest(const string &file)
{
- TRACE_ERR("loading nbest from " << file << std::endl);
-
- FeatureStats featentry;
- ScoreStats scoreentry;
- std::string sentence_index;
-
+ TRACE_ERR("loading nbest from " << file << endl);
inputfilestream inp(file); // matches a stream with a file. Opens the file
-
if (!inp.good())
throw runtime_error("Unable to open: " + file);
- std::string substring, subsubstring, stringBuf;
- std::string theSentence;
- std::string::size_type loc;
-
- while (getline(inp,stringBuf,'\n')) {
- if (stringBuf.empty()) continue;
-
-// TRACE_ERR("stringBuf: " << stringBuf << std::endl);
-
- getNextPound(stringBuf, substring, "|||"); //first field
- sentence_index = substring;
-
- getNextPound(stringBuf, substring, "|||"); //second field
- theSentence = substring;
+ ScoreStats scoreentry;
+ string line, sentence_index, sentence, feature_str;
+ while (getline(inp, line, '\n')) {
+ if (line.empty()) continue;
// adding statistics for error measures
- featentry.reset();
scoreentry.clear();
- theScorer->prepareStats(sentence_index, theSentence, scoreentry);
-
- scoredata->add(scoreentry, sentence_index);
+ getNextPound(line, sentence_index, "|||"); // first field
+ getNextPound(line, sentence, "|||"); // second field
+ getNextPound(line, feature_str, "|||"); // third field
- getNextPound(stringBuf, substring, "|||"); //third field
+ m_scorer->prepareStats(sentence_index, sentence, scoreentry);
+ m_score_data->add(scoreentry, sentence_index);
// examine first line for name of features
if (!existsFeatureNames()) {
- std::string stringsupport=substring;
- std::string features="";
- std::string tmpname="";
-
- size_t tmpidx=0;
- while (!stringsupport.empty()) {
- // TRACE_ERR("Decompounding: " << substring << std::endl);
- getNextPound(stringsupport, subsubstring);
-
- // string ending with ":" are skipped, because they are the names of the features
- if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
- features+=tmpname+"_"+stringify(tmpidx)+" ";
- tmpidx++;
- }
- // ignore sparse feature name
- else if (subsubstring.find("_") != string::npos) {
- // also ignore its value
- getNextPound(stringsupport, subsubstring);
- }
- // update current feature name
- else {
- tmpidx=0;
- tmpname=subsubstring.substr(0,subsubstring.size() - 1);
- }
- }
+ InitFeatureMap(feature_str);
+ }
+ AddFeatures(feature_str, sentence_index);
+ }
+ inp.close();
+}
- featdata->setFeatureMap(features);
+void Data::save(const std::string &featfile, const std::string &scorefile, bool bin) {
+ if (bin)
+ cerr << "Binary write mode is selected" << endl;
+ else
+ cerr << "Binary write mode is NOT selected" << endl;
+
+ m_feature_data->save(featfile, bin);
+ m_score_data->save(scorefile, bin);
+}
+
+void Data::InitFeatureMap(const string& str) {
+ string buf = str;
+ string substr;
+ string features = "";
+ string tmp_name = "";
+ size_t tmp_index = 0;
+
+ while (!buf.empty()) {
+ getNextPound(buf, substr);
+
+ // string ending with ":" are skipped, because they are the names of the features
+ if (!EndsWith(substr, ":")) {
+ stringstream ss;
+ ss << tmp_name << "_" << tmp_index << " ";
+ features.append(ss.str());
+
+ tmp_index++;
+ } else if (substr.find("_") != string::npos) {
+ // ignore sparse feature name and its value
+ getNextPound(buf, substr);
+ } else { // update current feature name
+ tmp_index = 0;
+ tmp_name = substr.substr(0, substr.size() - 1);
}
+ }
+ m_feature_data->setFeatureMap(features);
+}
- // adding features
- while (!substring.empty()) {
-// TRACE_ERR("Decompounding: " << substring << std::endl);
- getNextPound(substring, subsubstring);
+void Data::AddFeatures(const string& str,
+ const string& sentence_index) {
+ string buf = str;
+ string substr;
+ FeatureStats feature_entry;
+ feature_entry.reset();
- // no ':' -> feature value that needs to be stored
- if ((loc = subsubstring.find_last_of(":")) != subsubstring.length()-1) {
- featentry.add(ConvertStringToFeatureStatsType(subsubstring));
- }
+ while (!buf.empty()) {
+ getNextPound(buf, substr);
+
+ // no ':' -> feature value that needs to be stored
+ if (!EndsWith(substr, ":")) {
+ feature_entry.add(ConvertStringToFeatureStatsType(substr));
+ } else if (substr.find("_") != string::npos) {
// sparse feature name? store as well
- else if (subsubstring.find("_") != string::npos) {
- std::string name = subsubstring;
- getNextPound(substring, subsubstring);
- featentry.addSparse( name, atof(subsubstring.c_str()) );
- _sparse_flag = true;
- }
+ string name = substr;
+ getNextPound(buf, substr);
+ feature_entry.addSparse(name, atof(substr.c_str()));
+ m_sparse_flag = true;
}
- //cerr << "number of sparse features: " << featentry.getSparse().size() << endl;
- featdata->add(featentry,sentence_index);
}
-
- inp.close();
+ m_feature_data->add(feature_entry, sentence_index);
}
// TODO
void Data::mergeSparseFeatures() {
- std::cerr << "ERROR: sparse features can only be trained with pairwise ranked optimizer (PRO), not traditional MERT\n";
+ cerr << "ERROR: sparse features can only be trained with pairwise ranked optimizer (PRO), not traditional MERT\n";
exit(1);
}
void Data::createShards(size_t shard_count, float shard_size, const string& scorerconfig,
- std::vector<Data>& shards)
+ vector<Data>& shards)
{
CHECK(shard_count);
CHECK(shard_size >= 0);
CHECK(shard_size <= 1);
- size_t data_size = scoredata->size();
- CHECK(data_size == featdata->size());
+ size_t data_size = m_score_data->size();
+ CHECK(data_size == m_feature_data->size());
shard_size *= data_size;
+ const float coeff = static_cast<float>(data_size) / shard_count;
for (size_t shard_id = 0; shard_id < shard_count; ++shard_id) {
vector<size_t> shard_contents;
if (shard_size == 0) {
//split into roughly equal size shards
- const size_t shard_start = floor(0.5 + shard_id * static_cast<float>(data_size) / shard_count);
- const size_t shard_end = floor(0.5 + (shard_id + 1) * static_cast<float>(data_size) / shard_count);
+ const size_t shard_start = floor(0.5 + shard_id * coeff);
+ const size_t shard_end = floor(0.5 + (shard_id + 1) * coeff);
for (size_t i = shard_start; i < shard_end; ++i) {
shard_contents.push_back(i);
}
@@ -255,15 +256,15 @@ void Data::createShards(size_t shard_count, float shard_size, const string& scor
}
}
- Scorer* scorer = ScorerFactory::getScorer(score_type, scorerconfig);
+ Scorer* scorer = ScorerFactory::getScorer(m_score_type, scorerconfig);
- shards.push_back(Data(*scorer));
- shards.back().score_type = score_type;
- shards.back().number_of_scores = number_of_scores;
- shards.back()._sparse_flag = _sparse_flag;
+ shards.push_back(Data(scorer));
+ shards.back().m_score_type = m_score_type;
+ shards.back().m_num_scores = m_num_scores;
+ shards.back().m_sparse_flag = m_sparse_flag;
for (size_t i = 0; i < shard_contents.size(); ++i) {
- shards.back().featdata->add(featdata->get(shard_contents[i]));
- shards.back().scoredata->add(scoredata->get(shard_contents[i]));
+ shards.back().m_feature_data->add(m_feature_data->get(shard_contents[i]));
+ shards.back().m_score_data->add(m_score_data->get(shard_contents[i]));
}
//cerr << endl;
}
diff --git a/mert/Data.h b/mert/Data.h
index 171c6db41..37d4b5473 100644
--- a/mert/Data.h
+++ b/mert/Data.h
@@ -1,21 +1,16 @@
/*
* Data.h
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
-#ifndef DATA_H
-#define DATA_H
+#ifndef MERT_DATA_H_
+#define MERT_DATA_H_
-using namespace std;
-
-#include <limits>
#include <vector>
-#include <iostream>
-
-#include<boost/shared_ptr.hpp>
+#include <boost/shared_ptr.hpp>
#include "Util.h"
#include "FeatureData.h"
@@ -26,90 +21,65 @@ class Scorer;
typedef boost::shared_ptr<ScoreData> ScoreDataHandle;
typedef boost::shared_ptr<FeatureData> FeatureDataHandle;
+// NOTE: there is no copy constructor implemented, so only the
+// compiler synthesised shallow copy is available.
class Data
{
private:
- Scorer* theScorer;
- std::string score_type;
- size_t number_of_scores;
- bool _sparse_flag;
-
-protected:
- ScoreDataHandle scoredata;
- FeatureDataHandle featdata;
+ Scorer* m_scorer;
+ std::string m_score_type;
+ std::size_t m_num_scores;
+ bool m_sparse_flag;
+ ScoreDataHandle m_score_data;
+ FeatureDataHandle m_feature_data;
public:
- explicit Data(Scorer& sc);
+ explicit Data(Scorer* scorer);
Data();
- //Note that there is no copy constructor implemented, so only the
- //compiler synthesised shallow copy is available
-
- inline void clear() {
- scoredata->clear();
- featdata->clear();
+ void clear() {
+ m_score_data->clear();
+ m_feature_data->clear();
}
- ScoreDataHandle getScoreData() {
- return scoredata;
- }
+ ScoreDataHandle getScoreData() { return m_score_data; }
- FeatureDataHandle getFeatureData() {
- return featdata;
- }
+ FeatureDataHandle getFeatureData() { return m_feature_data; }
- Scorer* getScorer() {
- return theScorer;
- }
+ Scorer* getScorer() { return m_scorer; }
- inline size_t NumberOfFeatures() const {
- return featdata->NumberOfFeatures();
- }
- inline void NumberOfFeatures(size_t v) {
- featdata->NumberOfFeatures(v);
- }
- inline std::string Features() const {
- return featdata->Features();
- }
- inline void Features(const std::string &f) {
- featdata->Features(f);
+ std::size_t NumberOfFeatures() const {
+ return m_feature_data->NumberOfFeatures();
}
- inline bool hasSparseFeatures() const { return _sparse_flag; }
- void mergeSparseFeatures();
+ void NumberOfFeatures(std::size_t v) { m_feature_data->NumberOfFeatures(v); }
- void loadnbest(const std::string &file);
-
- void load(const std::string &featfile,const std::string &scorefile) {
- featdata->load(featfile);
- scoredata->load(scorefile);
- if (featdata->hasSparseFeatures())
- _sparse_flag = true;
- }
+ std::string Features() const { return m_feature_data->Features(); }
+ void Features(const std::string &f) { m_feature_data->Features(f); }
- //ADDED BY TS
- void remove_duplicates();
- //END_ADDED
+ bool hasSparseFeatures() const { return m_sparse_flag; }
+ void mergeSparseFeatures();
- void save(const std::string &featfile,const std::string &scorefile, bool bin=false) {
+ void loadNBest(const std::string &file);
- if (bin) cerr << "Binary write mode is selected" << endl;
- else cerr << "Binary write mode is NOT selected" << endl;
+ void load(const std::string &featfile, const std::string &scorefile);
- featdata->save(featfile, bin);
- scoredata->save(scorefile, bin);
- }
+ void save(const std::string &featfile, const std::string &scorefile, bool bin=false);
+
+ //ADDED BY TS
+ void removeDuplicates();
+ //END_ADDED
inline bool existsFeatureNames() const {
- return featdata->existsFeatureNames();
+ return m_feature_data->existsFeatureNames();
}
- inline std::string getFeatureName(size_t idx) const {
- return featdata->getFeatureName(idx);
+ inline std::string getFeatureName(std::size_t idx) const {
+ return m_feature_data->getFeatureName(idx);
}
- inline size_t getFeatureIndex(const std::string& name) const {
- return featdata->getFeatureIndex(name);
+ inline std::size_t getFeatureIndex(const std::string& name) const {
+ return m_feature_data->getFeatureIndex(name);
}
/**
@@ -118,8 +88,13 @@ public:
* the data (with replacement) and shard_size is interpreted as the proportion
* of the total size.
*/
- void createShards(size_t shard_count, float shard_size, const std::string& scorerconfig,
+ void createShards(std::size_t shard_count, float shard_size, const std::string& scorerconfig,
std::vector<Data>& shards);
+
+ // Helper functions for loadnbest();
+ void InitFeatureMap(const std::string& str);
+ void AddFeatures(const std::string& str,
+ const std::string& sentence_index);
};
-#endif // DATA_H
+#endif // MERT_DATA_H_
diff --git a/mert/DataTest.cpp b/mert/DataTest.cpp
index 0f02d64a0..43cf9bb24 100644
--- a/mert/DataTest.cpp
+++ b/mert/DataTest.cpp
@@ -5,12 +5,12 @@
#define BOOST_TEST_MODULE MertData
#include <boost/test/unit_test.hpp>
-
+#include <boost/scoped_ptr.hpp>
//very basic test of sharding
BOOST_AUTO_TEST_CASE(shard_basic) {
- Scorer* scorer = ScorerFactory::getScorer("BLEU", "");
- Data data(*scorer);
+ boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
+ Data data(scorer.get());
FeatureArray fa1, fa2, fa3, fa4;
ScoreArray sa1, sa2, sa3, sa4;
fa1.setIndex("1");
@@ -36,3 +36,13 @@ BOOST_AUTO_TEST_CASE(shard_basic) {
BOOST_CHECK_EQUAL(shards.size(),2);
BOOST_CHECK_EQUAL(shards[1].getFeatureData()->size(),2);
}
+
+BOOST_AUTO_TEST_CASE(init_feature_map_test) {
+ boost::scoped_ptr<Scorer> scorer(ScorerFactory::getScorer("BLEU", ""));
+ Data data(scorer.get());
+
+ std::string s = " d: 0 -7.66174 0 0 -3.51621 0 0 lm: -41.3435 -40.3647 tm: -67.6349 -100.438 -27.6817 -23.4685 8.99907 w: -9 ";
+ std::string expected = "d_0 d_1 d_2 d_3 d_4 d_5 d_6 lm_0 lm_1 tm_0 tm_1 tm_2 tm_3 tm_4 w_0 ";
+ data.InitFeatureMap(s);
+ BOOST_CHECK_EQUAL(expected, data.Features());
+}
diff --git a/mert/Fdstream.h b/mert/Fdstream.h
new file mode 100644
index 000000000..92ccd355f
--- /dev/null
+++ b/mert/Fdstream.h
@@ -0,0 +1,167 @@
+/*
+ * This class creates c++ like stream from file descriptor
+ */
+
+#ifndef _FDSTREAM_
+#define _FDSTREAM_
+
+#include <iostream>
+#if defined(__GLIBCXX__) || defined(__GLIBCPP__)
+#include <ext/stdio_filebuf.h>
+
+#define BUFFER_SIZE (1024)
+
+using namespace std;
+
+class _fdstream
+{
+protected:
+ _fdstream() :
+ _file_descriptor(-1), _filebuf(NULL)
+ { }
+
+ _fdstream(int file_descriptor, ios_base::openmode openmode) :
+ _file_descriptor(file_descriptor), _openmode(openmode)
+ {
+ _filebuf = NULL;
+ open(file_descriptor, openmode);
+ }
+
+ ios_base::openmode openmode() const { return _openmode; }
+
+ void open(int file_descriptor, ios_base::openmode openmode)
+ {
+ if (!_filebuf)
+ // We create a C++ stream from a file descriptor
+ // stdio_filebuf is not synced with stdio.
+ // From GCC 3.4.0 on exists in addition stdio_sync_filebuf
+ // You can also create the filebuf from a FILE* with
+ // FILE* f = fdopen(file_descriptor, mode);
+ _filebuf = new __gnu_cxx::stdio_filebuf<char> (file_descriptor,
+ openmode);
+ }
+
+ ~_fdstream()
+ {
+ close(_file_descriptor);
+ delete _filebuf;
+ _filebuf = NULL;
+ }
+
+ int _file_descriptor;
+ __gnu_cxx::stdio_filebuf<char>* _filebuf;
+ ios_base::openmode _openmode;
+};
+
+class ifdstream : public _fdstream
+{
+public:
+ ifdstream() :
+ _fdstream(), _stream(NULL)
+ { }
+
+ ifdstream(int file_descriptor) :
+ _fdstream(file_descriptor, ios_base::in)
+ {
+ _stream = new istream (_filebuf);
+ }
+
+ void open(int file_descriptor)
+ {
+ if (!_stream)
+ {
+ _fdstream::open(file_descriptor, ios_base::in);
+ _stream = new istream (_filebuf);
+ }
+ }
+
+ ifdstream& operator>> (string& str)
+ {
+ (*_stream) >> str;
+
+ return *this;
+ }
+
+ size_t getline(string& str)
+ {
+ char tmp[BUFFER_SIZE];
+ size_t ret = getline(tmp, BUFFER_SIZE);
+ str = tmp;
+ return ret;
+ }
+
+ size_t getline (char* s, streamsize n)
+ {
+ return (getline(s, n, '\n'));
+ }
+
+ size_t getline (char* s, streamsize n, char delim)
+ {
+ int i = 0;
+ do{
+ s[i] = _stream->get();
+ i++;
+ }while(i < n-1 && s[i-1] != delim && s[i-1] != '\0');
+
+ s[i-1] = '\0'; // overwrite the delimiter given with string end
+
+ return i-1;
+ }
+
+ ~ifdstream()
+ {
+ //this->~_fdstream();
+ delete _stream;
+ }
+
+private:
+ istream* _stream;
+};
+
+class ofdstream : public _fdstream
+{
+public:
+ ofdstream() :
+ _fdstream(), _stream(NULL)
+ { }
+
+ ofdstream(int file_descriptor) :
+ _fdstream(file_descriptor, ios_base::out)
+ {
+ _stream = new ostream (_filebuf);
+ }
+
+ void open(int file_descriptor)
+ {
+ if (!_stream)
+ {
+ _fdstream::open(file_descriptor, ios_base::out);
+ _stream = new ostream (_filebuf);
+ }
+ }
+
+
+ ofdstream& operator<< (const string& str)
+ {
+ if (_stream->good())
+ (*_stream) << str;
+
+ _stream->flush();
+ return *this;
+ }
+
+ ~ofdstream()
+ {
+ //this->~_fdstream();
+ delete _stream;
+ }
+
+private:
+ ostream* _stream;
+};
+
+#else
+#error "Not supported"
+#endif
+
+#endif // _FDSTREAM_
diff --git a/mert/FeatureArray.cpp b/mert/FeatureArray.cpp
index 854bcef79..62f9ceda5 100644
--- a/mert/FeatureArray.cpp
+++ b/mert/FeatureArray.cpp
@@ -1,140 +1,152 @@
/*
* FeatureArray.cpp
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
+#include <fstream>
#include "FeatureArray.h"
#include "FileStream.h"
#include "Util.h"
-
FeatureArray::FeatureArray()
- : idx(""), number_of_features(0), _sparse_flag(false) {}
+ : m_index(""), m_num_features(0), m_sparse_flag(false) {}
FeatureArray::~FeatureArray() {}
-void FeatureArray::savetxt(std::ofstream& outFile)
+void FeatureArray::savetxt(ostream* os)
{
- outFile << FEATURES_TXT_BEGIN << " " << idx << " " << array_.size()
- << " " << number_of_features << " " << features << std::endl;
- for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++) {
- i->savetxt(outFile);
- outFile << std::endl;
+ *os << FEATURES_TXT_BEGIN << " " << m_index << " " << m_array.size()
+ << " " << m_num_features << " " << m_features << endl;
+ for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i) {
+ i->savetxt(os);
+ *os << endl;
}
- outFile << FEATURES_TXT_END << std::endl;
+ *os << FEATURES_TXT_END << endl;
}
-void FeatureArray::savebin(std::ofstream& outFile)
+void FeatureArray::savebin(ostream* os)
{
- outFile << FEATURES_BIN_BEGIN << " " << idx << " " << array_.size()
- << " " << number_of_features << " " << features << std::endl;
- for (featarray_t::iterator i = array_.begin(); i !=array_.end(); i++)
- i->savebin(outFile);
+ *os << FEATURES_BIN_BEGIN << " " << m_index << " " << m_array.size()
+ << " " << m_num_features << " " << m_features << endl;
+ for (featarray_t::iterator i = m_array.begin(); i != m_array.end(); ++i)
+ i->savebin(os);
- outFile << FEATURES_BIN_END << std::endl;
+ *os << FEATURES_BIN_END << endl;
}
-void FeatureArray::save(std::ofstream& inFile, bool bin)
+void FeatureArray::save(ostream* os, bool bin)
{
- if (size()>0)
- (bin)?savebin(inFile):savetxt(inFile);
+ if (size() <= 0) return;
+ if (bin) {
+ savebin(os);
+ } else {
+ savetxt(os);
+ }
}
-void FeatureArray::save(const std::string &file, bool bin)
+void FeatureArray::save(const string &file, bool bin)
{
-
- std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
-
- save(outFile);
-
- outFile.close();
+ ofstream ofs(file.c_str(), ios::out);
+ if (!ofs) {
+ cerr << "Failed to open " << file << endl;
+ exit(1);
+ }
+ ostream *os = &ofs;
+ save(os, bin);
+ ofs.close();
}
-void FeatureArray::loadbin(ifstream& inFile, size_t n)
+void FeatureArray::save(bool bin)
{
- FeatureStats entry(number_of_features);
+ save(&cout, bin);
+}
- for (size_t i=0 ; i < n; i++) {
- entry.loadbin(inFile);
+void FeatureArray::loadbin(istream* is, size_t n)
+{
+ FeatureStats entry(m_num_features);
+ for (size_t i = 0 ; i < n; i++) {
+ entry.loadbin(is);
add(entry);
}
}
-void FeatureArray::loadtxt(ifstream& inFile, size_t n)
+void FeatureArray::loadtxt(istream* is, size_t n)
{
- FeatureStats entry(number_of_features);
+ FeatureStats entry(m_num_features);
- for (size_t i=0 ; i < n; i++) {
- entry.loadtxt(inFile);
+ for (size_t i = 0; i < n; i++) {
+ entry.loadtxt(is);
add(entry);
if (entry.getSparse().size()>0)
- _sparse_flag = true;
+ m_sparse_flag = true;
}
}
-void FeatureArray::load(ifstream& inFile)
+void FeatureArray::load(istream* is)
{
- size_t number_of_entries=0;
- bool binmode=false;
+ size_t number_of_entries = 0;
+ bool binmode = false;
- std::string substring, stringBuf;
- std::string::size_type loc;
+ string substring, stringBuf;
+ string::size_type loc;
- std::getline(inFile, stringBuf);
- if (!inFile.good()) {
+ getline(*is, stringBuf);
+ if (!is->good()) {
return;
}
if (!stringBuf.empty()) {
if ((loc = stringBuf.find(FEATURES_TXT_BEGIN)) == 0) {
- binmode=false;
+ binmode = false;
} else if ((loc = stringBuf.find(FEATURES_BIN_BEGIN)) == 0) {
- binmode=true;
+ binmode = true;
} else {
TRACE_ERR("ERROR: FeatureArray::load(): Wrong header");
return;
}
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
- idx = substring;
+ m_index = substring;
getNextPound(stringBuf, substring);
number_of_entries = atoi(substring.c_str());
getNextPound(stringBuf, substring);
- number_of_features = atoi(substring.c_str());
- features = stringBuf;
+ m_num_features = atoi(substring.c_str());
+ m_features = stringBuf;
}
- (binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
+ if (binmode) {
+ loadbin(is, number_of_entries);
+ } else {
+ loadtxt(is, number_of_entries);
+ }
- std::getline(inFile, stringBuf);
+ getline(*is, stringBuf);
if (!stringBuf.empty()) {
- if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 && (loc = stringBuf.find(FEATURES_BIN_END)) != 0) {
+ if ((loc = stringBuf.find(FEATURES_TXT_END)) != 0 &&
+ (loc = stringBuf.find(FEATURES_BIN_END)) != 0) {
TRACE_ERR("ERROR: FeatureArray::load(): Wrong footer");
return;
}
}
}
-void FeatureArray::load(const std::string &file)
+void FeatureArray::load(const string &file)
{
- TRACE_ERR("loading data from " << file << std::endl);
-
- inputfilestream inFile(file); // matches a stream with a file. Opens the file
-
- load((ifstream&) inFile);
-
- inFile.close();
-
+ TRACE_ERR("loading data from " << file << endl);
+ inputfilestream input_stream(file); // matches a stream with a file. Opens the file
+ istream* is = &input_stream;
+ load(is);
+ input_stream.close();
}
void FeatureArray::merge(FeatureArray& e)
{
//dummy implementation
- for (size_t i=0; i<e.size(); i++)
+ for (size_t i = 0; i < e.size(); i++)
add(e.get(i));
}
@@ -144,10 +156,9 @@ bool FeatureArray::check_consistency() const
if (sz == 0)
return true;
- for (featarray_t::const_iterator i = array_.begin(); i != array_.end(); i++) {
+ for (featarray_t::const_iterator i = m_array.begin(); i != m_array.end(); i++) {
if (i->size() != sz)
return false;
}
return true;
}
-
diff --git a/mert/FeatureArray.h b/mert/FeatureArray.h
index ee8ee1354..25ebbe866 100644
--- a/mert/FeatureArray.h
+++ b/mert/FeatureArray.h
@@ -1,17 +1,16 @@
/*
* FeatureArray.h
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
-#ifndef FEATURE_ARRAY_H
-#define FEATURE_ARRAY_H
+#ifndef MERT_FEATURE_ARRAY_H_
+#define MERT_FEATURE_ARRAY_H_
#include <vector>
#include <iostream>
-#include <fstream>
#include "FeatureStats.h"
using namespace std;
@@ -26,85 +25,60 @@ class FeatureArray
private:
// idx to identify the utterance. It can differ from
// the index inside the vector.
- std::string idx;
-
-protected:
- featarray_t array_;
- size_t number_of_features;
- std::string features;
- bool _sparse_flag;
+ std::string m_index;
+ featarray_t m_array;
+ size_t m_num_features;
+ std::string m_features;
+ bool m_sparse_flag;
public:
FeatureArray();
~FeatureArray();
- inline void clear() {
- array_.clear();
- }
+ void clear() { m_array.clear(); }
- inline bool hasSparseFeatures() const {
- return _sparse_flag;
- }
+ bool hasSparseFeatures() const { return m_sparse_flag; }
- inline std::string getIndex() const {
- return idx;
- }
- inline void setIndex(const std::string& value) {
- idx = value;
- }
+ std::string getIndex() const { return m_index; }
+ void setIndex(const std::string& value) { m_index = value; }
- inline FeatureStats& get(size_t i) {
- return array_.at(i);
- }
- inline const FeatureStats& get(size_t i)const {
- return array_.at(i);
- }
- void add(FeatureStats& e) {
- array_.push_back(e);
- }
+ FeatureStats& get(size_t i) { return m_array.at(i); }
+ const FeatureStats& get(size_t i) const { return m_array.at(i); }
+
+ void add(FeatureStats& e) { m_array.push_back(e); }
//ADDED BY TS
void swap(size_t i, size_t j) {
- std::swap(array_[i],array_[j]);
+ std::swap(m_array[i], m_array[j]);
}
-
+
void resize(size_t new_size) {
- array_.resize(std::min(new_size,array_.size()));
+ m_array.resize(std::min(new_size, m_array.size()));
}
//END_ADDED
void merge(FeatureArray& e);
- inline size_t size() const {
- return array_.size();
- }
- inline size_t NumberOfFeatures() const {
- return number_of_features;
- }
- inline void NumberOfFeatures(size_t v) {
- number_of_features = v;
- }
- inline std::string Features() const {
- return features;
- }
- inline void Features(const std::string& f) {
- features = f;
- }
+ size_t size() const { return m_array.size(); }
+
+ size_t NumberOfFeatures() const { return m_num_features; }
+ void NumberOfFeatures(size_t v) { m_num_features = v; }
- void savetxt(ofstream& outFile);
- void savebin(ofstream& outFile);
- void save(ofstream& outFile, bool bin=false);
+ std::string Features() const { return m_features; }
+ void Features(const std::string& f) { m_features = f; }
+
+ void savetxt(std::ostream* os);
+ void savebin(std::ostream* os);
+ void save(std::ostream* os, bool bin=false);
void save(const std::string &file, bool bin=false);
- inline void save(bool bin=false) {
- save("/dev/stdout",bin);
- }
+ void save(bool bin=false);
- void loadtxt(ifstream& inFile, size_t n);
- void loadbin(ifstream& inFile, size_t n);
- void load(ifstream& inFile);
+ void loadtxt(std::istream* is, size_t n);
+ void loadbin(std::istream* is, size_t n);
+ void load(std::istream* is);
void load(const std::string &file);
bool check_consistency() const;
};
-#endif // FEATURE_ARRAY_H
+#endif // MERT_FEATURE_ARRAY_H_
diff --git a/mert/FeatureData.cpp b/mert/FeatureData.cpp
index ed76bca3b..acc144d1a 100644
--- a/mert/FeatureData.cpp
+++ b/mert/FeatureData.cpp
@@ -1,6 +1,6 @@
/*
* FeatureData.cpp
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
@@ -12,44 +12,47 @@
#include "FileStream.h"
#include "Util.h"
-static const float MIN_FLOAT=-1.0*numeric_limits<float>::max();
-static const float MAX_FLOAT=numeric_limits<float>::max();
+using namespace std;
+
+static const float MIN_FLOAT = -1.0 * numeric_limits<float>::max();
+static const float MAX_FLOAT = numeric_limits<float>::max();
FeatureData::FeatureData()
- : number_of_features(0),
- _sparse_flag(false) {}
+ : m_num_features(0),
+ m_sparse_flag(false) {}
-void FeatureData::save(std::ofstream& outFile, bool bin)
+void FeatureData::save(ostream* os, bool bin)
{
- for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++)
- i->save(outFile, bin);
+ for (featdata_t::iterator i = m_array.begin(); i != m_array.end(); i++)
+ i->save(os, bin);
}
-void FeatureData::save(const std::string &file, bool bin)
+void FeatureData::save(const string &file, bool bin)
{
if (file.empty()) return;
+ TRACE_ERR("saving the array into " << file << endl);
+ ofstream ofs(file.c_str(), ios::out); // matches a stream with a file. Opens the file
+ ostream* os = &ofs;
+ save(os, bin);
+ ofs.close();
+}
- TRACE_ERR("saving the array into " << file << std::endl);
-
- std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
-
- save(outFile, bin);
-
- outFile.close();
+void FeatureData::save(bool bin) {
+ save(&cout, bin);
}
-void FeatureData::load(ifstream& inFile)
+void FeatureData::load(istream* is)
{
FeatureArray entry;
- while (!inFile.eof()) {
+ while (!is->eof()) {
- if (!inFile.good()) {
- std::cerr << "ERROR FeatureData::load inFile.good()" << std::endl;
+ if (!is->good()) {
+ cerr << "ERROR FeatureData::load inFile.good()" << endl;
}
entry.clear();
- entry.load(inFile);
+ entry.load(is);
if (entry.size() == 0)
break;
@@ -58,26 +61,23 @@ void FeatureData::load(ifstream& inFile)
setFeatureMap(entry.Features());
if (entry.hasSparseFeatures())
- _sparse_flag = true;
+ m_sparse_flag = true;
add(entry);
}
}
-void FeatureData::load(const std::string &file)
+void FeatureData::load(const string &file)
{
- TRACE_ERR("loading feature data from " << file << std::endl);
-
- inputfilestream inFile(file); // matches a stream with a file. Opens the file
-
- if (!inFile) {
+ TRACE_ERR("loading feature data from " << file << endl);
+ inputfilestream input_stream(file); // matches a stream with a file. Opens the file
+ if (!input_stream) {
throw runtime_error("Unable to open feature file: " + file);
}
-
- load((ifstream&) inFile);
-
- inFile.close();
+ istream* is = &input_stream;
+ load(is);
+ input_stream.close();
}
void FeatureData::add(FeatureArray& e)
@@ -85,25 +85,25 @@ void FeatureData::add(FeatureArray& e)
if (exists(e.getIndex())) { // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(e.getIndex());
- array_.at(pos).merge(e);
+ m_array.at(pos).merge(e);
} else {
- array_.push_back(e);
+ m_array.push_back(e);
setIndex();
}
}
-void FeatureData::add(FeatureStats& e, const std::string& sent_idx)
+void FeatureData::add(FeatureStats& e, const string& sent_idx)
{
if (exists(sent_idx)) { // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(sent_idx);
// TRACE_ERR("Inserting " << e << " in array " << sent_idx << std::endl);
- array_.at(pos).add(e);
+ m_array.at(pos).add(e);
} else {
// TRACE_ERR("Creating a new entry in the array and inserting " << e << std::endl);
FeatureArray a;
- a.NumberOfFeatures(number_of_features);
- a.Features(features);
+ a.NumberOfFeatures(m_num_features);
+ a.Features(m_features);
a.setIndex(sent_idx);
a.add(e);
add(a);
@@ -112,10 +112,10 @@ void FeatureData::add(FeatureStats& e, const std::string& sent_idx)
bool FeatureData::check_consistency() const
{
- if (array_.size() == 0)
+ if (m_array.size() == 0)
return true;
- for (featdata_t::const_iterator i = array_.begin(); i != array_.end(); i++)
+ for (featdata_t::const_iterator i = m_array.begin(); i != m_array.end(); i++)
if (!i->check_consistency()) return false;
return true;
@@ -124,25 +124,53 @@ bool FeatureData::check_consistency() const
void FeatureData::setIndex()
{
size_t j=0;
- for (featdata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
- idx2arrayname_[j]=(*i).getIndex();
- arrayname2idx_[(*i).getIndex()] = j;
+ for (featdata_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
+ m_index_to_array_name[j]=(*i).getIndex();
+ m_array_name_to_index[(*i).getIndex()] = j;
j++;
}
}
-void FeatureData::setFeatureMap(const std::string& feat)
+void FeatureData::setFeatureMap(const string& feat)
{
- number_of_features = 0;
- features = feat;
+ m_num_features = 0;
+ m_features = feat;
+
+ vector<string> buf;
+ Tokenize(feat.c_str(), ' ', &buf);
+ for (vector<string>::const_iterator it = buf.begin();
+ it != buf.end(); ++it) {
+ const size_t size = m_index_to_feature_name.size();
+ m_feature_name_to_index[*it] = size;
+ m_index_to_feature_name[size] = *it;
+ ++m_num_features;
+ }
+}
- std::string substring, stringBuf;
- stringBuf = features;
- while (!stringBuf.empty()) {
- getNextPound(stringBuf, substring);
+string FeatureData::ToString() const {
+ string res;
+
+ {
+ stringstream ss;
+ ss << "number of features: " << m_num_features
+ << ", features: " << m_features
+ << ", sparse flag: ";
+ if (m_sparse_flag) {
+ ss << "yes, ";
+ } else {
+ ss << "no, ";
+ }
+ res.append(ss.str());
+ }
- featname2idx_[substring] = idx2featname_.size();
- idx2featname_[idx2featname_.size()] = substring;
- number_of_features++;
+ res.append("feature_id_map = { ");
+ for (map<string, size_t>::const_iterator it = m_feature_name_to_index.begin();
+ it != m_feature_name_to_index.end(); ++it) {
+ stringstream ss;
+ ss << it->first << " => " << it->second << ", ";
+ res.append(ss.str());
}
+ res.append("}");
+
+ return res;
}
diff --git a/mert/FeatureData.h b/mert/FeatureData.h
index 8331fe2d2..aef1ef250 100644
--- a/mert/FeatureData.h
+++ b/mert/FeatureData.h
@@ -1,15 +1,13 @@
/*
* FeatureData.h
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
-#ifndef FEATURE_DATA_H
-#define FEATURE_DATA_H
-
-using namespace std;
+#ifndef MERT_FEATURE_DATA_H_
+#define MERT_FEATURE_DATA_H_
#include <vector>
#include <iostream>
@@ -19,123 +17,116 @@ using namespace std;
class FeatureData
{
private:
- size_t number_of_features;
- std::string features;
- bool _sparse_flag;
-
- map<std::string, size_t> featname2idx_; // map from name to index of features
- map<size_t, std::string> idx2featname_; // map from index to name of features
-
-protected:
- featdata_t array_;
- idx2name idx2arrayname_; // map from index to name of array
- name2idx arrayname2idx_; // map from name to index of array
+ std::size_t m_num_features;
+ std::string m_features;
+ bool m_sparse_flag;
+ std::map<std::string, std::size_t> m_feature_name_to_index; // map from name to index of features
+ std::map<std::size_t, std::string> m_index_to_feature_name; // map from index to name of features
+ featdata_t m_array;
+ idx2name m_index_to_array_name; // map from index to name of array
+ name2idx m_array_name_to_index; // map from name to index of array
public:
FeatureData();
~FeatureData() {}
- inline void clear() {
- array_.clear();
- }
+ void clear() { m_array.clear(); }
- inline bool hasSparseFeatures() const {
- return _sparse_flag;
- }
- inline FeatureArray get(const std::string& idx) {
- return array_.at(getIndex(idx));
- }
- inline FeatureArray& get(size_t idx) {
- return array_.at(idx);
- }
- inline const FeatureArray& get(size_t idx) const {
- return array_.at(idx);
+ bool hasSparseFeatures() const { return m_sparse_flag; }
+
+ FeatureArray get(const std::string& idx) {
+ return m_array.at(getIndex(idx));
}
+ FeatureArray& get(std::size_t idx) { return m_array.at(idx); }
+ const FeatureArray& get(std::size_t idx) const { return m_array.at(idx); }
+
inline bool exists(const std::string& sent_idx) const {
return exists(getIndex(sent_idx));
}
inline bool exists(int sent_idx) const {
- return (sent_idx > -1 && sent_idx < static_cast<int>(array_.size())) ? true : false;
+ return (sent_idx > -1 && sent_idx < static_cast<int>(m_array.size())) ? true : false;
}
- inline FeatureStats& get(size_t i, size_t j) {
- return array_.at(i).get(j);
+ inline FeatureStats& get(std::size_t i, std::size_t j) {
+ return m_array.at(i).get(j);
}
- inline const FeatureStats& get(size_t i, size_t j) const {
- return array_.at(i).get(j);
+
+ inline const FeatureStats& get(std::size_t i, std::size_t j) const {
+ return m_array.at(i).get(j);
}
void add(FeatureArray& e);
void add(FeatureStats& e, const std::string& sent_idx);
- inline size_t size() const {
- return array_.size();
- }
- inline size_t NumberOfFeatures() const {
- return number_of_features;
- }
- inline void NumberOfFeatures(size_t v) {
- number_of_features = v;
- }
- inline std::string Features() const {
- return features;
- }
- inline void Features(const std::string& f) {
- features = f;
- }
+ std::size_t size() const { return m_array.size(); }
+
+ std::size_t NumberOfFeatures() const { return m_num_features; }
+ void NumberOfFeatures(std::size_t v) { m_num_features = v; }
+
+ std::string Features() const { return m_features; }
+ void Features(const std::string& f) { m_features = f; }
void save(const std::string &file, bool bin=false);
- void save(ofstream& outFile, bool bin=false);
- inline void save(bool bin=false) {
- save("/dev/stdout", bin);
- }
+ void save(std::ostream* os, bool bin=false);
+ void save(bool bin=false);
- void load(ifstream& inFile);
+ void load(std::istream* is);
void load(const std::string &file);
bool check_consistency() const;
+
void setIndex();
inline int getIndex(const std::string& idx) const {
- name2idx::const_iterator i = arrayname2idx_.find(idx);
- if (i != arrayname2idx_.end())
+ name2idx::const_iterator i = m_array_name_to_index.find(idx);
+ if (i != m_array_name_to_index.end())
return i->second;
else
return -1;
}
- inline std::string getIndex(size_t idx) const {
- idx2name::const_iterator i = idx2arrayname_.find(idx);
- if (i != idx2arrayname_.end())
- throw runtime_error("there is no entry at index " + idx);
+ inline std::string getIndex(std::size_t idx) const {
+ idx2name::const_iterator i = m_index_to_array_name.find(idx);
+ if (i != m_index_to_array_name.end())
+ throw std::runtime_error("there is no entry at index " + idx);
return i->second;
}
bool existsFeatureNames() const {
- return (idx2featname_.size() > 0) ? true : false;
+ return (m_index_to_feature_name.size() > 0) ? true : false;
}
- std::string getFeatureName(size_t idx) const {
- if (idx >= idx2featname_.size())
+ std::string getFeatureName(std::size_t idx) const {
+ if (idx >= m_index_to_feature_name.size())
throw runtime_error("Error: you required an too big index");
- map<size_t, std::string>::const_iterator it = idx2featname_.find(idx);
- if (it == idx2featname_.end()) {
+ std::map<std::size_t, std::string>::const_iterator it = m_index_to_feature_name.find(idx);
+ if (it == m_index_to_feature_name.end()) {
throw runtime_error("Error: specified id is unknown: " + idx);
} else {
return it->second;
}
}
- size_t getFeatureIndex(const std::string& name) const {
- map<std::string, size_t>::const_iterator it = featname2idx_.find(name);
- if (it == featname2idx_.end())
- throw runtime_error("Error: feature " + name + " is unknown");
+ std::size_t getFeatureIndex(const std::string& name) const {
+ std::map<std::string, std::size_t>::const_iterator it = m_feature_name_to_index.find(name);
+ if (it == m_feature_name_to_index.end()) {
+ std::string msg = "Error: feature " + name + " is unknown. Known features: ";
+ for (std::map<std::string, std::size_t>::const_iterator it = m_feature_name_to_index.begin(); it != m_feature_name_to_index.end(); it++) {
+ msg += it->first;
+ msg += ", ";
+ }
+
+ throw std::runtime_error(msg);
+ }
return it->second;
}
void setFeatureMap(const std::string& feat);
+
+ /* For debugging */
+ std::string ToString() const;
};
-#endif // FEATURE_DATA_H
+#endif // MERT_FEATURE_DATA_H_
diff --git a/mert/FeatureDataIterator.h b/mert/FeatureDataIterator.h
index 81f072970..58345829c 100644
--- a/mert/FeatureDataIterator.h
+++ b/mert/FeatureDataIterator.h
@@ -17,8 +17,8 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-#ifndef _FEATURE_DATA_ITERATOR_
-#define _FEATURE_DATA_ITERATOR_
+#ifndef MERT_FEATURE_DATA_ITERATOR_H_
+#define MERT_FEATURE_DATA_ITERATOR_H_
/**
* For loading from the feature data file.
@@ -88,4 +88,4 @@ class FeatureDataIterator :
std::vector<FeatureDataItem> m_next;
};
-#endif
+#endif // MERT_FEATURE_DATA_ITERATOR_H_
diff --git a/mert/FeatureDataTest.cpp b/mert/FeatureDataTest.cpp
new file mode 100644
index 000000000..42ac5996c
--- /dev/null
+++ b/mert/FeatureDataTest.cpp
@@ -0,0 +1,40 @@
+#include "FeatureData.h"
+
+#define BOOST_TEST_MODULE FeatureData
+#include <boost/test/unit_test.hpp>
+
+#include <sstream>
+
+namespace {
+
+void CheckFeatureMap(const FeatureData* feature_data,
+ const char* str, int num_feature, int* cnt) {
+ for (int i = 0; i < num_feature; ++i) {
+ std::stringstream ss;
+ ss << str << "_" << i;
+ const string& s = ss.str();
+ BOOST_CHECK_EQUAL(feature_data->getFeatureIndex(s), *cnt);
+ BOOST_CHECK_EQUAL(feature_data->getFeatureName(*cnt).c_str(), s);
+ ++(*cnt);
+ }
+}
+
+} // namespace
+
+BOOST_AUTO_TEST_CASE(set_feature_map) {
+ std::string str("d_0 d_1 d_2 d_3 d_4 d_5 d_6 lm_0 lm_1 tm_0 tm_1 tm_2 tm_3 tm_4 w_0 ");
+ FeatureData feature_data;
+
+ feature_data.setFeatureMap(str);
+
+ BOOST_REQUIRE(feature_data.Features() == str);
+ BOOST_REQUIRE(feature_data.NumberOfFeatures() == 15);
+
+ int cnt = 0;
+ CheckFeatureMap(&feature_data, "d", 7, &cnt);
+ CheckFeatureMap(&feature_data, "lm", 2, &cnt);
+ CheckFeatureMap(&feature_data, "tm", 5, &cnt);
+
+ BOOST_CHECK_EQUAL(feature_data.getFeatureIndex("w_0"), cnt);
+ BOOST_CHECK_EQUAL(feature_data.getFeatureName(cnt).c_str(), "w_0");
+}
diff --git a/mert/FeatureStats.cpp b/mert/FeatureStats.cpp
index 0fe003158..38aa31328 100644
--- a/mert/FeatureStats.cpp
+++ b/mert/FeatureStats.cpp
@@ -1,6 +1,6 @@
/*
* FeatureStats.cpp
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
@@ -8,6 +8,7 @@
#include "FeatureStats.h"
+#include <fstream>
#include <cmath>
#include "Util.h"
@@ -15,58 +16,58 @@ namespace {
const int kAvailableSize = 8;
} // namespace
-SparseVector::name2id_t SparseVector::name2id_;
-SparseVector::id2name_t SparseVector::id2name_;
+SparseVector::name2id_t SparseVector::m_name_to_id;
+SparseVector::id2name_t SparseVector::m_id_to_name;
FeatureStatsType SparseVector::get(const string& name) const {
- name2id_t::const_iterator name2id_iter = name2id_.find(name);
- if (name2id_iter == name2id_.end()) return 0;
+ name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
+ if (name2id_iter == m_name_to_id.end()) return 0;
size_t id = name2id_iter->second;
return get(id);
}
FeatureStatsType SparseVector::get(size_t id) const {
- fvector_t::const_iterator fvector_iter = fvector_.find(id);
- if (fvector_iter == fvector_.end()) return 0;
+ fvector_t::const_iterator fvector_iter = m_fvector.find(id);
+ if (fvector_iter == m_fvector.end()) return 0;
return fvector_iter->second;
}
void SparseVector::set(const string& name, FeatureStatsType value) {
- name2id_t::const_iterator name2id_iter = name2id_.find(name);
+ name2id_t::const_iterator name2id_iter = m_name_to_id.find(name);
size_t id = 0;
- if (name2id_iter == name2id_.end()) {
- id = id2name_.size();
- id2name_.push_back(name);
- name2id_[name] = id;
+ if (name2id_iter == m_name_to_id.end()) {
+ id = m_id_to_name.size();
+ m_id_to_name.push_back(name);
+ m_name_to_id[name] = id;
} else {
id = name2id_iter->second;
}
- fvector_[id] = value;
+ m_fvector[id] = value;
}
void SparseVector::write(ostream& out, const string& sep) const {
- for (fvector_t::const_iterator i = fvector_.begin(); i != fvector_.end(); ++i) {
+ for (fvector_t::const_iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
if (abs(i->second) < 0.00001) continue;
- string name = id2name_[i->first];
+ string name = m_id_to_name[i->first];
out << name << sep << i->second << " ";
}
}
void SparseVector::clear() {
- fvector_.clear();
+ m_fvector.clear();
}
SparseVector& SparseVector::operator-=(const SparseVector& rhs) {
//All the elements that have values in *this
- for (fvector_t::iterator i = fvector_.begin(); i != fvector_.end(); ++i) {
- fvector_[i->first] = i->second - rhs.get(i->first);
+ for (fvector_t::iterator i = m_fvector.begin(); i != m_fvector.end(); ++i) {
+ m_fvector[i->first] = i->second - rhs.get(i->first);
}
//Any elements in rhs, that have no value in *this
- for (fvector_t::const_iterator i = rhs.fvector_.begin();
- i != rhs.fvector_.end(); ++i) {
- if (fvector_.find(i->first) == fvector_.end()) {
- fvector_[i->first] = -(i->second);
+ for (fvector_t::const_iterator i = rhs.m_fvector.begin();
+ i != rhs.m_fvector.end(); ++i) {
+ if (m_fvector.find(i->first) == m_fvector.end()) {
+ m_fvector[i->first] = -(i->second);
}
}
return *this;
@@ -79,37 +80,37 @@ SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs) {
}
FeatureStats::FeatureStats()
- : available_(kAvailableSize), entries_(0),
- array_(new FeatureStatsType[available_]) {}
+ : m_available_size(kAvailableSize), m_entries(0),
+ m_array(new FeatureStatsType[m_available_size]) {}
FeatureStats::FeatureStats(const size_t size)
- : available_(size), entries_(size),
- array_(new FeatureStatsType[available_])
+ : m_available_size(size), m_entries(size),
+ m_array(new FeatureStatsType[m_available_size])
{
- memset(array_, 0, GetArraySizeWithBytes());
+ memset(m_array, 0, GetArraySizeWithBytes());
}
-FeatureStats::FeatureStats(std::string &theString)
- : available_(0), entries_(0), array_(NULL)
+FeatureStats::FeatureStats(string &theString)
+ : m_available_size(0), m_entries(0), m_array(NULL)
{
set(theString);
}
FeatureStats::~FeatureStats()
{
- if (array_) {
- delete [] array_;
- array_ = NULL;
+ if (m_array) {
+ delete [] m_array;
+ m_array = NULL;
}
}
void FeatureStats::Copy(const FeatureStats &stats)
{
- available_ = stats.available();
- entries_ = stats.size();
- array_ = new FeatureStatsType[available_];
- memcpy(array_, stats.getArray(), GetArraySizeWithBytes());
- map_ = stats.getSparse();
+ m_available_size = stats.available();
+ m_entries = stats.size();
+ m_array = new FeatureStatsType[m_available_size];
+ memcpy(m_array, stats.getArray(), GetArraySizeWithBytes());
+ m_map = stats.getSparse();
}
FeatureStats::FeatureStats(const FeatureStats &stats)
@@ -119,34 +120,34 @@ FeatureStats::FeatureStats(const FeatureStats &stats)
FeatureStats& FeatureStats::operator=(const FeatureStats &stats)
{
- delete [] array_;
+ delete [] m_array;
Copy(stats);
return *this;
}
void FeatureStats::expand()
{
- available_ *= 2;
- featstats_t t_ = new FeatureStatsType[available_];
- memcpy(t_, array_, GetArraySizeWithBytes());
- delete [] array_;
- array_ = t_;
+ m_available_size *= 2;
+ featstats_t t_ = new FeatureStatsType[m_available_size];
+ memcpy(t_, m_array, GetArraySizeWithBytes());
+ delete [] m_array;
+ m_array = t_;
}
void FeatureStats::add(FeatureStatsType v)
{
if (isfull()) expand();
- array_[entries_++]=v;
+ m_array[m_entries++]=v;
}
void FeatureStats::addSparse(const string& name, FeatureStatsType v)
{
- map_.set(name,v);
+ m_map.set(name,v);
}
-void FeatureStats::set(std::string &theString)
+void FeatureStats::set(string &theString)
{
- std::string substring, stringBuf;
+ string substring, stringBuf;
reset();
while (!theString.empty()) {
@@ -163,48 +164,50 @@ void FeatureStats::set(std::string &theString)
}
}
-
-void FeatureStats::loadbin(std::ifstream& inFile)
+void FeatureStats::loadbin(istream* is)
{
- inFile.read((char*) array_, GetArraySizeWithBytes());
+ is->read(reinterpret_cast<char*>(m_array),
+ static_cast<streamsize>(GetArraySizeWithBytes()));
}
-void FeatureStats::loadtxt(std::ifstream& inFile)
+void FeatureStats::loadtxt(istream* is)
{
- std::string theString;
- std::getline(inFile, theString);
- set(theString);
+ string line;
+ getline(*is, line);
+ set(line);
}
-void FeatureStats::loadtxt(const std::string &file)
+void FeatureStats::loadtxt(const string &file)
{
- // TRACE_ERR("loading the stats from " << file << std::endl);
-
- std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
-
- loadtxt(inFile);
+ ifstream ifs(file.c_str(), ios::in);
+ if (!ifs) {
+ cerr << "Failed to open " << file << endl;
+ exit(1);
+ }
+ istream* is = &ifs;
+ loadtxt(is);
}
-
-void FeatureStats::savetxt(const std::string &file)
+void FeatureStats::savetxt(const string &file)
{
-// TRACE_ERR("saving the stats into " << file << std::endl);
-
- std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
-
- savetxt(outFile);
+ ofstream ofs(file.c_str(), ios::out);
+ ostream* os = &ofs;
+ savetxt(os);
}
-
-void FeatureStats::savetxt(std::ofstream& outFile)
+void FeatureStats::savetxt(ostream* os)
{
-// TRACE_ERR("saving the stats" << std::endl);
- outFile << *this;
+ *os << *this;
}
-void FeatureStats::savebin(std::ofstream& outFile)
+void FeatureStats::savetxt() {
+ savetxt(&cout);
+}
+
+void FeatureStats::savebin(ostream* os)
{
- outFile.write((char*) array_, GetArraySizeWithBytes());
+ os->write(reinterpret_cast<char*>(m_array),
+ static_cast<streamsize>(GetArraySizeWithBytes()));
}
ostream& operator<<(ostream& o, const FeatureStats& e)
@@ -230,7 +233,7 @@ bool operator==(const FeatureStats& f1, const FeatureStats& f2) {
if (f1.get(k) != f2.get(k))
return false;
}
-
+
return true;
}
//END_ADDED
diff --git a/mert/FeatureStats.h b/mert/FeatureStats.h
index 10ff31992..e2e63a714 100644
--- a/mert/FeatureStats.h
+++ b/mert/FeatureStats.h
@@ -1,16 +1,15 @@
/*
* FeatureStats.h
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
-#ifndef FEATURE_STATS_H
-#define FEATURE_STATS_H
+#ifndef MERT_FEATURE_STATS_H_
+#define MERT_FEATURE_STATS_H_
#include <cstring>
-#include <fstream>
#include <iostream>
#include <map>
#include <string>
@@ -30,18 +29,16 @@ public:
FeatureStatsType get(size_t id) const;
void set(const std::string& name, FeatureStatsType value);
void clear();
- size_t size() const {
- return fvector_.size();
- }
+ size_t size() const { return m_fvector.size(); }
void write(std::ostream& out, const std::string& sep = " ") const;
SparseVector& operator-=(const SparseVector& rhs);
private:
- static name2id_t name2id_;
- static id2name_t id2name_;
- fvector_t fvector_;
+ static name2id_t m_name_to_id;
+ static id2name_t m_id_to_name;
+ fvector_t m_fvector;
};
SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs);
@@ -49,12 +46,12 @@ SparseVector operator-(const SparseVector& lhs, const SparseVector& rhs);
class FeatureStats
{
private:
- size_t available_;
- size_t entries_;
+ size_t m_available_size;
+ size_t m_entries;
// TODO: Use smart pointer for exceptional-safety.
- featstats_t array_;
- SparseVector map_;
+ featstats_t m_array;
+ SparseVector m_map;
public:
FeatureStats();
@@ -69,64 +66,47 @@ public:
void Copy(const FeatureStats &stats);
- bool isfull() const {
- return (entries_ < available_) ? 0 : 1;
- }
+ bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; }
void expand();
void add(FeatureStatsType v);
void addSparse(const string& name, FeatureStatsType v);
void clear() {
- memset((void*)array_, 0, GetArraySizeWithBytes());
- map_.clear();
+ memset((void*)m_array, 0, GetArraySizeWithBytes());
+ m_map.clear();
}
void reset() {
- entries_ = 0;
+ m_entries = 0;
clear();
}
- inline FeatureStatsType get(size_t i) {
- return array_[i];
- }
- inline FeatureStatsType get(size_t i)const {
- return array_[i];
- }
- inline featstats_t getArray() const {
- return array_;
- }
- inline const SparseVector& getSparse() const {
- return map_;
- }
+ FeatureStatsType get(size_t i) { return m_array[i]; }
+ FeatureStatsType get(size_t i)const { return m_array[i]; }
+ featstats_t getArray() const { return m_array; }
+
+ const SparseVector& getSparse() const { return m_map; }
void set(std::string &theString);
- inline size_t bytes() const {
- return GetArraySizeWithBytes();
- }
+ inline size_t bytes() const { return GetArraySizeWithBytes(); }
size_t GetArraySizeWithBytes() const {
- return entries_ * sizeof(FeatureStatsType);
+ return m_entries * sizeof(FeatureStatsType);
}
- inline size_t size() const {
- return entries_;
- }
+ size_t size() const { return m_entries; }
- inline size_t available() const {
- return available_;
- }
+ size_t available() const { return m_available_size; }
void savetxt(const std::string &file);
- void savetxt(ofstream& outFile);
- void savebin(ofstream& outFile);
- inline void savetxt() {
- savetxt("/dev/stdout");
- }
+ void savetxt(std::ostream* os);
+ void savebin(std::ostream* os);
+ void savetxt();
void loadtxt(const std::string &file);
- void loadtxt(ifstream& inFile);
- void loadbin(ifstream& inFile);
+ void loadtxt(std::istream* is);
+ void loadbin(std::istream* is);
/**
* Write the whole object to a stream.
@@ -138,4 +118,4 @@ public:
bool operator==(const FeatureStats& f1, const FeatureStats& f2);
//END_ADDED
-#endif // FEATURE_STATS_H
+#endif // MERT_FEATURE_STATS_H_
diff --git a/mert/FileStream.cpp b/mert/FileStream.cpp
index 11fd58e26..1a52e53fa 100644
--- a/mert/FileStream.cpp
+++ b/mert/FileStream.cpp
@@ -1,7 +1,7 @@
#include "FileStream.h"
#include <stdexcept>
-#include "gzfilebuf.h"
+#include "GzFileBuf.h"
using namespace std;
@@ -13,16 +13,16 @@ bool IsGzipFile(const std::string &filename) {
} // namespace
inputfilestream::inputfilestream(const std::string &filePath)
- : std::istream(0), m_streambuf(0), is_good(false)
+ : std::istream(0), m_streambuf(0), m_is_good(false)
{
// check if file is readable
std::filebuf* fb = new std::filebuf();
- is_good = (fb->open(filePath.c_str(), std::ios::in) != NULL);
+ m_is_good = (fb->open(filePath.c_str(), std::ios::in) != NULL);
if (IsGzipFile(filePath)) {
fb->close();
delete fb;
- m_streambuf = new gzfilebuf(filePath.c_str());
+ m_streambuf = new GzFileBuf(filePath.c_str());
} else {
m_streambuf = fb;
}
@@ -40,11 +40,11 @@ void inputfilestream::close()
}
outputfilestream::outputfilestream(const std::string &filePath)
- : std::ostream(0), m_streambuf(0), is_good(false)
+ : std::ostream(0), m_streambuf(0), m_is_good(false)
{
// check if file is readable
std::filebuf* fb = new std::filebuf();
- is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL);
+ m_is_good = (fb->open(filePath.c_str(), std::ios::out) != NULL);
if (IsGzipFile(filePath)) {
throw runtime_error("Output to a zipped file not supported!");
diff --git a/mert/FileStream.h b/mert/FileStream.h
index afa8d9a29..3fd489cd7 100644
--- a/mert/FileStream.h
+++ b/mert/FileStream.h
@@ -1,7 +1,8 @@
-#ifndef FILESTREAM_H_
-#define FILESTREAM_H_
+#ifndef MERT_FILE_STREAM_H_
+#define MERT_FILE_STREAM_H_
#include <fstream>
+#include <iostream>
#include <streambuf>
#include <string>
@@ -9,12 +10,13 @@ class inputfilestream : public std::istream
{
protected:
std::streambuf *m_streambuf;
- bool is_good;
+ bool m_is_good;
public:
explicit inputfilestream(const std::string &filePath);
- ~inputfilestream();
- bool good() const { return is_good; }
+ virtual ~inputfilestream();
+
+ bool good() const { return m_is_good; }
void close();
};
@@ -22,13 +24,14 @@ class outputfilestream : public std::ostream
{
protected:
std::streambuf *m_streambuf;
- bool is_good;
+ bool m_is_good;
public:
explicit outputfilestream(const std::string &filePath);
- ~outputfilestream();
- bool good() const { return is_good; }
+ virtual ~outputfilestream();
+
+ bool good() const { return m_is_good; }
void close();
};
-#endif // FILESTREAM_H_
+#endif // MERT_FILE_STREAM_H_
diff --git a/mert/GzFileBuf.cpp b/mert/GzFileBuf.cpp
new file mode 100644
index 000000000..9d3ccb588
--- /dev/null
+++ b/mert/GzFileBuf.cpp
@@ -0,0 +1,80 @@
+#include "GzFileBuf.h"
+
+#include <cstring>
+#include <cstdlib>
+#include <cstdio>
+#include <iostream>
+
+GzFileBuf::GzFileBuf(const char* filename) {
+ m_gz_file = gzopen(filename, "rb");
+ if (m_gz_file == NULL) {
+ std::cerr << "ERROR: Failed to open " << filename << std::endl;
+ std::exit(1);
+ }
+ setg(m_buf + sizeof(int), // beginning of putback area
+ m_buf + sizeof(int), // read position
+ m_buf + sizeof(int)); // end position
+}
+
+GzFileBuf::~GzFileBuf() {
+ gzclose(m_gz_file);
+}
+
+int GzFileBuf::overflow(int_type c) {
+ throw;
+}
+
+// read one character
+int GzFileBuf::underflow() {
+ // is read position before end of m_buf?
+ if (gptr() < egptr()) {
+ return traits_type::to_int_type(*gptr());
+ }
+
+ /* process size of putback area
+ * - use number of characters read
+ * - but at most four
+ */
+ unsigned int num_put_back = static_cast<unsigned int>(gptr() - eback());
+ if (num_put_back > sizeof(int)) {
+ num_put_back = sizeof(int);
+ }
+
+ /* copy up to four characters previously read into
+ * the putback m_buf (area of first four characters)
+ */
+ std::memmove(m_buf + (sizeof(int) - num_put_back),
+ gptr() - num_put_back, num_put_back);
+
+ // read new characters
+ const int num = gzread(m_gz_file, m_buf + sizeof(int),
+ kBufSize - sizeof(int));
+ if (num <= 0) {
+ // ERROR or EOF
+ return EOF; // NOTE: the macro EOF defined in stdio.h
+ }
+
+ // reset m_buf pointers
+ setg(m_buf + (sizeof(int) - num_put_back), // beginning of putback area
+ m_buf + sizeof(int), // read position
+ m_buf + sizeof(int) + num); // end of buffer
+
+ // return next character
+ return traits_type::to_int_type(*gptr());
+}
+
+std::streampos GzFileBuf::seekpos(
+ std::streampos sp,
+ std::ios_base::openmode which) {
+ throw;
+}
+
+std::streamsize GzFileBuf::xsgetn(char* s,
+ std::streamsize num) {
+ return static_cast<std::streamsize>(gzread(m_gz_file,s,num));
+}
+
+std::streamsize GzFileBuf::xsputn(const char* s,
+ std::streamsize num) {
+ throw;
+}
diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp
new file mode 100644
index 000000000..822cdbb78
--- /dev/null
+++ b/mert/InterpolatedScorer.cpp
@@ -0,0 +1,189 @@
+#include "InterpolatedScorer.h"
+#include "ScorerFactory.h"
+#include "Util.h"
+
+using namespace std;
+
+// TODO: This is too long. Consider creating a function for
+// initialization such as Init().
+InterpolatedScorer::InterpolatedScorer(const string& name, const string& config)
+ : Scorer(name,config)
+{
+ // name would be: HAMMING,BLEU or similar
+ string scorers = name;
+ while (scorers.length() > 0) {
+ string scorertype = "";
+ getNextPound(scorers, scorertype,",");
+ Scorer *scorer = ScorerFactory::getScorer(scorertype,config);
+ m_scorers.push_back(scorer);
+ }
+ if (m_scorers.size() == 0) {
+ throw runtime_error("There are no scorers");
+ }
+ cerr << "Number of scorers: " << m_scorers.size() << endl;
+
+ //TODO debug this
+ string wtype = getConfig("weights","");
+ //Default weights set to uniform ie. if two weights 0.5 each
+ //weights should add to 1
+ if (wtype.length() == 0) {
+ float weight = 1.0 / m_scorers.size() ;
+ //cout << " Default weights:" << weight << endl;
+ for (size_t i = 0; i < m_scorers.size(); i ++) {
+ m_scorer_weights.push_back(weight);
+ }
+ } else {
+ float tot=0;
+ //cout << "Defined weights:" << endl;
+ while (wtype.length() > 0) {
+ string scoreweight = "";
+ getNextPound(wtype,scoreweight,"+");
+ float weight = atof(scoreweight.c_str());
+ m_scorer_weights.push_back(weight);
+ tot += weight;
+ //cout << " :" << weight ;
+ }
+ //cout << endl;
+ if (tot != float(1)) { // TODO: fix this checking in terms of readability.
+ for (vector<float>::iterator it = m_scorer_weights.begin();
+ it != m_scorer_weights.end(); ++it) {
+ *it /= tot;
+ }
+ }
+
+ if (m_scorers.size() != m_scorer_weights.size()) {
+ throw runtime_error("The number of weights does not equal the number of scorers!");
+ }
+ }
+ cerr << "The weights for the interpolated scorers are: " << endl;
+ for (vector<float>::iterator it = m_scorer_weights.begin(); it < m_scorer_weights.end(); it++) {
+ cerr << *it << " " ;
+ }
+ cerr <<endl;
+}
+
+void InterpolatedScorer::setScoreData(ScoreData* data)
+{
+ size_t last = 0;
+ m_score_data = data;
+ for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin();
+ itsc != m_scorers.end(); ++itsc) {
+ int numScoresScorer = (*itsc)->NumberOfScores();
+ ScoreData* newData =new ScoreData(*itsc);
+ for (size_t i = 0; i < data->size(); i++) {
+ ScoreArray scoreArray = data->get(i);
+ ScoreArray newScoreArray;
+ std::string istr;
+ std::stringstream out;
+ out << i;
+ istr = out.str();
+ size_t numNBest = scoreArray.size();
+ //cout << " Datasize " << data->size() << " NumNBest " << numNBest << endl ;
+ for (size_t j = 0; j < numNBest ; j++) {
+ ScoreStats scoreStats = data->get(i, j);
+ //cout << "Scorestats " << scoreStats << " i " << i << " j " << j << endl;
+ ScoreStats newScoreStats;
+ for (size_t k = last; k < size_t(numScoresScorer + last); k++) {
+ ScoreStatsType score = scoreStats.get(k);
+ newScoreStats.add(score);
+ }
+ //cout << " last " << last << " NumScores " << numScoresScorer << "newScorestats " << newScoreStats << endl;
+ newScoreArray.add(newScoreStats);
+ }
+ newScoreArray.setIndex(istr);
+ newData->add(newScoreArray);
+ }
+ //newData->dump();
+
+ // NOTE: This class takes the ownership of the heap allocated
+ // ScoreData objects to avoid the memory leak issues.
+ m_scorers_score_data.push_back(newData);
+
+ (*itsc)->setScoreData(newData);
+ last += numScoresScorer;
+ }
+}
+
+
+/** The interpolated scorer calls a vector of scorers and combines them with
+ weights **/
+void InterpolatedScorer::score(const candidates_t& candidates, const diffs_t& diffs,
+ statscores_t& scores) const
+{
+ //cout << "*******InterpolatedScorer::score" << endl;
+ size_t scorerNum = 0;
+ for (ScopedVector<Scorer>::const_iterator itsc = m_scorers.begin();
+ itsc != m_scorers.end(); ++itsc) {
+ //int numScores = (*itsc)->NumberOfScores();
+ statscores_t tscores;
+ (*itsc)->score(candidates,diffs,tscores);
+ size_t inc = 0;
+ for (statscores_t::iterator itstatsc = tscores.begin();
+ itstatsc != tscores.end(); ++itstatsc) {
+ //cout << "Scores " << (*itstatsc) << endl;
+ float weight = m_scorer_weights[scorerNum];
+ if (weight == 0) {
+ stringstream msg;
+ msg << "No weights for scorer" << scorerNum ;
+ throw runtime_error(msg.str());
+ }
+ if (scorerNum == 0) {
+ scores.push_back(weight * (*itstatsc));
+ } else {
+ scores[inc] += weight * (*itstatsc);
+ }
+ //cout << "Scorer:" << scorerNum << " scoreNum:" << inc << " score: " << (*itstatsc) << " weight:" << weight << endl;
+ inc++;
+
+ }
+ scorerNum++;
+ }
+
+}
+
+void InterpolatedScorer::setReferenceFiles(const vector<string>& referenceFiles)
+{
+ for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin();
+ itsc != m_scorers.end(); ++itsc) {
+ (*itsc)->setReferenceFiles(referenceFiles);
+ }
+}
+
+void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
+{
+ stringstream buff;
+ int i = 0;
+ for (ScopedVector<Scorer>::iterator itsc = m_scorers.begin();
+ itsc != m_scorers.end(); ++itsc) {
+ ScoreStats tempEntry;
+ (*itsc)->prepareStats(sid, text, tempEntry);
+ if (i > 0) buff << " ";
+ buff << tempEntry;
+ i++;
+ }
+ //cout << " Scores for interpolated: " << buff << endl;
+ string str = buff.str();
+ entry.set(str);
+}
+
+void InterpolatedScorer::setFactors(const string& factors)
+{
+ if (factors.empty()) return;
+
+ vector<string> fsplit;
+ split(factors, ',', fsplit);
+
+ if (fsplit.size() != m_scorers.size())
+ throw runtime_error("Number of factor specifications does not equal number of interpolated scorers.");
+
+ for (size_t i = 0; i < m_scorers.size(); ++i) {
+ m_scorers[i]->setFactors(fsplit[i]);
+ }
+}
+
+void InterpolatedScorer::setFilter(const string& filterCommand)
+{
+ for (size_t i = 0; i < m_scorers.size(); ++i) {
+ m_scorers[i]->setFilter(filterCommand);
+ }
+}
diff --git a/mert/InterpolatedScorer.h b/mert/InterpolatedScorer.h
new file mode 100644
index 000000000..7ee7e5eba
--- /dev/null
+++ b/mert/InterpolatedScorer.h
@@ -0,0 +1,55 @@
+#ifndef MERT_INTERPOLATED_SCORER_H_
+#define MERT_INTERPOLATED_SCORER_H_
+
+#include <string>
+#include <vector>
+#include "Types.h"
+#include "ScoreData.h"
+#include "Scorer.h"
+#include "ScopedVector.h"
+
+/**
+ * Class that includes other scorers eg.
+ * Interpolated HAMMING and BLEU scorer **/
+class InterpolatedScorer : public Scorer
+{
+public:
+ // name would be: "HAMMING,BLEU" or similar
+ InterpolatedScorer(const string& name, const string& config);
+ virtual ~InterpolatedScorer() {}
+
+ virtual void score(const candidates_t& candidates, const diffs_t& diffs,
+ statscores_t& scores) const;
+
+ virtual void setReferenceFiles(const vector<string>& referenceFiles);
+ virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
+
+ virtual size_t NumberOfScores() const {
+ size_t sz = 0;
+ for (ScopedVector<Scorer>::const_iterator itsc = m_scorers.begin();
+ itsc != m_scorers.end(); ++itsc) {
+ sz += (*itsc)->NumberOfScores();
+ }
+ return sz;
+ }
+
+ virtual void setScoreData(ScoreData* data);
+
+ /**
+ * Set the factors, which should be used for this metric
+ */
+ virtual void setFactors(const string& factors);
+
+ virtual void setFilter(const string& filterCommand);
+
+protected:
+ ScopedVector<Scorer> m_scorers;
+
+ // Take the ownership of the heap-allocated the objects
+ // by Scorer objects.
+ ScopedVector<ScoreData> m_scorers_score_data;
+
+ vector<float> m_scorer_weights;
+};
+
+#endif // MERT_INTERPOLATED_SCORER_H_
diff --git a/mert/Jamfile b/mert/Jamfile
index b23078fbe..2eaa7143c 100644
--- a/mert/Jamfile
+++ b/mert/Jamfile
@@ -4,33 +4,44 @@ lib m ;
lib mert_lib :
Util.cpp
+GzFileBuf.cpp
FileStream.cpp
Timer.cpp
-ScoreStats.cpp ScoreArray.cpp ScoreData.cpp
+ScoreStats.cpp
+ScoreArray.cpp
+ScoreData.cpp
ScoreDataIterator.cpp
-FeatureStats.cpp FeatureArray.cpp FeatureData.cpp
+FeatureStats.cpp
+FeatureArray.cpp
+FeatureData.cpp
FeatureDataIterator.cpp
Data.cpp
BleuScorer.cpp
+SemposScorer.cpp
+SemposOverlapping.cpp
+InterpolatedScorer.cpp
Point.cpp
PerScorer.cpp
Scorer.cpp
ScorerFactory.cpp
Optimizer.cpp
-TERsrc/alignmentStruct.cpp
-TERsrc/hashMap.cpp
-TERsrc/hashMapStringInfos.cpp
-TERsrc/stringHasher.cpp
-TERsrc/terAlignment.cpp
-TERsrc/terShift.cpp
-TERsrc/hashMapInfos.cpp
-TERsrc/infosHasher.cpp
-TERsrc/stringInfosHasher.cpp
-TERsrc/tercalc.cpp
-TERsrc/tools.cpp
+OptimizerFactory.cpp
+TER/alignmentStruct.cpp
+TER/hashMap.cpp
+TER/hashMapStringInfos.cpp
+TER/stringHasher.cpp
+TER/terAlignment.cpp
+TER/terShift.cpp
+TER/hashMapInfos.cpp
+TER/infosHasher.cpp
+TER/stringInfosHasher.cpp
+TER/tercalc.cpp
+TER/tools.cpp
TerScorer.cpp
CderScorer.cpp
MergeScorer.cpp
+Vocabulary.cpp
+PreProcessFilter.cpp
../util//kenutil m ..//z ;
exe mert : mert.cpp mert_lib ../moses/src//ThreadPool ;
@@ -43,6 +54,16 @@ exe pro : pro.cpp mert_lib ..//boost_program_options ;
alias programs : mert extractor evaluator pro ;
+unit-test bleu_scorer_test : BleuScorerTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test feature_data_test : FeatureDataTest.cpp mert_lib ..//boost_unit_test_framework ;
unit-test data_test : DataTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test ngram_test : NgramTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test optimizer_factory_test : OptimizerFactoryTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test point_test : PointTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test reference_test : ReferenceTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test singleton_test : SingletonTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test timer_test : TimerTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test util_test : UtilTest.cpp mert_lib ..//boost_unit_test_framework ;
+unit-test vocabulary_test : VocabularyTest.cpp mert_lib ..//boost_unit_test_framework ;
install legacy : programs : <location>. ;
diff --git a/mert/MergeScorer.cpp b/mert/MergeScorer.cpp
index 0f0da39c3..7a80f1477 100644
--- a/mert/MergeScorer.cpp
+++ b/mert/MergeScorer.cpp
@@ -8,13 +8,14 @@
#include "PerScorer.h"
#include "CderScorer.h"
-#include "TERsrc/tercalc.h"
-#include "TERsrc/terAlignment.h"
+#include "TER/tercalc.h"
+#include "TER/terAlignment.h"
using namespace TERCpp;
MergeScorer::MergeScorer(const string& config)
- : StatisticsBasedScorer("MERGE",config), kLENGTH(4) {}
+ : StatisticsBasedScorer("MERGE", config) {}
+
MergeScorer::~MergeScorer() {}
void MergeScorer::setReferenceFiles(const vector<string>& referenceFiles)
diff --git a/mert/MergeScorer.h b/mert/MergeScorer.h
index cc657b718..2d7030421 100644
--- a/mert/MergeScorer.h
+++ b/mert/MergeScorer.h
@@ -1,5 +1,5 @@
-#ifndef __MERGESCORER_H__
-#define __MERGESCORER_H__
+#ifndef MERT_MERGE_SCORER_H_
+#define MERT_MERGE_SCORER_H_
#include <iostream>
#include <set>
@@ -13,6 +13,8 @@ using namespace std;
class PerScorer;
class ScoreStats;
+const int kMergeScorerLength = 4;
+
/**
* Merge scoring.
*/
@@ -23,21 +25,16 @@ public:
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
-
- void whoami() const {
- cerr << "I AM MergeScorer" << endl;
- }
+ virtual size_t NumberOfScores() const { return 0; }
protected:
friend class PerScorer;
virtual float calculateScore(const vector<int>& comps) const;
private:
- const int kLENGTH;
-
// no copying allowed
MergeScorer(const MergeScorer&);
MergeScorer& operator=(const MergeScorer&);
};
-#endif //__TERSCORER_H
+#endif // MERT_MERGE_SCORER_H_
diff --git a/mert/Ngram.h b/mert/Ngram.h
new file mode 100644
index 000000000..846604f3f
--- /dev/null
+++ b/mert/Ngram.h
@@ -0,0 +1,98 @@
+#ifndef MERT_NGRAM_H_
+#define MERT_NGRAM_H_
+
+#include <vector>
+#include <map>
+#include <string>
+
+/** A simple STL-std::map based n-gram counts. Basically, we provide
+ * typical accessors and mutaors, but we intentionally does not allow
+ * erasing elements.
+ */
+class NgramCounts {
+ public:
+ // Used to construct the ngram map
+ struct NgramComparator {
+ bool operator()(const std::vector<int>& a, const std::vector<int>& b) const {
+ std::size_t i;
+ const std::size_t as = a.size();
+ const std::size_t bs = b.size();
+ for (i = 0; i < as && i < bs; ++i) {
+ if (a[i] < b[i]) {
+ return true;
+ }
+ if (a[i] > b[i]) {
+ return false;
+ }
+ }
+ // entries are equal, shortest wins
+ return as < bs;
+ }
+ };
+
+ typedef std::vector<int> Key;
+ typedef int Value;
+ typedef std::map<Key, Value, NgramComparator>::iterator iterator;
+ typedef std::map<Key, Value, NgramComparator>::const_iterator const_iterator;
+
+ NgramCounts() : kDefaultCount(1) { }
+ virtual ~NgramCounts() { }
+
+ /**
+ * If the specified "ngram" is found, we add counts.
+ * If not, we insert the default count in the container. */
+ void Add(const Key& ngram) {
+ const_iterator it = find(ngram);
+ if (it != end()) {
+ m_counts[ngram] = it->second + 1;
+ } else {
+ m_counts[ngram] = kDefaultCount;
+ }
+ }
+
+ /**
+ * Return true iff the specified "ngram" is found in the container.
+ */
+ bool Lookup(const Key& ngram, Value* v) const {
+ const_iterator it = m_counts.find(ngram);
+ if (it == m_counts.end()) return false;
+ *v = it->second;
+ return true;
+ }
+
+ /**
+ * Clear all elments in the container.
+ */
+ void clear() { m_counts.clear(); }
+
+ /**
+ * Return true iff the container is empty.
+ */
+ bool empty() const { return m_counts.empty(); }
+
+ /**
+ * Return the the number of elements in the container.
+ */
+ std::size_t size() const { return m_counts.size(); }
+
+ std::size_t max_size() const { return m_counts.max_size(); }
+
+ // Note: This is mainly used by unit tests.
+ int get_default_count() const { return kDefaultCount; }
+
+ iterator find(const Key& ngram) { return m_counts.find(ngram); }
+ const_iterator find(const Key& ngram) const { return m_counts.find(ngram); }
+
+ Value& operator[](const Key& ngram) { return m_counts[ngram]; }
+
+ iterator begin() { return m_counts.begin(); }
+ const_iterator begin() const { return m_counts.begin(); }
+ iterator end() { return m_counts.end(); }
+ const_iterator end() const { return m_counts.end(); }
+
+ private:
+ const int kDefaultCount;
+ std::map<Key, Value, NgramComparator> m_counts;
+};
+
+#endif // MERT_NGRAM_H_
diff --git a/mert/NgramTest.cpp b/mert/NgramTest.cpp
new file mode 100644
index 000000000..f2a8eb58b
--- /dev/null
+++ b/mert/NgramTest.cpp
@@ -0,0 +1,83 @@
+#include "Ngram.h"
+
+#define BOOST_TEST_MODULE MertNgram
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_CASE(ngram_basic) {
+ NgramCounts counts;
+ NgramCounts::Key key;
+ key.push_back(1);
+ key.push_back(2);
+ key.push_back(4);
+ counts.Add(key);
+
+ BOOST_REQUIRE(!counts.empty());
+ BOOST_CHECK_EQUAL(counts.size(), 1);
+
+ NgramCounts::const_iterator it = counts.find(key);
+ BOOST_CHECK(it != counts.end());
+ BOOST_CHECK_EQUAL(it->first.size(), key.size());
+ for (size_t i = 0; i < key.size(); ++i) {
+ BOOST_CHECK_EQUAL(it->first[i], key[i]);
+ }
+ BOOST_CHECK_EQUAL(it->second, 1);
+}
+
+BOOST_AUTO_TEST_CASE(ngram_Add) {
+ NgramCounts counts;
+ NgramCounts::Key key;
+ key.push_back(1);
+ key.push_back(2);
+ counts.Add(key);
+ BOOST_REQUIRE(!counts.empty());
+ BOOST_CHECK_EQUAL(counts[key], counts.get_default_count());
+
+ NgramCounts::Key key2;
+ key2.push_back(1);
+ key2.push_back(2);
+ counts.Add(key2);
+ BOOST_CHECK_EQUAL(counts.size(), 1);
+ BOOST_CHECK_EQUAL(counts[key], counts.get_default_count() + 1);
+ BOOST_CHECK_EQUAL(counts[key2], counts.get_default_count() + 1);
+
+ NgramCounts::Key key3;
+ key3.push_back(10);
+ counts.Add(key3);
+ BOOST_CHECK_EQUAL(counts.size(), 2);
+ BOOST_CHECK_EQUAL(counts[key3], counts.get_default_count());
+}
+
+BOOST_AUTO_TEST_CASE(ngram_lookup) {
+ NgramCounts counts;
+ NgramCounts::Key key;
+ key.push_back(1);
+ key.push_back(2);
+ key.push_back(4);
+ counts.Add(key);
+
+ {
+ NgramCounts::Value v;
+ BOOST_REQUIRE(counts.Lookup(key, &v));
+ BOOST_CHECK_EQUAL(v, 1);
+ }
+
+ // the case the key is not found.
+ {
+ NgramCounts::Key key2;
+ key2.push_back(0);
+ key2.push_back(4);
+ NgramCounts::Value v;
+ // We only check the return value;
+ // we don't check the value of "v" because it makes sense
+ // to check the value when the specified ngram is found.
+ BOOST_REQUIRE(!counts.Lookup(key2, &v));
+ }
+
+ // test after clear
+ counts.clear();
+ BOOST_CHECK(counts.empty());
+ {
+ NgramCounts::Value v;
+ BOOST_CHECK(!counts.Lookup(key, &v));
+ }
+}
diff --git a/mert/Optimizer.cpp b/mert/Optimizer.cpp
index 093c9ac1b..39e9aac1b 100644
--- a/mert/Optimizer.cpp
+++ b/mert/Optimizer.cpp
@@ -7,6 +7,7 @@
#include <map>
#include <cfloat>
#include <iostream>
+#include <stdint.h>
#include "Point.h"
#include "Util.h"
@@ -32,36 +33,25 @@ inline float intersect(float m1, float b1, float m2, float b2)
} // namespace
-
-void Optimizer::SetScorer(Scorer *_scorer)
-{
- scorer = _scorer;
-}
-
-void Optimizer::SetFData(FeatureDataHandle _FData)
-{
- FData = _FData;
-}
-
-Optimizer::Optimizer(unsigned Pd, vector<unsigned> i2O, vector<parameter_t> start, unsigned int nrandom)
- : scorer(NULL), FData(), number_of_random_directions(nrandom)
+Optimizer::Optimizer(unsigned Pd, const vector<unsigned>& i2O, const vector<bool>& pos, const vector<parameter_t>& start, unsigned int nrandom)
+ : m_scorer(NULL), m_feature_data(), m_num_random_directions(nrandom), m_positive(pos)
{
- // Warning: the init vector is a full set of parameters, of dimension pdim!
- Point::pdim = Pd;
+ // Warning: the init vector is a full set of parameters, of dimension m_pdim!
+ Point::m_pdim = Pd;
CHECK(start.size() == Pd);
- Point::dim = i2O.size();
- Point::optindices = i2O;
- if (Point::pdim > Point::dim) {
- for (unsigned int i = 0; i < Point::pdim; i++) {
+ Point::m_dim = i2O.size();
+ Point::m_opt_indices = i2O;
+ if (Point::m_pdim > Point::m_dim) {
+ for (unsigned int i = 0; i < Point::m_pdim; i++) {
unsigned int j = 0;
- while (j < Point::dim && i != i2O[j])
+ while (j < Point::m_dim && i != i2O[j])
j++;
- // The index i wasnt found on optindices, it is a fixed index,
+ // The index i wasnt found on m_opt_indices, it is a fixed index,
// we use the value of the start vector.
- if (j == Point::dim)
- Point::fixedweights[i] = start[i];
+ if (j == Point::m_dim)
+ Point::m_fixed_weights[i] = start[i];
}
}
}
@@ -72,12 +62,11 @@ statscore_t Optimizer::GetStatScore(const Point& param) const
{
vector<unsigned> bests;
Get1bests(param, bests);
- //copy(bests.begin(),bests.end(),ostream_iterator<unsigned>(cerr," "));
statscore_t score = GetStatScore(bests);
return score;
}
-map<float,diff_t >::iterator AddThreshold(map<float,diff_t >& thresholdmap, float newt, pair<unsigned,unsigned> newdiff)
+map<float,diff_t >::iterator AddThreshold(map<float,diff_t >& thresholdmap, float newt, const pair<unsigned,unsigned>& newdiff)
{
map<float,diff_t>::iterator it = thresholdmap.find(newt);
if (it != thresholdmap.end()) {
@@ -113,12 +102,12 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
//cerr << "Sentence " << S << endl;
multimap<float, unsigned> gradient;
vector<float> f0;
- f0.resize(FData->get(S).size());
- for (unsigned j = 0; j < FData->get(S).size(); j++) {
+ f0.resize(m_feature_data->get(S).size());
+ for (unsigned j = 0; j < m_feature_data->get(S).size(); j++) {
// gradient of the feature function for this particular target sentence
- gradient.insert(pair<float, unsigned>(direction * (FData->get(S,j)), j));
+ gradient.insert(pair<float, unsigned>(direction * (m_feature_data->get(S,j)), j));
// compute the feature function at the origin point
- f0[j] = origin * FData->get(S, j);
+ f0[j] = origin * m_feature_data->get(S, j);
}
// Now let's compute the 1best for each value of x.
@@ -255,7 +244,16 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
CHECK(scores.size() == thresholdmap.size());
for (unsigned int sc = 0; sc != scores.size(); sc++) {
//cerr << "x=" << thrit->first << " => " << scores[sc] << endl;
- if (scores[sc] > bestscore) {
+
+ //enforce positivity
+ Point respoint = origin + direction * thrit->first;
+ bool is_valid = true;
+ for (unsigned int k=0; k < respoint.getdim(); k++) {
+ if (m_positive[k] && respoint[k] <= 0.0)
+ is_valid = false;
+ }
+
+ if (is_valid && scores[sc] > bestscore) {
// This is the score for the interval [lit2->first, (lit2+1)->first]
// unless we're at the last score, when it's the score
// for the interval [lit2->first,+inf].
@@ -309,7 +307,7 @@ statscore_t Optimizer::LineOptimize(const Point& origin, const Point& direction,
void Optimizer::Get1bests(const Point& P, vector<unsigned>& bests) const
{
- CHECK(FData);
+ CHECK(m_feature_data);
bests.clear();
bests.resize(size());
@@ -317,8 +315,8 @@ void Optimizer::Get1bests(const Point& P, vector<unsigned>& bests) const
float bestfs = MIN_FLOAT;
unsigned idx = 0;
unsigned j;
- for (j = 0; j < FData->get(i).size(); j++) {
- float curfs = P * FData->get(i, j);
+ for (j = 0; j < m_feature_data->get(i).size(); j++) {
+ float curfs = P * m_feature_data->get(i, j);
if (curfs > bestfs) {
bestfs = curfs;
idx = j;
@@ -331,15 +329,15 @@ void Optimizer::Get1bests(const Point& P, vector<unsigned>& bests) const
statscore_t Optimizer::Run(Point& P) const
{
- if (!FData) {
+ if (!m_feature_data) {
cerr << "error trying to optimize without Features loaded" << endl;
exit(2);
}
- if (!scorer) {
+ if (!m_scorer) {
cerr << "error trying to optimize without a Scorer loaded" << endl;
exit(2);
}
- if (scorer->getReferenceSize() != FData->size()) {
+ if (m_scorer->getReferenceSize() != m_feature_data->size()) {
cerr << "error length mismatch between feature file and score file" << endl;
exit(2);
}
@@ -360,13 +358,13 @@ statscore_t Optimizer::Run(Point& P) const
}
-vector<statscore_t> Optimizer::GetIncStatScore(vector<unsigned> thefirst, vector<vector <pair<unsigned,unsigned> > > thediffs) const
+vector<statscore_t> Optimizer::GetIncStatScore(const vector<unsigned>& thefirst, const vector<vector <pair<unsigned,unsigned> > >& thediffs) const
{
- CHECK(scorer);
+ CHECK(m_scorer);
vector<statscore_t> theres;
- scorer->score(thefirst, thediffs, theres);
+ m_scorer->score(thefirst, thediffs, theres);
return theres;
}
@@ -393,7 +391,7 @@ statscore_t SimpleOptimizer::TrueRun(Point& P) const
Point linebest;
- for (unsigned int d = 0; d < Point::getdim()+number_of_random_directions; d++) {
+ for (unsigned int d = 0; d < Point::getdim() + m_num_random_directions; d++) {
if (verboselevel() > 4) {
// cerr<<"minimizing along direction "<<d<<endl;
cerr << "starting point: " << P << " => " << prevscore << endl;
@@ -441,7 +439,7 @@ statscore_t RandomDirectionOptimizer::TrueRun(Point& P) const
// do specified number of random direction optimizations
unsigned int nrun = 0;
unsigned int nrun_no_change = 0;
- for (; nrun_no_change < number_of_random_directions; nrun++, nrun_no_change++)
+ for (; nrun_no_change < m_num_random_directions; nrun++, nrun_no_change++)
{
// choose a random direction in which to optimize
Point direction;
@@ -474,63 +472,3 @@ statscore_t RandomOptimizer::TrueRun(Point& P) const
P.SetScore(score);
return score;
}
-
-//--------------------------------------
-
-vector<string> OptimizerFactory::typenames;
-
-void OptimizerFactory::SetTypeNames()
-{
- if (typenames.empty()) {
- typenames.resize(NOPTIMIZER);
- typenames[POWELL]="powell";
- typenames[RANDOM_DIRECTION]="random-direction";
- typenames[RANDOM]="random";
- // Add new type there
- }
-}
-vector<string> OptimizerFactory::GetTypeNames()
-{
- if (typenames.empty())
- SetTypeNames();
- return typenames;
-}
-
-OptimizerFactory::OptType OptimizerFactory::GetOType(const string& type)
-{
- unsigned int thetype;
- if (typenames.empty())
- SetTypeNames();
- for (thetype = 0; thetype < typenames.size(); thetype++)
- if (typenames[thetype] == type)
- break;
- return((OptType)thetype);
-}
-
-Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim, vector<unsigned> i2o, vector<parameter_t> start, const string& type, unsigned int nrandom)
-{
- OptType T = GetOType(type);
- if (T == NOPTIMIZER) {
- cerr << "Error: unknown Optimizer type " << type << endl;
- cerr << "Known Algorithm are:" << endl;
- unsigned int thetype;
- for (thetype = 0; thetype < typenames.size(); thetype++)
- cerr << typenames[thetype] << endl;
- throw ("unknown Optimizer Type");
- }
-
- switch ((OptType)T) {
- case POWELL:
- return new SimpleOptimizer(dim, i2o, start, nrandom);
- break;
- case RANDOM_DIRECTION:
- return new RandomDirectionOptimizer(dim, i2o, start, nrandom);
- break;
- case RANDOM:
- return new RandomOptimizer(dim, i2o, start, nrandom);
- break;
- default:
- cerr << "Error: unknown optimizer" << type << endl;
- return NULL;
- }
-}
diff --git a/mert/Optimizer.h b/mert/Optimizer.h
index 69c7a7641..218a7b7e6 100644
--- a/mert/Optimizer.h
+++ b/mert/Optimizer.h
@@ -1,5 +1,5 @@
-#ifndef OPTIMIZER_H
-#define OPTIMIZER_H
+#ifndef MERT_OPTIMIZER_H_
+#define MERT_OPTIMIZER_H_
#include <vector>
#include <string>
@@ -10,7 +10,7 @@
using namespace std;
-typedef float featurescore;
+static const float kMaxFloat = numeric_limits<float>::max();
class Point;
@@ -20,18 +20,21 @@ class Point;
class Optimizer
{
protected:
- Scorer *scorer; // no accessor for them only child can use them
- FeatureDataHandle FData; // no accessor for them only child can use them
- unsigned int number_of_random_directions;
+ Scorer *m_scorer; // no accessor for them only child can use them
+ FeatureDataHandle m_feature_data; // no accessor for them only child can use them
+ unsigned int m_num_random_directions;
+
+ const vector<bool>& m_positive;
public:
- Optimizer(unsigned Pd, vector<unsigned> i2O, vector<parameter_t> start, unsigned int nrandom);
- void SetScorer(Scorer *_scorer);
- void SetFData(FeatureDataHandle _FData);
+ Optimizer(unsigned Pd, const vector<unsigned>& i2O, const vector<bool>& positive, const vector<parameter_t>& start, unsigned int nrandom);
+
+ void SetScorer(Scorer *scorer) { m_scorer = scorer; }
+ void SetFeatureData(FeatureDataHandle feature_data) { m_feature_data = feature_data; }
virtual ~Optimizer();
unsigned size() const {
- return FData ? FData->size() : 0;
+ return m_feature_data ? m_feature_data->size() : 0;
}
/**
@@ -53,12 +56,12 @@ public:
* Given a set of nbests, get the Statistical score.
*/
statscore_t GetStatScore(const vector<unsigned>& nbests) const {
- return scorer->score(nbests);
+ return m_scorer->score(nbests);
}
statscore_t GetStatScore(const Point& param) const;
- vector<statscore_t> GetIncStatScore(vector<unsigned> ref, vector<vector<pair<unsigned,unsigned> > >) const;
+ vector<statscore_t> GetIncStatScore(const vector<unsigned>& ref, const vector<vector<pair<unsigned,unsigned> > >& diffs) const;
/**
* Get the optimal Lambda and the best score in a particular direction from a given Point.
@@ -76,8 +79,9 @@ class SimpleOptimizer : public Optimizer
private:
const float kEPS;
public:
- SimpleOptimizer(unsigned dim, vector<unsigned> i2O, vector<parameter_t> start, unsigned int nrandom)
- : Optimizer(dim, i2O, start,nrandom), kEPS(0.0001) {}
+ SimpleOptimizer(unsigned dim, const vector<unsigned>& i2O, const vector<bool>& positive,
+ const vector<parameter_t>& start, unsigned int nrandom)
+ : Optimizer(dim, i2O, positive, start,nrandom), kEPS(0.0001) {}
virtual statscore_t TrueRun(Point&) const;
};
@@ -89,8 +93,9 @@ class RandomDirectionOptimizer : public Optimizer
private:
const float kEPS;
public:
- RandomDirectionOptimizer(unsigned dim, vector<unsigned> i2O, vector<parameter_t> start, unsigned int nrandom)
- : Optimizer(dim, i2O, start, nrandom), kEPS(0.0001) {}
+ RandomDirectionOptimizer(unsigned dim, const vector<unsigned>& i2O, const vector<bool>& positive,
+ const vector<parameter_t>& start, unsigned int nrandom)
+ : Optimizer(dim, i2O, positive, start, nrandom), kEPS(0.0001) {}
virtual statscore_t TrueRun(Point&) const;
};
@@ -100,36 +105,10 @@ public:
class RandomOptimizer : public Optimizer
{
public:
- RandomOptimizer(unsigned dim, vector<unsigned> i2O, vector<parameter_t> start, unsigned int nrandom)
- : Optimizer(dim, i2O, start, nrandom) {}
+ RandomOptimizer(unsigned dim, const vector<unsigned>& i2O, const vector<bool>& positive,
+ const vector<parameter_t>& start, unsigned int nrandom)
+ : Optimizer(dim, i2O, positive, start, nrandom) {}
virtual statscore_t TrueRun(Point&) const;
};
-class OptimizerFactory
-{
-public:
- static vector<string> GetTypeNames();
- static Optimizer* BuildOptimizer(unsigned dim, vector<unsigned> tooptimize, vector<parameter_t> start, const string& type, unsigned int nrandom);
-
-private:
- OptimizerFactory() {}
- ~OptimizerFactory() {}
-
- // Add new optimizer here BEFORE NOPTIMZER
- enum OptType {
- POWELL = 0,
- RANDOM_DIRECTION = 1,
- RANDOM,
- NOPTIMIZER
- };
-
- // Get optimizer type.
- static OptType GetOType(const string& type);
-
- // Setup optimization types.
- static void SetTypeNames();
-
- static vector<string> typenames;
-};
-
#endif // OPTIMIZER_H
diff --git a/mert/OptimizerFactory.cpp b/mert/OptimizerFactory.cpp
new file mode 100644
index 000000000..6cafd15b0
--- /dev/null
+++ b/mert/OptimizerFactory.cpp
@@ -0,0 +1,67 @@
+#include "OptimizerFactory.h"
+#include "Optimizer.h"
+
+using namespace std;
+
+vector<string> OptimizerFactory::m_type_names;
+
+void OptimizerFactory::SetTypeNames()
+{
+ if (m_type_names.empty()) {
+ m_type_names.resize(NOPTIMIZER);
+ m_type_names[POWELL] = "powell";
+ m_type_names[RANDOM_DIRECTION] = "random-direction";
+ m_type_names[RANDOM] = "random";
+ // Add new type there
+ }
+}
+vector<string> OptimizerFactory::GetTypeNames()
+{
+ if (m_type_names.empty())
+ SetTypeNames();
+ return m_type_names;
+}
+
+OptimizerFactory::OptimizerType OptimizerFactory::GetOptimizerType(const string& type)
+{
+ unsigned int t;
+ if (m_type_names.empty())
+ SetTypeNames();
+ for (t = 0; t < m_type_names.size(); t++)
+ if (m_type_names[t] == type)
+ break;
+ return((OptimizerType)t);
+}
+
+Optimizer* OptimizerFactory::BuildOptimizer(unsigned dim,
+ const vector<unsigned>& i2o,
+ const std::vector<bool>& positive,
+ const vector<parameter_t>& start,
+ const string& type,
+ unsigned int nrandom)
+{
+ OptimizerType opt_type = GetOptimizerType(type);
+ if (opt_type == NOPTIMIZER) {
+ cerr << "Error: unknown Optimizer type " << type << endl;
+ cerr << "Known Algorithm are:" << endl;
+ unsigned int t;
+ for (t = 0; t < m_type_names.size(); t++)
+ cerr << m_type_names[t] << endl;
+ throw ("unknown Optimizer Type");
+ }
+
+ switch (opt_type) {
+ case POWELL:
+ return new SimpleOptimizer(dim, i2o, positive, start, nrandom);
+ break;
+ case RANDOM_DIRECTION:
+ return new RandomDirectionOptimizer(dim, i2o, positive, start, nrandom);
+ break;
+ case RANDOM:
+ return new RandomOptimizer(dim, i2o, positive, start, nrandom);
+ break;
+ default:
+ cerr << "Error: unknown optimizer" << type << endl;
+ return NULL;
+ }
+}
diff --git a/mert/OptimizerFactory.h b/mert/OptimizerFactory.h
new file mode 100644
index 000000000..3d8716115
--- /dev/null
+++ b/mert/OptimizerFactory.h
@@ -0,0 +1,42 @@
+#ifndef MERT_OPTIMIZER_FACTORY_H_
+#define MERT_OPTIMIZER_FACTORY_H_
+
+#include <vector>
+#include "Types.h"
+
+class Optimizer;
+
+class OptimizerFactory
+{
+ public:
+ // NOTE: Add new optimizer here BEFORE NOPTIMZER
+ enum OptimizerType {
+ POWELL = 0,
+ RANDOM_DIRECTION = 1,
+ RANDOM,
+ NOPTIMIZER
+ };
+
+ static std::vector<string> GetTypeNames();
+
+ // Setup optimization types.
+ static void SetTypeNames();
+
+ // Get optimizer type.
+ static OptimizerType GetOptimizerType(const std::string& type);
+
+ static Optimizer* BuildOptimizer(unsigned dim,
+ const std::vector<unsigned>& to_optimize,
+ const std::vector<bool>& positive,
+ const std::vector<parameter_t>& start,
+ const std::string& type,
+ unsigned int nrandom);
+
+ private:
+ OptimizerFactory() {}
+ ~OptimizerFactory() {}
+
+ static vector<string> m_type_names;
+};
+
+#endif // MERT_OPTIMIZER_FACTORY_H_
diff --git a/mert/OptimizerFactoryTest.cpp b/mert/OptimizerFactoryTest.cpp
new file mode 100644
index 000000000..53c2d252a
--- /dev/null
+++ b/mert/OptimizerFactoryTest.cpp
@@ -0,0 +1,49 @@
+#include "OptimizerFactory.h"
+#include "Optimizer.h"
+
+#define BOOST_TEST_MODULE MertOptimizerFactory
+#include <boost/test/unit_test.hpp>
+#include <boost/scoped_ptr.hpp>
+
+namespace {
+
+inline bool CheckBuildOptimizer(unsigned dim,
+ const vector<unsigned>& to_optimize,
+ const vector<bool>& positive,
+ const vector<parameter_t>& start,
+ const string& type,
+ unsigned int num_random) {
+ boost::scoped_ptr<Optimizer> optimizer(OptimizerFactory::BuildOptimizer(dim, to_optimize, positive, start, type, num_random));
+ return optimizer.get() != NULL;
+}
+
+} // namespace
+
+BOOST_AUTO_TEST_CASE(optimizer_type) {
+ BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("powell"),
+ OptimizerFactory::POWELL);
+ BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("random"),
+ OptimizerFactory::RANDOM);
+ BOOST_CHECK_EQUAL(OptimizerFactory::GetOptimizerType("random-direction"),
+ OptimizerFactory::RANDOM_DIRECTION);
+}
+
+BOOST_AUTO_TEST_CASE(optimizer_build) {
+ const unsigned dim = 3;
+ std::vector<unsigned> to_optimize;
+ to_optimize.push_back(1);
+ to_optimize.push_back(2);
+ to_optimize.push_back(3);
+ std::vector<parameter_t> start;
+ start.push_back(0.3);
+ start.push_back(0.1);
+ start.push_back(0.2);
+ const unsigned int num_random = 1;
+ std::vector<bool> positive(dim);
+ for (unsigned int k = 0; k < dim; k++)
+ positive[k] = false;
+
+ BOOST_CHECK(CheckBuildOptimizer(dim, to_optimize, positive, start, "powell", num_random));
+ BOOST_CHECK(CheckBuildOptimizer(dim, to_optimize, positive, start, "random", num_random));
+ BOOST_CHECK(CheckBuildOptimizer(dim, to_optimize, positive, start, "random-direction", num_random));
+}
diff --git a/mert/PerScorer.cpp b/mert/PerScorer.cpp
index 76c2765dd..67b633872 100644
--- a/mert/PerScorer.cpp
+++ b/mert/PerScorer.cpp
@@ -29,6 +29,7 @@ void PerScorer::setReferenceFiles(const vector<string>& referenceFiles)
string line;
int sid = 0;
while (getline(in,line)) {
+ line = this->preprocessSentence(line);
vector<int> tokens;
TokenizeAndEncode(line, tokens);
m_ref_tokens.push_back(multiset<int>());
@@ -52,10 +53,13 @@ void PerScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
msg << "Sentence id (" << sid << ") not found in reference set";
throw runtime_error(msg.str());
}
+
+ string sentence = this->preprocessSentence(text);
+
// Calculate correct, output_length and ref_length for
// the line and store it in entry
vector<int> testtokens;
- TokenizeAndEncode(text, testtokens);
+ TokenizeAndEncode(sentence, testtokens);
multiset<int> testtokens_all(testtokens.begin(),testtokens.end());
set<int> testtokens_unique(testtokens.begin(),testtokens.end());
int correct = 0;
diff --git a/mert/PerScorer.h b/mert/PerScorer.h
index f06e2955a..d32e14029 100644
--- a/mert/PerScorer.h
+++ b/mert/PerScorer.h
@@ -1,9 +1,7 @@
-#ifndef __PERSCORER_H__
-#define __PERSCORER_H__
+#ifndef MERT_PER_SCORER_H_
+#define MERT_PER_SCORER_H_
-#include <iostream>
#include <set>
-#include <sstream>
#include <string>
#include <vector>
#include "Types.h"
@@ -27,18 +25,9 @@ public:
virtual void setReferenceFiles(const vector<string>& referenceFiles);
virtual void prepareStats(size_t sid, const string& text, ScoreStats& entry);
-
- virtual size_t NumberOfScores() const {
- // cerr << "PerScorer: 3" << endl;
- return 3;
- }
-
+ virtual size_t NumberOfScores() const { return 3; }
virtual float calculateScore(const vector<int>& comps) const;
- void whoami() const {
- cerr << "I AM PerScorer" << std::endl;
- }
-
private:
// no copying allowed
PerScorer(const PerScorer&);
@@ -49,4 +38,4 @@ private:
vector<multiset<int> > m_ref_tokens;
};
-#endif // __PERSCORER_H__
+#endif // MERT_PER_SCORER_H_
diff --git a/mert/Point.cpp b/mert/Point.cpp
index f298647dd..299e2b4d0 100644
--- a/mert/Point.cpp
+++ b/mert/Point.cpp
@@ -3,45 +3,46 @@
#include <cmath>
#include <cstdlib>
#include "util/check.hh"
-#include <limits>
#include "FeatureStats.h"
+#include "Optimizer.h"
using namespace std;
-vector<unsigned> Point::optindices;
+vector<unsigned> Point::m_opt_indices;
-unsigned Point::dim = 0;
+unsigned Point::m_dim = 0;
-map<unsigned,statscore_t> Point::fixedweights;
+map<unsigned,statscore_t> Point::m_fixed_weights;
-unsigned Point::pdim = 0;
-unsigned Point::ncall = 0;
+unsigned Point::m_pdim = 0;
+unsigned Point::m_ncall = 0;
vector<parameter_t> Point::m_min;
vector<parameter_t> Point::m_max;
-Point::Point() : vector<parameter_t>(dim), score_(0.0) {}
+Point::Point() : vector<parameter_t>(m_dim), m_score(0.0) {}
-//Can initialize from a vector of dim or pdim
+//Can initialize from a vector of dim or m_pdim
Point::Point(const vector<parameter_t>& init,
const vector<parameter_t>& min,
const vector<parameter_t>& max)
- : vector<parameter_t>(Point::dim), score_(0.0)
+ : vector<parameter_t>(Point::m_dim), m_score(0.0)
{
- m_min.resize(Point::dim);
- m_max.resize(Point::dim);
- if(init.size()==dim) {
- for (unsigned int i=0; i<Point::dim; i++) {
- operator[](i)=init[i];
+ m_min.resize(Point::m_dim);
+ m_max.resize(Point::m_dim);
+ if (init.size() == m_dim) {
+ for (unsigned int i = 0; i < Point::m_dim; i++) {
+ operator[](i) = init[i];
m_min[i] = min[i];
m_max[i] = max[i];
}
} else {
- CHECK(init.size()==pdim);
- for (unsigned int i=0; i<Point::dim; i++) {
- operator[](i)=init[optindices[i]];
- m_min[i] = min[optindices[i]];
- m_max[i] = max[optindices[i]];
+ CHECK(init.size() == m_pdim);
+ CHECK(m_opt_indices.size() == Point::m_dim);
+ for (unsigned int i = 0; i < Point::m_dim; i++) {
+ operator[](i) = init[m_opt_indices[i]];
+ m_min[i] = min[m_opt_indices[i]];
+ m_max[i] = max[m_opt_indices[i]];
}
}
}
@@ -50,9 +51,9 @@ Point::~Point() {}
void Point::Randomize()
{
- CHECK(m_min.size()==Point::dim);
- CHECK(m_max.size()==Point::dim);
- for (unsigned int i=0; i<size(); i++) {
+ CHECK(m_min.size() == Point::m_dim);
+ CHECK(m_max.size() == Point::m_dim);
+ for (unsigned int i = 0; i < size(); i++) {
operator[](i) = m_min[i] +
static_cast<float>(random()) / static_cast<float>(RAND_MAX) * (m_max[i] - m_min[i]);
}
@@ -60,21 +61,22 @@ void Point::Randomize()
double Point::operator*(const FeatureStats& F) const
{
- ncall++; // to track performance
- double prod=0.0;
- if(OptimizeAll())
+ m_ncall++; // to track performance
+ double prod = 0.0;
+ if (OptimizeAll())
for (unsigned i=0; i<size(); i++)
- prod+= operator[](i)*F.get(i);
+ prod += operator[](i) * F.get(i);
else {
- for (unsigned i=0; i<size(); i++)
- prod+= operator[](i)*F.get(optindices[i]);
- for(map<unsigned,float >::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++)
- prod+=it->second*F.get(it->first);
+ for (unsigned i = 0; i < size(); i++)
+ prod += operator[](i) * F.get(m_opt_indices[i]);
+ for(map<unsigned, float>::iterator it = m_fixed_weights.begin();
+ it != m_fixed_weights.end(); ++it)
+ prod += it->second * F.get(it->first);
}
return prod;
}
-Point Point::operator+(const Point& p2) const
+const Point Point::operator+(const Point& p2) const
{
CHECK(p2.size() == size());
Point Res(*this);
@@ -82,7 +84,7 @@ Point Point::operator+(const Point& p2) const
Res[i] += p2[i];
}
- Res.score_ = numeric_limits<statscore_t>::max();
+ Res.m_score = kMaxFloat;
return Res;
}
@@ -92,23 +94,24 @@ void Point::operator+=(const Point& p2)
for (unsigned i = 0; i < size(); i++) {
operator[](i) += p2[i];
}
- score_ = numeric_limits<statscore_t>::max();
+ m_score = kMaxFloat;
}
-Point Point::operator*(float l) const
+const Point Point::operator*(float l) const
{
Point Res(*this);
for (unsigned i = 0; i < size(); i++) {
Res[i] *= l;
}
- Res.score_ = numeric_limits<statscore_t>::max();
+ Res.m_score = kMaxFloat;
return Res;
}
ostream& operator<<(ostream& o, const Point& P)
{
- vector<parameter_t> w = P.GetAllWeights();
- for (unsigned int i = 0; i < Point::pdim; i++) {
+ vector<parameter_t> w;
+ P.GetAllWeights(w);
+ for (unsigned int i = 0; i < Point::m_pdim; i++) {
o << w[i] << " ";
}
return o;
@@ -117,39 +120,39 @@ ostream& operator<<(ostream& o, const Point& P)
void Point::NormalizeL2()
{
parameter_t norm=0.0;
- for (unsigned int i=0; i<size(); i++)
- norm+= operator[](i)*operator[](i);
- if(norm!=0.0) {
- norm=sqrt(norm);
- for (unsigned int i=0; i<size(); i++)
- operator[](i)/=norm;
+ for (unsigned int i = 0; i < size(); i++)
+ norm += operator[](i) * operator[](i);
+ if (norm != 0.0) {
+ norm = sqrt(norm);
+ for (unsigned int i = 0; i < size(); i++)
+ operator[](i) /= norm;
}
}
void Point::NormalizeL1()
{
- parameter_t norm=0.0;
- for (unsigned int i=0; i<size(); i++)
- norm+= abs(operator[](i));
- if(norm!=0.0) {
- for (unsigned int i=0; i<size(); i++)
- operator[](i)/=norm;
+ parameter_t norm = 0.0;
+ for (unsigned int i = 0; i < size(); i++)
+ norm += abs(operator[](i));
+ if (norm != 0.0) {
+ for (unsigned int i = 0; i < size(); i++)
+ operator[](i) /= norm;
}
}
-vector<parameter_t> Point::GetAllWeights()const
+void Point::GetAllWeights(vector<parameter_t>& w) const
{
- vector<parameter_t> w;
- if(OptimizeAll()) {
- w=*this;
+ if (OptimizeAll()) {
+ w = *this;
} else {
- w.resize(pdim);
- for (unsigned int i=0; i<size(); i++)
- w[optindices[i]]=operator[](i);
- for(map<unsigned,float >::iterator it=fixedweights.begin(); it!=fixedweights.end(); it++)
+ w.resize(m_pdim);
+ for (unsigned int i = 0; i < size(); i++)
+ w[m_opt_indices[i]] = operator[](i);
+ for (map<unsigned,float>::const_iterator it = m_fixed_weights.begin();
+ it != m_fixed_weights.end(); ++it) {
w[it->first]=it->second;
+ }
}
- return w;
}
diff --git a/mert/Point.h b/mert/Point.h
index 55d173215..46b23c9d9 100644
--- a/mert/Point.h
+++ b/mert/Point.h
@@ -1,7 +1,7 @@
-#ifndef POINT_H
-#define POINT_H
+#ifndef MERT_POINT_H_
+#define MERT_POINT_H_
-#include <fstream>
+#include <ostream>
#include <map>
#include <vector>
#include "Types.h"
@@ -16,52 +16,55 @@ class Optimizer;
class Point : public vector<parameter_t>
{
friend class Optimizer;
+
private:
/**
* The indices over which we optimize.
*/
- static vector<unsigned int> optindices;
+ static vector<unsigned int> m_opt_indices;
/**
- * Dimension of optindices and of the parent vector.
+ * Dimension of m_opt_indices and of the parent vector.
*/
- static unsigned int dim;
+ static unsigned int m_dim;
/**
* Fixed weights in case of partial optimzation.
*/
- static map<unsigned int,parameter_t> fixedweights;
+ static map<unsigned int,parameter_t> m_fixed_weights;
/**
* Total size of the parameter space; we have
- * pdim = FixedWeight.size() + optinidices.size().
+ * m_pdim = FixedWeight.size() + optinidices.size().
*/
- static unsigned int pdim;
- static unsigned int ncall;
+ static unsigned int m_pdim;
+ static unsigned int m_ncall;
/**
- * The limits for randomization, both vectors are of full length, pdim.
+ * The limits for randomization, both vectors are of full length, m_pdim.
*/
static vector<parameter_t> m_min;
static vector<parameter_t> m_max;
- statscore_t score_;
+ statscore_t m_score;
public:
- static unsigned int getdim() {
- return dim;
- }
- static unsigned int getpdim() {
- return pdim;
- }
- static void setpdim(size_t pd) {
- pdim = pd;
+ static unsigned int getdim() { return m_dim; }
+ static void setdim(size_t d) { m_dim = d; }
+
+ static unsigned int getpdim() { return m_pdim; }
+ static void setpdim(size_t pd) { m_pdim = pd; }
+
+ static void set_optindices(const vector<unsigned int>& indices) {
+ m_opt_indices = indices;
}
- static void setdim(size_t d) {
- dim = d;
+
+ static const vector<unsigned int>& get_optindices() {
+ return m_opt_indices;
}
+
static bool OptimizeAll() {
- return fixedweights.empty();
+ return m_fixed_weights.empty();
}
Point();
@@ -74,12 +77,12 @@ public:
// Compute the feature function
double operator*(const FeatureStats&) const;
- Point operator+(const Point&) const;
+ const Point operator+(const Point&) const;
void operator+=(const Point&);
- Point operator*(float) const;
+ const Point operator*(float) const;
/**
- * Write the Whole featureweight to a stream (ie pdim float).
+ * Write the Whole featureweight to a stream (ie m_pdim float).
*/
friend ostream& operator<<(ostream& o,const Point& P);
@@ -88,16 +91,13 @@ public:
void NormalizeL1();
/**
- * Return a vector of size pdim where all weights have been
+ * Return a vector of size m_pdim where all weights have been
* put (including fixed ones).
*/
- vector<parameter_t> GetAllWeights() const;
-
- statscore_t GetScore() const {
- return score_;
- }
+ void GetAllWeights(vector<parameter_t>& w) const;
- void SetScore(statscore_t score) { score_ = score; }
+ statscore_t GetScore() const { return m_score; }
+ void SetScore(statscore_t score) { m_score = score; }
};
-#endif // POINT_H
+#endif // MERT_POINT_H
diff --git a/mert/PointTest.cpp b/mert/PointTest.cpp
new file mode 100644
index 000000000..d7d6b031c
--- /dev/null
+++ b/mert/PointTest.cpp
@@ -0,0 +1,60 @@
+#include "Point.h"
+
+#define BOOST_TEST_MODULE MertPoint
+#include <boost/test/unit_test.hpp>
+
+#include "Optimizer.h"
+#include "Util.h"
+
+using namespace std;
+
+BOOST_AUTO_TEST_CASE(point_operators) {
+ const unsigned int dim = 5;
+ vector<float> init(dim);
+ init[0] = 1.0f;
+ init[1] = 1.0f;
+ init[2] = 0.3f;
+ init[3] = 0.2f;
+ init[4] = 0.3f;
+
+ vector<float> min(dim, 0.0f);
+ vector<float> max(dim, 0.0f);
+
+ Point::setdim(dim);
+ BOOST_REQUIRE(dim == Point::getdim());
+
+ // Test operator '+'
+ {
+ Point p1(init, min, max);
+ Point p2(init, min, max);
+ Point p3 = p1 + p2;
+ for (size_t i = 0; i < p3.size(); ++i) {
+ BOOST_CHECK(IsAlmostEqual(init[i] * 2.0f, p3[i]));
+ }
+ BOOST_CHECK_EQUAL(p3.GetScore(), kMaxFloat);
+ }
+
+ // Test operator '+='
+ {
+ Point p1(init, min, max);
+ Point p2(init, min, max);
+ p1 += p2;
+
+ for (size_t i = 0; i < p1.size(); ++i) {
+ BOOST_CHECK(IsAlmostEqual(init[i] * 2.0f, p1[i]));
+ }
+ BOOST_CHECK_EQUAL(p1.GetScore(), kMaxFloat);
+ }
+
+ // Test operator '*'
+ {
+ Point p1(init, min, max);
+ const Point p2 = p1 * 2.0;
+
+ BOOST_REQUIRE(p1.size() == p2.size());
+ for (size_t i = 0; i < p2.size(); ++i) {
+ BOOST_CHECK(IsAlmostEqual(init[i] * 2.0f, p2[i]));
+ }
+ BOOST_CHECK_EQUAL(p2.GetScore(), kMaxFloat);
+ }
+}
diff --git a/mert/PreProcessFilter.cpp b/mert/PreProcessFilter.cpp
new file mode 100644
index 000000000..d72907713
--- /dev/null
+++ b/mert/PreProcessFilter.cpp
@@ -0,0 +1,135 @@
+#include "PreProcessFilter.h"
+
+#include <iostream>
+#include <cstdlib>
+#include <unistd.h>
+#include <csignal>
+
+using namespace std;
+
+#define CHILD_STDIN_READ pipefds_input[0]
+#define CHILD_STDIN_WRITE pipefds_input[1]
+#define CHILD_STDOUT_READ pipefds_output[0]
+#define CHILD_STDOUT_WRITE pipefds_output[1]
+#define CHILD_STDERR_READ pipefds_error[0]
+#define CHILD_STDERR_WRITE pipefds_error[1]
+
+// Child exec error signal
+void exec_failed (int sig)
+{
+ cerr << "Exec failed. Child process couldn't be launched." << endl;
+ exit (EXIT_FAILURE);
+}
+
+PreProcessFilter::PreProcessFilter(const string& filterCommand)
+ : m_toFilter(NULL),
+ m_fromFilter(NULL)
+{
+ // Child error signal install
+ // sigaction is the replacement for the traditional signal() method
+ struct sigaction action;
+ action.sa_handler = exec_failed;
+ sigemptyset(&action.sa_mask);
+ action.sa_flags = 0;
+ if (sigaction(SIGUSR1, &action, NULL) < 0)
+ {
+ perror("SIGUSR1 install error");
+ exit(EXIT_FAILURE);
+ }
+
+ int pipe_status;
+ int pipefds_input[2];
+ int pipefds_output[2];
+ // int pipefds_error[2];
+
+ // Create the pipes
+ // We do this before the fork so both processes will know about
+ // the same pipe and they can communicate.
+
+ pipe_status = pipe(pipefds_input);
+ if (pipe_status == -1)
+ {
+ perror("Error creating the pipe");
+ exit(EXIT_FAILURE);
+ }
+
+ pipe_status = pipe(pipefds_output);
+ if (pipe_status == -1)
+ {
+ perror("Error creating the pipe");
+ exit(EXIT_FAILURE);
+ }
+
+ /*
+ pipe_status = pipe(pipefds_error);
+ if (pipe_status == -1)
+ {
+ perror("Error creating the pipe");
+ exit(EXIT_FAILURE);
+ }
+ */
+
+ pid_t pid;
+ // Create child process; both processes continue from here
+ pid = fork();
+
+ if (pid == pid_t(0))
+ {
+ // Child process
+
+ // When the child process finishes sends a SIGCHLD signal
+ // to the parent
+
+ // Tie the standard input, output and error streams to the
+ // appropiate pipe ends
+ // The file descriptor 0 is the standard input
+ // We tie it to the read end of the pipe as we will use
+ // this end of the pipe to read from it
+ dup2 (CHILD_STDIN_READ,0);
+ dup2 (CHILD_STDOUT_WRITE,1);
+ // dup2 (CHILD_STDERR_WRITE,2);
+ // Close in the child the unused ends of the pipes
+ close(CHILD_STDIN_WRITE);
+ close(CHILD_STDOUT_READ);
+ //close(CHILD_STDERR_READ);
+
+ // Execute the program
+ execl("/bin/bash", "bash", "-c", filterCommand.c_str() , (char*)NULL);
+
+ // We should never reach this point
+ // Tell the parent the exec failed
+ kill(getppid(), SIGUSR1);
+ exit(EXIT_FAILURE);
+ }
+ else if (pid > pid_t(0))
+ {
+ // Parent
+
+ // Close in the parent the unused ends of the pipes
+ close(CHILD_STDIN_READ);
+ close(CHILD_STDOUT_WRITE);
+ // close(CHILD_STDERR_WRITE);
+
+ m_toFilter = new ofdstream(CHILD_STDIN_WRITE);
+ m_fromFilter = new ifdstream(CHILD_STDOUT_READ);
+ }
+ else
+ {
+ perror("Error: fork failed");
+ exit(EXIT_FAILURE);
+ }
+}
+
+string PreProcessFilter::ProcessSentence(const string& sentence)
+{
+ *m_toFilter << sentence << "\n";
+ string processedSentence;
+ m_fromFilter->getline(processedSentence);
+ return processedSentence;
+}
+
+PreProcessFilter::~PreProcessFilter()
+{
+ delete m_toFilter;
+ delete m_fromFilter;
+}
diff --git a/mert/PreProcessFilter.h b/mert/PreProcessFilter.h
new file mode 100644
index 000000000..c65c060a4
--- /dev/null
+++ b/mert/PreProcessFilter.h
@@ -0,0 +1,24 @@
+#ifndef MERT_PREPROCESSFILTER_H_
+#define MERT_PREPROCESSFILTER_H_
+
+#include <string>
+
+#include "Fdstream.h"
+
+/*
+ * This class runs the filter command in a child process and
+ * then use this filter to process given sentences.
+ */
+class PreProcessFilter
+{
+public:
+ PreProcessFilter(const string& filterCommand);
+ string ProcessSentence(const string& sentence);
+ ~PreProcessFilter();
+
+private:
+ ofdstream* m_toFilter;
+ ifdstream* m_fromFilter;
+};
+
+#endif // MERT_PREPROCESSFILTER_H_
diff --git a/mert/Reference.h b/mert/Reference.h
new file mode 100644
index 000000000..353a3311b
--- /dev/null
+++ b/mert/Reference.h
@@ -0,0 +1,82 @@
+#ifndef MERT_REFERENCE_H_
+#define MERT_REFERENCE_H_
+
+#include <algorithm>
+#include <climits>
+#include <vector>
+
+#include "Ngram.h"
+
+/**
+ * Reference class represents reference translations for an output
+ * translation used in calculating BLEU score.
+ */
+class Reference {
+ public:
+ // for m_length
+ typedef std::vector<size_t>::iterator iterator;
+ typedef std::vector<size_t>::const_iterator const_iterator;
+
+ Reference() : m_counts(new NgramCounts) { }
+ ~Reference() { delete m_counts; }
+
+ NgramCounts* get_counts() { return m_counts; }
+ const NgramCounts* get_counts() const { return m_counts; }
+
+ iterator begin() { return m_length.begin(); }
+ const_iterator begin() const { return m_length.begin(); }
+ iterator end() { return m_length.end(); }
+ const_iterator end() const { return m_length.end(); }
+
+ void push_back(size_t len) { m_length.push_back(len); }
+
+ size_t num_references() const { return m_length.size(); }
+
+ int CalcAverage() const;
+ int CalcClosest(size_t length) const;
+ int CalcShortest() const;
+
+ private:
+ NgramCounts* m_counts;
+
+ // multiple reference lengths
+ std::vector<size_t> m_length;
+};
+
+// TODO(tetsuok): fix this function and related stuff.
+// "average" reference length should not be calculated at sentence-level unlike "closest".
+inline int Reference::CalcAverage() const {
+ int total = 0;
+ for (size_t i = 0; i < m_length.size(); ++i) {
+ total += m_length[i];
+ }
+ return static_cast<int>(
+ static_cast<float>(total) / m_length.size());
+}
+
+inline int Reference::CalcClosest(size_t length) const {
+ int min_diff = INT_MAX;
+ int closest_ref_id = 0; // an index of the closest reference translation
+ for (size_t i = 0; i < m_length.size(); ++i) {
+ const int ref_length = m_length[i];
+ const int length_diff = abs(ref_length - static_cast<int>(length));
+ const int abs_min_diff = abs(min_diff);
+ // Look for the closest reference
+ if (length_diff < abs_min_diff) {
+ min_diff = ref_length - length;
+ closest_ref_id = i;
+ // if two references has the same closest length, take the shortest
+ } else if (length_diff == abs_min_diff) {
+ if (ref_length < static_cast<int>(m_length[closest_ref_id])) {
+ closest_ref_id = i;
+ }
+ }
+ }
+ return static_cast<int>(m_length[closest_ref_id]);
+}
+
+inline int Reference::CalcShortest() const {
+ return *std::min_element(m_length.begin(), m_length.end());
+}
+
+#endif // MERT_REFERENCE_H_
diff --git a/mert/ReferenceTest.cpp b/mert/ReferenceTest.cpp
new file mode 100644
index 000000000..454768195
--- /dev/null
+++ b/mert/ReferenceTest.cpp
@@ -0,0 +1,116 @@
+#include "Reference.h"
+
+#define BOOST_TEST_MODULE MertReference
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_CASE(refernece_count) {
+ Reference ref;
+ BOOST_CHECK(ref.get_counts() != NULL);
+}
+
+BOOST_AUTO_TEST_CASE(refernece_length_iterator) {
+ Reference ref;
+ ref.push_back(4);
+ ref.push_back(2);
+ BOOST_REQUIRE(ref.num_references() == 2);
+
+ Reference::iterator it = ref.begin();
+ BOOST_CHECK_EQUAL(*it, 4);
+ ++it;
+ BOOST_CHECK_EQUAL(*it, 2);
+ ++it;
+ BOOST_CHECK(it == ref.end());
+}
+
+BOOST_AUTO_TEST_CASE(refernece_length_average) {
+ {
+ Reference ref;
+ ref.push_back(4);
+ ref.push_back(1);
+ BOOST_CHECK_EQUAL(2, ref.CalcAverage());
+ }
+
+ {
+ Reference ref;
+ ref.push_back(4);
+ ref.push_back(3);
+ BOOST_CHECK_EQUAL(3, ref.CalcAverage());
+ }
+
+ {
+ Reference ref;
+ ref.push_back(4);
+ ref.push_back(3);
+ ref.push_back(4);
+ ref.push_back(5);
+ BOOST_CHECK_EQUAL(4, ref.CalcAverage());
+ }
+}
+
+BOOST_AUTO_TEST_CASE(refernece_length_closest) {
+ {
+ Reference ref;
+ ref.push_back(4);
+ ref.push_back(1);
+ BOOST_REQUIRE(ref.num_references() == 2);
+
+ BOOST_CHECK_EQUAL(1, ref.CalcClosest(2));
+ BOOST_CHECK_EQUAL(1, ref.CalcClosest(1));
+ BOOST_CHECK_EQUAL(4, ref.CalcClosest(3));
+ BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
+ BOOST_CHECK_EQUAL(4, ref.CalcClosest(5));
+ }
+
+ {
+ Reference ref;
+ ref.push_back(4);
+ ref.push_back(3);
+ BOOST_REQUIRE(ref.num_references() == 2);
+
+ BOOST_CHECK_EQUAL(3, ref.CalcClosest(1));
+ BOOST_CHECK_EQUAL(3, ref.CalcClosest(2));
+ BOOST_CHECK_EQUAL(3, ref.CalcClosest(3));
+ BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
+ BOOST_CHECK_EQUAL(4, ref.CalcClosest(5));
+ }
+
+ {
+ Reference ref;
+ ref.push_back(4);
+ ref.push_back(3);
+ ref.push_back(4);
+ ref.push_back(5);
+ BOOST_REQUIRE(ref.num_references() == 4);
+
+ BOOST_CHECK_EQUAL(3, ref.CalcClosest(1));
+ BOOST_CHECK_EQUAL(3, ref.CalcClosest(2));
+ BOOST_CHECK_EQUAL(3, ref.CalcClosest(3));
+ BOOST_CHECK_EQUAL(4, ref.CalcClosest(4));
+ BOOST_CHECK_EQUAL(5, ref.CalcClosest(5));
+ }
+}
+
+BOOST_AUTO_TEST_CASE(refernece_length_shortest) {
+ {
+ Reference ref;
+ ref.push_back(4);
+ ref.push_back(1);
+ BOOST_CHECK_EQUAL(1, ref.CalcShortest());
+ }
+
+ {
+ Reference ref;
+ ref.push_back(4);
+ ref.push_back(3);
+ BOOST_CHECK_EQUAL(3, ref.CalcShortest());
+ }
+
+ {
+ Reference ref;
+ ref.push_back(4);
+ ref.push_back(3);
+ ref.push_back(4);
+ ref.push_back(5);
+ BOOST_CHECK_EQUAL(3, ref.CalcShortest());
+ }
+}
diff --git a/mert/ScopedVector.h b/mert/ScopedVector.h
index 1fbce88b7..a2f0e7066 100644
--- a/mert/ScopedVector.h
+++ b/mert/ScopedVector.h
@@ -1,5 +1,5 @@
-#ifndef SCOPEDVECTOR_H_
-#define SCOPEDVECTOR_H_
+#ifndef MERT_SCOPED_VECTOR_H_
+#define MERT_SCOPED_VECTOR_H_
#include <vector>
@@ -12,43 +12,43 @@ class ScopedVector {
ScopedVector() {}
virtual ~ScopedVector() { reset(); }
- bool empty() const { return vec_.empty(); }
+ bool empty() const { return m_vec.empty(); }
- void push_back(T *e) { vec_.push_back(e); }
+ void push_back(T *e) { m_vec.push_back(e); }
void reset() {
- for (iterator it = vec_.begin(); it != vec_.end(); ++it) {
+ for (iterator it = m_vec.begin(); it != m_vec.end(); ++it) {
delete *it;
}
- vec_.clear();
+ m_vec.clear();
}
- void reserve(size_t capacity) { vec_.reserve(capacity); }
- void resize(size_t size) { vec_.resize(size); }
+ void reserve(size_t capacity) { m_vec.reserve(capacity); }
+ void resize(size_t size) { m_vec.resize(size); }
- size_t size() const {return vec_.size(); }
+ size_t size() const {return m_vec.size(); }
- iterator begin() { return vec_.begin(); }
- const_iterator begin() const { return vec_.begin(); }
+ iterator begin() { return m_vec.begin(); }
+ const_iterator begin() const { return m_vec.begin(); }
- iterator end() { return vec_.end(); }
- const_iterator end() const { return vec_.end(); }
+ iterator end() { return m_vec.end(); }
+ const_iterator end() const { return m_vec.end(); }
- std::vector<T*>& get() { return vec_; }
- const std::vector<T*>& get() const { return vec_; }
+ std::vector<T*>& get() { return m_vec; }
+ const std::vector<T*>& get() const { return m_vec; }
- std::vector<T*>* operator->() { return &vec_; }
- const std::vector<T*>* operator->() const { return &vec_; }
+ std::vector<T*>* operator->() { return &m_vec; }
+ const std::vector<T*>* operator->() const { return &m_vec; }
- T*& operator[](size_t i) { return vec_[i]; }
- const T* operator[](size_t i) const { return vec_[i]; }
+ T*& operator[](size_t i) { return m_vec[i]; }
+ const T* operator[](size_t i) const { return m_vec[i]; }
private:
- std::vector<T*> vec_;
+ std::vector<T*> m_vec;
// no copying allowed.
ScopedVector<T>(const ScopedVector<T>&);
void operator=(const ScopedVector<T>&);
};
-#endif // SCOPEDVECTOR_H_
+#endif // MERT_SCOPED_VECTOR_H_
diff --git a/mert/ScoreArray.cpp b/mert/ScoreArray.cpp
index b26b93114..83fa96ef0 100644
--- a/mert/ScoreArray.cpp
+++ b/mert/ScoreArray.cpp
@@ -1,6 +1,6 @@
/*
* ScoreArray.cpp
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
@@ -10,76 +10,87 @@
#include "Util.h"
#include "FileStream.h"
+using namespace std;
ScoreArray::ScoreArray()
- : number_of_scores(0), idx("") {}
+ : m_num_scores(0), m_index("") {}
-void ScoreArray::savetxt(std::ofstream& outFile, const std::string& sctype)
+void ScoreArray::savetxt(ostream* os, const string& sctype)
{
- outFile << SCORES_TXT_BEGIN << " " << idx << " " << array_.size()
- << " " << number_of_scores << " " << sctype << std::endl;
- for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++) {
- i->savetxt(outFile);
- outFile << std::endl;
+ *os << SCORES_TXT_BEGIN << " " << m_index << " " << m_array.size()
+ << " " << m_num_scores << " " << sctype << endl;
+ for (scorearray_t::iterator i = m_array.begin(); i !=m_array.end(); i++) {
+ i->savetxt(os);
+ *os << endl;
}
- outFile << SCORES_TXT_END << std::endl;
+ *os << SCORES_TXT_END << endl;
}
-void ScoreArray::savebin(std::ofstream& outFile, const std::string& sctype)
+void ScoreArray::savebin(ostream* os, const string& score_type)
{
- outFile << SCORES_BIN_BEGIN << " " << idx << " " << array_.size()
- << " " << number_of_scores << " " << sctype << std::endl;
- for (scorearray_t::iterator i = array_.begin(); i !=array_.end(); i++)
- i->savebin(outFile);
-
- outFile << SCORES_BIN_END << std::endl;
+ *os << SCORES_BIN_BEGIN << " " << m_index << " " << m_array.size()
+ << " " << m_num_scores << " " << score_type << endl;
+ for (scorearray_t::iterator i = m_array.begin();
+ i != m_array.end(); i++) {
+ i->savebin(os);
+ }
+ *os << SCORES_BIN_END << endl;
}
-void ScoreArray::save(std::ofstream& inFile, const std::string& sctype, bool bin)
+void ScoreArray::save(ostream* os, const string& score_type, bool bin)
{
- if (size()>0)
- (bin)?savebin(inFile, sctype):savetxt(inFile, sctype);
+ if (size() <= 0) return;
+ if (bin) {
+ savebin(os, score_type);
+ } else {
+ savetxt(os, score_type);
+ }
}
-void ScoreArray::save(const std::string &file, const std::string& sctype, bool bin)
+void ScoreArray::save(const string &file, const string& score_type, bool bin)
{
- std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
-
- save(outFile, sctype, bin);
+ ofstream ofs(file.c_str(), ios::out);
+ if (!ofs) {
+ cerr << "Failed to open " << file << endl;
+ exit(1);
+ }
+ ostream* os = &ofs;
+ save(os, score_type, bin);
+ ofs.close();
+}
- outFile.close();
+void ScoreArray::save(const string& score_type, bool bin) {
+ save(&cout, score_type, bin);
}
-void ScoreArray::loadbin(ifstream& inFile, size_t n)
+void ScoreArray::loadbin(istream* is, size_t n)
{
- ScoreStats entry(number_of_scores);
-
- for (size_t i=0 ; i < n; i++) {
- entry.loadbin(inFile);
+ ScoreStats entry(m_num_scores);
+ for (size_t i = 0; i < n; i++) {
+ entry.loadbin(is);
add(entry);
}
}
-void ScoreArray::loadtxt(ifstream& inFile, size_t n)
+void ScoreArray::loadtxt(istream* is, size_t n)
{
- ScoreStats entry(number_of_scores);
-
- for (size_t i=0 ; i < n; i++) {
- entry.loadtxt(inFile);
+ ScoreStats entry(m_num_scores);
+ for (size_t i = 0; i < n; i++) {
+ entry.loadtxt(is);
add(entry);
}
}
-void ScoreArray::load(ifstream& inFile)
+void ScoreArray::load(istream* is)
{
- size_t number_of_entries=0;
- bool binmode=false;
+ size_t number_of_entries = 0;
+ bool binmode = false;
- std::string substring, stringBuf;
- std::string::size_type loc;
+ string substring, stringBuf;
+ string::size_type loc;
- std::getline(inFile, stringBuf);
- if (!inFile.good()) {
+ getline(*is, stringBuf);
+ if (!is->good()) {
return;
}
@@ -94,35 +105,38 @@ void ScoreArray::load(ifstream& inFile)
}
getNextPound(stringBuf, substring);
getNextPound(stringBuf, substring);
- idx = substring;
+ m_index = substring;
getNextPound(stringBuf, substring);
number_of_entries = atoi(substring.c_str());
getNextPound(stringBuf, substring);
- number_of_scores = atoi(substring.c_str());
+ m_num_scores = atoi(substring.c_str());
getNextPound(stringBuf, substring);
- score_type = substring;
+ m_score_type = substring;
}
- (binmode)?loadbin(inFile, number_of_entries):loadtxt(inFile, number_of_entries);
+ if (binmode) {
+ loadbin(is, number_of_entries);
+ } else {
+ loadtxt(is, number_of_entries);
+ }
- std::getline(inFile, stringBuf);
+ getline(*is, stringBuf);
if (!stringBuf.empty()) {
- if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 && (loc = stringBuf.find(SCORES_BIN_END)) != 0) {
+ if ((loc = stringBuf.find(SCORES_TXT_END)) != 0 &&
+ (loc = stringBuf.find(SCORES_BIN_END)) != 0) {
TRACE_ERR("ERROR: ScoreArray::load(): Wrong footer");
return;
}
}
}
-void ScoreArray::load(const std::string &file)
+void ScoreArray::load(const string &file)
{
- TRACE_ERR("loading data from " << file << std::endl);
-
- inputfilestream inFile(file); // matches a stream with a file. Opens the file
-
- load((ifstream&) inFile);
-
- inFile.close();
+ TRACE_ERR("loading data from " << file << endl);
+ inputfilestream input_stream(file); // matches a stream with a file. Opens the file
+ istream* is = &input_stream;
+ load(is);
+ input_stream.close();
}
@@ -139,7 +153,8 @@ bool ScoreArray::check_consistency() const
if (sz == 0)
return true;
- for (scorearray_t::const_iterator i = array_.begin(); i != array_.end(); ++i) {
+ for (scorearray_t::const_iterator i = m_array.begin();
+ i != m_array.end(); ++i) {
if (i->size() != sz)
return false;
}
diff --git a/mert/ScoreArray.h b/mert/ScoreArray.h
index 0a0ddbdc0..64d019daf 100644
--- a/mert/ScoreArray.h
+++ b/mert/ScoreArray.h
@@ -1,15 +1,13 @@
/*
* ScoreArray.h
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
-#ifndef SCORE_ARRAY_H
-#define SCORE_ARRAY_H
-
-using namespace std;
+#ifndef MERT_SCORE_ARRAY_H_
+#define MERT_SCORE_ARRAY_H_
#include <vector>
#include <iostream>
@@ -24,88 +22,65 @@ const char SCORES_BIN_END[] = "SCORES_BIN_END_0";
class ScoreArray
{
-protected:
- scorearray_t array_;
- std::string score_type;
- size_t number_of_scores;
+ private:
+ scorearray_t m_array;
+ std::string m_score_type;
+ std::size_t m_num_scores;
-private:
- // idx to identify the utterance.
+ // indexx to identify the utterance.
// It can differ from the index inside the vector.
- std::string idx;
+ std::string m_index;
public:
ScoreArray();
~ScoreArray() {}
- inline void clear() {
- array_.clear();
- }
+ void clear() { m_array.clear(); }
- inline std::string getIndex() const {
- return idx;
- }
- inline void setIndex(const std::string& value) {
- idx=value;
- }
+ std::string getIndex() const { return m_index; }
-// inline ScoreStats get(size_t i){ return array_.at(i); }
+ void setIndex(const std::string& value) { m_index = value; }
- inline ScoreStats& get(size_t i) {
- return array_.at(i);
- }
- inline const ScoreStats& get(size_t i)const {
- return array_.at(i);
- }
+ ScoreStats& get(std::size_t i) { return m_array.at(i); }
- void add(const ScoreStats& e) {
- array_.push_back(e);
- }
+ const ScoreStats& get(std::size_t i) const { return m_array.at(i); }
+
+ void add(const ScoreStats& e) { m_array.push_back(e); }
//ADDED BY TS
- void swap(size_t i, size_t j) {
- std::swap(array_[i],array_[j]);
+ void swap(std::size_t i, std::size_t j) {
+ std::swap(m_array[i], m_array[j]);
}
- void resize(size_t new_size) {
- array_.resize(std::min(new_size,array_.size()));
+ void resize(std::size_t new_size) {
+ m_array.resize(std::min(new_size, m_array.size()));
}
//END_ADDED
void merge(ScoreArray& e);
- inline std::string name() const {
- return score_type;
- }
+ std::string name() const { return m_score_type; }
- inline void name(std::string &sctype) {
- score_type = sctype;
- }
+ void name(std::string &score_type) { m_score_type = score_type; }
- inline size_t size() const {
- return array_.size();
- }
- inline size_t NumberOfScores() const {
- return number_of_scores;
- }
- inline void NumberOfScores(size_t v) {
- number_of_scores = v;
- }
+ std::size_t size() const { return m_array.size(); }
- void savetxt(ofstream& outFile, const std::string& sctype);
- void savebin(ofstream& outFile, const std::string& sctype);
- void save(ofstream& outFile, const std::string& sctype, bool bin=false);
- void save(const std::string &file, const std::string& sctype, bool bin=false);
- inline void save(const std::string& sctype, bool bin=false) {
- save("/dev/stdout", sctype, bin);
- }
+ std::size_t NumberOfScores() const { return m_num_scores; }
+
+ void NumberOfScores(std::size_t v) { m_num_scores = v; }
+
+ void savetxt(std::ostream* os, const std::string& score_type);
+ void savebin(std::ostream* os, const std::string& score_type);
+ void save(std::ostream* os, const std::string& score_type, bool bin=false);
+ void save(const std::string &file, const std::string& score_type, bool bin=false);
+ void save(const std::string& score_type, bool bin=false);
- void loadtxt(ifstream& inFile, size_t n);
- void loadbin(ifstream& inFile, size_t n);
- void load(ifstream& inFile);
+ void loadtxt(std::istream* is, std::size_t n);
+ void loadbin(std::istream* is, std::size_t n);
+ void load(std::istream* is);
void load(const std::string &file);
bool check_consistency() const;
};
-#endif // SCORE_ARRAY_H
+#endif // MERT_SCORE_ARRAY_H_
diff --git a/mert/ScoreData.cpp b/mert/ScoreData.cpp
index e79595d06..b4454dc4e 100644
--- a/mert/ScoreData.cpp
+++ b/mert/ScoreData.cpp
@@ -1,61 +1,62 @@
/*
* ScoreData.cpp
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
#include "ScoreData.h"
+
+#include <fstream>
#include "Scorer.h"
#include "Util.h"
#include "FileStream.h"
-ScoreData::ScoreData(Scorer& ptr):
- theScorer(&ptr)
+ScoreData::ScoreData(Scorer* scorer) :
+ m_scorer(scorer)
{
- score_type = theScorer->getName();
+ m_score_type = m_scorer->getName();
// This is not dangerous: we don't use the this pointer in SetScoreData.
- theScorer->setScoreData(this);
- number_of_scores = theScorer->NumberOfScores();
- // TRACE_ERR("ScoreData: number_of_scores: " << number_of_scores << std::endl);
+ m_scorer->setScoreData(this);
+ m_num_scores = m_scorer->NumberOfScores();
+ // TRACE_ERR("ScoreData: m_num_scores: " << m_num_scores << std::endl);
}
-void ScoreData::save(std::ofstream& outFile, bool bin)
+void ScoreData::save(ostream* os, bool bin)
{
- for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
- i->save(outFile, score_type, bin);
+ for (scoredata_t::iterator i = m_array.begin();
+ i != m_array.end(); ++i) {
+ i->save(os, m_score_type, bin);
}
}
-void ScoreData::save(const std::string &file, bool bin)
+void ScoreData::save(const string &file, bool bin)
{
if (file.empty()) return;
- TRACE_ERR("saving the array into " << file << std::endl);
+ TRACE_ERR("saving the array into " << file << endl);
// matches a stream with a file. Opens the file.
- std::ofstream outFile(file.c_str(), std::ios::out);
-
- ScoreStats entry;
-
- save(outFile, bin);
+ ofstream ofs(file.c_str(), ios::out);
+ ostream* os = &ofs;
+ save(os, bin);
+ ofs.close();
+}
- outFile.close();
+void ScoreData::save(bool bin) {
+ save(&cout, bin);
}
-void ScoreData::load(ifstream& inFile)
+void ScoreData::load(istream* is)
{
ScoreArray entry;
- while (!inFile.eof()) {
-
- if (!inFile.good()) {
- std::cerr << "ERROR ScoreData::load inFile.good()" << std::endl;
+ while (!is->eof()) {
+ if (!is->good()) {
+ cerr << "ERROR ScoreData::load inFile.good()" << endl;
}
-
entry.clear();
- entry.load(inFile);
-
+ entry.load(is);
if (entry.size() == 0) {
break;
}
@@ -63,60 +64,58 @@ void ScoreData::load(ifstream& inFile)
}
}
-
-void ScoreData::load(const std::string &file)
+void ScoreData::load(const string &file)
{
- TRACE_ERR("loading score data from " << file << std::endl);
-
- inputfilestream inFile(file); // matches a stream with a file. Opens the file
-
- if (!inFile) {
+ TRACE_ERR("loading score data from " << file << endl);
+ inputfilestream input_stream(file); // matches a stream with a file. Opens the file
+ if (!input_stream) {
throw runtime_error("Unable to open score file: " + file);
}
-
- load((ifstream&) inFile);
-
- inFile.close();
+ istream* is = &input_stream;
+ load(is);
+ input_stream.close();
}
-
void ScoreData::add(ScoreArray& e)
{
if (exists(e.getIndex())) { // array at position e.getIndex() already exists
//enlarge array at position e.getIndex()
size_t pos = getIndex(e.getIndex());
- array_.at(pos).merge(e);
+ m_array.at(pos).merge(e);
} else {
- array_.push_back(e);
+ m_array.push_back(e);
setIndex();
}
}
-void ScoreData::add(const ScoreStats& e, const std::string& sent_idx)
+void ScoreData::add(const ScoreStats& e, const string& sent_idx)
{
if (exists(sent_idx)) { // array at position e.getIndex() already exists
// Enlarge array at position e.getIndex()
size_t pos = getIndex(sent_idx);
// TRACE_ERR("Inserting in array " << sent_idx << std::endl);
- array_.at(pos).add(e);
+ m_array.at(pos).add(e);
// TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl);
} else {
// TRACE_ERR("Creating a new entry in the array" << std::endl);
ScoreArray a;
- a.NumberOfScores(number_of_scores);
+ a.NumberOfScores(m_num_scores);
a.add(e);
a.setIndex(sent_idx);
- add(a);
+ size_t idx = m_array.size();
+ m_array.push_back(a);
+ m_index_to_array_name[idx] = sent_idx;
+ m_array_name_to_index[sent_idx]=idx;
// TRACE_ERR("size: " << size() << " -> " << a.size() << std::endl);
}
}
bool ScoreData::check_consistency() const
{
- if (array_.size() == 0)
+ if (m_array.size() == 0)
return true;
- for (scoredata_t::const_iterator i = array_.begin(); i != array_.end(); ++i)
+ for (scoredata_t::const_iterator i = m_array.begin(); i != m_array.end(); ++i)
if (!i->check_consistency()) return false;
return true;
@@ -124,10 +123,10 @@ bool ScoreData::check_consistency() const
void ScoreData::setIndex()
{
- size_t j=0;
- for (scoredata_t::iterator i = array_.begin(); i !=array_.end(); i++) {
- idx2arrayname_[j]=i->getIndex();
- arrayname2idx_[i->getIndex()]=j;
+ size_t j = 0;
+ for (scoredata_t::iterator i = m_array.begin(); i != m_array.end(); ++i) {
+ m_index_to_array_name[j] = i->getIndex();
+ m_array_name_to_index[i->getIndex()]=j;
j++;
}
}
diff --git a/mert/ScoreData.h b/mert/ScoreData.h
index 765f74148..70d7b9ab1 100644
--- a/mert/ScoreData.h
+++ b/mert/ScoreData.h
@@ -1,17 +1,16 @@
/*
* ScoreData.h
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
-#ifndef SCORE_DATA_H
-#define SCORE_DATA_H
+#ifndef MERT_SCORE_DATA_H_
+#define MERT_SCORE_DATA_H_
-#include <fstream>
-#include <vector>
#include <iostream>
+#include <vector>
#include <stdexcept>
#include <string>
#include "ScoreArray.h"
@@ -23,35 +22,34 @@ class Scorer;
class ScoreData
{
-protected:
- scoredata_t array_;
- idx2name idx2arrayname_; // map from index to name of array
- name2idx arrayname2idx_; // map from name to index of array
-
private:
// Do not allow the user to instanciate without arguments.
ScoreData() {}
- Scorer* theScorer;
- std::string score_type;
- size_t number_of_scores;
+ scoredata_t m_array;
+ idx2name m_index_to_array_name; // map from index to name of array
+ name2idx m_array_name_to_index; // map from name to index of array
+
+ Scorer* m_scorer;
+ std::string m_score_type;
+ size_t m_num_scores;
public:
- ScoreData(Scorer& sc);
+ ScoreData(Scorer* scorer);
~ScoreData() {}
- inline void clear() {
- array_.clear();
- }
+ void clear() { m_array.clear(); }
inline ScoreArray get(const std::string& idx) {
- return array_.at(getIndex(idx));
+ return m_array.at(getIndex(idx));
}
+
inline ScoreArray& get(size_t idx) {
- return array_.at(idx);
+ return m_array.at(idx);
}
+
inline const ScoreArray& get(size_t idx) const {
- return array_.at(idx);
+ return m_array.at(idx);
}
inline bool exists(const std::string& sent_idx) const {
@@ -59,59 +57,54 @@ public:
}
inline bool exists(int sent_idx) const {
- return (sent_idx > -1 && sent_idx < static_cast<int>(array_.size())) ? true : false;
+ return (sent_idx > -1 && sent_idx < static_cast<int>(m_array.size())) ? true : false;
}
inline ScoreStats& get(size_t i, size_t j) {
- return array_.at(i).get(j);
- }
- inline const ScoreStats& get(size_t i, size_t j) const {
- return array_.at(i).get(j);
+ return m_array.at(i).get(j);
}
- inline std::string name() const {
- return score_type;
+ inline const ScoreStats& get(size_t i, size_t j) const {
+ return m_array.at(i).get(j);
}
- inline std::string name(const std::string &sctype) {
- return score_type = sctype;
+ std::string name() const { return m_score_type; }
+
+ std::string name(const std::string &score_type) {
+ return m_score_type = score_type;
}
void add(ScoreArray& e);
void add(const ScoreStats& e, const std::string& sent_idx);
- inline size_t NumberOfScores() const {
- return number_of_scores;
- }
- inline size_t size() const {
- return array_.size();
- }
+ size_t NumberOfScores() const { return m_num_scores; }
+ size_t size() const { return m_array.size(); }
void save(const std::string &file, bool bin=false);
- void save(ofstream& outFile, bool bin=false);
- inline void save(bool bin=false) {
- save("/dev/stdout", bin);
- }
+ void save(std::ostream* os, bool bin=false);
+ void save(bool bin=false);
- void load(ifstream& inFile);
+ void load(std::istream* is);
void load(const std::string &file);
bool check_consistency() const;
+
void setIndex();
inline int getIndex(const std::string& idx) const {
- name2idx::const_iterator i = arrayname2idx_.find(idx);
- if (i != arrayname2idx_.end())
+ name2idx::const_iterator i = m_array_name_to_index.find(idx);
+ if (i != m_array_name_to_index.end())
return i->second;
else
return -1;
}
+
inline std::string getIndex(size_t idx) const {
- idx2name::const_iterator i = idx2arrayname_.find(idx);
- if (i != idx2arrayname_.end())
+ idx2name::const_iterator i = m_index_to_array_name.find(idx);
+ if (i != m_index_to_array_name.end())
throw runtime_error("there is no entry at index " + idx);
return i->second;
}
};
-#endif // SCORE_DATA_H
+#endif // MERT_SCORE_DATA_H_
diff --git a/mert/ScoreDataIterator.h b/mert/ScoreDataIterator.h
index 4633b8651..910e92165 100644
--- a/mert/ScoreDataIterator.h
+++ b/mert/ScoreDataIterator.h
@@ -17,8 +17,8 @@ License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-#ifndef _SCORE_DATA_ITERATOR_
-#define _SCORE_DATA_ITERATOR_
+#ifndef MERT_SCORE_DATA_ITERATOR_H_
+#define MERT_SCORE_DATA_ITERATOR_H_
/*
* For loading from the score data file.
@@ -62,4 +62,4 @@ class ScoreDataIterator :
std::vector<ScoreDataItem> m_next;
};
-#endif
+#endif // MERT_SCORE_DATA_ITERATOR_H_
diff --git a/mert/ScoreStats.cpp b/mert/ScoreStats.cpp
index 7efea99a9..e6c111d5d 100644
--- a/mert/ScoreStats.cpp
+++ b/mert/ScoreStats.cpp
@@ -1,6 +1,6 @@
/*
* FeatureStats.cpp
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
@@ -14,36 +14,30 @@ const int kAvailableSize = 8;
} // namespace
ScoreStats::ScoreStats()
- : available_(kAvailableSize), entries_(0),
- array_(new ScoreStatsType[available_]) {}
+ : m_available_size(kAvailableSize), m_entries(0),
+ m_array(new ScoreStatsType[m_available_size]) {}
ScoreStats::ScoreStats(const size_t size)
- : available_(size), entries_(size),
- array_(new ScoreStatsType[available_])
+ : m_available_size(size), m_entries(size),
+ m_array(new ScoreStatsType[m_available_size])
{
- memset(array_, 0, GetArraySizeWithBytes());
-}
-
-ScoreStats::ScoreStats(std::string &theString)
- : available_(0), entries_(0), array_(NULL)
-{
- set(theString);
+ memset(m_array, 0, GetArraySizeWithBytes());
}
ScoreStats::~ScoreStats()
{
- if (array_) {
- delete [] array_;
- array_ = NULL;
+ if (m_array) {
+ delete [] m_array;
+ m_array = NULL;
}
}
void ScoreStats::Copy(const ScoreStats &stats)
{
- available_ = stats.available();
- entries_ = stats.size();
- array_ = new ScoreStatsType[available_];
- memcpy(array_, stats.getArray(), GetArraySizeWithBytes());
+ m_available_size = stats.available();
+ m_entries = stats.size();
+ m_array = new ScoreStatsType[m_available_size];
+ memcpy(m_array, stats.getArray(), GetArraySizeWithBytes());
}
ScoreStats::ScoreStats(const ScoreStats &stats)
@@ -53,77 +47,82 @@ ScoreStats::ScoreStats(const ScoreStats &stats)
ScoreStats& ScoreStats::operator=(const ScoreStats &stats)
{
- delete [] array_;
+ delete [] m_array;
Copy(stats);
return *this;
}
void ScoreStats::expand()
{
- available_ *= 2;
- scorestats_t buf = new ScoreStatsType[available_];
- memcpy(buf, array_, GetArraySizeWithBytes());
- delete [] array_;
- array_ = buf;
+ m_available_size *= 2;
+ scorestats_t buf = new ScoreStatsType[m_available_size];
+ memcpy(buf, m_array, GetArraySizeWithBytes());
+ delete [] m_array;
+ m_array = buf;
}
void ScoreStats::add(ScoreStatsType v)
{
if (isfull()) expand();
- array_[entries_++]=v;
+ m_array[m_entries++]=v;
}
-void ScoreStats::set(std::string &theString)
+void ScoreStats::set(const string& str)
{
- std::string substring, stringBuf;
reset();
-
- while (!theString.empty()) {
- getNextPound(theString, substring);
- add(ConvertStringToScoreStatsType(substring));
+ vector<string> out;
+ Tokenize(str.c_str(), ' ', &out);
+ for (vector<string>::const_iterator it = out.begin();
+ it != out.end(); ++it) {
+ add(ConvertStringToScoreStatsType(*it));
}
}
-void ScoreStats::loadbin(std::ifstream& inFile)
+void ScoreStats::loadbin(istream* is)
{
- inFile.read((char*)array_, GetArraySizeWithBytes());
+ is->read(reinterpret_cast<char*>(m_array),
+ static_cast<streamsize>(GetArraySizeWithBytes()));
}
-void ScoreStats::loadtxt(std::ifstream& inFile)
+void ScoreStats::loadtxt(istream* is)
{
- std::string theString;
- std::getline(inFile, theString);
- set(theString);
+ string line;
+ getline(*is, line);
+ set(line);
}
-void ScoreStats::loadtxt(const std::string &file)
+void ScoreStats::loadtxt(const string &file)
{
-// TRACE_ERR("loading the stats from " << file << std::endl);
-
- std::ifstream inFile(file.c_str(), std::ios::in); // matches a stream with a file. Opens the file
-
- loadtxt(inFile);
+ ifstream ifs(file.c_str(), ios::in); // matches a stream with a file. Opens the file
+ if (!ifs) {
+ cerr << "Failed to open " << file << endl;
+ exit(1);
+ }
+ istream* is = &ifs;
+ loadtxt(is);
}
-void ScoreStats::savetxt(const std::string &file)
+void ScoreStats::savetxt(const string &file)
{
-// TRACE_ERR("saving the stats into " << file << std::endl);
-
- std::ofstream outFile(file.c_str(), std::ios::out); // matches a stream with a file. Opens the file
-
- savetxt(outFile);
+ ofstream ofs(file.c_str(), ios::out); // matches a stream with a file. Opens the file
+ ostream* os = &ofs;
+ savetxt(os);
}
-
-void ScoreStats::savetxt(std::ofstream& outFile)
+void ScoreStats::savetxt(ostream* os)
{
- outFile << *this;
+ *os << *this;
}
-void ScoreStats::savebin(std::ofstream& outFile)
+void ScoreStats::savetxt() {
+ savetxt(&cout);
+}
+
+void ScoreStats::savebin(ostream* os)
{
- outFile.write((char*)array_, GetArraySizeWithBytes());
+ os->write(reinterpret_cast<char*>(m_array),
+ static_cast<streamsize>(GetArraySizeWithBytes()));
}
ostream& operator<<(ostream& o, const ScoreStats& e)
@@ -144,7 +143,7 @@ bool operator==(const ScoreStats& s1, const ScoreStats& s2) {
if (s1.get(k) != s2.get(k))
return false;
}
-
+
return true;
}
//END_ADDED
diff --git a/mert/ScoreStats.h b/mert/ScoreStats.h
index 68df91195..e8d4543ce 100644
--- a/mert/ScoreStats.h
+++ b/mert/ScoreStats.h
@@ -1,13 +1,13 @@
/*
* ScoreStats.h
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
-#ifndef SCORE_STATS_H
-#define SCORE_STATS_H
+#ifndef MERT_SCORE_STATS_H_
+#define MERT_SCORE_STATS_H_
#include <vector>
#include <iostream>
@@ -22,16 +22,16 @@ using namespace std;
class ScoreStats
{
private:
- size_t available_;
- size_t entries_;
+ size_t m_available_size;
+ size_t m_entries;
// TODO: Use smart pointer for exceptional-safety.
- scorestats_t array_;
+ scorestats_t m_array;
public:
ScoreStats();
explicit ScoreStats(const size_t size);
- explicit ScoreStats(std::string &theString);
+
~ScoreStats();
// We intentionally allow copying.
@@ -40,59 +40,52 @@ public:
void Copy(const ScoreStats &stats);
- bool isfull() const {
- return (entries_ < available_) ? 0 : 1;
- }
+ bool isfull() const { return (m_entries < m_available_size) ? 0 : 1; }
void expand();
void add(ScoreStatsType v);
void clear() {
- memset((void*)array_, 0, GetArraySizeWithBytes());
+ memset((void*)m_array, 0, GetArraySizeWithBytes());
}
void reset() {
- entries_ = 0;
+ m_entries = 0;
clear();
}
- inline ScoreStatsType get(size_t i) {
- return array_[i];
- }
- inline ScoreStatsType get(size_t i)const {
- return array_[i];
- }
- inline scorestats_t getArray() const {
- return array_;
- }
+ ScoreStatsType get(size_t i) { return m_array[i]; }
+ ScoreStatsType get(size_t i) const { return m_array[i]; }
+ scorestats_t getArray() const { return m_array; }
- void set(std::string &theString);
+ void set(const std::string& str);
- inline size_t bytes() const {
- return GetArraySizeWithBytes();
+ // Much more efficient than the above.
+ void set(const std::vector<ScoreStatsType>& stats) {
+ reset();
+ for (size_t i = 0; i < stats.size(); ++i) {
+ add(stats[i]);
+ }
}
+ size_t bytes() const { return GetArraySizeWithBytes(); }
+
size_t GetArraySizeWithBytes() const {
- return entries_ * sizeof(ScoreStatsType);
+ return m_entries * sizeof(ScoreStatsType);
}
- inline size_t size() const {
- return entries_;
- }
- inline size_t available() const {
- return available_;
- }
+ size_t size() const { return m_entries; }
+
+ size_t available() const { return m_available_size; }
void savetxt(const std::string &file);
- void savetxt(ofstream& outFile);
- void savebin(ofstream& outFile);
- inline void savetxt() {
- savetxt("/dev/stdout");
- }
+ void savetxt(ostream* os);
+ void savebin(ostream* os);
+ void savetxt();
void loadtxt(const std::string &file);
- void loadtxt(ifstream& inFile);
- void loadbin(ifstream& inFile);
+ void loadtxt(istream* is);
+ void loadbin(istream* is);
/**
* Write the whole object to a stream.
@@ -101,7 +94,7 @@ public:
};
//ADDED_BY_TS
-bool operator==(const ScoreStats& s1, const ScoreStats& s2);
+bool operator==(const ScoreStats& s1, const ScoreStats& s2);
//END_ADDED
-#endif // SCORE_STATS_H
+#endif // MERT_SCORE_STATS_H_
diff --git a/mert/Scorer.cpp b/mert/Scorer.cpp
index a2bb4720c..70948c47f 100644
--- a/mert/Scorer.cpp
+++ b/mert/Scorer.cpp
@@ -1,5 +1,9 @@
#include "Scorer.h"
+
#include <limits>
+#include "Vocabulary.h"
+#include "Util.h"
+#include "Singleton.h"
namespace {
@@ -33,14 +37,16 @@ inline float score_average(const statscores_t& scores, size_t start, size_t end)
Scorer::Scorer(const string& name, const string& config)
: m_name(name),
- m_encoder(new Encoder),
+ m_vocab(mert::VocabularyFactory::GetVocabulary()),
m_score_data(0),
- m_enable_preserve_case(true) {
+ m_enable_preserve_case(true),
+ m_filter(NULL) {
InitConfig(config);
}
Scorer::~Scorer() {
- delete m_encoder;
+ Singleton<mert::Vocabulary>::Delete();
+ delete m_filter;
}
void Scorer::InitConfig(const string& config) {
@@ -64,23 +70,6 @@ void Scorer::InitConfig(const string& config) {
}
}
-Scorer::Encoder::Encoder() {}
-
-Scorer::Encoder::~Encoder() {}
-
-int Scorer::Encoder::Encode(const string& token) {
- map<string, int>::iterator it = m_vocab.find(token);
- int encoded_token;
- if (it == m_vocab.end()) {
- // Add an new entry to the vocaburary.
- encoded_token = static_cast<int>(m_vocab.size());
- m_vocab[token] = encoded_token;
- } else {
- encoded_token = it->second;
- }
- return encoded_token;
-}
-
void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
std::istringstream in(line);
std::string token;
@@ -91,10 +80,84 @@ void Scorer::TokenizeAndEncode(const string& line, vector<int>& encoded) {
*it = tolower(*it);
}
}
- encoded.push_back(m_encoder->Encode(token));
+ encoded.push_back(m_vocab->Encode(token));
+ }
+}
+
+/**
+ * Set the factors, which should be used for this metric
+ */
+void Scorer::setFactors(const string& factors)
+{
+ if (factors.empty()) return;
+ vector<string> factors_vec;
+ split(factors, '|', factors_vec);
+ for(vector<string>::iterator it = factors_vec.begin(); it != factors_vec.end(); ++it)
+ {
+ int factor = atoi(it->c_str());
+ m_factors.push_back(factor);
+ }
+}
+
+/**
+ * Set unix filter, which will be used to preprocess the sentences
+ */
+void Scorer::setFilter(const string& filterCommand)
+{
+ if (filterCommand.empty()) return;
+ m_filter = new PreProcessFilter(filterCommand);
+}
+
+/**
+ * Take the factored sentence and return the desired factors
+ */
+string Scorer::applyFactors(const string& sentence) const
+{
+ if (m_factors.size() == 0) return sentence;
+
+ vector<string> tokens;
+ split(sentence, ' ', tokens);
+
+ stringstream sstream;
+ for (size_t i = 0; i < tokens.size(); ++i)
+ {
+ if (tokens[i] == "") continue;
+
+ vector<string> factors;
+ split(tokens[i], '|', factors);
+
+ int fsize = factors.size();
+
+ if (i > 0) sstream << " ";
+
+ for (size_t j = 0; j < m_factors.size(); ++j)
+ {
+ int findex = m_factors[j];
+ if (findex < 0 || findex >= fsize) throw runtime_error("Factor index is out of range.");
+
+ if (j > 0) sstream << "|";
+ sstream << factors[findex];
+ }
+ }
+ return sstream.str();
+}
+
+/**
+ * Preprocess the sentence with the filter (if given)
+ */
+string Scorer::applyFilter(const string& sentence) const
+{
+ if (m_filter)
+ {
+ return m_filter->ProcessSentence(sentence);
+ }
+ else
+ {
+ return sentence;
}
}
+
StatisticsBasedScorer::StatisticsBasedScorer(const string& name, const string& config)
: Scorer(name,config) {
//configure regularisation
diff --git a/mert/Scorer.h b/mert/Scorer.h
index f2f54670a..c441eca28 100644
--- a/mert/Scorer.h
+++ b/mert/Scorer.h
@@ -1,5 +1,5 @@
-#ifndef __SCORER_H__
-#define __SCORER_H__
+#ifndef MERT_SCORER_H_
+#define MERT_SCORER_H_
#include <iostream>
#include <sstream>
@@ -8,11 +8,18 @@
#include <vector>
#include "Types.h"
#include "ScoreData.h"
+#include "PreProcessFilter.h"
using namespace std;
class ScoreStats;
+namespace mert {
+
+class Vocabulary;
+
+} // namespace mert
+
/**
* Superclass of all scorers and dummy implementation.
*
@@ -28,10 +35,7 @@ class Scorer
/**
* Return the number of statistics needed for the computation of the score.
*/
- virtual size_t NumberOfScores() const {
- cerr << "Scorer: 0" << endl;
- return 0;
- }
+ virtual size_t NumberOfScores() const = 0;
/**
* Set the reference files. This must be called before prepareStats().
@@ -57,7 +61,9 @@ class Scorer
* applying each in turn, and calculating a new score each time.
*/
virtual void score(const candidates_t& candidates, const diffs_t& diffs,
- statscores_t& scores) const {
+ statscores_t& scores) const = 0;
+ /*
+ {
//dummy impl
if (!m_score_data) {
throw runtime_error("score data not loaded");
@@ -67,6 +73,7 @@ class Scorer
scores.push_back(0);
}
}
+ */
/**
* Calculate the score of the sentences corresponding to the list of candidate
@@ -93,27 +100,40 @@ class Scorer
/**
* Set the score data, prior to scoring.
*/
- void setScoreData(ScoreData* data) {
+ virtual void setScoreData(ScoreData* data) {
m_score_data = data;
}
- private:
- class Encoder {
- public:
- Encoder();
- virtual ~Encoder();
- int Encode(const std::string& token);
- void Clear() { m_vocab.clear(); }
-
- private:
- std::map<std::string, int> m_vocab;
- };
+ /**
+ * Set the factors, which should be used for this metric
+ */
+ virtual void setFactors(const string& factors);
+ mert::Vocabulary* GetVocab() const { return m_vocab; }
+
+ /**
+ * Set unix filter, which will be used to preprocess the sentences
+ */
+ virtual void setFilter(const string& filterCommand);
+
+ private:
void InitConfig(const string& config);
+ /**
+ * Take the factored sentence and return the desired factors
+ */
+ string applyFactors(const string& sentece) const;
+
+ /**
+ * Preprocess the sentence with the filter (if given)
+ */
+ string applyFilter(const string& sentence) const;
+
string m_name;
- Encoder* m_encoder;
+ mert::Vocabulary* m_vocab;
map<string, string> m_config;
+ vector<int> m_factors;
+ PreProcessFilter* m_filter;
protected:
ScoreData* m_score_data;
@@ -133,13 +153,19 @@ class Scorer
/**
* Tokenise line and encode.
- * Note: We assume that all tokens are separated by single spaces.
+ * Note: We assume that all tokens are separated by whitespaces.
*/
void TokenizeAndEncode(const string& line, vector<int>& encoded);
- void ClearEncoder() { m_encoder->Clear(); }
-};
+ /**
+ * Every inherited scorer should call this function for each sentence
+ */
+ string preprocessSentence(const string& sentence) const
+ {
+ return applyFactors(applyFilter(sentence));
+ }
+};
/**
* Abstract base class for Scorers that work by adding statistics across all
@@ -171,4 +197,4 @@ class StatisticsBasedScorer : public Scorer
size_t m_regularization_window;
};
-#endif // __SCORER_H__
+#endif // MERT_SCORER_H_
diff --git a/mert/ScorerFactory.cpp b/mert/ScorerFactory.cpp
index 2f47092ef..5da75273d 100644
--- a/mert/ScorerFactory.cpp
+++ b/mert/ScorerFactory.cpp
@@ -7,6 +7,8 @@
#include "TerScorer.h"
#include "CderScorer.h"
#include "MergeScorer.h"
+#include "InterpolatedScorer.h"
+#include "SemposScorer.h"
using namespace std;
@@ -16,22 +18,34 @@ vector<string> ScorerFactory::getTypes() {
types.push_back(string("PER"));
types.push_back(string("TER"));
types.push_back(string("CDER"));
+ types.push_back(string("WER"));
types.push_back(string("MERGE"));
+ types.push_back(string("SEMPOS"));
return types;
}
Scorer* ScorerFactory::getScorer(const string& type, const string& config) {
if (type == "BLEU") {
- return (BleuScorer*) new BleuScorer(config);
+ return new BleuScorer(config);
} else if (type == "PER") {
- return (PerScorer*) new PerScorer(config);
+ return new PerScorer(config);
} else if (type == "TER") {
- return (TerScorer*) new TerScorer(config);
+ return new TerScorer(config);
} else if (type == "CDER") {
- return (CderScorer*) new CderScorer(config);
+ return new CderScorer(config, true);
+ } else if (type == "WER") {
+ // CderScorer can compute both CDER and WER metric
+ return new CderScorer(config, false);
+ } else if (type == "SEMPOS") {
+ return new SemposScorer(config);
} else if (type == "MERGE") {
- return (MergeScorer*) new MergeScorer(config);
+ return new MergeScorer(config);
} else {
- throw runtime_error("Unknown scorer type: " + type);
+ if (type.find(',') != string::npos) {
+ return new InterpolatedScorer(type, config);
+ }
+ else {
+ throw runtime_error("Unknown scorer type: " + type);
+ }
}
}
diff --git a/mert/ScorerFactory.h b/mert/ScorerFactory.h
index f6054c770..6752817ef 100644
--- a/mert/ScorerFactory.h
+++ b/mert/ScorerFactory.h
@@ -1,5 +1,5 @@
-#ifndef __SCORER_FACTORY_H
-#define __SCORER_FACTORY_H
+#ifndef MERT_SCORER_FACTORY_H_
+#define MERT_SCORER_FACTORY_H_
#include <vector>
#include <string>
@@ -18,4 +18,4 @@ private:
~ScorerFactory() {}
};
-#endif // __SCORER_FACTORY_H
+#endif // MERT_SCORER_FACTORY_H_
diff --git a/mert/SemposOverlapping.cpp b/mert/SemposOverlapping.cpp
new file mode 100644
index 000000000..f27f188f7
--- /dev/null
+++ b/mert/SemposOverlapping.cpp
@@ -0,0 +1,109 @@
+#include "SemposOverlapping.h"
+#include "SemposScorer.h"
+
+#include <algorithm>
+#include <stdexcept>
+
+using namespace std;
+
+namespace {
+
+SemposOverlapping* g_overlapping = NULL;
+
+} // namespace
+
+SemposOverlapping* SemposOverlappingFactory::GetOverlapping(const string& str, const SemposScorer* sempos) {
+ if (str == "cap-micro") {
+ return new CapMicroOverlapping(sempos);
+ } else if (str == "cap-macro") {
+ return new CapMacroOverlapping(sempos);
+ } else {
+ throw runtime_error("Unknown overlapping: " + str);
+ }
+}
+
+void SemposOverlappingFactory::SetOverlapping(SemposOverlapping* ovr) {
+ g_overlapping = ovr;
+}
+
+vector<int> CapMicroOverlapping::prepareStats(const sentence_t& cand, const sentence_t& ref)
+{
+ vector<int> stats(2);
+ sentence_t intersection;
+
+ set_intersection(cand.begin(), cand.end(), ref.begin(), ref.end(),
+ inserter(intersection, intersection.begin()));
+
+ int multCoeff = 1000;
+
+ float interSum = 0;
+ for (sentence_t::iterator it = intersection.begin(); it != intersection.end(); it++)
+ {
+ interSum += semposScorer->weight(it->first);
+ }
+
+ float refSum = 0;
+ for (sentence_t::iterator it = ref.begin(); it != ref.end(); it++)
+ {
+ refSum += semposScorer->weight(it->first);
+ }
+
+ stats[0] = (int)(multCoeff * interSum);
+ stats[1] = (int)(multCoeff * refSum);
+ return stats;
+}
+
+float CapMicroOverlapping::calculateScore(const vector<int>& stats) const
+{
+ if (stats.size() != 2) {
+ throw std::runtime_error("Size of stats vector has to be 2");
+ }
+ if (stats[1] == 0) return 1.0f;
+ return stats[0] / static_cast<float>(stats[1]);
+}
+
+vector<int> CapMacroOverlapping::prepareStats(const sentence_t& cand, const sentence_t& ref)
+{
+ vector<int> stats(2 * kMaxNOC);
+ sentence_t intersection;
+
+ set_intersection(cand.begin(), cand.end(), ref.begin(), ref.end(),
+ inserter(intersection, intersection.begin()));
+
+ int multCoeff = 1000;
+
+ for (int i = 0; i < 2 * kMaxNOC; ++i) stats[i] = 0;
+ for (sentence_t::const_iterator it = intersection.begin(); it != intersection.end(); ++it) {
+ const int sempos = it->second;
+ float weight = semposScorer->weight(it->first);
+ stats[2 * sempos] += weight * multCoeff ;
+ }
+ for (sentence_t::const_iterator it = ref.begin(); it != ref.end(); ++it) {
+ const int sempos = it->second;
+ float weight = semposScorer->weight(it->first);
+ stats[2 * sempos + 1] += weight * multCoeff;
+ }
+
+ return stats;
+}
+
+float CapMacroOverlapping::calculateScore(const vector<int>& stats) const
+{
+ if (stats.size() != 2 * kMaxNOC) {
+ // TODO: Add some comments. The number "38" looks like a magic number.
+ throw std::runtime_error("Size of stats vector has to be 38");
+ }
+
+ int n = 0;
+ float sum = 0;
+ for (int i = 0; i < kMaxNOC; ++i) {
+ int clipped = stats[2 * i];
+ int refsize = stats[2 * i + 1];
+ if (refsize > 0) {
+ sum += clipped / static_cast<float>(refsize);
+ ++n;
+ }
+ }
+ if (n == 0) return 1;
+ return sum / n;
+}
diff --git a/mert/SemposOverlapping.h b/mert/SemposOverlapping.h
new file mode 100644
index 000000000..e16ffe7bb
--- /dev/null
+++ b/mert/SemposOverlapping.h
@@ -0,0 +1,90 @@
+#ifndef MERT_SEMPOSOVERLAPPING_H_
+#define MERT_SEMPOSOVERLAPPING_H_
+
+#include <map>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+
+class SemposScorer;
+
+// TODO: need comments about this number.
+const int kMaxNOC = 50;
+
+typedef std::pair<std::string, std::string> str_item_t;
+typedef std::vector<str_item_t> str_sentence_t;
+typedef str_sentence_t::const_iterator str_sentence_it;
+
+typedef std::pair<int,int> item_t;
+typedef std::multiset<item_t> sentence_t;
+typedef sentence_t::const_iterator sentence_it;
+
+/**
+ * An interface for classes representing overlapping formulas
+ */
+class SemposOverlapping
+{
+public:
+ virtual ~SemposOverlapping() {}
+ virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref) = 0;
+ virtual float calculateScore(const std::vector<int>& stats) const = 0;
+ virtual std::size_t NumberOfScores() const = 0;
+};
+
+class SemposOverlappingFactory {
+ public:
+ static SemposOverlapping* GetOverlapping(const std::string& str, const SemposScorer* sempos);
+
+ // dependency injection for unit testing.
+ static void SetOverlapping(SemposOverlapping* ovr);
+
+ private:
+ SemposOverlappingFactory() {}
+ ~SemposOverlappingFactory() {}
+};
+
+/**
+ * Overlapping proposed by (Bojar and Machacek, WMT 2011)
+ *
+ * Please refer to the paper for details:
+ * http://aclweb.org/anthology-new/W/W11/W11-2108.pdf
+ */
+class CapMicroOverlapping : public SemposOverlapping
+{
+public:
+ CapMicroOverlapping(const SemposScorer* sempos) : semposScorer(sempos) {}
+ ~CapMicroOverlapping() {}
+
+ virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
+ virtual float calculateScore(const std::vector<int>& stats) const;
+ virtual std::size_t NumberOfScores() const { return 2; }
+
+ private:
+ // no copying allowed.
+ CapMicroOverlapping(const CapMicroOverlapping&);
+ CapMicroOverlapping& operator=(const CapMicroOverlapping&);
+ const SemposScorer* semposScorer;
+};
+
+/**
+ * Overlapping proposed by (Kos and Bojar, 2009)
+ */
+class CapMacroOverlapping : public SemposOverlapping
+{
+public:
+ CapMacroOverlapping(const SemposScorer* sempos) : semposScorer(sempos) {}
+ ~CapMacroOverlapping() {}
+
+ virtual std::vector<int> prepareStats(const sentence_t& cand, const sentence_t& ref);
+ virtual float calculateScore(const std::vector<int>& stats) const;
+ virtual std::size_t NumberOfScores() const { return kMaxNOC * 2; }
+
+ private:
+ // no copying allowed.
+ CapMacroOverlapping(const CapMacroOverlapping&);
+ CapMacroOverlapping& operator=(const CapMacroOverlapping&);
+ const SemposScorer* semposScorer;
+};
+
+#endif // MERT_SEMPOSOVERLAPPING_H_
diff --git a/mert/SemposScorer.cpp b/mert/SemposScorer.cpp
new file mode 100644
index 000000000..30105c01f
--- /dev/null
+++ b/mert/SemposScorer.cpp
@@ -0,0 +1,179 @@
+#include "SemposScorer.h"
+
+#include <algorithm>
+#include <vector>
+#include <stdexcept>
+#include <fstream>
+
+#include "Util.h"
+#include "SemposOverlapping.h"
+
+using namespace std;
+
+SemposScorer::SemposScorer(const string& config)
+ : StatisticsBasedScorer("SEMPOS", config),
+ m_ovr(SemposOverlappingFactory::GetOverlapping(getConfig("overlapping", "cap-micro"),this)),
+ m_enable_debug(false)
+{
+ const string& debugSwitch = getConfig("debug", "0");
+ if (debugSwitch == "1") m_enable_debug = true;
+
+ m_semposMap.clear();
+
+ string weightsfile = getConfig("weightsfile", "");
+ if (weightsfile != "")
+ {
+ loadWeights(weightsfile);
+ }
+}
+
+SemposScorer::~SemposScorer() {}
+
+void SemposScorer::setReferenceFiles(const vector<string>& referenceFiles)
+{
+ //make sure reference data is clear
+ m_ref_sentences.clear();
+
+ //load reference data
+ for (size_t rid = 0; rid < referenceFiles.size(); ++rid) {
+ ifstream refin(referenceFiles[rid].c_str());
+ if (!refin) {
+ throw runtime_error("Unable to open: " + referenceFiles[rid]);
+ }
+ m_ref_sentences.push_back(vector<sentence_t>());
+ string line;
+ while (getline(refin,line)) {
+ line = preprocessSentence(line);
+
+ str_sentence_t sentence;
+ splitSentence(line, sentence);
+
+ sentence_t encodedSentence;
+ encodeSentence(sentence, encodedSentence);
+
+ m_ref_sentences[rid].push_back(encodedSentence);
+ }
+ }
+}
+
+void SemposScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
+{
+ vector<ScoreStatsType> stats;
+
+ const string& sentence = preprocessSentence(text);
+ str_sentence_t splitCandSentence;
+ splitSentence(sentence, splitCandSentence);
+
+ sentence_t encodedCandSentence;
+ encodeSentence(splitCandSentence, encodedCandSentence);
+
+ if (m_ref_sentences.size() == 1) {
+ stats = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[0][sid]);
+ } else {
+ float max = -1.0f;
+ for (size_t rid = 0; rid < m_ref_sentences.size(); ++rid) {
+ const vector<ScoreStatsType>& tmp = m_ovr->prepareStats(encodedCandSentence, m_ref_sentences[rid][sid]);
+ if (m_ovr->calculateScore(tmp) > max) {
+ stats = tmp;
+ }
+ }
+ }
+ entry.set(stats);
+}
+
+void SemposScorer::splitSentence(const string& sentence, str_sentence_t& splitSentence)
+{
+ splitSentence.clear();
+
+ vector<string> tokens;
+ split(sentence, ' ', tokens);
+ for (vector<string>::iterator it = tokens.begin(); it != tokens.end(); ++it) {
+ vector<string> factors;
+ split(*it, '|', factors);
+ if (factors.size() != 2) throw runtime_error("Sempos scorer accepts two factors (item|class)");
+ const string& item = factors[0];
+ const string& klass = factors[1];
+ splitSentence.push_back(make_pair(item, klass));
+ }
+}
+
+void SemposScorer::encodeSentence(const str_sentence_t& sentence, sentence_t& encodedSentence)
+{
+ for (str_sentence_it it = sentence.begin(); it != sentence.end(); ++it) {
+ const int tlemma = encodeString(it->first);
+ const int sempos = encodeSempos(it->second);
+ if (sempos >= 0) {
+ encodedSentence.insert(make_pair(tlemma,sempos));
+ }
+ }
+}
+
+int SemposScorer::encodeString(const string& str)
+{
+ encoding_it encoding = m_stringMap.find(str);
+ int encoded_str;
+ if (encoding == m_stringMap.end()) {
+ encoded_str = static_cast<int>(m_stringMap.size());
+ m_stringMap[str] = encoded_str;
+ } else {
+ encoded_str = encoding->second;
+ }
+ return encoded_str;
+}
+
+int SemposScorer::encodeSempos(const string& sempos)
+{
+ if (sempos == "-") return -1;
+ encoding_it it = m_semposMap.find(sempos);
+ if (it == m_semposMap.end()) {
+ const int classNumber = static_cast<int>(m_semposMap.size());
+ if (classNumber == kMaxNOC) {
+ throw std::runtime_error("Number of classes is greater than kMaxNOC");
+ }
+ m_semposMap[sempos] = classNumber;
+ return classNumber;
+ } else {
+ return it->second;
+ }
+}
+
+float SemposScorer::weight(int item) const
+{
+ std::map<int,float>::const_iterator it = weightsMap.find(item);
+ if (it == weightsMap.end())
+ {
+ return 1.0f;
+ }
+ else
+ {
+ return it->second;
+ }
+}
+
+void SemposScorer::loadWeights(const string& weightsfile)
+{
+ string line;
+ ifstream myfile;
+ myfile.open(weightsfile.c_str(), ifstream::in);
+ if (myfile.is_open())
+ {
+ while ( myfile.good() )
+ {
+ getline (myfile,line);
+ vector<string> fields;
+ if (line == "") continue;
+ split(line, '\t', fields);
+ if (fields.size() != 2) throw std::runtime_error("Bad format of a row in weights file.");
+ int encoded = encodeString(fields[0]);
+ float weight = atof(fields[1].c_str());
+ weightsMap[encoded] = weight;
+ }
+ myfile.close();
+ }
+ else
+ {
+ cerr << "Unable to open file "<< weightsfile << endl;
+ exit(1);
+ }
+
+}
diff --git a/mert/SemposScorer.h b/mert/SemposScorer.h
new file mode 100644
index 000000000..e0ab84768
--- /dev/null
+++ b/mert/SemposScorer.h
@@ -0,0 +1,64 @@
+#ifndef MERT_SEMPOSSCORER_H_
+#define MERT_SEMPOSSCORER_H_
+
+#include <map>
+#include <string>
+#include <utility>
+#include <vector>
+#include <boost/scoped_ptr.hpp>
+
+#include "Scorer.h"
+
+// NOTE: This header should be included in .cpp file
+// because SemposScorer wants to know what actual SemposOverlapping type is
+// when we implement the scorer in .cpp file.
+// However, currently SemposScorer uses a bunch of typedefs, which are
+// used in SemposScorer as well as inherited SemposOverlapping classes.
+#include "SemposOverlapping.h"
+
+/**
+ * This class represents sempos based metrics.
+ */
+class SemposScorer: public StatisticsBasedScorer
+{
+public:
+ explicit SemposScorer(const std::string& config);
+ ~SemposScorer();
+
+ virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
+ virtual void prepareStats(std::size_t sindex, const std::string& text, ScoreStats& entry);
+ virtual std::size_t NumberOfScores() const { return m_ovr->NumberOfScores(); }
+ virtual float calculateScore(const std::vector<int>& comps) const {
+ return m_ovr->calculateScore(comps);
+ }
+
+ bool EnableDebug() const { return m_enable_debug; }
+
+ float weight(int item) const;
+
+private:
+ boost::scoped_ptr<SemposOverlapping> m_ovr;
+ std::vector<std::vector<sentence_t> > m_ref_sentences;
+
+ typedef std::map<std::string, int> encoding_t;
+ typedef encoding_t::iterator encoding_it;
+
+ encoding_t m_semposMap;
+ encoding_t m_stringMap;
+ bool m_enable_debug;
+
+ void splitSentence(const std::string& sentence, str_sentence_t& splitSentence);
+ void encodeSentence(const str_sentence_t& sentence, sentence_t& encodedSentence);
+ int encodeString(const std::string& str);
+ int encodeSempos(const std::string& sempos);
+
+ std::map<int, float> weightsMap;
+
+ void loadWeights(const string& weightsfile);
+
+ // no copying allowed.
+ SemposScorer(const SemposScorer&);
+ SemposScorer& operator=(const SemposScorer&);
+};
+
+#endif // MERT_SEMPOSSCORER_H_
diff --git a/mert/Singleton.h b/mert/Singleton.h
new file mode 100644
index 000000000..9fef3e639
--- /dev/null
+++ b/mert/Singleton.h
@@ -0,0 +1,33 @@
+#ifndef MERT_SINGLETON_H_
+#define MERT_SINGLETON_H_
+
+#include <cstdlib>
+
+// thread *un*safe singleton.
+// TODO: replace this with thread-safe singleton.
+template <typename T>
+class Singleton {
+ public:
+ static T* GetInstance() {
+ if (m_instance == NULL) {
+ m_instance = new T;
+ }
+ return m_instance;
+ }
+
+ static void Delete() {
+ if (m_instance) {
+ delete m_instance;
+ m_instance = NULL;
+ }
+ }
+
+ private:
+ Singleton();
+ static T* m_instance;
+};
+
+template <typename T>
+T* Singleton<T>::m_instance = NULL;
+
+#endif // MERT_SINGLETON_H_
diff --git a/mert/SingletonTest.cpp b/mert/SingletonTest.cpp
new file mode 100644
index 000000000..2c44bdc1f
--- /dev/null
+++ b/mert/SingletonTest.cpp
@@ -0,0 +1,27 @@
+#include "Singleton.h"
+
+#define BOOST_TEST_MODULE MertSingleton
+#include <boost/test/unit_test.hpp>
+
+namespace {
+
+static int g_count = 0;
+
+class Instance {
+ public:
+ Instance() { ++g_count; }
+ ~Instance() {}
+};
+
+} // namespace
+
+BOOST_AUTO_TEST_CASE(singleton_basic) {
+ Instance* instance1 = Singleton<Instance>::GetInstance();
+ Instance* instance2 = Singleton<Instance>::GetInstance();
+ Instance* instance3 = Singleton<Instance>::GetInstance();
+ BOOST_REQUIRE(instance1 == instance2);
+ BOOST_REQUIRE(instance2 == instance3);
+ BOOST_CHECK_EQUAL(1, g_count);
+
+ Singleton<Instance>::Delete();
+}
diff --git a/mert/TERsrc/alignmentStruct.cpp b/mert/TER/alignmentStruct.cpp
index 15b4a8032..15b4a8032 100644
--- a/mert/TERsrc/alignmentStruct.cpp
+++ b/mert/TER/alignmentStruct.cpp
diff --git a/mert/TERsrc/alignmentStruct.h b/mert/TER/alignmentStruct.h
index 27e8c35d3..9e9a75468 100644
--- a/mert/TERsrc/alignmentStruct.h
+++ b/mert/TER/alignmentStruct.h
@@ -1,6 +1,5 @@
-#ifndef __TERCPPALIGNMENTSTRUCT_H__
-#define __TERCPPALIGNMENTSTRUCT_H__
-
+#ifndef MERT_TER_ALIGNMENT_STRUCT_H_
+#define MERT_TER_ALIGNMENT_STRUCT_H_
#include <vector>
#include <stdio.h>
@@ -8,7 +7,6 @@
#include <sstream>
#include "tools.h"
-
using namespace std;
using namespace Tools;
@@ -44,4 +42,4 @@ public:
}
-#endif // __TERCPPALIGNMENTSTRUCT_H__
+#endif // MERT_TER_ALIGNMENT_STRUCT_H_
diff --git a/mert/TERsrc/bestShiftStruct.h b/mert/TER/bestShiftStruct.h
index 141ebdeb8..bfebe3b1e 100644
--- a/mert/TERsrc/bestShiftStruct.h
+++ b/mert/TER/bestShiftStruct.h
@@ -1,5 +1,5 @@
-#ifndef __BESTSHIFTSTRUCT_H__
-#define __BESTSHIFTSTRUCT_H__
+#ifndef MERT_TER_BEST_SHIFT_STRUCT_H_
+#define MERT_TER_BEST_SHIFT_STRUCT_H_
#include <vector>
#include <stdio.h>
@@ -47,4 +47,4 @@ public:
}
-#endif // __BESTSHIFTSTRUCT_H__
+#endif // MERT_TER_BEST_SHIFT_STRUCT_H_
diff --git a/mert/TERsrc/hashMap.cpp b/mert/TER/hashMap.cpp
index 469167aaa..469167aaa 100644
--- a/mert/TERsrc/hashMap.cpp
+++ b/mert/TER/hashMap.cpp
diff --git a/mert/TERsrc/hashMap.h b/mert/TER/hashMap.h
index c3e4578e5..85020d041 100644
--- a/mert/TERsrc/hashMap.h
+++ b/mert/TER/hashMap.h
@@ -2,8 +2,8 @@
* Generic hashmap manipulation functions
*/
-#ifndef __HASHMAP_H__
-#define __HASHMAP_H__
+#ifndef MERT_TER_HASHMAP_H_
+#define MERT_TER_HASHMAP_H_
#include "stringHasher.h"
#include <vector>
@@ -40,4 +40,4 @@ public:
}
-#endif // __HASHMAP_H__
+#endif // MERT_TER_HASHMAP_H_
diff --git a/mert/TERsrc/hashMapInfos.cpp b/mert/TER/hashMapInfos.cpp
index 9cd431196..9cd431196 100644
--- a/mert/TERsrc/hashMapInfos.cpp
+++ b/mert/TER/hashMapInfos.cpp
diff --git a/mert/TERsrc/hashMapInfos.h b/mert/TER/hashMapInfos.h
index f4a46acf8..8b56e9d02 100644
--- a/mert/TERsrc/hashMapInfos.h
+++ b/mert/TER/hashMapInfos.h
@@ -1,8 +1,8 @@
/*
* Generic hashmap manipulation functions
*/
-#ifndef __HASHMAPINFOS_H__
-#define __HASHMAPINFOS_H__
+#ifndef MERT_TER_HASHMAP_INFOS_H_
+#define MERT_TER_HASHMAP_INFOS_H_
#include "infosHasher.h"
#include <vector>
@@ -39,4 +39,4 @@ public:
}
-#endif // __HASHMAPINFOS_H__
+#endif // MERT_TER_HASHMAP_INFOS_H_
diff --git a/mert/TERsrc/hashMapStringInfos.cpp b/mert/TER/hashMapStringInfos.cpp
index 0fbb0a98a..0fbb0a98a 100644
--- a/mert/TERsrc/hashMapStringInfos.cpp
+++ b/mert/TER/hashMapStringInfos.cpp
diff --git a/mert/TERsrc/hashMapStringInfos.h b/mert/TER/hashMapStringInfos.h
index 7912be0a2..870274f3d 100644
--- a/mert/TERsrc/hashMapStringInfos.h
+++ b/mert/TER/hashMapStringInfos.h
@@ -1,8 +1,8 @@
/*
* Generic hashmap manipulation functions
*/
-#ifndef __HASHMAPSTRINGINFOS_H__
-#define __HASHMAPSTRINGINFOS_H__
+#ifndef MERT_TER_HASHMAP_STRING_INFOS_H_
+#define MERT_TER_HASHMAP_STRING_INFOS_H_
#include "stringInfosHasher.h"
#include <vector>
@@ -39,4 +39,4 @@ public:
}
-#endif // __HASHMAPSTRINGINFOS_H__
+#endif // MERT_TER_HASHMAP_STRING_INFOS_H_
diff --git a/mert/TERsrc/infosHasher.cpp b/mert/TER/infosHasher.cpp
index 654b0b26f..654b0b26f 100644
--- a/mert/TERsrc/infosHasher.cpp
+++ b/mert/TER/infosHasher.cpp
diff --git a/mert/TERsrc/infosHasher.h b/mert/TER/infosHasher.h
index 8bc2ccd00..02a32280b 100644
--- a/mert/TERsrc/infosHasher.h
+++ b/mert/TER/infosHasher.h
@@ -1,5 +1,5 @@
-#ifndef __INFOSHASHER_H__
-#define __INFOSHASHER_H__
+#ifndef MERT_TER_INFO_SHASHER_H_
+#define MERT_TER_INFO_SHASHER_H_
#include <string>
#include <stdio.h>
@@ -28,4 +28,4 @@ public:
}
-#endif // __INFOSHASHER_H__
+#endif // MERT_TER_INFO_SHASHER_H_
diff --git a/mert/TERsrc/stringHasher.cpp b/mert/TER/stringHasher.cpp
index 24fde0e32..24fde0e32 100644
--- a/mert/TERsrc/stringHasher.cpp
+++ b/mert/TER/stringHasher.cpp
diff --git a/mert/TERsrc/stringHasher.h b/mert/TER/stringHasher.h
index 0894812f0..897bd9ff5 100644
--- a/mert/TERsrc/stringHasher.h
+++ b/mert/TER/stringHasher.h
@@ -1,5 +1,5 @@
-#ifndef __STRINGHASHER_H__
-#define __STRINGHASHER_H__
+#ifndef MERT_TER_STRING_HASHER_H_
+#define MERT_TER_STRING_HASHER_H_
#include <string>
#include <iostream>
@@ -25,4 +25,4 @@ public:
}
-#endif // __STRINGHASHER_H__
+#endif // MERT_TER_STRING_HASHER_H_
diff --git a/mert/TERsrc/stringInfosHasher.cpp b/mert/TER/stringInfosHasher.cpp
index 3e02e7a20..3e02e7a20 100644
--- a/mert/TERsrc/stringInfosHasher.cpp
+++ b/mert/TER/stringInfosHasher.cpp
diff --git a/mert/TERsrc/stringInfosHasher.h b/mert/TER/stringInfosHasher.h
index e9324cc47..c1b891662 100644
--- a/mert/TERsrc/stringInfosHasher.h
+++ b/mert/TER/stringInfosHasher.h
@@ -1,5 +1,5 @@
-#ifndef __STRINGINFOSHASHER_H__
-#define __STRINGINFOSHASHER_H__
+#ifndef MERT_TER_STRING_INFOS_HASHER_H_
+#define MERT_TER_STRING_INFOS_HASHER_H_
#include <string>
#include <iostream>
@@ -25,4 +25,4 @@ public:
}
-#endif // __STRINGINFOSHASHER_H__
+#endif // MERT_TER_STRING_INFOS_HASHER_H_
diff --git a/mert/TERsrc/terAlignment.cpp b/mert/TER/terAlignment.cpp
index 87be53b11..87be53b11 100644
--- a/mert/TERsrc/terAlignment.cpp
+++ b/mert/TER/terAlignment.cpp
diff --git a/mert/TERsrc/terAlignment.h b/mert/TER/terAlignment.h
index bca00ead3..c8c82eac8 100644
--- a/mert/TERsrc/terAlignment.h
+++ b/mert/TER/terAlignment.h
@@ -1,5 +1,5 @@
-#ifndef __TERCPPTERALIGNMENT_H__
-#define __TERCPPTERALIGNMENT_H__
+#ifndef MERT_TER_TER_ALIGNMENT_H_
+#define MERT_TER_TER_ALIGNMENT_H_
#include <vector>
#include <stdio.h>
@@ -48,4 +48,4 @@ public:
}
-#endif // __TERCPPTERALIGNMENT_H__
+#endif // MERT_TER_TER_ALIGNMENT_H__
diff --git a/mert/TERsrc/terShift.cpp b/mert/TER/terShift.cpp
index 428803849..428803849 100644
--- a/mert/TERsrc/terShift.cpp
+++ b/mert/TER/terShift.cpp
diff --git a/mert/TERsrc/terShift.h b/mert/TER/terShift.h
index a54ba633d..679a7c8bb 100644
--- a/mert/TERsrc/terShift.h
+++ b/mert/TER/terShift.h
@@ -1,5 +1,5 @@
-#ifndef __TERCPPTERSHIFT_H__
-#define __TERCPPTERSHIFT_H__
+#ifndef MERT_TER_TER_SHIFT_H_
+#define MERT_TER_TER_SHIFT_H_
#include <vector>
#include <stdio.h>
@@ -41,4 +41,4 @@ public:
}
-#endif // __TERCPPTERSHIFT_H__
+#endif // MERT_TER_TER_SHIFT_H_
diff --git a/mert/TERsrc/tercalc.cpp b/mert/TER/tercalc.cpp
index e16f692e8..e16f692e8 100644
--- a/mert/TERsrc/tercalc.cpp
+++ b/mert/TER/tercalc.cpp
diff --git a/mert/TERsrc/tercalc.h b/mert/TER/tercalc.h
index cf205ccbb..9e1a01f65 100644
--- a/mert/TERsrc/tercalc.h
+++ b/mert/TER/tercalc.h
@@ -1,5 +1,5 @@
-#ifndef _TERCPPTERCALC_H___
-#define _TERCPPTERCALC_H___
+#ifndef MERT_TER_TER_CALC_H_
+#define MERT_TER_TER_CALC_H_
#include <vector>
#include <stdio.h>
@@ -79,4 +79,4 @@ public:
}
-#endif // _TERCPPTERCALC_H___
+#endif // MERT_TER_TER_CALC_H_
diff --git a/mert/TERsrc/tools.cpp b/mert/TER/tools.cpp
index 2d910ec05..2d910ec05 100644
--- a/mert/TERsrc/tools.cpp
+++ b/mert/TER/tools.cpp
diff --git a/mert/TERsrc/tools.h b/mert/TER/tools.h
index df681a2b2..6f78b9a6a 100644
--- a/mert/TERsrc/tools.h
+++ b/mert/TER/tools.h
@@ -1,5 +1,5 @@
-#ifndef __TERCPPTOOLS_H__
-#define __TERCPPTOOLS_H__
+#ifndef MERT_TER_TOOLS_H_
+#define MERT_TER_TOOLS_H_
#include <vector>
#include <iostream>
@@ -62,4 +62,4 @@ param copyParam(param p);
}
-#endif // __TERCPPTOOLS_H__
+#endif // MERT_TER_TOOLS_H_
diff --git a/mert/TODO b/mert/TODO
index 2559e78b5..21b4ce04e 100644
--- a/mert/TODO
+++ b/mert/TODO
@@ -4,3 +4,21 @@
- this may make use of 'evaluator', soon to be added by Matous Machacek
- check that --pairwise-ranked is compatible with all optimization metrics
+
+- Replace the standard rand() currently used in MERT and PRO with better
+ random generators such as Boost's random generators (e.g., boost::mt19937).
+ - create a Random class to hide the details, i.e., how to generate
+ random numbers, which allows us to use custom random generators more
+ easily.
+
+ Pros:
+ - In MERT, you might want to use the random restarting technique to avoid
+ local optima.
+ - PRO uses a sampling technique to choose candidate translation pairs
+ from N-best lists, which means the choice of random generators seems to
+ be important.
+
+ Cons:
+ - This change will require us to re-create the truth results for regression
+ testing related to MERT and PRO because the new random generator will
+ generate different numbers from the current generator does.
diff --git a/mert/TerScorer.cpp b/mert/TerScorer.cpp
index ac029b027..2cfb19275 100644
--- a/mert/TerScorer.cpp
+++ b/mert/TerScorer.cpp
@@ -5,8 +5,8 @@
#include <stdexcept>
#include "ScoreStats.h"
-#include "TERsrc/tercalc.h"
-#include "TERsrc/terAlignment.h"
+#include "TER/tercalc.h"
+#include "TER/terAlignment.h"
#include "Util.h"
using namespace TERCpp;
@@ -33,6 +33,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
string line;
int sid = 0;
while ( getline ( in, line ) ) {
+ line = this->preprocessSentence(line);
vector<int> tokens;
TokenizeAndEncode(line, tokens);
m_references.push_back ( tokens );
@@ -48,6 +49,7 @@ void TerScorer::setReferenceFiles ( const vector<string>& referenceFiles )
void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry )
{
+ string sentence = this->preprocessSentence(text);
terAlignment result;
result.numEdits = 0.0 ;
@@ -74,7 +76,7 @@ void TerScorer::prepareStats ( size_t sid, const string& text, ScoreStats& entry
averageLength+=(double)m_multi_references.at ( incRefsBis ).at ( sid ).size();
}
averageLength=averageLength/( double ) m_multi_references.size();
- TokenizeAndEncode(text, testtokens);
+ TokenizeAndEncode(sentence, testtokens);
terCalc * evaluation=new terCalc();
evaluation->setDebugMode ( false );
terAlignment tmp_result = evaluation->TER ( reftokens, testtokens );
diff --git a/mert/TerScorer.h b/mert/TerScorer.h
index 7ffb4c741..46b02924e 100644
--- a/mert/TerScorer.h
+++ b/mert/TerScorer.h
@@ -1,5 +1,5 @@
-#ifndef __TERSCORER_H__
-#define __TERSCORER_H__
+#ifndef MERT_TER_SCORER_H_
+#define MERT_TER_SCORER_H_
#include <iostream>
#include <set>
@@ -54,4 +54,4 @@ private:
TerScorer& operator=(const TerScorer&);
};
-#endif // __TERSCORER_H__
+#endif // MERT_TER_SCORER_H_
diff --git a/mert/Timer.cpp b/mert/Timer.cpp
index 373eb4a2e..5235edb04 100644
--- a/mert/Timer.cpp
+++ b/mert/Timer.cpp
@@ -1,73 +1,104 @@
#include "Timer.h"
#include "Util.h"
-double Timer::elapsed_time()
-{
- time_t now;
- time(&now);
- return difftime(now, start_time);
-}
+#if !defined(_WIN32) && !defined(_WIN64)
+#include <sys/resource.h>
+#include <sys/time.h>
+#endif
-double Timer::get_elapsed_time()
-{
- return elapsed_time();
+namespace {
+
+#if !defined(_WIN32) && !defined(_WIN64)
+uint64_t GetMicroSeconds(const struct timeval& tv) {
+ return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
}
-void Timer::start(const char* msg)
-{
- // Print an optional message, something like "Starting timer t";
- if (msg) TRACE_ERR( msg << std::endl);
+uint64_t GetTimeOfDayMicroSeconds() {
+ struct timeval tv;
+ gettimeofday(&tv, NULL);
+ return static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec;
+}
+#endif
- // Return immediately if the timer is already running
- if (running) return;
+} // namespace
- // Change timer status to running
- running = true;
+void Timer::GetCPUTimeMicroSeconds(Timer::CPUTime* cpu_time) const {
+#if !defined(_WIN32) && !defined(_WIN64)
+ struct rusage usage;
+ if (getrusage(RUSAGE_SELF, &usage)) {
+ TRACE_ERR("Error occurred: getrusage().\n");
+ exit(1);
+ }
+ cpu_time->user_time = GetMicroSeconds(usage.ru_utime);
+ cpu_time->sys_time = GetMicroSeconds(usage.ru_stime);
+#else // Windows
+ // Not implemented yet.
+ // TODO: implement the Windows version using native APIs.
+#endif
+}
- // Set the start time;
- time(&start_time);
+double Timer::get_elapsed_cpu_time() const {
+ return static_cast<double>(get_elapsed_cpu_time_microseconds()) * 1e-6;
}
-/***
- * Turn the timer off and start it again from 0. Print an optional message.
- */
-/*
-inline void Timer::restart(const char* msg)
-{
- // Print an optional message, something like "Restarting timer t";
- if (msg) TRACE_ERR( msg << std::endl;
+uint64_t Timer::get_elapsed_cpu_time_microseconds() const {
+ CPUTime e;
+ GetCPUTimeMicroSeconds(&e);
+ return (e.user_time - m_start_time.user_time) +
+ (e.sys_time - m_start_time.sys_time);
+}
- // Set the timer status to running
- running = true;
+double Timer::get_elapsed_wall_time() const {
+ return static_cast<double>(get_elapsed_wall_time_microseconds()) * 1e-6;
+}
- // Set the accumulated time to 0 and the start time to now
- acc_time = 0;
- start_clock = clock();
- start_time = time(0);
+uint64_t Timer::get_elapsed_wall_time_microseconds() const {
+ return GetTimeOfDayMicroSeconds() - m_wall;
}
-*/
-/***
- * Stop the timer and print an optional message.
- */
-/*
-inline void Timer::stop(const char* msg)
+void Timer::start(const char* msg)
{
- // Print an optional message, something like "Stopping timer t";
- check(msg);
-
- // Recalculate and store the total accumulated time up until now
- if (running) acc_time += elapsed_time();
+ // Print an optional message, something like "Starting timer t";
+ if (msg) TRACE_ERR( msg << std::endl);
+ if (m_is_running) return;
+ m_is_running = true;
+ m_wall = GetTimeOfDayMicroSeconds();
+ GetCPUTimeMicroSeconds(&m_start_time);
+}
- running = false;
+void Timer::restart(const char* msg)
+{
+ if (msg) {
+ TRACE_ERR(msg << std::endl);
+ }
+ m_wall = GetTimeOfDayMicroSeconds();
+ GetCPUTimeMicroSeconds(&m_start_time);
}
-*/
void Timer::check(const char* msg)
{
// Print an optional message, something like "Checking timer t";
if (msg) TRACE_ERR( msg << " : ");
-// TRACE_ERR( "[" << std::setiosflags(std::ios::fixed) << std::setprecision(2) << (running ? elapsed_time() : 0) << "] seconds\n");
- TRACE_ERR( "[" << (running ? elapsed_time() : 0) << "] seconds\n");
+ if (m_is_running) {
+ TRACE_ERR("[Wall " << get_elapsed_wall_time()
+ << " CPU " << get_elapsed_cpu_time() << "] seconds.\n");
+ } else {
+ TRACE_ERR("WARNING: the timer is not running.\n");
+ }
+}
+
+std::string Timer::ToString() const {
+ std::string res;
+ const double wall = get_elapsed_wall_time();
+ CPUTime e;
+ GetCPUTimeMicroSeconds(&e);
+ const double utime = (e.user_time - m_start_time.user_time) * 1e-6;
+ const double stime = (e.sys_time - m_start_time.sys_time) * 1e-6;
+ std::stringstream ss;
+ ss << "wall " << wall << " sec. user " << utime << " sec. sys " << stime
+ << " sec. total " << utime + stime << " sec.";
+ res.append(ss.str());
+
+ return res;
}
diff --git a/mert/Timer.h b/mert/Timer.h
index 403547620..7b1101b50 100644
--- a/mert/Timer.h
+++ b/mert/Timer.h
@@ -1,46 +1,54 @@
-#ifndef TIMER_H
-#define TIMER_H
+#ifndef MERT_TIMER_H_
+#define MERT_TIMER_H_
-#include <ctime>
-#include <iostream>
-#include <iomanip>
+#include <ostream>
+#include <string>
+#include <stdint.h>
class Timer
{
- /**
- * Allow timers to be printed to ostreams using the syntax 'os << t'
- * for an ostream 'os' and a timer 't'. For example, "cout << t" will
- * print out the total amount of time 't' has been "running".
- */
- friend std::ostream& operator<<(std::ostream& os, Timer& t);
+ private:
+ // Time values are stored in microseconds.
+ struct CPUTime {
+ uint64_t user_time; // user CPU time
+ uint64_t sys_time; // system CPU time
-private:
- bool running;
- time_t start_time;
+ CPUTime() : user_time(0), sys_time(0) { }
+ };
- /**
- * Return the total time that the timer has been in the "running"
- * state since it was first "started" or last "restarted". For
- * "short" time periods (less than an hour), the actual cpu time
- * used is reported instead of the elapsed time.
- * TODO in seconds?
- */
- double elapsed_time();
+ void GetCPUTimeMicroSeconds(CPUTime* cpu_time) const;
+
+ bool m_is_running;
+ uint64_t m_wall; // wall-clock time in microseconds
+ CPUTime m_start_time;
+
+ // No copying allowed
+ Timer(const Timer&);
+ void operator=(const Timer&);
-public:
+ public:
/**
- * 'running' is initially false. A timer needs to be explicitly started
- * using 'start' or 'restart'.
+ * 'm_is_running' is initially false. A timer needs to be explicitly started
+ * using 'start'.
*/
- Timer() : running(false), start_time(0) { }
+ Timer()
+ : m_is_running(false),
+ m_wall(0),
+ m_start_time() {}
+
+ ~Timer() {}
/**
* Start a timer. If it is already running, let it continue running.
* Print an optional message.
*/
void start(const char* msg = 0);
-// void restart(const char* msg = 0);
-// void stop(const char* msg = 0);
+
+ /**
+ * Restart the timer iff the timer is already running.
+ * if the timer is not running, just start the timer.
+ */
+ void restart(const char* msg = 0);
/**
* Print out an optional message followed by the current timer timing.
@@ -48,20 +56,50 @@ public:
void check(const char* msg = 0);
/**
- * Return the total time that the timer has been in the "running"
- * state since it was first "started" or last "restarted". For
- * "short" time periods (less than an hour), the actual cpu time
+ */
+ bool is_running() const { return m_is_running; }
+
+ /**
+ * Return the total time in seconds that the timer has been in the
+ * "running" state since it was first "started" or last "restarted".
+ * For "short" time periods (less than an hour), the actual cpu time
* used is reported instead of the elapsed time.
- * This function is the public version of elapsed_time()
*/
- double get_elapsed_time();
+ double get_elapsed_cpu_time() const;
+
+ /**
+ * Return the total time in microseconds.
+ */
+ uint64_t get_elapsed_cpu_time_microseconds() const;
+
+ /**
+ * Get elapsed wall-clock time in seconds.
+ */
+ double get_elapsed_wall_time() const;
+
+ /**
+ * Get elapsed wall-clock time in microseconds.
+ */
+ uint64_t get_elapsed_wall_time_microseconds() const;
+
+ /**
+ * Return a string that has the user CPU time, system time, and total time.
+ */
+ std::string ToString() const;
};
-inline std::ostream& operator<<(std::ostream& os, Timer& t)
-{
- //os << std::setprecision(2) << std::setiosflags(std::ios::fixed) << (t.running ? t.elapsed_time() : 0);
- os << (t.running ? t.elapsed_time() : 0);
+/**
+ * Allow timers to be printed to ostreams using the syntax 'os << t'
+ * for an ostream 'os' and a timer 't'. For example, "cout << t" will
+ * print out the total amount of time 't' has been "running".
+ */
+inline std::ostream& operator<<(std::ostream& os, const Timer& t) {
+ if (t.is_running()) {
+ os << t.ToString();
+ } else {
+ os << "timer is not running.";
+ }
return os;
}
-#endif // TIMER_H
+#endif // MERT_TIMER_H_
diff --git a/mert/TimerTest.cpp b/mert/TimerTest.cpp
new file mode 100644
index 000000000..d9562a3df
--- /dev/null
+++ b/mert/TimerTest.cpp
@@ -0,0 +1,27 @@
+#include "Timer.h"
+
+#define BOOST_TEST_MODULE TimerTest
+#include <boost/test/unit_test.hpp>
+
+#include <string>
+#include <unistd.h>
+
+BOOST_AUTO_TEST_CASE(timer_basic_test) {
+ Timer timer;
+ const int sleep_time_microsec = 40; // ad-hoc microseconds to pass unit tests.
+
+ timer.start();
+ BOOST_REQUIRE(timer.is_running());
+ BOOST_REQUIRE(usleep(sleep_time_microsec) == 0);
+ BOOST_CHECK(timer.get_elapsed_wall_time() > 0.0);
+ BOOST_CHECK(timer.get_elapsed_wall_time_microseconds() > 0);
+
+ timer.restart();
+ BOOST_REQUIRE(timer.is_running());
+ BOOST_REQUIRE(usleep(sleep_time_microsec) == 0);
+ BOOST_CHECK(timer.get_elapsed_wall_time() > 0.0);
+ BOOST_CHECK(timer.get_elapsed_wall_time_microseconds() > 0);
+
+ const std::string s = timer.ToString();
+ BOOST_CHECK(!s.empty());
+}
diff --git a/mert/Types.h b/mert/Types.h
index 1d0fd0dd0..c65c6ffc2 100644
--- a/mert/Types.h
+++ b/mert/Types.h
@@ -1,5 +1,5 @@
-#ifndef TYPE_H
-#define TYPE_H
+#ifndef MERT_TYPE_H_
+#define MERT_TYPE_H_
#include <vector>
#include <map>
@@ -40,4 +40,4 @@ typedef vector<ScoreArray> scoredata_t;
typedef map<size_t, std::string> idx2name;
typedef map<std::string, size_t> name2idx;
-#endif // TYPE_H
+#endif // MERT_TYPE_H_
diff --git a/mert/Util.cpp b/mert/Util.cpp
index 3769c71e7..952aaf9aa 100644
--- a/mert/Util.cpp
+++ b/mert/Util.cpp
@@ -1,6 +1,6 @@
/*
* Util.cpp
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
@@ -11,29 +11,28 @@
using namespace std;
-// global variables
-Timer g_timer;
-
-int verbose = 0;
-
namespace {
+Timer g_timer;
+int g_verbose = 0;
+
bool FindDelimiter(const std::string &str, const std::string &delim, size_t *pos)
{
*pos = str.find(delim);
return *pos != std::string::npos ? true : false;
}
+
} // namespace
int verboselevel()
{
- return verbose;
+ return g_verbose;
}
int setverboselevel(int v)
{
- verbose = v;
- return verbose;
+ g_verbose = v;
+ return g_verbose;
}
size_t getNextPound(std::string &str, std::string &substr,
@@ -67,27 +66,12 @@ void Tokenize(const char *str, const char delim,
while (1) {
const char *begin = str;
while (*str != delim && *str) str++;
- res->push_back(std::string(begin, str));
+ if (begin != str) // Don't create empty string objects.
+ res->push_back(std::string(begin, str));
if (*str++ == 0) break;
}
}
-int swapbytes(char *p, int sz, int n)
-{
- char c, *l, *h;
-
- if((n < 1) || (sz < 2)) return 0;
- for (; n--; p += sz) {
- for (h = (l = p) + sz; --h > l; l++) {
- c = *h;
- *h = *l;
- *l = c;
- }
- }
- return 0;
-
-}
-
void ResetUserTime()
{
g_timer.start();
@@ -100,5 +84,5 @@ void PrintUserTime(const std::string &message)
double GetUserTime()
{
- return g_timer.get_elapsed_time();
+ return g_timer.get_elapsed_cpu_time();
}
diff --git a/mert/Util.h b/mert/Util.h
index da68685c3..cf99cdf6e 100644
--- a/mert/Util.h
+++ b/mert/Util.h
@@ -1,14 +1,15 @@
/*
* Util.h
- * met - Minimum Error Training
+ * mert - Minimum Error Rate Training
*
* Created by Nicola Bertoldi on 13/05/08.
*
*/
-#ifndef UTIL_H
-#define UTIL_H
+#ifndef MERT_UTIL_H_
+#define MERT_UTIL_H_
+#include <cmath>
#include <cstdlib>
#include <stdexcept>
#include <limits>
@@ -23,9 +24,6 @@
using namespace std;
-#define US_NOSET (numeric_limits<unsigned short>::max())
-#define MAX_LINE 1024
-
#ifdef TRACE_ENABLE
#define TRACE_ERR(str) { std::cerr << str; }
#else
@@ -37,6 +35,20 @@ const char kDefaultDelimiterSymbol[] = " ";
int verboselevel();
int setverboselevel(int v);
+
+const float kEPS = 0.0001f;
+
+template <typename T>
+bool IsAlmostEqual(T expected, T actual, float round=kEPS) {
+ if (abs(expected - actual) < round) {
+ return true;
+ } else {
+ cerr << "Fail: expected = " << expected
+ << " (actual = " << actual << ")" << endl;
+ return false;
+ }
+}
+
/**
* Find the specified delimiter for the string 'str', and 'str' is assigned
* to a substring object that starts at the position of first occurrence of
@@ -52,6 +64,12 @@ size_t getNextPound(std::string &str, std::string &substr,
void split(const std::string &s, char delim, std::vector<std::string> &elems);
+/**
+ * Split the string 'str' with specified delimitter 'delim' into tokens.
+ * The resulting tokens are set to 'res'.
+ *
+ * ex. "a,b,c" => {"a", "b", "c"}.
+ */
void Tokenize(const char *str, const char delim, std::vector<std::string> *res);
template<typename T>
@@ -63,6 +81,14 @@ inline T Scan(const std::string &input)
return ret;
}
+/**
+ * Returns true iff "str" ends with "suffix".
+ * e.g., Given str = "abc:" and suffix = ":", this function returns true.
+ */
+inline bool EndsWith(const std::string& str, const char* suffix) {
+ return str.find_last_of(suffix) == str.size() - 1;
+}
+
template<typename T>
inline std::string stringify(T x)
{
@@ -97,4 +123,4 @@ void ResetUserTime();
void PrintUserTime(const std::string &message);
double GetUserTime();
-#endif // UTIL_H
+#endif // MERT_UTIL_H_
diff --git a/mert/UtilTest.cpp b/mert/UtilTest.cpp
new file mode 100644
index 000000000..2101f7c8d
--- /dev/null
+++ b/mert/UtilTest.cpp
@@ -0,0 +1,76 @@
+#include "Util.h"
+
+#define BOOST_TEST_MODULE UtilTest
+#include <boost/test/unit_test.hpp>
+
+BOOST_AUTO_TEST_CASE(util_get_next_pound_test) {
+ {
+ std::string str("9 9 7 ");
+ std::string substr;
+ std::vector<std::string> res;
+
+ while (!str.empty()) {
+ getNextPound(str, substr);
+ res.push_back(substr);
+ }
+ BOOST_REQUIRE(res.size() == 3);
+ BOOST_CHECK_EQUAL("9", res[0]);
+ BOOST_CHECK_EQUAL("9", res[1]);
+ BOOST_CHECK_EQUAL("7", res[2]);
+ }
+
+ {
+ std::string str("ref.0,ref.1,ref.2");
+ std::string substr;
+ std::vector<std::string> res;
+ const std::string delim(",");
+
+ while (!str.empty()) {
+ getNextPound(str, substr, delim);
+ res.push_back(substr);
+ }
+ BOOST_REQUIRE(res.size() == 3);
+ BOOST_CHECK_EQUAL("ref.0", res[0]);
+ BOOST_CHECK_EQUAL("ref.1", res[1]);
+ BOOST_CHECK_EQUAL("ref.2", res[2]);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(util_tokenize_test) {
+ {
+ std::vector<std::string> res;
+ Tokenize("9 9 7", ' ', &res);
+ BOOST_REQUIRE(res.size() == 3);
+ BOOST_CHECK_EQUAL("9", res[0]);
+ BOOST_CHECK_EQUAL("9", res[1]);
+ BOOST_CHECK_EQUAL("7", res[2]);
+ }
+
+ {
+ std::vector<std::string> res;
+ Tokenize("9 8 7 ", ' ', &res);
+ BOOST_REQUIRE(res.size() == 3);
+ BOOST_CHECK_EQUAL("9", res[0]);
+ BOOST_CHECK_EQUAL("8", res[1]);
+ BOOST_CHECK_EQUAL("7", res[2]);
+ }
+
+ {
+ std::vector<std::string> res;
+ Tokenize("ref.0,ref.1,", ',', &res);
+ BOOST_REQUIRE(res.size() == 2);
+ BOOST_CHECK_EQUAL("ref.0", res[0]);
+ BOOST_CHECK_EQUAL("ref.1", res[1]);
+ }
+}
+
+BOOST_AUTO_TEST_CASE(util_ends_with_test) {
+ BOOST_CHECK(EndsWith("abc:", ":"));
+ BOOST_CHECK(EndsWith("a b c:", ":"));
+ BOOST_CHECK(!EndsWith("a", ":"));
+ BOOST_CHECK(!EndsWith("a:b", ":"));
+
+ BOOST_CHECK(EndsWith("ab ", " "));
+ BOOST_CHECK(!EndsWith("ab", " "));
+ BOOST_CHECK(!EndsWith("a b", " "));
+}
diff --git a/mert/Vocabulary.cpp b/mert/Vocabulary.cpp
new file mode 100644
index 000000000..40b04bf99
--- /dev/null
+++ b/mert/Vocabulary.cpp
@@ -0,0 +1,21 @@
+#include "Vocabulary.h"
+#include "Singleton.h"
+
+namespace mert {
+namespace {
+Vocabulary* g_vocab = NULL;
+} // namespace
+
+Vocabulary* VocabularyFactory::GetVocabulary() {
+ if (g_vocab == NULL) {
+ return Singleton<Vocabulary>::GetInstance();
+ } else {
+ return g_vocab;
+ }
+}
+
+void VocabularyFactory::SetVocabulary(Vocabulary* vocab) {
+ g_vocab = vocab;
+}
+
+} // namespace mert
diff --git a/mert/Vocabulary.h b/mert/Vocabulary.h
new file mode 100644
index 000000000..12c8c1727
--- /dev/null
+++ b/mert/Vocabulary.h
@@ -0,0 +1,79 @@
+#ifndef MERT_VOCABULARY_H_
+#define MERT_VOCABULARY_H_
+
+#include <map>
+#include <string>
+
+namespace mert {
+
+/**
+ * A embarrassingly simple map to handle vocabularies to calculate
+ * various scores such as BLEU.
+ *
+ * TODO: replace this with more efficient data structure.
+ */
+class Vocabulary {
+ public:
+ typedef std::map<std::string, int>::iterator iterator;
+ typedef std::map<std::string, int>::const_iterator const_iterator;
+
+ Vocabulary() {}
+ virtual ~Vocabulary() {}
+
+ /** Returns the assiged id for given "token". */
+ int Encode(const std::string& token) {
+ iterator it = m_vocab.find(token);
+ int encoded_token;
+ if (it == m_vocab.end()) {
+ // Add an new entry to the vocaburary.
+ encoded_token = static_cast<int>(m_vocab.size());
+ m_vocab[token] = encoded_token;
+ } else {
+ encoded_token = it->second;
+ }
+ return encoded_token;
+ }
+
+ /**
+ * Return true iff the specified "str" is found in the container.
+ */
+ bool Lookup(const std::string&str , int* v) const {
+ const_iterator it = m_vocab.find(str);
+ if (it == m_vocab.end()) return false;
+ *v = it->second;
+ return true;
+ }
+
+ void clear() { m_vocab.clear(); }
+
+ bool empty() const { return m_vocab.empty(); }
+
+ size_t size() const { return m_vocab.size(); }
+
+ iterator find(const std::string& str) { return m_vocab.find(str); }
+ const_iterator find(const std::string& str) const { return m_vocab.find(str); }
+
+ int& operator[](const std::string& str) { return m_vocab[str]; }
+
+ iterator begin() { return m_vocab.begin(); }
+ const_iterator begin() const { return m_vocab.begin(); }
+ iterator end() { return m_vocab.end(); }
+ const_iterator end() const { return m_vocab.end(); }
+
+ private:
+ std::map<std::string, int> m_vocab;
+};
+
+class VocabularyFactory {
+ public:
+ static Vocabulary* GetVocabulary();
+ static void SetVocabulary(Vocabulary* vocab);
+
+ private:
+ VocabularyFactory() {}
+ virtual ~VocabularyFactory() {}
+};
+
+} // namespace mert
+
+#endif // MERT_VOCABULARY_H_
diff --git a/mert/VocabularyTest.cpp b/mert/VocabularyTest.cpp
new file mode 100644
index 000000000..0e67ba62a
--- /dev/null
+++ b/mert/VocabularyTest.cpp
@@ -0,0 +1,52 @@
+#include "Vocabulary.h"
+
+#define BOOST_TEST_MODULE MertVocabulary
+#include <boost/test/unit_test.hpp>
+
+#include "Singleton.h"
+
+namespace mert {
+namespace {
+
+void TearDown() {
+ Singleton<Vocabulary>::Delete();
+}
+
+} // namespace
+
+BOOST_AUTO_TEST_CASE(vocab_basic) {
+ Vocabulary vocab;
+ BOOST_REQUIRE(vocab.empty());
+ vocab.clear();
+
+ BOOST_CHECK_EQUAL(0, vocab.Encode("hello"));
+ BOOST_CHECK_EQUAL(0, vocab.Encode("hello"));
+ BOOST_CHECK_EQUAL(1, vocab.Encode("world"));
+
+ BOOST_CHECK_EQUAL(2, vocab.size());
+
+ int v;
+ BOOST_CHECK(vocab.Lookup("hello", &v));
+ BOOST_CHECK_EQUAL(0, v);
+ BOOST_CHECK(vocab.Lookup("world", &v));
+ BOOST_CHECK_EQUAL(1, v);
+
+ BOOST_CHECK(!vocab.Lookup("java", &v));
+
+ vocab.clear();
+ BOOST_CHECK(!vocab.Lookup("hello", &v));
+ BOOST_CHECK(!vocab.Lookup("world", &v));
+}
+
+BOOST_AUTO_TEST_CASE(vocab_factory_test) {
+ Vocabulary* vocab1 = VocabularyFactory::GetVocabulary();
+ Vocabulary* vocab2 = VocabularyFactory::GetVocabulary();
+ Vocabulary* vocab3 = VocabularyFactory::GetVocabulary();
+
+ BOOST_REQUIRE(vocab1 != NULL);
+ BOOST_CHECK(vocab1 == vocab2);
+ BOOST_CHECK(vocab2 == vocab3);
+
+ TearDown();
+}
+} // namespace mert
diff --git a/mert/evaluator.cpp b/mert/evaluator.cpp
index 2fcda0140..a95cdfa1b 100644
--- a/mert/evaluator.cpp
+++ b/mert/evaluator.cpp
@@ -55,7 +55,7 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
for (int i = 0; i < bootstrap; ++i)
{
// TODO: Use smart pointer for exceptional-safety.
- ScoreData* scoredata = new ScoreData(*g_scorer);
+ ScoreData* scoredata = new ScoreData(g_scorer);
for (int j = 0; j < n; ++j)
{
int randomIndex = random() % n;
@@ -89,7 +89,7 @@ void EvaluatorUtil::evaluate(const string& candFile, int bootstrap)
else
{
// TODO: Use smart pointer for exceptional-safety.
- ScoreData* scoredata = new ScoreData(*g_scorer);
+ ScoreData* scoredata = new ScoreData(g_scorer);
for (int sid = 0; sid < n; ++sid)
{
string str_sid = int2string(sid);
@@ -133,15 +133,26 @@ void usage()
cerr << "\tThis is of the form NAME1:VAL1,NAME2:VAL2 etc " << endl;
cerr << "[--reference|-R] comma separated list of reference files" << endl;
cerr << "[--candidate|-C] comma separated list of candidate files" << endl;
+ cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
+ cerr << "[--filter|-l] filter command which will be used to preprocess the sentences" << endl;
cerr << "[--bootstrap|-b] number of booststraped samples (default 0 - no bootstraping)" << endl;
cerr << "[--rseed|-r] the random seed for bootstraping (defaults to system clock)" << endl;
cerr << "[--help|-h] print this message and exit" << endl;
cerr << endl;
cerr << "Evaluator is able to compute more metrics at once. To do this," << endl;
- cerr << "separate scorers with semicolon (note that comma is used to separate" << endl;
- cerr << "scorers in the interpolated scorer)." << endl;
+ cerr << "specify more --sctype arguments. You can also specify more --scconfig strings." << endl;
cerr << endl;
- cerr << "If you specify only one metric and one candidate file, only the final score" << endl;
+ cerr << "The example below prints BLEU score, PER score and interpolated" << endl;
+ cerr << "score of CDER and PER with the given weights." << endl;
+ cerr << endl;
+ cerr << "./evaluator \\" << endl;
+ cerr << "\t--sctype BLEU --scconfig reflen:closest \\" << endl;
+ cerr << "\t--sctype PER \\" << endl;
+ cerr << "\t--sctype CDER,PER --scconfig weights:0.25+0.75 \\" << endl;
+ cerr << "\t--candidate CANDIDATE \\" << endl;
+ cerr << "\t--reference REFERENCE" << endl;
+ cerr << endl;
+ cerr << "If you specify only one scorer and one candidate file, only the final score" << endl;
cerr << "will be printed to stdout. Otherwise each line will contain metric name" << endl;
cerr << "and/or filename and the final score. Since most of the metrics prints some" << endl;
cerr << "debuging info, consider redirecting stderr to /dev/null." << endl;
@@ -155,24 +166,26 @@ static struct option long_options[] = {
{"candidate", required_argument, 0, 'C'},
{"bootstrap", required_argument, 0, 'b'},
{"rseed", required_argument, 0, 'r'},
+ {"factors", required_argument, 0, 'f'},
+ {"filter", required_argument, 0, 'l'},
{"help", no_argument, 0, 'h'},
{0, 0, 0, 0}
};
// Options used in evaluator.
struct ProgramOption {
- string scorer_type;
- string scorer_config;
+ vector<string> scorer_types;
+ vector<string> scorer_configs;
string reference;
string candidate;
+ vector<string> scorer_factors;
+ vector<string> scorer_filter;
int bootstrap;
int seed;
bool has_seed;
ProgramOption()
- : scorer_type("BLEU"),
- scorer_config(""),
- reference(""),
+ : reference(""),
candidate(""),
bootstrap(0),
seed(0),
@@ -182,13 +195,18 @@ struct ProgramOption {
void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
int c;
int option_index;
- while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:h", long_options, &option_index)) != -1) {
+ int last_scorer_index = -1;
+ while ((c = getopt_long(argc, argv, "s:c:R:C:b:r:f:l:h", long_options, &option_index)) != -1) {
switch(c) {
case 's':
- opt->scorer_type = string(optarg);
+ opt->scorer_types.push_back(string(optarg));
+ opt->scorer_configs.push_back(string(""));
+ opt->scorer_factors.push_back(string(""));
+ opt->scorer_filter.push_back(string(""));
+ last_scorer_index++;
break;
case 'c':
- opt->scorer_config = string(optarg);
+ opt->scorer_configs[last_scorer_index] = string(optarg);
break;
case 'R':
opt->reference = string(optarg);
@@ -203,10 +221,25 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
opt->seed = strtol(optarg, NULL, 10);
opt->has_seed = true;
break;
+ case 'f':
+ opt->scorer_factors[last_scorer_index] = string(optarg);
+ break;
+ case 'l':
+ opt->scorer_filter[last_scorer_index] = string(optarg);
+ break;
default:
usage();
}
}
+
+ // Add default scorer if no scorer provided
+ if (opt->scorer_types.size() == 0)
+ {
+ opt->scorer_types.push_back(string("BLEU"));
+ opt->scorer_configs.push_back(string(""));
+ opt->scorer_factors.push_back(string(""));
+ opt->scorer_filter.push_back(string(""));
+ }
}
void InitSeed(const ProgramOption *opt) {
@@ -236,7 +269,6 @@ int main(int argc, char** argv)
try {
vector<string> refFiles;
vector<string> candFiles;
- vector<string> scorerTypes;
if (option.reference.length() == 0) throw runtime_error("You have to specify at least one reference file.");
split(option.reference, ',', refFiles);
@@ -244,17 +276,16 @@ int main(int argc, char** argv)
if (option.candidate.length() == 0) throw runtime_error("You have to specify at least one candidate file.");
split(option.candidate, ',', candFiles);
- if (option.scorer_type.length() == 0) throw runtime_error("You have to specify at least one scorer.");
- split(option.scorer_type, ';', scorerTypes);
-
if (candFiles.size() > 1) g_has_more_files = true;
- if (scorerTypes.size() > 1) g_has_more_scorers = true;
+ if (option.scorer_types.size() > 1) g_has_more_scorers = true;
for (vector<string>::const_iterator fileIt = candFiles.begin(); fileIt != candFiles.end(); ++fileIt)
{
- for (vector<string>::const_iterator scorerIt = scorerTypes.begin(); scorerIt != scorerTypes.end(); ++scorerIt)
+ for (size_t i = 0; i < option.scorer_types.size(); i++)
{
- g_scorer = ScorerFactory::getScorer(*scorerIt, option.scorer_config);
+ g_scorer = ScorerFactory::getScorer(option.scorer_types[i], option.scorer_configs[i]);
+ g_scorer->setFactors(option.scorer_factors[i]);
+ g_scorer->setFilter(option.scorer_filter[i]);
g_scorer->setReferenceFiles(refFiles);
EvaluatorUtil::evaluate(*fileIt, option.bootstrap);
delete g_scorer;
diff --git a/mert/example/README b/mert/example/README
deleted file mode 100644
index 7ece55a53..000000000
--- a/mert/example/README
+++ /dev/null
@@ -1,26 +0,0 @@
-extractor=../extractor
-#extractor="../extractor --binary"
-mert=../mert
-size=15
-
-#to read an nbest file; output is in text format
-$extractor --nbest NBEST --reference REF.0,REF.1,REF.2 --ffile FEATSTAT --scfile SCORESTAT --sctype BLEU
-$extractor --ffile FEATSTAT.2 --scfile SCORESTAT.2 --sctype BLEU --prev-ffile FEATSTAT --prev-scfile SCORESTAT
-$extractor --binary --ffile FEATSTAT.3 --scfile SCORESTAT.3 --sctype BLEU --prev-ffile FEATSTAT,FEATSTAT.2 --prev-scfile SCORESTAT,SCORESTAT.2
-$extractor --nbest NBEST --reference REF.0,REF.1,REF.2 --ffile FEATSTAT.4 --scfile SCORESTAT.4 --sctype BLEU --prev-ffile FEATSTAT,FEATSTAT.3 --prev-scfile SCORESTAT,SCORESTAT.3
-
-
-$mert -r 1234 --ifile init.opt --scfile SCORESTAT --ffile FEATSTAT -d $size --verbose 4 -n 5
-
-exit
-
-
-#to read a gzipped nbest file; output is in text format
-$extractor --nbest NBEST.gz --reference REF.0,REF.1,REF.2 --ffile FEATSTATgz --scfile SCORESTATgz --sctype BLEU
-gzip FEATSTATgz SCORESTATgz
-
-$extractor --nbest NBEST --reference REF.0,REF.1,REF.2 --prev-ffile FEATSTAT --prev-scfile SCORESTAT --ffile FEATSTAT2 --scfile SCORESTAT2 --sctype BLEU
-
-$extractor --nbest NBEST.gz --reference REF.0,REF.1,REF.2 --prev-ffile FEATSTATgz.gz --prev-scfile SCORESTATgz.gz --ffile FEATSTAT2gz --scfile SCORESTAT2gz --sctype BLEU
-
-exit
diff --git a/mert/example/gzipped_test.sh b/mert/example/gzipped_test.sh
new file mode 100755
index 000000000..f52613da1
--- /dev/null
+++ b/mert/example/gzipped_test.sh
@@ -0,0 +1,41 @@
+#!/bin/sh
+extractor=$1
+mert=$2
+size=$3
+
+if [ $# -ne 3 ]; then
+ echo "Usage: ./normal_test.sh extracto mert size"
+ exit 1
+fi
+
+if ! [ -f NBEST.gz ]; then
+ gzip NBEST
+fi
+
+$extractor --nbest NBEST.gz --reference REF.0,REF.1,REF.2 \
+ --ffile FEATSTAT_gz --scfile SCORESTAT_gz \
+ --sctype BLEU 2> extractor_gz1.log
+
+gzip -d NBEST.gz
+
+$extractor --nbest NBEST --reference REF.0,REF.1,REF.2 \
+ --prev-ffile FEATSTAT --prev-scfile SCORESTAT \
+ --ffile FEATSTAT2 --scfile SCORESTAT2 \
+ --sctype BLEU 2> extractor_gz2.log
+
+# Now we want to test reading gzipped files.
+# We will first compress the output previously.
+
+for f in FEATSTAT_gz SCORESTAT_gz; do
+ printf "Compressing %s " $f
+ gzip $f
+ echo "done."
+done
+
+$extractor --nbest NBEST --reference REF.0,REF.1,REF.2 \
+ --prev-ffile FEATSTAT_gz.gz --prev-scfile SCORESTAT_gz.gz \
+ --ffile FEATSTAT2_gz --scfile SCORESTAT2_gz \
+ --sctype BLEU 2> extractor_gz3.log
+
+gzip -d FEATSTAT_gz.gz SCORESTAT_gz.gz
+echo "Done."
diff --git a/mert/example/normal_test.sh b/mert/example/normal_test.sh
new file mode 100755
index 000000000..8b5bf3eb9
--- /dev/null
+++ b/mert/example/normal_test.sh
@@ -0,0 +1,30 @@
+#!/bin/sh
+extractor=$1
+mert=$2
+size=$3
+
+if [ $# -ne 3 ]; then
+ echo "Usage: ./normal_test.sh extracto mert size"
+ exit 1
+fi
+
+echo "Runnning extractor ..."
+$extractor --nbest NBEST --reference REF.0,REF.1,REF.2 --ffile FEATSTAT \
+ --scfile SCORESTAT --sctype BLEU 2> extractor1.log
+
+$extractor --ffile FEATSTAT.2 --scfile SCORESTAT.2 --sctype BLEU \
+ --prev-ffile FEATSTAT --prev-scfile SCORESTAT 2> extractor2.log
+
+$extractor --binary --ffile FEATSTAT.3 --scfile SCORESTAT.3 --sctype BLEU \
+ --prev-ffile FEATSTAT,FEATSTAT.2 \
+ --prev-scfile SCORESTAT,SCORESTAT.2 2> extractor3.log
+
+$extractor --nbest NBEST --reference REF.0,REF.1,REF.2 --ffile FEATSTAT.4 \
+ --scfile SCORESTAT.4 --sctype BLEU --prev-ffile FEATSTAT,FEATSTAT.3 \
+ --prev-scfile SCORESTAT,SCORESTAT.3 2> extractor4.log
+
+echo "Running mert ..."
+$mert -r 1234 --ifile init.opt --scfile SCORESTAT --ffile FEATSTAT \
+ -d $size --verbose 4 -n 5 2>mert.log
+
+echo "Done."
diff --git a/mert/example/smoke_test.sh b/mert/example/smoke_test.sh
new file mode 100755
index 000000000..193d481ae
--- /dev/null
+++ b/mert/example/smoke_test.sh
@@ -0,0 +1,39 @@
+#!/bin/sh
+# A sample script for smoke testing.
+# This is not tuning script.
+# Please see: mosesdecoder/scripts/training/mert-moses.pl
+
+extractor=../extractor
+mert=../mert
+
+# Default the dimension used in mert.
+size=15
+
+# Make sure you have already compiled mert related stuff.
+for f in $extractor $mert; do
+ if ! [ -f $f ]; then
+ echo "Error: no such file or directory: $f"
+ echo "You should run `bjam` first!"
+ exit 1
+ fi
+done
+
+# Make sure you have sample data and inifile used in this tests.
+for f in NBEST REF.0 REF.1 REF.2 init.opt; do
+ if ! [ -f $f ]; then
+ echo "Error: no such file or directory: $f"
+ exit 1
+ fi
+done
+
+# Read an nbest file, Print output in text format.
+# We will save stderr to disk. Please see each log file.
+echo "Running tests for reading text files ..."
+./normal_test.sh $extractor $mert $size
+
+# Run reading gzipped file tests.
+# We will save stderr to disk. Please see each log file.
+echo "Running tests for reading gzipped files ..."
+./gzipped_test.sh $extractor $mert $size
+
+echo "Smoke tests done."
diff --git a/mert/extractor.cpp b/mert/extractor.cpp
index cb3e4c8ef..1119dfa57 100644
--- a/mert/extractor.cpp
+++ b/mert/extractor.cpp
@@ -9,6 +9,7 @@
#include <vector>
#include <getopt.h>
+#include <boost/scoped_ptr.hpp>
#include "Data.h"
#include "Scorer.h"
@@ -33,6 +34,8 @@ void usage()
cerr << "[--ffile|-F] the feature data output file" << endl;
cerr << "[--prev-ffile|-E] comma separated list of previous feature data" << endl;
cerr << "[--prev-scfile|-R] comma separated list of previous scorer data" << endl;
+ cerr << "[--factors|-f] list of factors passed to the scorer (e.g. 0|2)" << endl;
+ cerr << "[--filter|-l] filter command used to preprocess the sentences" << endl;
cerr << "[-v] verbose level" << endl;
cerr << "[--help|-h] print this message and exit" << endl;
exit(1);
@@ -41,6 +44,8 @@ void usage()
static struct option long_options[] = {
{"sctype", required_argument, 0, 's'},
{"scconfig", required_argument,0, 'c'},
+ {"factors", required_argument,0, 'f'},
+ {"filter", required_argument,0, 'l'},
{"reference", required_argument, 0, 'r'},
{"binary", no_argument, 0, 'b'},
{"nbest", required_argument, 0, 'n'},
@@ -57,6 +62,8 @@ static struct option long_options[] = {
struct ProgramOption {
string scorerType;
string scorerConfig;
+ string scorerFactors;
+ string scorerFilter;
string referenceFile;
string nbestFile;
string scoreDataFile;
@@ -69,6 +76,8 @@ struct ProgramOption {
ProgramOption()
: scorerType("BLEU"),
scorerConfig(""),
+ scorerFactors(""),
+ scorerFilter(""),
referenceFile(""),
nbestFile(""),
scoreDataFile("statscore.data"),
@@ -83,7 +92,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
int c;
int option_index;
- while ((c = getopt_long(argc, argv, "s:r:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
+ while ((c = getopt_long(argc, argv, "s:r:f:l:n:S:F:R:E:v:hb", long_options, &option_index)) != -1) {
switch (c) {
case 's':
opt->scorerType = string(optarg);
@@ -91,6 +100,12 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
case 'c':
opt->scorerConfig = string(optarg);
break;
+ case 'f':
+ opt->scorerFactors = string(optarg);
+ break;
+ case 'l':
+ opt->scorerFilter = string(optarg);
+ break;
case 'r':
opt->referenceFile = string(optarg);
break;
@@ -178,7 +193,12 @@ int main(int argc, char** argv)
TRACE_ERR("Scorer type: " << option.scorerType << endl);
- Scorer* scorer = ScorerFactory::getScorer(option.scorerType, option.scorerConfig);
+ boost::scoped_ptr<Scorer> scorer(
+ ScorerFactory::getScorer(option.scorerType, option.scorerConfig));
+
+ // set Factors and Filter used to preprocess the sentences
+ scorer->setFactors(option.scorerFactors);
+ scorer->setFilter(option.scorerFilter);
// load references
if (referenceFiles.size() > 0)
@@ -186,7 +206,7 @@ int main(int argc, char** argv)
PrintUserTime("References loaded");
- Data data(*scorer);
+ Data data(scorer.get());
// load old data
for (size_t i = 0; i < prevScoreDataFiles.size(); i++) {
@@ -197,27 +217,18 @@ int main(int argc, char** argv)
// computing score statistics of each nbest file
for (size_t i = 0; i < nbestFiles.size(); i++) {
- data.loadnbest(nbestFiles.at(i));
+ data.loadNBest(nbestFiles.at(i));
}
PrintUserTime("Nbest entries loaded and scored");
//ADDED_BY_TS
- data.remove_duplicates();
+ data.removeDuplicates();
//END_ADDED
- if (option.binmode)
- cerr << "Binary write mode is selected" << endl;
- else
- cerr << "Binary write mode is NOT selected" << endl;
-
data.save(option.featureDataFile, option.scoreDataFile, option.binmode);
PrintUserTime("Stopping...");
- // timer.stop("Stopping...");
-
- delete scorer;
-
return EXIT_SUCCESS;
} catch (const exception& e) {
cerr << "Exception: " << e.what() << endl;
diff --git a/mert/gzfilebuf.h b/mert/gzfilebuf.h
deleted file mode 100644
index f9cd8a446..000000000
--- a/mert/gzfilebuf.h
+++ /dev/null
@@ -1,85 +0,0 @@
-#ifndef _GZFILEBUF_H_
-#define _GZFILEBUF_H_
-
-#include <streambuf>
-#include <zlib.h>
-#include <cstring>
-
-class gzfilebuf : public std::streambuf
-{
-public:
- explicit gzfilebuf(const char *filename) {
- _gzf = gzopen(filename, "rb");
- setg (_buff+sizeof(int), // beginning of putback area
- _buff+sizeof(int), // read position
- _buff+sizeof(int)); // end position
- }
- ~gzfilebuf() {
- gzclose(_gzf);
- }
-protected:
- virtual int_type overflow (int_type c) {
- throw;
- }
-
- // write multiple characters
- virtual
- std::streamsize xsputn (const char* s,
- std::streamsize num) {
- throw;
- }
-
- virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ) {
- throw;
- }
-
- // read one character
- virtual int_type underflow () {
- // is read position before end of _buff?
- if (gptr() < egptr()) {
- return traits_type::to_int_type(*gptr());
- }
-
- /* process size of putback area
- * - use number of characters read
- * - but at most four
- */
- unsigned int numPutback = gptr() - eback();
- if (numPutback > sizeof(int)) {
- numPutback = sizeof(int);
- }
-
- /* copy up to four characters previously read into
- * the putback _buff (area of first four characters)
- */
- std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
- numPutback);
-
- // read new characters
- int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
- if (num <= 0) {
- // ERROR or EOF
- return EOF;
- }
-
- // reset _buff pointers
- setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
- _buff+sizeof(int), // read position
- _buff+sizeof(int)+num); // end of buffer
-
- // return next character
- return traits_type::to_int_type(*gptr());
- }
-
- std::streamsize xsgetn (char* s,
- std::streamsize num) {
- return gzread(_gzf,s,num);
- }
-
-private:
- gzFile _gzf;
- static const unsigned int _buffsize = 1024;
- char _buff[_buffsize];
-};
-
-#endif // _GZFILEBUF_H_
diff --git a/mert/mert.cpp b/mert/mert.cpp
index 58214f30b..bbad8fe38 100755..100644
--- a/mert/mert.cpp
+++ b/mert/mert.cpp
@@ -11,6 +11,7 @@
#include <ctime>
#include <getopt.h>
+#include <boost/scoped_ptr.hpp>
#include "Data.h"
#include "Point.h"
@@ -19,6 +20,7 @@
#include "ScoreData.h"
#include "FeatureData.h"
#include "Optimizer.h"
+#include "OptimizerFactory.h"
#include "Types.h"
#include "Timer.h"
#include "Util.h"
@@ -34,6 +36,7 @@ const char kDefaultScorer[] = "BLEU";
const char kDefaultScorerFile[] = "statscore.data";
const char kDefaultFeatureFile[] = "features.data";
const char kDefaultInitFile[] = "init.opt";
+const char kDefaultPositiveString[] = "";
// Used when saving optimized weights.
const char kOutputFile[] = "weights.txt";
@@ -106,6 +109,7 @@ void usage(int ret)
cerr << "[--scfile|-S] comma separated list of scorer data files (default " << kDefaultScorerFile << ")" << endl;
cerr << "[--ffile|-F] comma separated list of feature data files (default " << kDefaultFeatureFile << ")" << endl;
cerr << "[--ifile|-i] the starting point data file (default " << kDefaultInitFile << ")" << endl;
+ cerr << "[--positive|-P] indexes with positive weights (default none)"<<endl;
#ifdef WITH_THREADS
cerr << "[--threads|-T] use multiple threads (default 1)" << endl;
#endif
@@ -123,6 +127,7 @@ static struct option long_options[] = {
{"rseed", required_argument, 0, 'r'},
{"optimize", 1, 0, 'o'},
{"pro", required_argument, 0, 'p'},
+ {"positive",1,0,'P'},
{"type", 1, 0, 't'},
{"sctype", 1, 0, 's'},
{"scconfig", required_argument, 0, 'c'},
@@ -152,6 +157,7 @@ struct ProgramOption {
string scorer_file;
string feature_file;
string init_file;
+ string positive_string;
size_t num_threads;
float shard_size;
size_t shard_count;
@@ -169,6 +175,7 @@ struct ProgramOption {
scorer_file(kDefaultScorerFile),
feature_file(kDefaultFeatureFile),
init_file(kDefaultInitFile),
+ positive_string(kDefaultPositiveString),
num_threads(1),
shard_size(0),
shard_count(0) { }
@@ -178,7 +185,7 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
int c;
int option_index;
- while ((c = getopt_long(argc, argv, "o:r:d:n:m:t:s:S:F:v:p:", long_options, &option_index)) != -1) {
+ while ((c = getopt_long(argc, argv, "o:r:d:n:m:t:s:S:F:v:p:P:", long_options, &option_index)) != -1) {
switch (c) {
case 'o':
opt->to_optimize_str = string(optarg);
@@ -232,6 +239,9 @@ void ParseCommandOptions(int argc, char** argv, ProgramOption* opt) {
case 'h':
usage(0);
break;
+ case 'P':
+ opt->positive_string = string(optarg);
+ break;
default:
usage(1);
}
@@ -251,6 +261,7 @@ int main(int argc, char **argv)
vector<vector<parameter_t> > start_list;
vector<parameter_t> min;
vector<parameter_t> max;
+ vector<bool> positive;
// NOTE: those mins and max are the bound for the starting points of the algorithm, not strict bound on the result!
if (option.pdim < 0)
@@ -333,19 +344,20 @@ int main(int argc, char **argv)
}
// it make sense to know what parameter set were used to generate the nbest
- Scorer *TheScorer = ScorerFactory::getScorer(option.scorer_type, option.scorer_config);
+ boost::scoped_ptr<Scorer> scorer(
+ ScorerFactory::getScorer(option.scorer_type, option.scorer_config));
//load data
- Data data(*TheScorer);
+ Data data(scorer.get());
for (size_t i = 0; i < ScoreDataFiles.size(); i++) {
cerr<<"Loading Data from: "<< ScoreDataFiles.at(i) << " and " << FeatureDataFiles.at(i) << endl;
data.load(FeatureDataFiles.at(i), ScoreDataFiles.at(i));
}
- //ADDED_BY_TS
- data.remove_duplicates();
- //END_ADDED
+ scorer->setScoreData(data.getScoreData().get());
+
+ data.removeDuplicates();
PrintUserTime("Data loaded");
@@ -358,19 +370,24 @@ int main(int argc, char **argv)
if (option.to_optimize_str.length() > 0) {
cerr << "Weights to optimize: " << option.to_optimize_str << endl;
- // Parse string to get weights to optimize, and set them as active
- string substring;
- int index;
- while (!option.to_optimize_str.empty()) {
- getNextPound(option.to_optimize_str, substring, ",");
- index = data.getFeatureIndex(substring);
- cerr << "FeatNameIndex:" << index << " to insert" << endl;
- //index = strtol(substring.c_str(), NULL, 10);
- if (index >= 0 && index < option.pdim) {
- to_optimize.push_back(index);
- } else {
- cerr << "Index " << index << " is out of bounds. Allowed indexes are [0," << option.pdim - 1 << "]." << endl;
+ // Parse the string to get weights to optimize, and set them as active.
+ vector<string> features;
+ Tokenize(option.to_optimize_str.c_str(), ',', &features);
+
+ for (vector<string>::const_iterator it = features.begin();
+ it != features.end(); ++it) {
+ const int feature_index = data.getFeatureIndex(*it);
+
+ // Note: previous implementaion checked whether
+ // feature_index is less than option.pdim.
+ // However, it does not make sense when we optimize 'discrete' features,
+ // given by '-o' option like -o "d_0,lm_0,tm_2,tm_3,tm_4,w_0".
+ if (feature_index < 0) {
+ cerr << "Error: invalid feature index = " << feature_index << endl;
+ exit(1);
}
+ cerr << "FeatNameIndex: " << feature_index << " to insert" << endl;
+ to_optimize.push_back(feature_index);
}
} else {
//set all weights as active
@@ -380,6 +397,27 @@ int main(int argc, char **argv)
}
}
+ positive.resize(option.pdim);
+ for (int i = 0; i < option.pdim; i++)
+ positive[i] = false;
+ if (option.positive_string.length() > 0) {
+ // Parse string to get weights that need to be positive
+ std::string substring;
+ int index;
+ while (!option.positive_string.empty()) {
+ getNextPound(option.positive_string, substring, ",");
+ index = data.getFeatureIndex(substring);
+ //index = strtol(substring.c_str(), NULL, 10);
+ if (index >= 0 && index < option.pdim) {
+ positive[index] = true;
+ } else {
+ cerr << "Index " << index
+ << " is out of bounds in positivity list. Allowed indexes are [0,"
+ << (option.pdim-1) << "]." << endl;
+ }
+ }
+ }
+
// treat sparse features just like regular features
if (data.hasSparseFeatures()) {
data.mergeSparseFeatures();
@@ -393,6 +431,7 @@ int main(int argc, char **argv)
Point::setpdim(option.pdim);
Point::setdim(to_optimize.size());
+ Point::set_optindices(to_optimize);
//starting points consist of specified points and random restarts
vector<Point> startingPoints;
@@ -422,9 +461,9 @@ int main(int argc, char **argv)
data_ref = shards[i]; //use the sharded data if it exists
vector<OptimizationTask*>& tasks = allTasks[i];
- Optimizer *optimizer = OptimizerFactory::BuildOptimizer(option.pdim, to_optimize, start_list[0], option.optimize_type, option.nrandom);
+ Optimizer *optimizer = OptimizerFactory::BuildOptimizer(option.pdim, to_optimize, positive, start_list[0], option.optimize_type, option.nrandom);
optimizer->SetScorer(data_ref.getScorer());
- optimizer->SetFData(data_ref.getFeatureData());
+ optimizer->SetFeatureData(data_ref.getFeatureData());
// A task for each start point
for (size_t j = 0; j < startingPoints.size(); ++j) {
OptimizationTask* task = new OptimizationTask(optimizer, startingPoints[j]);
@@ -498,7 +537,6 @@ int main(int argc, char **argv)
}
}
- delete TheScorer;
PrintUserTime("Stopping...");
return 0;
diff --git a/mert/pro.cpp b/mert/pro.cpp
index a18e7a117..e1d2ebcfd 100644
--- a/mert/pro.cpp
+++ b/mert/pro.cpp
@@ -21,8 +21,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-/**
- * This is part of the PRO implementation. It converts the features and scores
+/**
+ * This is part of the PRO implementation. It converts the features and scores
* files into a form suitable for input into the megam maxent trainer.
*
* For details of PRO, refer to Hopkins & May (EMNLP 2011)
@@ -34,9 +34,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
#include <iostream>
#include <string>
#include <vector>
+#include <utility>
#include <boost/program_options.hpp>
+#include "BleuScorer.h"
#include "FeatureDataIterator.h"
#include "ScoreDataIterator.h"
@@ -46,49 +48,33 @@ namespace po = boost::program_options;
class SampledPair {
private:
- pair<size_t,size_t> translation1;
- pair<size_t,size_t> translation2;
- float scoreDiff;
-public:
- SampledPair(const pair<size_t,size_t>& t1, const pair<size_t,size_t>& t2, float diff ) {
- if (diff > 0) {
- translation1 = t1;
- translation2 = t2;
- scoreDiff = diff;
- }
- else {
- translation1 = t2;
- translation2 = t1;
- scoreDiff = -diff;
- }
- }
- float getDiff() const { return scoreDiff; }
- const pair<size_t,size_t>& getTranslation1() const { return translation1; }
- const pair<size_t,size_t>& getTranslation2() const { return translation2; }
-};
+ pair<size_t,size_t> m_translation1;
+ pair<size_t,size_t> m_translation2;
+ float m_score_diff;
+public:
+ SampledPair(const pair<size_t,size_t>& t1, const pair<size_t,size_t>& t2, float diff ) {
+ if (diff > 0) {
+ m_translation1 = t1;
+ m_translation2 = t2;
+ m_score_diff = diff;
+ } else {
+ m_translation1 = t2;
+ m_translation2 = t1;
+ m_score_diff = -diff;
+ }
+ }
-static float sentenceLevelBleuPlusOne(const vector<float>& stats) {
- float logbleu = 0.0;
- const unsigned int bleu_order = 4;
- for (unsigned int j=0; j<bleu_order; j++) {
- //cerr << (stats.get(2*j)+1) << "/" << (stats.get(2*j+1)+1) << " ";
- logbleu += log(stats[2*j]+1) - log(stats[2*j+1]+1);
- }
- logbleu /= bleu_order;
- const float brevity = 1.0 - static_cast<float>(stats[(bleu_order*2)]) / stats[1];
- if (brevity < 0.0) {
- logbleu += brevity;
- }
- //cerr << brevity << " -> " << exp(logbleu) << endl;
- return exp(logbleu);
-}
+ float getDiff() const { return m_score_diff; }
+ const pair<size_t,size_t>& getTranslation1() const { return m_translation1; }
+ const pair<size_t,size_t>& getTranslation2() const { return m_translation2; }
+};
static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureDataItem& f2) {
// difference in score in regular features
- for(unsigned int j=0; j<f1.dense.size(); j++)
- if (abs(f1.dense[j]-f2.dense[j]) > 0.00001)
- out << " F" << j << " " << (f1.dense[j]-f2.dense[j]);
+ for(unsigned int j=0; j<f1.dense.size(); j++)
+ if (abs(f1.dense[j]-f2.dense[j]) > 0.00001)
+ out << " F" << j << " " << (f1.dense[j]-f2.dense[j]);
if (f1.sparse.size() || f2.sparse.size()) {
out << " ";
@@ -101,27 +87,27 @@ static void outputSample(ostream& out, const FeatureDataItem& f1, const FeatureD
}
}
-
-int main(int argc, char** argv)
+
+int main(int argc, char** argv)
{
bool help;
vector<string> scoreFiles;
vector<string> featureFiles;
int seed;
string outputFile;
- //TODO: options
- const unsigned int n_candidates = 5000; // Gamma, in Hopkins & May
- const unsigned int n_samples = 50; // Xi, in Hopkins & May
- const float min_diff = 0.05;
+ // TODO: Add these constants to options
+ const unsigned int n_candidates = 5000; // Gamma, in Hopkins & May
+ const unsigned int n_samples = 50; // Xi, in Hopkins & May
+ const float min_diff = 0.05;
po::options_description desc("Allowed options");
desc.add_options()
- ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
- ("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
- ("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
- ("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
- ("output-file,o", po::value<string>(&outputFile), "Output file")
- ;
+ ("help,h", po::value(&help)->zero_tokens()->default_value(false), "Print this help message and exit")
+ ("scfile,S", po::value<vector<string> >(&scoreFiles), "Scorer data files")
+ ("ffile,F", po::value<vector<string> > (&featureFiles), "Feature data files")
+ ("random-seed,r", po::value<int>(&seed), "Seed for random number generation")
+ ("output-file,o", po::value<string>(&outputFile), "Output file")
+ ;
po::options_description cmdline_options;
cmdline_options.add(desc);
@@ -134,7 +120,7 @@ int main(int argc, char** argv)
cout << desc << endl;
exit(0);
}
-
+
if (vm.count("random-seed")) {
cerr << "Initialising random seed to " << seed << endl;
srand(seed);
@@ -167,7 +153,7 @@ int main(int argc, char** argv)
out = &cout;
}
-
+
vector<FeatureDataIterator> featureDataIters;
vector<ScoreDataIterator> scoreDataIters;
for (size_t i = 0; i < featureFiles.size(); ++i) {
@@ -179,7 +165,7 @@ int main(int argc, char** argv)
size_t sentenceId = 0;
while(1) {
vector<pair<size_t,size_t> > hypotheses;
- //TODO: de-deuping. Collect hashes of score,feature pairs and
+ //TODO: de-deuping. Collect hashes of score,feature pairs and
//only add index if it's unique.
if (featureDataIters[0] == FeatureDataIterator::end()) {
break;
@@ -214,7 +200,7 @@ int main(int argc, char** argv)
size_t rand2 = rand() % n_translations;
pair<size_t,size_t> translation2 = hypotheses[rand2];
float bleu2 = sentenceLevelBleuPlusOne(scoreDataIters[translation2.first]->operator[](translation2.second));
-
+
/*
cerr << "t(" << translation1.first << "," << translation1.second << ") = " << bleu1 <<
" t(" << translation2.first << "," << translation2.second << ") = " <<
@@ -222,7 +208,7 @@ int main(int argc, char** argv)
*/
if (abs(bleu1-bleu2) < min_diff)
continue;
-
+
samples.push_back(SampledPair(translation1, translation2, bleu1-bleu2));
scores.push_back(1.0-abs(bleu1-bleu2));
}
@@ -261,4 +247,3 @@ int main(int argc, char** argv)
outFile.close();
}
-