Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/mert
diff options
context:
space:
mode:
authorMatthias Huck <mhuck@inf.ed.ac.uk>2015-05-01 00:26:30 +0300
committerMatthias Huck <mhuck@inf.ed.ac.uk>2015-05-01 00:26:30 +0300
commit4ee8f2dec1c200858c4ed82becc95ed1f7c1017f (patch)
tree07ecd1b03f6aa7926bfa3fdeaf176833b836ed7b /mert
parent1d86b8fde7f62dbd0897815479d44df981de5d84 (diff)
sentence-bleu less greedy regarding memory
Don't load all references, read them line by line. Corpora with millions of sentences can now be evaluated without consuming gigabytes of RAM.
Diffstat (limited to 'mert')
-rw-r--r--mert/BleuDocScorer.cpp2
-rw-r--r--mert/BleuDocScorer.h4
-rw-r--r--mert/BleuScorer.cpp30
-rw-r--r--mert/BleuScorer.h25
-rw-r--r--mert/HopeFearDecoder.cpp2
-rw-r--r--mert/HopeFearDecoder.h5
-rw-r--r--mert/Ngram.h4
-rw-r--r--mert/Reference.h5
-rw-r--r--mert/Scorer.h4
-rw-r--r--mert/sentence-bleu-nbest.cpp32
-rw-r--r--mert/sentence-bleu.cpp28
11 files changed, 98 insertions, 43 deletions
diff --git a/mert/BleuDocScorer.cpp b/mert/BleuDocScorer.cpp
index 48c17ee96..d71c5171d 100644
--- a/mert/BleuDocScorer.cpp
+++ b/mert/BleuDocScorer.cpp
@@ -174,7 +174,7 @@ statscore_t BleuDocScorer::calculateScore(const vector<int>& comps) const
UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
float logbleu = 0.0;
- for (int i = 0; i < kBleuNgramOrder; ++i) {
+ for (size_t i = 0; i < kBleuNgramOrder; ++i) {
if (comps[2*i] == 0) {
return 0.0;
}
diff --git a/mert/BleuDocScorer.h b/mert/BleuDocScorer.h
index 9677410f8..d27088254 100644
--- a/mert/BleuDocScorer.h
+++ b/mert/BleuDocScorer.h
@@ -1,5 +1,4 @@
-#ifndef MERT_BLEU_DOC_SCORER_H_
-#define MERT_BLEU_DOC_SCORER_H_
+#pragma once
#include <ostream>
#include <string>
@@ -64,4 +63,3 @@ private:
}
-#endif // MERT_BLEU_DOC_SCORER_H_
diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp
index 13a472447..dc926054f 100644
--- a/mert/BleuScorer.cpp
+++ b/mert/BleuScorer.cpp
@@ -99,9 +99,8 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
ifstream ifs(referenceFiles[i].c_str());
- UTIL_THROW_IF2(!ifs, "Cannot open " << referenceFiles[i]);
if (!OpenReferenceStream(&ifs, i)) {
- UTIL_THROW2("Unable to open " + referenceFiles[i]);
+ UTIL_THROW2("Cannot open " + referenceFiles[i]);
}
}
}
@@ -152,13 +151,26 @@ void BleuScorer::ProcessReferenceLine(const std::string& line, Reference* ref) c
ref->push_back(length);
}
+bool BleuScorer::GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const
+{
+ for (vector<boost::shared_ptr<ifstream> >::iterator ifs=referenceStreams.begin(); ifs!=referenceStreams.end(); ++ifs)
+ {
+ if (!(*ifs)) return false;
+ string line;
+ if (!getline(**ifs, line)) return false;
+ line = preprocessSentence(line);
+ ProcessReferenceLine(line, &ref);
+ }
+ return true;
+}
+
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set");
- CalcBleuStats(m_references[sid], text, entry);
+ CalcBleuStats(*(m_references[sid]), text, entry);
}
-void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const
+void BleuScorer::CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const
{
NgramCounts testcounts;
// stats for this line
@@ -177,7 +189,7 @@ void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, Sc
NgramCounts::Value correct = 0;
NgramCounts::Value v = 0;
- if (ref->get_counts()->Lookup(testcounts_it->first, &v)) {
+ if (ref.get_counts()->Lookup(testcounts_it->first, &v)) {
correct = min(v, guess);
}
stats[len * 2 - 2] += correct;
@@ -207,17 +219,17 @@ statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) cons
return exp(logbleu);
}
-int BleuScorer::CalcReferenceLength(const Reference* ref, std::size_t length) const
+int BleuScorer::CalcReferenceLength(const Reference& ref, std::size_t length) const
{
switch (m_ref_length_type) {
case AVERAGE:
- return ref->CalcAverage();
+ return ref.CalcAverage();
break;
case CLOSEST:
- return ref->CalcClosest(length);
+ return ref.CalcClosest(length);
break;
case SHORTEST:
- return ref->CalcShortest();
+ return ref.CalcShortest();
break;
default:
UTIL_THROW2("Unknown reference types");
diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h
index e90915822..d7ee8e4e7 100644
--- a/mert/BleuScorer.h
+++ b/mert/BleuScorer.h
@@ -1,23 +1,23 @@
-#ifndef MERT_BLEU_SCORER_H_
-#define MERT_BLEU_SCORER_H_
+#pragma once
-#include <ostream>
+#include <fstream>
#include <string>
#include <vector>
-#include "Types.h"
+#include <boost/shared_ptr.hpp>
+
+#include "Ngram.h"
+#include "Reference.h"
+#include "ScopedVector.h"
#include "ScoreData.h"
#include "StatisticsBasedScorer.h"
-#include "ScopedVector.h"
+#include "Types.h"
namespace MosesTuning
{
const size_t kBleuNgramOrder = 4;
-class NgramCounts;
-class Reference;
-
/**
* Bleu scoring
*/
@@ -42,9 +42,9 @@ public:
return 2 * kBleuNgramOrder + 1;
}
- void CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const;
+ void CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const;
- int CalcReferenceLength(const Reference* ref, std::size_t length) const;
+ int CalcReferenceLength(const Reference& ref, std::size_t length) const;
ReferenceLengthType GetReferenceLengthType() const {
return m_ref_length_type;
@@ -65,7 +65,7 @@ public:
/**
* Count the ngrams of each type, up to the given length in the input line.
*/
- std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
+ size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
void DumpCounts(std::ostream* os, const NgramCounts& counts) const;
@@ -74,6 +74,8 @@ public:
void ProcessReferenceLine(const std::string& line, Reference* ref) const;
+ bool GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const;
+
//private:
protected:
ReferenceLengthType m_ref_length_type;
@@ -102,4 +104,3 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
}
-#endif // MERT_BLEU_SCORER_H_
diff --git a/mert/HopeFearDecoder.cpp b/mert/HopeFearDecoder.cpp
index 5288116d6..be9d8f2c9 100644
--- a/mert/HopeFearDecoder.cpp
+++ b/mert/HopeFearDecoder.cpp
@@ -98,7 +98,7 @@ void NbestHopeFearDecoder::HopeFear(
size_t hope_index=0, fear_index=0, model_index=0;
ValType hope_score=0, fear_score=0, model_score=0;
for(size_t safe_loop=0; safe_loop<2; safe_loop++) {
- ValType hope_bleu, hope_model;
+ ValType hope_bleu=0, hope_model=0;
for(size_t i=0; i< train_->cur_size(); i++) {
const MiraFeatureVector& vec=train_->featuresAt(i);
ValType score = wv.score(vec);
diff --git a/mert/HopeFearDecoder.h b/mert/HopeFearDecoder.h
index 53c0e935d..73f0e97d9 100644
--- a/mert/HopeFearDecoder.h
+++ b/mert/HopeFearDecoder.h
@@ -16,8 +16,7 @@ You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-#ifndef MERT_HOPEFEARDECODER_H
-#define MERT_HOPEFEARDECODER_H
+#pragma once
#include <vector>
@@ -160,5 +159,3 @@ private:
};
-#endif
-
diff --git a/mert/Ngram.h b/mert/Ngram.h
index 521dc4928..de2703605 100644
--- a/mert/Ngram.h
+++ b/mert/Ngram.h
@@ -1,5 +1,4 @@
-#ifndef MERT_NGRAM_H_
-#define MERT_NGRAM_H_
+#pragma once
#include <vector>
#include <string>
@@ -121,4 +120,3 @@ private:
}
-#endif // MERT_NGRAM_H_
diff --git a/mert/Reference.h b/mert/Reference.h
index 2c12f2ed7..a7878f3e7 100644
--- a/mert/Reference.h
+++ b/mert/Reference.h
@@ -59,6 +59,11 @@ public:
int CalcClosest(std::size_t length) const;
int CalcShortest() const;
+ void clear() {
+ m_length.clear();
+ m_counts->clear();
+ }
+
private:
NgramCounts* m_counts;
diff --git a/mert/Scorer.h b/mert/Scorer.h
index 4383c68f2..a08fc436d 100644
--- a/mert/Scorer.h
+++ b/mert/Scorer.h
@@ -1,5 +1,4 @@
-#ifndef MERT_SCORER_H_
-#define MERT_SCORER_H_
+#pragma once
#include <iostream>
#include <sstream>
@@ -236,4 +235,3 @@ inline float score_average(const statscores_t& scores, size_t start, size_t end)
}
-#endif // MERT_SCORER_H_
diff --git a/mert/sentence-bleu-nbest.cpp b/mert/sentence-bleu-nbest.cpp
index 023e9faae..f869386e3 100644
--- a/mert/sentence-bleu-nbest.cpp
+++ b/mert/sentence-bleu-nbest.cpp
@@ -1,9 +1,14 @@
+#include <fstream>
#include <iostream>
#include <vector>
#include <string>
+#include <boost/shared_ptr.hpp>
+
#include "BleuScorer.h"
+#include "Reference.h"
#include "moses/Util.h"
+#include "util/exception.hh"
using namespace MosesTuning;
@@ -24,21 +29,40 @@ int main(int argc, char **argv)
BleuScorer scorer(config);
scorer.setFactors(factors);
scorer.setFilter(filter);
- scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
- // Loading sentences and preparing statistics
+ // initialize reference streams
+ std::vector<boost::shared_ptr<std::ifstream> > refStreams;
+ for (std::vector<std::string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile)
+ {
+ TRACE_ERR("Loading reference from " << *refFile << std::endl);
+ boost::shared_ptr<std::ifstream> ifs(new std::ifstream(refFile->c_str()));
+ UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile);
+ refStreams.push_back(ifs);
+ }
+
+ // load sentences, preparing statistics, score
std::string nbestLine;
+ int sid = -1;
+ Reference ref;
while ( getline(std::cin, nbestLine) )
{
std::vector<std::string> items;
Moses::TokenizeMultiCharSeparator(items, nbestLine, " ||| ");
- size_t sid = Moses::Scan<size_t>(items[0]);
+ int sidCurrent = Moses::Scan<int>(items[0]);
+ if (sidCurrent != sid) {
+ ref.clear();
+ if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) {
+ UTIL_THROW2("Missing references");
+ }
+ sid = sidCurrent;
+ }
ScoreStats scoreStats;
- scorer.prepareStats(sid, items[1], scoreStats);
+ scorer.CalcBleuStats(ref, items[1], scoreStats);
std::vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
std::cout << smoothedSentenceBleu(stats) << std::endl;
}
return 0;
}
+
diff --git a/mert/sentence-bleu.cpp b/mert/sentence-bleu.cpp
index 425122c05..9bdab30d2 100644
--- a/mert/sentence-bleu.cpp
+++ b/mert/sentence-bleu.cpp
@@ -1,18 +1,26 @@
+#include <fstream>
#include <iostream>
#include <vector>
#include <string>
+#include <boost/shared_ptr.hpp>
+
#include "BleuScorer.h"
+#include "Reference.h"
+#include "moses/Util.h"
+#include "util/exception.hh"
using namespace std;
using namespace MosesTuning;
+
int main(int argc, char **argv)
{
if (argc == 1) {
cerr << "Usage: ./sentence-bleu ref1 [ref2 ...] < candidate > bleu-scores" << endl;
return 1;
}
+
vector<string> refFiles(argv + 1, argv + argc);
// TODO all of these are empty for now
@@ -23,15 +31,28 @@ int main(int argc, char **argv)
BleuScorer scorer(config);
scorer.setFactors(factors);
scorer.setFilter(filter);
- scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
- // Loading sentences and preparing statistics
+ // initialize reference streams
+ vector<boost::shared_ptr<ifstream> > refStreams;
+ for (vector<string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile)
+ {
+ TRACE_ERR("Loading reference from " << *refFile << endl);
+ boost::shared_ptr<ifstream> ifs(new ifstream(refFile->c_str()));
+ UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile);
+ refStreams.push_back(ifs);
+ }
+
+ // load sentences, preparing statistics, score
string hypothesisLine;
size_t sid = 0;
while (getline(std::cin, hypothesisLine))
{
+ Reference ref;
+ if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) {
+ UTIL_THROW2("Missing references");
+ }
ScoreStats scoreStats;
- scorer.prepareStats(sid, hypothesisLine, scoreStats);
+ scorer.CalcBleuStats(ref, hypothesisLine, scoreStats);
vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
std::cout << smoothedSentenceBleu(stats) << std::endl;
++sid;
@@ -39,3 +60,4 @@ int main(int argc, char **argv)
return 0;
}
+