Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Huck <mhuck@inf.ed.ac.uk>2015-05-01 00:26:30 +0300
committerMatthias Huck <mhuck@inf.ed.ac.uk>2015-05-01 00:26:30 +0300
commit4ee8f2dec1c200858c4ed82becc95ed1f7c1017f (patch)
tree07ecd1b03f6aa7926bfa3fdeaf176833b836ed7b /mert/sentence-bleu-nbest.cpp
parent1d86b8fde7f62dbd0897815479d44df981de5d84 (diff)
sentence-bleu less greedy regarding memory
Don't load all references, read them line by line. Corpora with millions of sentences can now be evaluated without consuming gigabytes of RAM.
Diffstat (limited to 'mert/sentence-bleu-nbest.cpp')
-rw-r--r--mert/sentence-bleu-nbest.cpp32
1 files changed, 28 insertions, 4 deletions
diff --git a/mert/sentence-bleu-nbest.cpp b/mert/sentence-bleu-nbest.cpp
index 023e9faae..f869386e3 100644
--- a/mert/sentence-bleu-nbest.cpp
+++ b/mert/sentence-bleu-nbest.cpp
@@ -1,9 +1,14 @@
+#include <fstream>
#include <iostream>
#include <vector>
#include <string>
+#include <boost/shared_ptr.hpp>
+
#include "BleuScorer.h"
+#include "Reference.h"
#include "moses/Util.h"
+#include "util/exception.hh"
using namespace MosesTuning;
@@ -24,21 +29,40 @@ int main(int argc, char **argv)
BleuScorer scorer(config);
scorer.setFactors(factors);
scorer.setFilter(filter);
- scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
- // Loading sentences and preparing statistics
+ // initialize reference streams
+ std::vector<boost::shared_ptr<std::ifstream> > refStreams;
+ for (std::vector<std::string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile)
+ {
+ TRACE_ERR("Loading reference from " << *refFile << std::endl);
+ boost::shared_ptr<std::ifstream> ifs(new std::ifstream(refFile->c_str()));
+ UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile);
+ refStreams.push_back(ifs);
+ }
+
+ // load sentences, preparing statistics, score
std::string nbestLine;
+ int sid = -1;
+ Reference ref;
while ( getline(std::cin, nbestLine) )
{
std::vector<std::string> items;
Moses::TokenizeMultiCharSeparator(items, nbestLine, " ||| ");
- size_t sid = Moses::Scan<size_t>(items[0]);
+ int sidCurrent = Moses::Scan<int>(items[0]);
+ if (sidCurrent != sid) {
+ ref.clear();
+ if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) {
+ UTIL_THROW2("Missing references");
+ }
+ sid = sidCurrent;
+ }
ScoreStats scoreStats;
- scorer.prepareStats(sid, items[1], scoreStats);
+ scorer.CalcBleuStats(ref, items[1], scoreStats);
std::vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
std::cout << smoothedSentenceBleu(stats) << std::endl;
}
return 0;
}
+