Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Huck <mhuck@inf.ed.ac.uk>2015-05-01 00:26:30 +0300
committerMatthias Huck <mhuck@inf.ed.ac.uk>2015-05-01 00:26:30 +0300
commit4ee8f2dec1c200858c4ed82becc95ed1f7c1017f (patch)
tree07ecd1b03f6aa7926bfa3fdeaf176833b836ed7b
parent1d86b8fde7f62dbd0897815479d44df981de5d84 (diff)
sentence-bleu less greedy regarding memory
Don't load all references, read them line by line. Corpora with millions of sentences can now be evaluated without consuming gigabytes of RAM.
-rw-r--r--mert/BleuDocScorer.cpp2
-rw-r--r--mert/BleuDocScorer.h4
-rw-r--r--mert/BleuScorer.cpp30
-rw-r--r--mert/BleuScorer.h25
-rw-r--r--mert/HopeFearDecoder.cpp2
-rw-r--r--mert/HopeFearDecoder.h5
-rw-r--r--mert/Ngram.h4
-rw-r--r--mert/Reference.h5
-rw-r--r--mert/Scorer.h4
-rw-r--r--mert/sentence-bleu-nbest.cpp32
-rw-r--r--mert/sentence-bleu.cpp28
11 files changed, 98 insertions, 43 deletions
diff --git a/mert/BleuDocScorer.cpp b/mert/BleuDocScorer.cpp
index 48c17ee96..d71c5171d 100644
--- a/mert/BleuDocScorer.cpp
+++ b/mert/BleuDocScorer.cpp
@@ -174,7 +174,7 @@ statscore_t BleuDocScorer::calculateScore(const vector<int>& comps) const
UTIL_THROW_IF(comps.size() != kBleuNgramOrder * 2 + 1, util::Exception, "Error");
float logbleu = 0.0;
- for (int i = 0; i < kBleuNgramOrder; ++i) {
+ for (size_t i = 0; i < kBleuNgramOrder; ++i) {
if (comps[2*i] == 0) {
return 0.0;
}
diff --git a/mert/BleuDocScorer.h b/mert/BleuDocScorer.h
index 9677410f8..d27088254 100644
--- a/mert/BleuDocScorer.h
+++ b/mert/BleuDocScorer.h
@@ -1,5 +1,4 @@
-#ifndef MERT_BLEU_DOC_SCORER_H_
-#define MERT_BLEU_DOC_SCORER_H_
+#pragma once
#include <ostream>
#include <string>
@@ -64,4 +63,3 @@ private:
}
-#endif // MERT_BLEU_DOC_SCORER_H_
diff --git a/mert/BleuScorer.cpp b/mert/BleuScorer.cpp
index 13a472447..dc926054f 100644
--- a/mert/BleuScorer.cpp
+++ b/mert/BleuScorer.cpp
@@ -99,9 +99,8 @@ void BleuScorer::setReferenceFiles(const vector<string>& referenceFiles)
TRACE_ERR("Loading reference from " << referenceFiles[i] << endl);
ifstream ifs(referenceFiles[i].c_str());
- UTIL_THROW_IF2(!ifs, "Cannot open " << referenceFiles[i]);
if (!OpenReferenceStream(&ifs, i)) {
- UTIL_THROW2("Unable to open " + referenceFiles[i]);
+ UTIL_THROW2("Cannot open " + referenceFiles[i]);
}
}
}
@@ -152,13 +151,26 @@ void BleuScorer::ProcessReferenceLine(const std::string& line, Reference* ref) c
ref->push_back(length);
}
+bool BleuScorer::GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const
+{
+ for (vector<boost::shared_ptr<ifstream> >::iterator ifs=referenceStreams.begin(); ifs!=referenceStreams.end(); ++ifs)
+ {
+ if (!(*ifs)) return false;
+ string line;
+ if (!getline(**ifs, line)) return false;
+ line = preprocessSentence(line);
+ ProcessReferenceLine(line, &ref);
+ }
+ return true;
+}
+
void BleuScorer::prepareStats(size_t sid, const string& text, ScoreStats& entry)
{
UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set");
- CalcBleuStats(m_references[sid], text, entry);
+ CalcBleuStats(*(m_references[sid]), text, entry);
}
-void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const
+void BleuScorer::CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const
{
NgramCounts testcounts;
// stats for this line
@@ -177,7 +189,7 @@ void BleuScorer::CalcBleuStats(const Reference* ref, const std::string& text, Sc
NgramCounts::Value correct = 0;
NgramCounts::Value v = 0;
- if (ref->get_counts()->Lookup(testcounts_it->first, &v)) {
+ if (ref.get_counts()->Lookup(testcounts_it->first, &v)) {
correct = min(v, guess);
}
stats[len * 2 - 2] += correct;
@@ -207,17 +219,17 @@ statscore_t BleuScorer::calculateScore(const vector<ScoreStatsType>& comps) cons
return exp(logbleu);
}
-int BleuScorer::CalcReferenceLength(const Reference* ref, std::size_t length) const
+int BleuScorer::CalcReferenceLength(const Reference& ref, std::size_t length) const
{
switch (m_ref_length_type) {
case AVERAGE:
- return ref->CalcAverage();
+ return ref.CalcAverage();
break;
case CLOSEST:
- return ref->CalcClosest(length);
+ return ref.CalcClosest(length);
break;
case SHORTEST:
- return ref->CalcShortest();
+ return ref.CalcShortest();
break;
default:
UTIL_THROW2("Unknown reference types");
diff --git a/mert/BleuScorer.h b/mert/BleuScorer.h
index e90915822..d7ee8e4e7 100644
--- a/mert/BleuScorer.h
+++ b/mert/BleuScorer.h
@@ -1,23 +1,23 @@
-#ifndef MERT_BLEU_SCORER_H_
-#define MERT_BLEU_SCORER_H_
+#pragma once
-#include <ostream>
+#include <fstream>
#include <string>
#include <vector>
-#include "Types.h"
+#include <boost/shared_ptr.hpp>
+
+#include "Ngram.h"
+#include "Reference.h"
+#include "ScopedVector.h"
#include "ScoreData.h"
#include "StatisticsBasedScorer.h"
-#include "ScopedVector.h"
+#include "Types.h"
namespace MosesTuning
{
const size_t kBleuNgramOrder = 4;
-class NgramCounts;
-class Reference;
-
/**
* Bleu scoring
*/
@@ -42,9 +42,9 @@ public:
return 2 * kBleuNgramOrder + 1;
}
- void CalcBleuStats(const Reference* ref, const std::string& text, ScoreStats& entry) const;
+ void CalcBleuStats(const Reference& ref, const std::string& text, ScoreStats& entry) const;
- int CalcReferenceLength(const Reference* ref, std::size_t length) const;
+ int CalcReferenceLength(const Reference& ref, std::size_t length) const;
ReferenceLengthType GetReferenceLengthType() const {
return m_ref_length_type;
@@ -65,7 +65,7 @@ public:
/**
* Count the ngrams of each type, up to the given length in the input line.
*/
- std::size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
+ size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
void DumpCounts(std::ostream* os, const NgramCounts& counts) const;
@@ -74,6 +74,8 @@ public:
void ProcessReferenceLine(const std::string& line, Reference* ref) const;
+ bool GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const;
+
//private:
protected:
ReferenceLengthType m_ref_length_type;
@@ -102,4 +104,3 @@ float sentenceLevelBackgroundBleu(const std::vector<float>& sent, const std::vec
}
-#endif // MERT_BLEU_SCORER_H_
diff --git a/mert/HopeFearDecoder.cpp b/mert/HopeFearDecoder.cpp
index 5288116d6..be9d8f2c9 100644
--- a/mert/HopeFearDecoder.cpp
+++ b/mert/HopeFearDecoder.cpp
@@ -98,7 +98,7 @@ void NbestHopeFearDecoder::HopeFear(
size_t hope_index=0, fear_index=0, model_index=0;
ValType hope_score=0, fear_score=0, model_score=0;
for(size_t safe_loop=0; safe_loop<2; safe_loop++) {
- ValType hope_bleu, hope_model;
+ ValType hope_bleu=0, hope_model=0;
for(size_t i=0; i< train_->cur_size(); i++) {
const MiraFeatureVector& vec=train_->featuresAt(i);
ValType score = wv.score(vec);
diff --git a/mert/HopeFearDecoder.h b/mert/HopeFearDecoder.h
index 53c0e935d..73f0e97d9 100644
--- a/mert/HopeFearDecoder.h
+++ b/mert/HopeFearDecoder.h
@@ -16,8 +16,7 @@ You should have received a copy of the GNU Lesser General Public
License along with this library; if not, write to the Free Software
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-#ifndef MERT_HOPEFEARDECODER_H
-#define MERT_HOPEFEARDECODER_H
+#pragma once
#include <vector>
@@ -160,5 +159,3 @@ private:
};
-#endif
-
diff --git a/mert/Ngram.h b/mert/Ngram.h
index 521dc4928..de2703605 100644
--- a/mert/Ngram.h
+++ b/mert/Ngram.h
@@ -1,5 +1,4 @@
-#ifndef MERT_NGRAM_H_
-#define MERT_NGRAM_H_
+#pragma once
#include <vector>
#include <string>
@@ -121,4 +120,3 @@ private:
}
-#endif // MERT_NGRAM_H_
diff --git a/mert/Reference.h b/mert/Reference.h
index 2c12f2ed7..a7878f3e7 100644
--- a/mert/Reference.h
+++ b/mert/Reference.h
@@ -59,6 +59,11 @@ public:
int CalcClosest(std::size_t length) const;
int CalcShortest() const;
+ void clear() {
+ m_length.clear();
+ m_counts->clear();
+ }
+
private:
NgramCounts* m_counts;
diff --git a/mert/Scorer.h b/mert/Scorer.h
index 4383c68f2..a08fc436d 100644
--- a/mert/Scorer.h
+++ b/mert/Scorer.h
@@ -1,5 +1,4 @@
-#ifndef MERT_SCORER_H_
-#define MERT_SCORER_H_
+#pragma once
#include <iostream>
#include <sstream>
@@ -236,4 +235,3 @@ inline float score_average(const statscores_t& scores, size_t start, size_t end)
}
-#endif // MERT_SCORER_H_
diff --git a/mert/sentence-bleu-nbest.cpp b/mert/sentence-bleu-nbest.cpp
index 023e9faae..f869386e3 100644
--- a/mert/sentence-bleu-nbest.cpp
+++ b/mert/sentence-bleu-nbest.cpp
@@ -1,9 +1,14 @@
+#include <fstream>
#include <iostream>
#include <vector>
#include <string>
+#include <boost/shared_ptr.hpp>
+
#include "BleuScorer.h"
+#include "Reference.h"
#include "moses/Util.h"
+#include "util/exception.hh"
using namespace MosesTuning;
@@ -24,21 +29,40 @@ int main(int argc, char **argv)
BleuScorer scorer(config);
scorer.setFactors(factors);
scorer.setFilter(filter);
- scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
- // Loading sentences and preparing statistics
+ // initialize reference streams
+ std::vector<boost::shared_ptr<std::ifstream> > refStreams;
+ for (std::vector<std::string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile)
+ {
+ TRACE_ERR("Loading reference from " << *refFile << std::endl);
+ boost::shared_ptr<std::ifstream> ifs(new std::ifstream(refFile->c_str()));
+ UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile);
+ refStreams.push_back(ifs);
+ }
+
+ // load sentences, preparing statistics, score
std::string nbestLine;
+ int sid = -1;
+ Reference ref;
while ( getline(std::cin, nbestLine) )
{
std::vector<std::string> items;
Moses::TokenizeMultiCharSeparator(items, nbestLine, " ||| ");
- size_t sid = Moses::Scan<size_t>(items[0]);
+ int sidCurrent = Moses::Scan<int>(items[0]);
+ if (sidCurrent != sid) {
+ ref.clear();
+ if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) {
+ UTIL_THROW2("Missing references");
+ }
+ sid = sidCurrent;
+ }
ScoreStats scoreStats;
- scorer.prepareStats(sid, items[1], scoreStats);
+ scorer.CalcBleuStats(ref, items[1], scoreStats);
std::vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
std::cout << smoothedSentenceBleu(stats) << std::endl;
}
return 0;
}
+
diff --git a/mert/sentence-bleu.cpp b/mert/sentence-bleu.cpp
index 425122c05..9bdab30d2 100644
--- a/mert/sentence-bleu.cpp
+++ b/mert/sentence-bleu.cpp
@@ -1,18 +1,26 @@
+#include <fstream>
#include <iostream>
#include <vector>
#include <string>
+#include <boost/shared_ptr.hpp>
+
#include "BleuScorer.h"
+#include "Reference.h"
+#include "moses/Util.h"
+#include "util/exception.hh"
using namespace std;
using namespace MosesTuning;
+
int main(int argc, char **argv)
{
if (argc == 1) {
cerr << "Usage: ./sentence-bleu ref1 [ref2 ...] < candidate > bleu-scores" << endl;
return 1;
}
+
vector<string> refFiles(argv + 1, argv + argc);
// TODO all of these are empty for now
@@ -23,15 +31,28 @@ int main(int argc, char **argv)
BleuScorer scorer(config);
scorer.setFactors(factors);
scorer.setFilter(filter);
- scorer.setReferenceFiles(refFiles); // TODO: we don't need to load the whole reference corpus into memory (this can take gigabytes of RAM if done with millions of sentences)
- // Loading sentences and preparing statistics
+ // initialize reference streams
+ vector<boost::shared_ptr<ifstream> > refStreams;
+ for (vector<string>::const_iterator refFile=refFiles.begin(); refFile!=refFiles.end(); ++refFile)
+ {
+ TRACE_ERR("Loading reference from " << *refFile << endl);
+ boost::shared_ptr<ifstream> ifs(new ifstream(refFile->c_str()));
+ UTIL_THROW_IF2(!ifs, "Cannot open " << *refFile);
+ refStreams.push_back(ifs);
+ }
+
+ // load sentences, preparing statistics, score
string hypothesisLine;
size_t sid = 0;
while (getline(std::cin, hypothesisLine))
{
+ Reference ref;
+ if (!scorer.GetNextReferenceFromStreams(refStreams, ref)) {
+ UTIL_THROW2("Missing references");
+ }
ScoreStats scoreStats;
- scorer.prepareStats(sid, hypothesisLine, scoreStats);
+ scorer.CalcBleuStats(ref, hypothesisLine, scoreStats);
vector<float> stats(scoreStats.getArray(), scoreStats.getArray() + scoreStats.size());
std::cout << smoothedSentenceBleu(stats) << std::endl;
++sid;
@@ -39,3 +60,4 @@ int main(int argc, char **argv)
return 0;
}
+