Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/mert
diff options
context:
space:
mode:
authorPrashant Mathur <pramathur@ebay.com>2018-05-18 17:16:22 +0300
committerPrashant Mathur <pramathur@ebay.com>2018-05-18 17:16:22 +0300
commit8b5964494550f34dd5b807f090d64f55f040a2a8 (patch)
treef76937cb585b0bd6fa086270bccb9815905580ba /mert
parent999e83d12893ea5de67722a2596d8a0c46658833 (diff)
Adding chrf scorers
Diffstat (limited to 'mert')
-rw-r--r--mert/CHRFScorer.cpp288
-rw-r--r--mert/CHRFScorer.h102
2 files changed, 390 insertions, 0 deletions
diff --git a/mert/CHRFScorer.cpp b/mert/CHRFScorer.cpp
new file mode 100644
index 000000000..2fa2afc25
--- /dev/null
+++ b/mert/CHRFScorer.cpp
@@ -0,0 +1,288 @@
+/*
+ * CHRFScorer.cpp
+ *
+ * Created on: Dec 28, 2016
+ * Author: pramathur
+ */
+
+#include "CHRFScorer.h"
+#include <fstream>
+#include <stdexcept>
+
+
+#include "Util.h"
+#include "math.h"
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <climits>
+#include <fstream>
+#include <iostream>
+#include <stdexcept>
+#include "ScoreStats.h"
+#include "util/exception.hh"
+#include "Util.h"
+#include "ScoreDataIterator.h"
+#include "FeatureDataIterator.h"
+#include "Vocabulary.h"
+
+namespace {
+
+const char KEY_REFLEN[] = "reflen";
+const char REFLEN_AVERAGE[] = "average";
+const char REFLEN_SHORTEST[] = "shortest";
+const char REFLEN_CLOSEST[] = "closest";
+const char KEY_BETA[] = "beta";
+const char KEY_BETA_DEF[] = "3";
+const char KEY_SMOOTH[] = "smooth";
+const char KEY_SMOOTH_DEF[] = "0";
+float BETA=3;
+float SMOOTH=0;
+
+}
+
+namespace MosesTuning {
+
+CHRFScorer::CHRFScorer(const std::string& config)
+ : StatisticsBasedScorer("CHRF",config), m_ref_length_type(CLOSEST), m_beta(3), m_smooth(0) {
+ const std::string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
+ if (reflen == REFLEN_AVERAGE) {
+ m_ref_length_type = AVERAGE;
+ } else if (reflen == REFLEN_SHORTEST) {
+ m_ref_length_type = SHORTEST;
+ } else if (reflen == REFLEN_CLOSEST) {
+ m_ref_length_type = CLOSEST;
+ } else {
+ UTIL_THROW2("Unknown reference length strategy: " + reflen);
+ }
+ const std::string beta = getConfig(KEY_BETA, KEY_BETA_DEF);
+ const std::string smooth = getConfig(KEY_SMOOTH, KEY_SMOOTH_DEF);
+ if(beta == KEY_BETA_DEF){
+ m_beta=3.0;
+ } else{
+ m_beta = ::atof(beta.c_str());
+ }
+ if(smooth == KEY_SMOOTH_DEF){
+ m_smooth=0.0;
+ }else{
+ m_smooth = ::atof(smooth.c_str());
+ }
+ BETA= m_beta;
+ SMOOTH = m_smooth;
+}
+
+CHRFScorer::~CHRFScorer() {}
+
+void CHRFScorer::setReferenceFiles(const std::vector<std::string>& referenceFiles)
+{
+ // Make sure reference data is clear
+ m_references.reset();
+ mert::VocabularyFactory::GetVocabulary()->clear();
+
+ //load reference data
+ for (size_t i = 0; i < referenceFiles.size(); ++i) {
+ TRACE_ERR("Loading reference from " << referenceFiles[i] << std::endl);
+
+ std::ifstream ifs(referenceFiles[i].c_str());
+ if (!OpenReferenceStream(&ifs, i)) {
+ UTIL_THROW2("Cannot open " + referenceFiles[i]);
+ }
+ }
+
+}
+
+bool CHRFScorer::OpenReferenceStream(std::istream* is, size_t file_id)
+{
+ if (is == NULL) return false;
+
+ std::string line;
+ size_t sid = 0;
+ while (getline(*is, line)) {
+ // TODO: rather than loading the whole reference corpus into memory, can we stream it line by line?
+ // (loading the whole reference corpus can take gigabytes of RAM if done with millions of sentences)
+ line = preprocessSentence(line);
+
+ // chrf stuff here
+ // split line into characters
+ std::string temp_line;
+ for(size_t i=0; i<line.size(); i++){
+ if(line[i]!=' ')
+ temp_line.append(line[i]+" ");
+ }
+ temp_line.substr(0, temp_line.size()-1);
+ line = temp_line;
+// std::cerr<<line<<std::endl;
+
+ if (file_id == 0) {
+ Reference* ref = new Reference;
+ m_references.push_back(ref); // Take ownership of the Reference object.
+ }
+ UTIL_THROW_IF2(m_references.size() <= sid, "Reference " << file_id << "has too many sentences.");
+
+ ProcessReferenceLine(line, m_references[sid]);
+
+ if (sid > 0 && sid % 100 == 0) {
+ TRACE_ERR(".");
+ }
+ ++sid;
+ }
+ return true;
+}
+
+void CHRFScorer::ProcessReferenceLine(const std::string& line, Reference* ref) const
+{
+ NgramCounts counts;
+ size_t length = CountNgrams(line, counts, CHRFNgramOrder);
+
+ //for any counts larger than those already there, merge them in
+ for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
+ const NgramCounts::Key& ngram = ci->first;
+ const NgramCounts::Value newcount = ci->second;
+
+ NgramCounts::Value oldcount = 0;
+ ref->get_counts()->Lookup(ngram, &oldcount);
+ if (newcount > oldcount) {
+ ref->get_counts()->operator[](ngram) = newcount;
+ }
+ }
+ //add in the length
+ ref->push_back(length);
+}
+
+size_t CHRFScorer::CountNgrams(const std::string& line, NgramCounts& counts,
+ unsigned int n, bool is_testing) const
+{
+ assert(n > 0);
+ std::vector<int> encoded_tokens;
+
+ // When performing tokenization of a hypothesis translation, we don't have
+ // to update the Scorer's word vocabulary. However, the tokenization of
+ // reference translations requires modifying the vocabulary, which means
+ // this procedure might be slower than the tokenization the hypothesis
+ // translation.
+ if (is_testing) {
+ TokenizeAndEncodeTesting(line, encoded_tokens);
+ } else {
+ TokenizeAndEncode(line, encoded_tokens);
+ }
+ const size_t len = encoded_tokens.size();
+ std::vector<int> ngram;
+
+ for (size_t k = 1; k <= n; ++k) {
+ //ngram order longer than sentence - no point
+ if (k > len) {
+ continue;
+ }
+ for (size_t i = 0; i < len - k + 1; ++i) {
+ ngram.clear();
+ ngram.reserve(len);
+ for (size_t j = i; j < i+k && j < len; ++j) {
+ ngram.push_back(encoded_tokens[j]);
+ }
+ counts.Add(ngram);
+ }
+ }
+// DumpCounts(&std::cerr, counts);
+ return len;
+}
+
+void CHRFScorer::prepareStats(size_t sid, const std::string& text, ScoreStats& entry)
+{
+ UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set");
+ CalcCHRFStats(*(m_references[sid]), text, entry);
+}
+
+void CHRFScorer::CalcCHRFStats(const Reference& ref, const std::string& text, ScoreStats& entry) const
+{
+ NgramCounts testcounts;
+ // stats for this line
+ std::vector<ScoreStatsType> stats(CHRFNgramOrder * 3);
+ std::string sentence = preprocessSentence(text);
+ // chrf stuff here
+ // split line into characters
+ std::string temp_line;
+ for(size_t i=0; i<sentence.size(); i++){
+ if(sentence[i]!=' ')
+ temp_line.append(sentence[i]+" ");
+ }
+ temp_line.substr(0, temp_line.size()-1);
+ sentence=temp_line;
+// std::cerr<<sentence<<std::endl;
+ stats.push_back(sentence.size());
+ const size_t length = CountNgrams(sentence, testcounts, CHRFNgramOrder, true);
+
+ const int reference_len = CalcReferenceLength(ref, length);
+ stats.push_back(reference_len);
+
+ //precision on each ngram type
+ for (NgramCounts::const_iterator testcounts_it = testcounts.begin();
+ testcounts_it != testcounts.end(); ++testcounts_it) {
+ const NgramCounts::Value guess = testcounts_it->second;
+ const size_t len = testcounts_it->first.size();
+ NgramCounts::Value correct = 0;
+
+ NgramCounts::Value v = 0;
+ if (ref.get_counts()->Lookup(testcounts_it->first, &v)) {
+ correct = std::min(v, guess);
+ }
+ stats[len * 3 - 3] += correct;
+ stats[len * 3 - 2] += guess;
+ stats[len * 3 - 1] += v;
+ }
+ entry.set(stats);
+}
+
+statscore_t CHRFScorer::calculateScore(const std::vector<ScoreStatsType>& comps) const
+{
+ UTIL_THROW_IF(comps.size() != CHRFNgramOrder * 3 + 2, util::Exception, "Error");
+ float f1=0.0;
+ float precision = 0.0;
+ float recall = 0.0;
+ for (size_t i = 0; i < CHRFNgramOrder; i++){
+ precision += ((comps[3*i] + m_smooth)*1.0) / ((comps[3*i+1] + m_smooth)*1.0);
+ recall += ((comps[3*i] + m_smooth)*1.0) / ((comps[3*i+2] + m_smooth)*1.0);
+ }
+
+ precision /= CHRFNgramOrder;
+ recall /= CHRFNgramOrder;
+
+ f1 = ((1 + pow(m_beta, 2) ) * (precision * recall) ) / ( ( pow(m_beta, 2) * precision) + recall) ;
+ return f1;
+}
+
+int CHRFScorer::CalcReferenceLength(const Reference& ref, std::size_t length) const
+{
+ switch (m_ref_length_type) {
+ case AVERAGE:
+ return ref.CalcAverage();
+ break;
+ case CLOSEST:
+ return ref.CalcClosest(length);
+ break;
+ case SHORTEST:
+ return ref.CalcShortest();
+ break;
+ default:
+ UTIL_THROW2("Unknown reference types");
+ }
+}
+
+void CHRFScorer::DumpCounts(std::ostream* os,
+ const NgramCounts& counts) const
+{
+ for (NgramCounts::const_iterator it = counts.begin();
+ it != counts.end(); ++it) {
+ *os << "(";
+ const NgramCounts::Key& keys = it->first;
+ for (size_t i = 0; i < keys.size(); ++i) {
+ if (i != 0) {
+ *os << " ";
+ }
+ *os << keys[i];
+ }
+ *os << ") : " << it->second << ", ";
+ }
+ *os << std::endl;
+}
+
+} /* namespace MosesTuning */
diff --git a/mert/CHRFScorer.h b/mert/CHRFScorer.h
new file mode 100644
index 000000000..eb67ef0f9
--- /dev/null
+++ b/mert/CHRFScorer.h
@@ -0,0 +1,102 @@
+/*
+ * CHRFScorer.h
+ *
+ * Created on: Dec 28, 2016
+ * Author: pramathur
+ */
+#pragma once
+
+#ifndef MERT_CHRFSCORER_H_
+#define MERT_CHRFSCORER_H_
+
+#include <fstream>
+#include <string>
+#include <vector>
+#include <set>
+#include <boost/shared_ptr.hpp>
+
+#include "Ngram.h"
+#include "Reference.h"
+#include "ScopedVector.h"
+#include "ScoreData.h"
+#include "StatisticsBasedScorer.h"
+#include "Types.h"
+
+namespace MosesTuning {
+
+const size_t CHRFNgramOrder = 6;
+class CHRFScorer : public StatisticsBasedScorer{
+public:
+ enum ReferenceLengthType {
+ AVERAGE,
+ CLOSEST,
+ SHORTEST
+ };
+
+ explicit CHRFScorer(const std::string& config = "");
+ ~CHRFScorer();
+
+ virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
+ virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
+ virtual statscore_t calculateScore(const std::vector<ScoreStatsType>& comps) const;
+ virtual std::size_t NumberOfScores() const {
+ return 3*CHRFNgramOrder + 2;
+ }
+
+ void CalcCHRFStats(const Reference& ref, const std::string& text, ScoreStats& entry) const;
+
+ int CalcReferenceLength(const Reference& ref, std::size_t length) const;
+
+ ReferenceLengthType GetReferenceLengthType() const {
+ return m_ref_length_type;
+ }
+
+ void SetReferenceLengthType(ReferenceLengthType type) {
+ m_ref_length_type = type;
+ }
+
+ const std::vector<Reference*>& GetReferences() const {
+ return m_references.get();
+ }
+
+ virtual float getReferenceLength(const std::vector<ScoreStatsType>& totals) const {
+ return totals[CHRFNgramOrder*3+2];
+ }
+
+ /**
+ * Count the ngrams of each type, up to the given length in the input line.
+ */
+ size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
+
+ void DumpCounts(std::ostream* os, const NgramCounts& counts) const;
+
+ // NOTE: this function is also used for unit testing.
+ bool OpenReferenceStream(std::istream* is, std::size_t file_id);
+
+ void ProcessReferenceLine(const std::string& line, Reference* ref) const;
+
+ bool GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const;
+
+protected:
+ ReferenceLengthType m_ref_length_type;
+ // reference translations.
+ ScopedVector<Reference> m_references;
+
+ // no copying allowed
+ CHRFScorer(const CHRFScorer&);
+ CHRFScorer& operator=(const CHRFScorer&);
+
+
+private:
+ float m_beta;
+ float m_smooth;
+ // data extracted from reference files
+ std::vector<std::size_t> m_ref_lengths;
+ std::vector<std::multiset<int> > m_ref_tokens;
+
+
+};
+
+} /* namespace MosesTuning */
+
+#endif /* MERT_CHRFSCORER_H_ */