Adding chrf scorers

author: Prashant Mathur <pramathur@ebay.com> 2018-05-18 17:16:22 +0300
committer: Prashant Mathur <pramathur@ebay.com> 2018-05-18 17:16:22 +0300
commit: 8b5964494550f34dd5b807f090d64f55f040a2a8 (patch)
tree: f76937cb585b0bd6fa086270bccb9815905580ba /mert
parent: 999e83d12893ea5de67722a2596d8a0c46658833 (diff)
2 files changed, 390 insertions, 0 deletions
diff --git a/mert/CHRFScorer.cpp b/mert/CHRFScorer.cpp
new file mode 100644
index 000000000..2fa2afc25
--- /dev/null
+++ b/mert/CHRFScorer.cpp
@@ -0,0 +1,288 @@
+/*
+ * CHRFScorer.cpp
+ *
+ *  Created on: Dec 28, 2016
+ *      Author: pramathur
+ */
+
+#include "CHRFScorer.h"
+#include <fstream>
+#include <stdexcept>
+
+
+#include "Util.h"
+#include "math.h"
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <climits>
+#include <fstream>
+#include <iostream>
+#include <stdexcept>
+#include "ScoreStats.h"
+#include "util/exception.hh"
+#include "Util.h"
+#include "ScoreDataIterator.h"
+#include "FeatureDataIterator.h"
+#include "Vocabulary.h"
+
+namespace {
+
+const char KEY_REFLEN[] = "reflen";
+const char REFLEN_AVERAGE[] = "average";
+const char REFLEN_SHORTEST[] = "shortest";
+const char REFLEN_CLOSEST[] = "closest";
+const char KEY_BETA[] = "beta";
+const char KEY_BETA_DEF[] = "3";
+const char KEY_SMOOTH[] = "smooth";
+const char KEY_SMOOTH_DEF[] = "0";
+float BETA=3;
+float SMOOTH=0;
+
+}
+
+namespace MosesTuning {
+
+CHRFScorer::CHRFScorer(const std::string& config)
+		  : StatisticsBasedScorer("CHRF",config), m_ref_length_type(CLOSEST), m_beta(3), m_smooth(0) {
+	const std::string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST);
+	  if (reflen == REFLEN_AVERAGE) {
+	    m_ref_length_type = AVERAGE;
+	  } else if (reflen == REFLEN_SHORTEST) {
+	    m_ref_length_type = SHORTEST;
+	  } else if (reflen == REFLEN_CLOSEST) {
+	    m_ref_length_type = CLOSEST;
+	  } else {
+	    UTIL_THROW2("Unknown reference length strategy: " + reflen);
+	  }
+	  const std::string beta = getConfig(KEY_BETA, KEY_BETA_DEF);
+	  const std::string smooth = getConfig(KEY_SMOOTH, KEY_SMOOTH_DEF);
+	  if(beta == KEY_BETA_DEF){
+		  m_beta=3.0;
+	  } else{
+		  m_beta = ::atof(beta.c_str());
+	  }
+	  if(smooth == KEY_SMOOTH_DEF){
+		  m_smooth=0.0;
+	  }else{
+		  m_smooth = ::atof(smooth.c_str());
+	  }
+	  BETA= m_beta;
+	  SMOOTH = m_smooth;
+}
+
+CHRFScorer::~CHRFScorer() {}
+
+void CHRFScorer::setReferenceFiles(const std::vector<std::string>& referenceFiles)
+{
+	// Make sure reference data is clear
+	  m_references.reset();
+	  mert::VocabularyFactory::GetVocabulary()->clear();
+
+	  //load reference data
+	  for (size_t i = 0; i < referenceFiles.size(); ++i) {
+	    TRACE_ERR("Loading reference from " << referenceFiles[i] << std::endl);
+
+	    std::ifstream ifs(referenceFiles[i].c_str());
+	    if (!OpenReferenceStream(&ifs, i)) {
+	      UTIL_THROW2("Cannot open " + referenceFiles[i]);
+	    }
+	  }
+
+}
+
+bool CHRFScorer::OpenReferenceStream(std::istream* is, size_t file_id)
+{
+  if (is == NULL) return false;
+
+  std::string line;
+  size_t sid = 0;
+  while (getline(*is, line)) {
+    // TODO: rather than loading the whole reference corpus into memory, can we stream it line by line?
+    //  (loading the whole reference corpus can take gigabytes of RAM if done with millions of sentences)
+    line = preprocessSentence(line);
+
+    // chrf stuff here
+    // split line into characters
+    std::string temp_line;
+    for(size_t i=0; i<line.size(); i++){
+    if(line[i]!=' ')
+        temp_line.append(line[i]+" ");
+    }
+    temp_line.substr(0, temp_line.size()-1);
+    line = temp_line;
+//    std::cerr<<line<<std::endl;
+
+    if (file_id == 0) {
+      Reference* ref = new Reference;
+      m_references.push_back(ref);    // Take ownership of the Reference object.
+    }
+    UTIL_THROW_IF2(m_references.size() <= sid, "Reference " << file_id << "has too many sentences.");
+
+    ProcessReferenceLine(line, m_references[sid]);
+
+    if (sid > 0 && sid % 100 == 0) {
+      TRACE_ERR(".");
+    }
+    ++sid;
+  }
+  return true;
+}
+
+void CHRFScorer::ProcessReferenceLine(const std::string& line, Reference* ref) const
+{
+  NgramCounts counts;
+  size_t length = CountNgrams(line, counts, CHRFNgramOrder);
+
+  //for any counts larger than those already there, merge them in
+  for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) {
+    const NgramCounts::Key& ngram = ci->first;
+    const NgramCounts::Value newcount = ci->second;
+
+    NgramCounts::Value oldcount = 0;
+    ref->get_counts()->Lookup(ngram, &oldcount);
+    if (newcount > oldcount) {
+      ref->get_counts()->operator[](ngram) = newcount;
+    }
+  }
+  //add in the length
+  ref->push_back(length);
+}
+
+size_t CHRFScorer::CountNgrams(const std::string& line, NgramCounts& counts,
+                               unsigned int n, bool is_testing) const
+{
+  assert(n > 0);
+  std::vector<int> encoded_tokens;
+
+  // When performing tokenization of a hypothesis translation, we don't have
+  // to update the Scorer's word vocabulary. However, the tokenization of
+  // reference translations requires modifying the vocabulary, which means
+  // this procedure might be slower than the tokenization the hypothesis
+  // translation.
+  if (is_testing) {
+    TokenizeAndEncodeTesting(line, encoded_tokens);
+  } else {
+    TokenizeAndEncode(line, encoded_tokens);
+  }
+  const size_t len = encoded_tokens.size();
+  std::vector<int> ngram;
+
+  for (size_t k = 1; k <= n; ++k) {
+    //ngram order longer than sentence - no point
+    if (k > len) {
+      continue;
+    }
+    for (size_t i = 0; i < len - k + 1; ++i) {
+      ngram.clear();
+      ngram.reserve(len);
+      for (size_t j = i; j < i+k && j < len; ++j) {
+        ngram.push_back(encoded_tokens[j]);
+      }
+      counts.Add(ngram);
+    }
+  }
+//  DumpCounts(&std::cerr, counts);
+  return len;
+}
+
+void CHRFScorer::prepareStats(size_t sid, const std::string& text, ScoreStats& entry)
+{
+  UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set");
+  CalcCHRFStats(*(m_references[sid]), text, entry);
+}
+
+void CHRFScorer::CalcCHRFStats(const Reference& ref, const std::string& text, ScoreStats& entry) const
+{
+  NgramCounts testcounts;
+  // stats for this line
+  std::vector<ScoreStatsType> stats(CHRFNgramOrder * 3);
+  std::string sentence = preprocessSentence(text);
+  // chrf stuff here
+  // split line into characters
+  std::string temp_line;
+  for(size_t i=0; i<sentence.size(); i++){
+	if(sentence[i]!=' ')
+		temp_line.append(sentence[i]+" ");
+  }
+  temp_line.substr(0, temp_line.size()-1);
+  sentence=temp_line;
+//  std::cerr<<sentence<<std::endl;
+  stats.push_back(sentence.size());
+  const size_t length = CountNgrams(sentence, testcounts, CHRFNgramOrder, true);
+
+  const int reference_len = CalcReferenceLength(ref, length);
+  stats.push_back(reference_len);
+
+  //precision on each ngram type
+  for (NgramCounts::const_iterator testcounts_it = testcounts.begin();
+       testcounts_it != testcounts.end(); ++testcounts_it) {
+    const NgramCounts::Value guess = testcounts_it->second;
+    const size_t len = testcounts_it->first.size();
+    NgramCounts::Value correct = 0;
+
+    NgramCounts::Value v = 0;
+    if (ref.get_counts()->Lookup(testcounts_it->first, &v)) {
+      correct = std::min(v, guess);
+    }
+    stats[len * 3 - 3] += correct;
+    stats[len * 3 - 2] += guess;
+    stats[len * 3 - 1] += v;
+  }
+  entry.set(stats);
+}
+
+statscore_t CHRFScorer::calculateScore(const std::vector<ScoreStatsType>& comps) const
+{
+  UTIL_THROW_IF(comps.size() != CHRFNgramOrder * 3 + 2, util::Exception, "Error");
+  float f1=0.0;
+  float precision = 0.0;
+  float recall = 0.0;
+  for (size_t i = 0; i < CHRFNgramOrder; i++){
+	  precision += ((comps[3*i] + m_smooth)*1.0) / ((comps[3*i+1] + m_smooth)*1.0);
+	  recall += ((comps[3*i] + m_smooth)*1.0) / ((comps[3*i+2] + m_smooth)*1.0);
+  }
+
+  precision /= CHRFNgramOrder;
+  recall /= CHRFNgramOrder;
+
+  f1 = ((1 + pow(m_beta, 2) ) * (precision * recall) ) / ( ( pow(m_beta, 2) * precision) + recall) ;
+  return f1;
+}
+
+int CHRFScorer::CalcReferenceLength(const Reference& ref, std::size_t length) const
+{
+  switch (m_ref_length_type) {
+  case AVERAGE:
+    return ref.CalcAverage();
+    break;
+  case CLOSEST:
+    return ref.CalcClosest(length);
+    break;
+  case SHORTEST:
+    return ref.CalcShortest();
+    break;
+  default:
+    UTIL_THROW2("Unknown reference types");
+  }
+}
+
+void CHRFScorer::DumpCounts(std::ostream* os,
+                            const NgramCounts& counts) const
+{
+  for (NgramCounts::const_iterator it = counts.begin();
+       it != counts.end(); ++it) {
+    *os << "(";
+    const NgramCounts::Key& keys = it->first;
+    for (size_t i = 0; i < keys.size(); ++i) {
+      if (i != 0) {
+        *os << " ";
+      }
+      *os << keys[i];
+    }
+    *os << ") : " << it->second << ", ";
+  }
+  *os << std::endl;
+}
+
+} /* namespace MosesTuning */
diff --git a/mert/CHRFScorer.h b/mert/CHRFScorer.h
new file mode 100644
index 000000000..eb67ef0f9
--- /dev/null
+++ b/mert/CHRFScorer.h
@@ -0,0 +1,102 @@
+/*
+ * CHRFScorer.h
+ *
+ *  Created on: Dec 28, 2016
+ *      Author: pramathur
+ */
+#pragma once
+
+#ifndef MERT_CHRFSCORER_H_
+#define MERT_CHRFSCORER_H_
+
+#include <fstream>
+#include <string>
+#include <vector>
+#include <set>
+#include <boost/shared_ptr.hpp>
+
+#include "Ngram.h"
+#include "Reference.h"
+#include "ScopedVector.h"
+#include "ScoreData.h"
+#include "StatisticsBasedScorer.h"
+#include "Types.h"
+
+namespace MosesTuning {
+
+const size_t CHRFNgramOrder = 6;
+class CHRFScorer : public StatisticsBasedScorer{
+public:
+	enum ReferenceLengthType {
+	    AVERAGE,
+	    CLOSEST,
+	    SHORTEST
+	  };
+
+  explicit CHRFScorer(const std::string& config = "");
+  ~CHRFScorer();
+
+  virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles);
+  virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry);
+  virtual statscore_t calculateScore(const std::vector<ScoreStatsType>& comps) const;
+  virtual std::size_t NumberOfScores() const {
+    return 3*CHRFNgramOrder + 2;
+  }
+
+  void CalcCHRFStats(const Reference& ref, const std::string& text, ScoreStats& entry) const;
+
+  int CalcReferenceLength(const Reference& ref, std::size_t length) const;
+
+  ReferenceLengthType GetReferenceLengthType() const {
+      return m_ref_length_type;
+    }
+
+  void SetReferenceLengthType(ReferenceLengthType type) {
+    m_ref_length_type = type;
+  }
+
+  const std::vector<Reference*>& GetReferences() const {
+    return m_references.get();
+  }
+
+  virtual float getReferenceLength(const std::vector<ScoreStatsType>& totals) const {
+    return totals[CHRFNgramOrder*3+2];
+  }
+
+  /**
+   * Count the ngrams of each type, up to the given length in the input line.
+   */
+  size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const;
+
+  void DumpCounts(std::ostream* os, const NgramCounts& counts) const;
+
+  // NOTE: this function is also used for unit testing.
+  bool OpenReferenceStream(std::istream* is, std::size_t file_id);
+
+  void ProcessReferenceLine(const std::string& line, Reference* ref) const;
+
+  bool GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const;
+
+protected:
+  ReferenceLengthType m_ref_length_type;
+  // reference translations.
+  ScopedVector<Reference> m_references;
+
+  // no copying allowed
+  CHRFScorer(const CHRFScorer&);
+  CHRFScorer& operator=(const CHRFScorer&);
+
+
+private:
+  float m_beta;
+  float m_smooth;
+  // data extracted from reference files
+  std::vector<std::size_t> m_ref_lengths;
+  std::vector<std::multiset<int> > m_ref_tokens;
+
+
+};
+
+} /* namespace MosesTuning */
+
+#endif /* MERT_CHRFSCORER_H_ */
author	Prashant Mathur <pramathur@ebay.com>	2018-05-18 17:16:22 +0300
committer	Prashant Mathur <pramathur@ebay.com>	2018-05-18 17:16:22 +0300
commit	8b5964494550f34dd5b807f090d64f55f040a2a8 (patch)
tree	f76937cb585b0bd6fa086270bccb9815905580ba /mert
parent	999e83d12893ea5de67722a2596d8a0c46658833 (diff)