diff options
author | Prashant Mathur <pramathur@ebay.com> | 2018-05-18 17:16:22 +0300 |
---|---|---|
committer | Prashant Mathur <pramathur@ebay.com> | 2018-05-18 17:16:22 +0300 |
commit | 8b5964494550f34dd5b807f090d64f55f040a2a8 (patch) | |
tree | f76937cb585b0bd6fa086270bccb9815905580ba /mert | |
parent | 999e83d12893ea5de67722a2596d8a0c46658833 (diff) |
Adding chrf scorers
Diffstat (limited to 'mert')
-rw-r--r-- | mert/CHRFScorer.cpp | 288 | ||||
-rw-r--r-- | mert/CHRFScorer.h | 102 |
2 files changed, 390 insertions, 0 deletions
diff --git a/mert/CHRFScorer.cpp b/mert/CHRFScorer.cpp new file mode 100644 index 000000000..2fa2afc25 --- /dev/null +++ b/mert/CHRFScorer.cpp @@ -0,0 +1,288 @@ +/* + * CHRFScorer.cpp + * + * Created on: Dec 28, 2016 + * Author: pramathur + */ + +#include "CHRFScorer.h" +#include <fstream> +#include <stdexcept> + + +#include "Util.h" +#include "math.h" +#include <algorithm> +#include <cassert> +#include <cmath> +#include <climits> +#include <fstream> +#include <iostream> +#include <stdexcept> +#include "ScoreStats.h" +#include "util/exception.hh" +#include "Util.h" +#include "ScoreDataIterator.h" +#include "FeatureDataIterator.h" +#include "Vocabulary.h" + +namespace { + +const char KEY_REFLEN[] = "reflen"; +const char REFLEN_AVERAGE[] = "average"; +const char REFLEN_SHORTEST[] = "shortest"; +const char REFLEN_CLOSEST[] = "closest"; +const char KEY_BETA[] = "beta"; +const char KEY_BETA_DEF[] = "3"; +const char KEY_SMOOTH[] = "smooth"; +const char KEY_SMOOTH_DEF[] = "0"; +float BETA=3; +float SMOOTH=0; + +} + +namespace MosesTuning { + +CHRFScorer::CHRFScorer(const std::string& config) + : StatisticsBasedScorer("CHRF",config), m_ref_length_type(CLOSEST), m_beta(3), m_smooth(0) { + const std::string reflen = getConfig(KEY_REFLEN, REFLEN_CLOSEST); + if (reflen == REFLEN_AVERAGE) { + m_ref_length_type = AVERAGE; + } else if (reflen == REFLEN_SHORTEST) { + m_ref_length_type = SHORTEST; + } else if (reflen == REFLEN_CLOSEST) { + m_ref_length_type = CLOSEST; + } else { + UTIL_THROW2("Unknown reference length strategy: " + reflen); + } + const std::string beta = getConfig(KEY_BETA, KEY_BETA_DEF); + const std::string smooth = getConfig(KEY_SMOOTH, KEY_SMOOTH_DEF); + if(beta == KEY_BETA_DEF){ + m_beta=3.0; + } else{ + m_beta = ::atof(beta.c_str()); + } + if(smooth == KEY_SMOOTH_DEF){ + m_smooth=0.0; + }else{ + m_smooth = ::atof(smooth.c_str()); + } + BETA= m_beta; + SMOOTH = m_smooth; +} + +CHRFScorer::~CHRFScorer() {} + +void CHRFScorer::setReferenceFiles(const std::vector<std::string>& referenceFiles) +{ + // Make sure reference data is clear + m_references.reset(); + mert::VocabularyFactory::GetVocabulary()->clear(); + + //load reference data + for (size_t i = 0; i < referenceFiles.size(); ++i) { + TRACE_ERR("Loading reference from " << referenceFiles[i] << std::endl); + + std::ifstream ifs(referenceFiles[i].c_str()); + if (!OpenReferenceStream(&ifs, i)) { + UTIL_THROW2("Cannot open " + referenceFiles[i]); + } + } + +} + +bool CHRFScorer::OpenReferenceStream(std::istream* is, size_t file_id) +{ + if (is == NULL) return false; + + std::string line; + size_t sid = 0; + while (getline(*is, line)) { + // TODO: rather than loading the whole reference corpus into memory, can we stream it line by line? + // (loading the whole reference corpus can take gigabytes of RAM if done with millions of sentences) + line = preprocessSentence(line); + + // chrf stuff here + // split line into characters + std::string temp_line; + for(size_t i=0; i<line.size(); i++){ + if(line[i]!=' ') + temp_line.append(line[i]+" "); + } + temp_line.substr(0, temp_line.size()-1); + line = temp_line; +// std::cerr<<line<<std::endl; + + if (file_id == 0) { + Reference* ref = new Reference; + m_references.push_back(ref); // Take ownership of the Reference object. + } + UTIL_THROW_IF2(m_references.size() <= sid, "Reference " << file_id << "has too many sentences."); + + ProcessReferenceLine(line, m_references[sid]); + + if (sid > 0 && sid % 100 == 0) { + TRACE_ERR("."); + } + ++sid; + } + return true; +} + +void CHRFScorer::ProcessReferenceLine(const std::string& line, Reference* ref) const +{ + NgramCounts counts; + size_t length = CountNgrams(line, counts, CHRFNgramOrder); + + //for any counts larger than those already there, merge them in + for (NgramCounts::const_iterator ci = counts.begin(); ci != counts.end(); ++ci) { + const NgramCounts::Key& ngram = ci->first; + const NgramCounts::Value newcount = ci->second; + + NgramCounts::Value oldcount = 0; + ref->get_counts()->Lookup(ngram, &oldcount); + if (newcount > oldcount) { + ref->get_counts()->operator[](ngram) = newcount; + } + } + //add in the length + ref->push_back(length); +} + +size_t CHRFScorer::CountNgrams(const std::string& line, NgramCounts& counts, + unsigned int n, bool is_testing) const +{ + assert(n > 0); + std::vector<int> encoded_tokens; + + // When performing tokenization of a hypothesis translation, we don't have + // to update the Scorer's word vocabulary. However, the tokenization of + // reference translations requires modifying the vocabulary, which means + // this procedure might be slower than the tokenization the hypothesis + // translation. + if (is_testing) { + TokenizeAndEncodeTesting(line, encoded_tokens); + } else { + TokenizeAndEncode(line, encoded_tokens); + } + const size_t len = encoded_tokens.size(); + std::vector<int> ngram; + + for (size_t k = 1; k <= n; ++k) { + //ngram order longer than sentence - no point + if (k > len) { + continue; + } + for (size_t i = 0; i < len - k + 1; ++i) { + ngram.clear(); + ngram.reserve(len); + for (size_t j = i; j < i+k && j < len; ++j) { + ngram.push_back(encoded_tokens[j]); + } + counts.Add(ngram); + } + } +// DumpCounts(&std::cerr, counts); + return len; +} + +void CHRFScorer::prepareStats(size_t sid, const std::string& text, ScoreStats& entry) +{ + UTIL_THROW_IF2(sid >= m_references.size(), "Sentence id (" << sid << ") not found in reference set"); + CalcCHRFStats(*(m_references[sid]), text, entry); +} + +void CHRFScorer::CalcCHRFStats(const Reference& ref, const std::string& text, ScoreStats& entry) const +{ + NgramCounts testcounts; + // stats for this line + std::vector<ScoreStatsType> stats(CHRFNgramOrder * 3); + std::string sentence = preprocessSentence(text); + // chrf stuff here + // split line into characters + std::string temp_line; + for(size_t i=0; i<sentence.size(); i++){ + if(sentence[i]!=' ') + temp_line.append(sentence[i]+" "); + } + temp_line.substr(0, temp_line.size()-1); + sentence=temp_line; +// std::cerr<<sentence<<std::endl; + stats.push_back(sentence.size()); + const size_t length = CountNgrams(sentence, testcounts, CHRFNgramOrder, true); + + const int reference_len = CalcReferenceLength(ref, length); + stats.push_back(reference_len); + + //precision on each ngram type + for (NgramCounts::const_iterator testcounts_it = testcounts.begin(); + testcounts_it != testcounts.end(); ++testcounts_it) { + const NgramCounts::Value guess = testcounts_it->second; + const size_t len = testcounts_it->first.size(); + NgramCounts::Value correct = 0; + + NgramCounts::Value v = 0; + if (ref.get_counts()->Lookup(testcounts_it->first, &v)) { + correct = std::min(v, guess); + } + stats[len * 3 - 3] += correct; + stats[len * 3 - 2] += guess; + stats[len * 3 - 1] += v; + } + entry.set(stats); +} + +statscore_t CHRFScorer::calculateScore(const std::vector<ScoreStatsType>& comps) const +{ + UTIL_THROW_IF(comps.size() != CHRFNgramOrder * 3 + 2, util::Exception, "Error"); + float f1=0.0; + float precision = 0.0; + float recall = 0.0; + for (size_t i = 0; i < CHRFNgramOrder; i++){ + precision += ((comps[3*i] + m_smooth)*1.0) / ((comps[3*i+1] + m_smooth)*1.0); + recall += ((comps[3*i] + m_smooth)*1.0) / ((comps[3*i+2] + m_smooth)*1.0); + } + + precision /= CHRFNgramOrder; + recall /= CHRFNgramOrder; + + f1 = ((1 + pow(m_beta, 2) ) * (precision * recall) ) / ( ( pow(m_beta, 2) * precision) + recall) ; + return f1; +} + +int CHRFScorer::CalcReferenceLength(const Reference& ref, std::size_t length) const +{ + switch (m_ref_length_type) { + case AVERAGE: + return ref.CalcAverage(); + break; + case CLOSEST: + return ref.CalcClosest(length); + break; + case SHORTEST: + return ref.CalcShortest(); + break; + default: + UTIL_THROW2("Unknown reference types"); + } +} + +void CHRFScorer::DumpCounts(std::ostream* os, + const NgramCounts& counts) const +{ + for (NgramCounts::const_iterator it = counts.begin(); + it != counts.end(); ++it) { + *os << "("; + const NgramCounts::Key& keys = it->first; + for (size_t i = 0; i < keys.size(); ++i) { + if (i != 0) { + *os << " "; + } + *os << keys[i]; + } + *os << ") : " << it->second << ", "; + } + *os << std::endl; +} + +} /* namespace MosesTuning */ diff --git a/mert/CHRFScorer.h b/mert/CHRFScorer.h new file mode 100644 index 000000000..eb67ef0f9 --- /dev/null +++ b/mert/CHRFScorer.h @@ -0,0 +1,102 @@ +/* + * CHRFScorer.h + * + * Created on: Dec 28, 2016 + * Author: pramathur + */ +#pragma once + +#ifndef MERT_CHRFSCORER_H_ +#define MERT_CHRFSCORER_H_ + +#include <fstream> +#include <string> +#include <vector> +#include <set> +#include <boost/shared_ptr.hpp> + +#include "Ngram.h" +#include "Reference.h" +#include "ScopedVector.h" +#include "ScoreData.h" +#include "StatisticsBasedScorer.h" +#include "Types.h" + +namespace MosesTuning { + +const size_t CHRFNgramOrder = 6; +class CHRFScorer : public StatisticsBasedScorer{ +public: + enum ReferenceLengthType { + AVERAGE, + CLOSEST, + SHORTEST + }; + + explicit CHRFScorer(const std::string& config = ""); + ~CHRFScorer(); + + virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles); + virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry); + virtual statscore_t calculateScore(const std::vector<ScoreStatsType>& comps) const; + virtual std::size_t NumberOfScores() const { + return 3*CHRFNgramOrder + 2; + } + + void CalcCHRFStats(const Reference& ref, const std::string& text, ScoreStats& entry) const; + + int CalcReferenceLength(const Reference& ref, std::size_t length) const; + + ReferenceLengthType GetReferenceLengthType() const { + return m_ref_length_type; + } + + void SetReferenceLengthType(ReferenceLengthType type) { + m_ref_length_type = type; + } + + const std::vector<Reference*>& GetReferences() const { + return m_references.get(); + } + + virtual float getReferenceLength(const std::vector<ScoreStatsType>& totals) const { + return totals[CHRFNgramOrder*3+2]; + } + + /** + * Count the ngrams of each type, up to the given length in the input line. + */ + size_t CountNgrams(const std::string& line, NgramCounts& counts, unsigned int n, bool is_testing=false) const; + + void DumpCounts(std::ostream* os, const NgramCounts& counts) const; + + // NOTE: this function is also used for unit testing. + bool OpenReferenceStream(std::istream* is, std::size_t file_id); + + void ProcessReferenceLine(const std::string& line, Reference* ref) const; + + bool GetNextReferenceFromStreams(std::vector<boost::shared_ptr<std::ifstream> >& referenceStreams, Reference& ref) const; + +protected: + ReferenceLengthType m_ref_length_type; + // reference translations. + ScopedVector<Reference> m_references; + + // no copying allowed + CHRFScorer(const CHRFScorer&); + CHRFScorer& operator=(const CHRFScorer&); + + +private: + float m_beta; + float m_smooth; + // data extracted from reference files + std::vector<std::size_t> m_ref_lengths; + std::vector<std::multiset<int> > m_ref_tokens; + + +}; + +} /* namespace MosesTuning */ + +#endif /* MERT_CHRFSCORER_H_ */ |