diff options
author | Prashant Mathur <pramathur@ebay.com> | 2016-06-15 15:33:42 +0300 |
---|---|---|
committer | Prashant Mathur <pramathur@ebay.com> | 2016-06-15 15:33:42 +0300 |
commit | e31bc247ead9f2b0e048b2394f7726d77b889736 (patch) | |
tree | f391d01d64b972dca9c977ae5f81a91eb16a47a2 | |
parent | dee124b70aed617e62fff8810cc80986d4f050b9 (diff) | |
parent | bc5f8d15c6ce4bc678ba992860bfd4be6719cee8 (diff) |
Merge branch 'master' of ssh://github.com/moses-smt/mosesdecoder
71 files changed, 3284 insertions, 539 deletions
diff --git a/compile.sh b/compile.sh index 45c10325c..f47a697d6 100755 --- a/compile.sh +++ b/compile.sh @@ -3,6 +3,6 @@ # you can install all 3rd-party dependencies by running make -f contrib/Makefiles/install-dependencies.gmake set -e -o pipefail -OPT=${OPT:-$(pwd)/OPT} +OPT=${OPT:-$(pwd)/opt} ./bjam --with-irstlm=$OPT/irstlm-5.80.08 --with-boost=$OPT --with-cmph=$OPT --with-xmlrpc-c=$OPT --with-mm --with-probing-pt -j$(getconf _NPROCESSORS_ONLN) $@ diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project index 26b838df9..222f19365 100644 --- a/contrib/other-builds/moses/.project +++ b/contrib/other-builds/moses/.project @@ -1106,6 +1106,16 @@ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/ControlRecombination.h</locationURI> </link> <link> + <name>FF/CorrectionPattern.cpp</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/moses/FF/CorrectionPattern.cpp</locationURI> + </link> + <link> + <name>FF/CorrectionPattern.h</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/moses/FF/CorrectionPattern.h</locationURI> + </link> + <link> <name>FF/CountNonTerms.cpp</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/moses/FF/CountNonTerms.cpp</locationURI> @@ -1171,6 +1181,16 @@ <locationURI>PARENT-3-PROJECT_LOC/moses/FF/DynamicCacheBasedLanguageModel.h</locationURI> </link> <link> + <name>FF/EditOps.cpp</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/moses/FF/EditOps.cpp</locationURI> + </link> + <link> + <name>FF/EditOps.h</name> + <type>1</type> + <locationURI>PARENT-3-PROJECT_LOC/moses/FF/EditOps.h</locationURI> + </link> + <link> <name>FF/FFState.cpp</name> <type>1</type> <locationURI>PARENT-3-PROJECT_LOC/moses/FF/FFState.cpp</locationURI> diff --git a/mert/Jamfile b/mert/Jamfile index e5adce76e..e3f083864 100644 --- a/mert/Jamfile +++ b/mert/Jamfile @@ -31,6 +31,8 @@ Point.cpp PerScorer.cpp HwcmScorer.cpp InternalTree.cpp +M2.cpp +M2Scorer.cpp Scorer.cpp ScorerFactory.cpp Optimizer.cpp diff --git a/mert/M2.cpp b/mert/M2.cpp new file mode 100644 index 000000000..58181d38e --- /dev/null +++ b/mert/M2.cpp @@ -0,0 +1,61 @@ + +#include <boost/algorithm/string.hpp> + +#include "M2.h" + +namespace MosesTuning +{ + +namespace M2 +{ + +bool Annot::lowercase = true; + +std::string Annot::transform(const std::string& e) +{ + std::string temp = e; + if(lowercase) { + boost::erase_all(temp, " "); + return ToLower(temp); + } else + return e; +} + +const std::string ToLower(const std::string& str) +{ + std::string lc(str); + std::transform(lc.begin(), lc.end(), lc.begin(), (int(*)(int))std::tolower); + return lc; +} + + +Edit operator+(Edit& e1, Edit& e2) +{ + std::string edit; + if(e1.edit.size() > 0 && e2.edit.size() > 0) + edit = e1.edit + " " + e2.edit; + else if(e1.edit.size() > 0) + edit = e1.edit; + else if(e2.edit.size() > 0) + edit = e2.edit; + + return Edit(e1.cost + e2.cost, e1.changed + e2.changed, e1.unchanged + e2.unchanged, edit); +} + + +Edge operator+(Edge e1, Edge e2) +{ + return Edge(e1.v, e2.u, e1.edit + e2.edit); +} + +std::ostream& operator<<(std::ostream& o, Sentence s) +{ + for(Sentence::iterator it = s.begin(); it != s.end(); it++) + o << *it << " "; + return o; +} + + +} + +}
\ No newline at end of file diff --git a/mert/M2.h b/mert/M2.h new file mode 100644 index 000000000..76f1aed6e --- /dev/null +++ b/mert/M2.h @@ -0,0 +1,480 @@ +#pragma once + +#include <cmath> +#include <string> +#include <vector> +#include <set> +#include <map> +#include <queue> +#include <iostream> +#include <fstream> +#include <iterator> +#include <algorithm> +#include <limits> +#include <sstream> +#include <boost/algorithm/string.hpp> + + + +namespace MosesTuning +{ + +namespace M2 +{ + +typedef std::vector<float> Stats; + +typedef std::vector<std::string> Sentence; + +std::ostream& operator<<(std::ostream& o, Sentence s); + +const std::string ToLower(const std::string& str); + +struct Annot { + size_t i; + size_t j; + + std::string type; + std::string edit; + + size_t annotator; + + bool operator<(Annot a) const { + return i < a.i || (i == a.i && j < a.j) + || (i == a.i && j == a.j && annotator < a.annotator) + || (i == a.i && j == a.j && annotator == a.annotator && transform(edit) < transform(a.edit)); + } + + bool operator==(Annot a) const { + return (!(*this < a) && !(a < *this)); + } + + static std::string transform(const std::string& e); + + static bool lowercase; +}; + +typedef std::set<Annot> Annots; +typedef std::set<size_t> Users; + +struct Unit { + Sentence first; + Annots second; + Users third; +}; + +typedef std::vector<Unit> M2File; + +struct Edit { + Edit(float c = 1.0, size_t ch = 0, size_t unch = 1, std::string e = "") + : cost(c), changed(ch), unchanged(unch), edit(e) {} + + float cost; + size_t changed; + size_t unchanged; + std::string edit; +}; + +Edit operator+(Edit& e1, Edit& e2); + +struct Vertex { + Vertex(size_t a = 0, size_t b = 0) : i(a), j(b) {} + + bool operator<(const Vertex &v) const { + return i < v.i || (i == v.i && j < v.j); + } + + bool operator==(const Vertex &v) const { + return i == v.i && j == v.j; + } + + size_t i; + size_t j; +}; + +struct Edge { + Edge(Vertex vv = Vertex(), Vertex uu = Vertex(), Edit editt = Edit()) + : v(vv), u(uu), edit(editt) {} + + bool operator<(const Edge &e) const { + return v < e.v || (v == e.v && u < e.u); + } + + Vertex v; + Vertex u; + Edit edit; +}; + +Edge operator+(Edge e1, Edge e2); + +typedef std::vector<size_t> Row; +typedef std::vector<Row> Matrix; + +struct Info { + Info(Vertex vv = Vertex(), Edit editt = Edit()) + : v(vv), edit(editt) {} + + bool operator<(const Info &i) const { + return v < i.v; + } + + Vertex v; + Edit edit; +}; + +typedef std::set<Info> Track; +typedef std::vector<Track> TrackRow; +typedef std::vector<TrackRow> TrackMatrix; + +typedef std::set<Vertex> Vertices; +typedef std::set<Edge> Edges; + +class M2 +{ +private: + M2File m_m2; + + size_t m_max_unchanged; + float m_beta; + bool m_lowercase; + bool m_verbose; + +public: + M2() : m_max_unchanged(2), m_beta(0.5), m_lowercase(true), m_verbose(false) { } + M2(size_t max_unchanged, float beta, bool truecase, bool verbose = false) + : m_max_unchanged(max_unchanged), m_beta(beta), m_lowercase(!truecase), m_verbose(verbose) { + if(!m_lowercase) { + Annot::lowercase = false; + } + } + + float Beta() { + return m_beta; + } + + void ReadM2(const std::string& filename) { + std::ifstream m2file(filename.c_str()); + std::string line; + + Unit unit; + bool first = true; + + while(std::getline(m2file, line)) { + if(line.size() > 2) { + if(line.substr(0, 2) == "S ") { + if(!first) { + if(unit.third.empty()) + unit.third.insert(0); + m_m2.push_back(unit); + } + first = false; + + unit.first = Sentence(); + unit.second = Annots(); + + std::string sentenceLine = line.substr(2); + boost::split(unit.first, sentenceLine, boost::is_any_of(" "), boost::token_compress_on); + } + if(line.substr(0, 2) == "A ") { + std::string annotLine = line.substr(2); + + std::vector<std::string> annot; + boost::iter_split(annot, annotLine, boost::algorithm::first_finder("|||")); + + if(annot[1] != "noop") { + Annot a; + std::stringstream rangeStr(annot[0]); + rangeStr >> a.i >> a.j; + a.type = annot[1]; + a.edit = annot[2]; + + std::stringstream annotStr(annot[5]); + annotStr >> a.annotator; + + unit.third.insert(a.annotator); + unit.second.insert(a); + } else { + std::stringstream annotStr(annot[5]); + size_t annotator; + annotStr >> annotator; + unit.third.insert(annotator); + } + } + } + } + if(unit.third.empty()) + unit.third.insert(0); + m_m2.push_back(unit); + } + + size_t LevenshteinMatrix(const Sentence &s1, const Sentence &s2, Matrix &d, TrackMatrix &bt) { + size_t n = s1.size(); + size_t m = s2.size(); + + if (n == 0) + return m; + if (m == 0) + return n; + + d.resize(n + 1, Row(m + 1, 0)); + bt.resize(n + 1, TrackRow(m + 1)); + + for(size_t i = 0; i <= n; ++i) { + d[i][0] = i; + if(i > 0) + bt[i][0].insert(Info(Vertex(i - 1, 0), Edit(1, 1, 0, ""))); + } + for(size_t j = 0; j <= m; ++j) { + d[0][j] = j; + if(j > 0) + bt[0][j].insert(Info(Vertex(0, j - 1), Edit(1, 1, 0, s2[j - 1]))); + } + + int cost; + for(size_t i = 1; i <= n; ++i) { + for(size_t j = 1; j <= m; ++j) { + if(Annot::transform(s1[i-1]) == Annot::transform(s2[j-1])) + cost = 0; + else + cost = 2; + + size_t left = d[i][j - 1] + 1; + size_t down = d[i - 1][j] + 1; + size_t diag = d[i - 1][j - 1] + cost; + + d[i][j] = std::min(left, std::min(down, diag)); + + if(d[i][j] == left) + bt[i][j].insert(Info(Vertex(i, j - 1), Edit(1, 1, 0, s2[j - 1]))); + if(d[i][j] == down) + bt[i][j].insert(Info(Vertex(i - 1, j), Edit(1, 1, 0, ""))); + if(d[i][j] == diag) + bt[i][j].insert(Info(Vertex(i - 1, j - 1), cost ? Edit(1, 1, 0, s2[j - 1]) : Edit(1, 0, 1, s2[j - 1]) )); + } + } + return d[n][m]; + } + + + void BuildGraph(const TrackMatrix &bt, Vertices &V, Edges &E) { + Vertex start(bt.size() - 1, bt[0].size() - 1); + + std::queue<Vertex> Q; + Q.push(start); + while(!Q.empty()) { + Vertex v = Q.front(); + Q.pop(); + if(V.count(v) > 0) + continue; + V.insert(v); + for(Track::iterator it = bt[v.i][v.j].begin(); + it != bt[v.i][v.j].end(); ++it) { + Edge e(it->v, v, it->edit); + E.insert(e); + if(V.count(e.v) == 0) + Q.push(e.v); + } + } + + Edges newE; + do { + newE.clear(); + for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) { + for(Edges::iterator it2 = E.begin(); it2 != E.end(); ++it2) { + if(it1->u == it2->v) { + Edge e = *it1 + *it2; + if(e.edit.changed > 0 && + e.edit.unchanged <= m_max_unchanged && + E.count(e) == 0) + newE.insert(e); + } + } + } + E.insert(newE.begin(), newE.end()); + } while(newE.size() > 0); + } + + void AddWeights(Edges &E, const Unit &u, size_t aid) { + for(Edges::iterator it1 = E.begin(); it1 != E.end(); ++it1) { + if(it1->edit.changed > 0) { + const_cast<float&>(it1->edit.cost) += 0.001; + for(Annots::iterator it2 = u.second.begin(); it2 != u.second.end(); ++it2) { + // if matches an annotator + if(it1->v.i == it2->i && it1->u.i == it2->j + && Annot::transform(it1->edit.edit) == Annot::transform(it2->edit) + && it2->annotator == aid) { + int newWeight = -(m_max_unchanged + 1) * E.size(); + const_cast<float&>(it1->edit.cost) = newWeight; + } + } + } + } + } + + void BellmanFord(Vertices &V, Edges &E) { + Vertex source(0, 0); + std::map<Vertex, float> distance; + std::map<Vertex, Vertex> predecessor; + + for(Vertices::iterator it = V.begin(); it != V.end(); ++it) { + if(*it == source) + distance[*it] = 0; + else { + distance[*it] = std::numeric_limits<float>::infinity(); + } + } + + for(size_t i = 1; i < V.size(); ++i) { + for(Edges::iterator it = E.begin(); it != E.end(); ++it) { + if(distance[it->v] + it->edit.cost < distance[it->u]) { + distance[it->u] = distance[it->v] + it->edit.cost; + predecessor[it->u] = it->v; + } + } + } + + Edges newE; + + Vertex v = *V.rbegin(); + while(true) { + //std::cout << predecessor[v] << " -> " << v << std::endl; + Edges::iterator it = E.find(Edge(predecessor[v], v)); + if(it != E.end()) { + Edge f = *it; + //std::cout << f << std::endl; + newE.insert(f); + + v = predecessor[v]; + if(v == source) + break; + } else { + std::cout << "Error" << std::endl; + break; + } + } + E.clear(); + E.insert(newE.begin(), newE.end()); + } + + void AddStats(const std::vector<Edges> &Es, const Unit &u, Stats &stats, size_t line) { + + std::map<size_t, Stats> statsPerAnnotator; + for(std::set<size_t>::iterator it = u.third.begin(); + it != u.third.end(); ++it) { + statsPerAnnotator[*it] = Stats(4, 0); + } + + for(Annots::iterator it = u.second.begin(); it != u.second.end(); it++) + statsPerAnnotator[it->annotator][2]++; + + for(std::set<size_t>::iterator ait = u.third.begin(); + ait != u.third.end(); ++ait) { + for(Edges::iterator eit = Es[*ait].begin(); eit != Es[*ait].end(); ++eit) { + if(eit->edit.changed > 0) { + statsPerAnnotator[*ait][1]++; + Annot f; + f.i = eit->v.i; + f.j = eit->u.i; + f.annotator = *ait; + f.edit = eit->edit.edit; + for(Annots::iterator fit = u.second.begin(); fit != u.second.end(); fit++) { + if(f == *fit) + statsPerAnnotator[*ait][0]++; + } + } + } + } + size_t bestAnnot = 0; + float bestF = -1; + for(std::set<size_t>::iterator it = u.third.begin(); + it != u.third.end(); ++it) { + Stats localStats = stats; + localStats[0] += statsPerAnnotator[*it][0]; + localStats[1] += statsPerAnnotator[*it][1]; + localStats[2] += statsPerAnnotator[*it][2]; + if(m_verbose) + std::cerr << *it << " : " << localStats[0] << " " << localStats[1] << " " << localStats[2] << std::endl; + float f = FScore(localStats); + if(m_verbose) + std::cerr << f << std::endl; + if(f > bestF) { + bestF = f; + bestAnnot = *it; + } + } + if(m_verbose) + std::cerr << ">> Chosen Annotator for line " << line + 1 << " : " << bestAnnot << std::endl; + stats[0] += statsPerAnnotator[bestAnnot][0]; + stats[1] += statsPerAnnotator[bestAnnot][1]; + stats[2] += statsPerAnnotator[bestAnnot][2]; + } + + void SufStats(const std::string &sStr, size_t i, Stats &stats) { + std::string temp = sStr; + + Sentence s; + boost::split(s, temp, boost::is_any_of(" "), boost::token_compress_on); + + Unit &unit = m_m2[i]; + + Matrix d; + TrackMatrix bt; + size_t distance = LevenshteinMatrix(unit.first, s, d, bt); + + std::vector<Vertices> Vs(unit.third.size()); + std::vector<Edges> Es(unit.third.size()); + + if(distance > unit.first.size()) { + std::cerr << "Levenshtein distance is greater than source size." << std::endl; + stats[0] = 0; + stats[1] = distance; + stats[2] = 0; + stats[3] = unit.first.size(); + return; + } else if(distance > 0) { + for(size_t j = 0; j < unit.third.size(); j++) { + BuildGraph(bt, Vs[j], Es[j]); + AddWeights(Es[j], unit, j); + BellmanFord(Vs[j], Es[j]); + } + } + AddStats(Es, unit, stats, i); + stats[3] = unit.first.size(); + } + + + float FScore(const Stats& stats) { + float p = 1.0; + if(stats[1] != 0) + p = (float)stats[0] / (float)stats[1]; + + float r = 1.0; + if(stats[2] != 0) + r = (float)stats[0] / (float)stats[2]; + + float denom = (m_beta * m_beta * p + r); + float f = 0.0; + if(denom != 0) + f = ((1 + m_beta * m_beta) * p * r) / denom; + return f; + } + + void FScore(const Stats& stats, float &p, float &r, float &f) { + p = 1.0; + if(stats[1] != 0) + p = (float)stats[0] / (float)stats[1]; + + r = 1.0; + if(stats[2] != 0) + r = (float)stats[0] / (float)stats[2]; + + float denom = (m_beta * m_beta * p + r); + f = 0.0; + if(denom != 0) + f = ((1 + m_beta * m_beta) * p * r) / denom; + } +}; + +} + +}
\ No newline at end of file diff --git a/mert/M2Scorer.cpp b/mert/M2Scorer.cpp new file mode 100644 index 000000000..f7e276631 --- /dev/null +++ b/mert/M2Scorer.cpp @@ -0,0 +1,137 @@ +#include "M2Scorer.h" + +#include <algorithm> +#include <fstream> +#include <stdexcept> +#include <sstream> +#include <cstdlib> + +#include <boost/lexical_cast.hpp> + + +using namespace std; + +namespace MosesTuning +{ + +M2Scorer::M2Scorer(const string& config) + : StatisticsBasedScorer("M2Scorer", config), + beta_(Scan<float>(getConfig("beta", "0.5"))), + max_unchanged_words_(Scan<int>(getConfig("max_unchanged_words", "2"))), + truecase_(Scan<bool>(getConfig("truecase", "false"))), + verbose_(Scan<bool>(getConfig("verbose", "false"))), + m2_(max_unchanged_words_, beta_, truecase_) +{} + +void M2Scorer::setReferenceFiles(const vector<string>& referenceFiles) +{ + for(size_t i = 0; i < referenceFiles.size(); ++i) { + m2_.ReadM2(referenceFiles[i]); + break; + } +} + +void M2Scorer::prepareStats(size_t sid, const string& text, ScoreStats& entry) +{ + string sentence = trimStr(this->preprocessSentence(text)); + std::vector<ScoreStatsType> stats(4, 0); + m2_.SufStats(sentence, sid, stats); + entry.set(stats); +} + +float M2Scorer::calculateScore(const vector<ScoreStatsType>& comps) const +{ + + if (comps.size() != NumberOfScores()) { + throw runtime_error("Size of stat vector for M2Scorer is not " + NumberOfScores()); + } + + float beta = beta_; + + + float p = 0.0; + float r = 0.0; + float f = 0.0; + + if(comps[1] != 0) + p = comps[0] / (double)comps[1]; + else + p = 1.0; + + if(comps[2] != 0) + r = comps[0] / (double)comps[2]; + else + r = 1.0; + + float denom = beta * beta * p + r; + if(denom != 0) + f = (1.0 + beta * beta) * p * r / denom; + else + f = 0.0; + + if(verbose_) + std::cerr << comps[0] << " " << comps[1] << " " << comps[2] << std::endl; + + if(verbose_) + std::cerr << p << " " << r << " " << f << std::endl; + + return f; +} + +float M2Scorer::getReferenceLength(const vector<ScoreStatsType>& comps) const +{ + return comps[3]; +} + +std::vector<ScoreStatsType> randomStats(float decay, int max) +{ + int gold = rand() % max; + int prop = rand() % max; + int corr = 0.0; + + if(std::min(prop, gold) > 0) + corr = rand() % std::min(prop, gold); + + //std::cerr << corr << " " << prop << " " << gold << std::endl; + + std::vector<ScoreStatsType> stats(3, 0.0); + stats[0] = corr * decay; + stats[1] = prop * decay; + stats[2] = gold * decay; + + return stats; +} + +float sentenceM2(const std::vector<ScoreStatsType>& stats) +{ + float beta = 0.5; + + std::vector<ScoreStatsType> smoothStats(3, 0.0); // = randomStats(0.001, 5); + smoothStats[0] += stats[0]; + smoothStats[1] += stats[1]; + smoothStats[2] += stats[2]; + + float p = 0.0; + float r = 0.0; + float f = 0.0; + + if(smoothStats[1] != 0) + p = smoothStats[0] / smoothStats[1]; + else + p = 1.0; + + if(smoothStats[2] != 0) + r = smoothStats[0] / smoothStats[2]; + else + r = 1.0; + + float denom = beta * beta * p + r; + if(denom != 0) + f = (1.0 + beta * beta) * p * r / denom; + else + f = 0.0; + + return f; +} + +} diff --git a/mert/M2Scorer.h b/mert/M2Scorer.h new file mode 100644 index 000000000..2a807e447 --- /dev/null +++ b/mert/M2Scorer.h @@ -0,0 +1,52 @@ +#ifndef MERT_M2_SCORER_H_ +#define MERT_M2_SCORER_H_ + +#include <string> +#include <vector> +#include <functional> + +#include "Types.h" +#include "Util.h" +#include "StatisticsBasedScorer.h" +#include "M2.h" + +namespace MosesTuning +{ + +/** + * M2Scorer class can compute CoNLL m2 F-score. + */ +class M2Scorer: public StatisticsBasedScorer +{ +public: + explicit M2Scorer(const std::string& config); + + virtual void setReferenceFiles(const std::vector<std::string>& referenceFiles); + virtual void prepareStats(std::size_t sid, const std::string& text, ScoreStats& entry); + + virtual std::size_t NumberOfScores() const { + return 4; + } + + virtual float calculateScore(const std::vector<ScoreStatsType>& comps) const; + virtual float getReferenceLength(const std::vector<ScoreStatsType>& comps) const; + +private: + float beta_; + int max_unchanged_words_; + bool truecase_; + bool verbose_; + M2::M2 m2_; + + std::map<std::pair<size_t, std::string>, std::vector<ScoreStatsType> > seen_; + + // no copying allowed + M2Scorer(const M2Scorer&); + M2Scorer& operator=(const M2Scorer&); +}; + +float sentenceM2 (const std::vector<ScoreStatsType>& stats); + +} + +#endif // MERT_M2_SCORER_H_ diff --git a/mert/ScorerFactory.cpp b/mert/ScorerFactory.cpp index 02573091c..8827f3e5d 100644 --- a/mert/ScorerFactory.cpp +++ b/mert/ScorerFactory.cpp @@ -11,6 +11,7 @@ #include "SemposScorer.h" #include "PermutationScorer.h" #include "MeteorScorer.h" +#include "M2Scorer.h" #include "HwcmScorer.h" #include "Reference.h" @@ -34,6 +35,7 @@ vector<string> ScorerFactory::getTypes() types.push_back(string("LRSCORE")); types.push_back(string("METEOR")); types.push_back(string("HWCM")); + types.push_back(string("M2SCORER")); return types; } @@ -54,6 +56,8 @@ Scorer* ScorerFactory::getScorer(const string& type, const string& config) return new CderScorer(config, false); } else if (type == "SEMPOS") { return new SemposScorer(config); + } else if (type == "M2SCORER") { + return new M2Scorer(config); } else if ((type == "HAMMING") || (type == "KENDALL")) { return (PermutationScorer*) new PermutationScorer(type, config); } else if (type == "METEOR") { diff --git a/moses/FF/CorrectionPattern.cpp b/moses/FF/CorrectionPattern.cpp new file mode 100644 index 000000000..915eaff2c --- /dev/null +++ b/moses/FF/CorrectionPattern.cpp @@ -0,0 +1,354 @@ +#include <sstream> +#include "CorrectionPattern.h" +#include "moses/Phrase.h" +#include "moses/TargetPhrase.h" +#include "moses/InputPath.h" +#include "moses/Hypothesis.h" +#include "moses/ChartHypothesis.h" +#include "moses/ScoreComponentCollection.h" +#include "moses/TranslationOption.h" +#include "util/string_piece_hash.hh" +#include "util/exception.hh" + +#include <functional> +#include <algorithm> + +#include <boost/foreach.hpp> +#include <boost/algorithm/string.hpp> + +#include "Diffs.h" + +namespace Moses +{ + +using namespace std; + +std::string MakePair(const std::string &s1, const std::string &s2, bool general) +{ + std::vector<std::string> sourceList; + std::vector<std::string> targetList; + + if(general) { + Diffs diffs = CreateDiff(s1, s2); + + size_t i = 0, j = 0; + char lastType = 'm'; + + std::string source, target; + std::string match; + + int count = 1; + + BOOST_FOREACH(Diff type, diffs) { + if(type == 'm') { + if(lastType != 'm') { + sourceList.push_back(source); + targetList.push_back(target); + } + source.clear(); + target.clear(); + + if(s1[i] == '+') { + if(match.size() >= 3) { + sourceList.push_back("(\\w{3,})·"); + std::string temp = "1"; + sprintf((char*)temp.c_str(), "%d", count); + targetList.push_back("\\" + temp + "·"); + count++; + } else { + sourceList.push_back(match + "·"); + targetList.push_back(match + "·"); + } + match.clear(); + } else + match.push_back(s1[i]); + + i++; + j++; + } else if(type == 'd') { + if(s1[i] == '+') + source += "·"; + else + source.push_back(s1[i]); + i++; + } else if(type == 'i') { + if(s2[j] == '+') + target += "·"; + else + target.push_back(s2[j]); + j++; + } + if(type != 'm' && !match.empty()) { + if(match.size() >= 3) { + sourceList.push_back("(\\w{3,})"); + std::string temp = "1"; + sprintf((char*)temp.c_str(), "%d", count); + targetList.push_back("\\" + temp); + count++; + } else { + sourceList.push_back(match); + targetList.push_back(match); + } + + match.clear(); + } + + lastType = type; + } + if(lastType != 'm') { + sourceList.push_back(source); + targetList.push_back(target); + } + + if(!match.empty()) { + if(match.size() >= 3) { + sourceList.push_back("(\\w{3,})"); + std::string temp = "1"; + sprintf((char*)temp.c_str(), "%d", count); + targetList.push_back("\\"+ temp); + count++; + } else { + sourceList.push_back(match); + targetList.push_back(match); + } + } + match.clear(); + } else { + std::string cs1 = s1; + std::string cs2 = s2; + boost::replace_all(cs1, "+", "·"); + boost::replace_all(cs2, "+", "·"); + + sourceList.push_back(cs1); + targetList.push_back(cs2); + } + + std::stringstream out; + out << "sub(«"; + out << boost::join(sourceList, ""); + out << "»,«"; + out << boost::join(targetList, ""); + out << "»)"; + + return out.str(); +} + +std::string CorrectionPattern::CreateSinglePattern(const Tokens &s1, const Tokens &s2) const +{ + std::stringstream out; + if(s1.empty()) { + out << "ins(«" << boost::join(s2, "·") << "»)"; + return out.str(); + } else if(s2.empty()) { + out << "del(«" << boost::join(s1, "·") << "»)"; + return out.str(); + } else { + typename Tokens::value_type v1 = boost::join(s1, "+"); + typename Tokens::value_type v2 = boost::join(s2, "+"); + out << MakePair(v1, v2, m_general); + return out.str(); + } +} + +std::vector<std::string> GetContext(size_t pos, + size_t len, + size_t window, + const InputType &input, + const InputPath &inputPath, + const std::vector<FactorType>& factorTypes, + bool isRight) +{ + + const Sentence& sentence = static_cast<const Sentence&>(input); + const Range& range = inputPath.GetWordsRange(); + + int leftPos = range.GetStartPos() + pos - len - 1; + int rightPos = range.GetStartPos() + pos; + + std::vector<std::string> contexts; + + for(int length = 1; length <= (int)window; ++length) { + std::vector<std::string> current; + if(!isRight) { + for(int i = 0; i < length; i++) { + if(leftPos - i >= 0) { + current.push_back(sentence.GetWord(leftPos - i).GetString(factorTypes, false)); + } else { + current.push_back("<s>"); + } + } + + if(current.back() == "<s>" && current.size() >= 2 && current[current.size()-2] == "<s>") + continue; + + std::reverse(current.begin(), current.end()); + contexts.push_back("left(«" + boost::join(current, "·") + "»)_"); + } + if(isRight) { + for(int i = 0; i < length; i++) { + if(rightPos + i < (int)sentence.GetSize()) { + current.push_back(sentence.GetWord(rightPos + i).GetString(factorTypes, false)); + } else { + current.push_back("</s>"); + } + } + + if(current.back() == "</s>" && current.size() >= 2 && current[current.size()-2] == "</s>") + continue; + + contexts.push_back("_right(«" + boost::join(current, "·") + "»)"); + } + } + return contexts; +} + +std::vector<std::string> +CorrectionPattern::CreatePattern(const Tokens &s1, + const Tokens &s2, + const InputType &input, + const InputPath &inputPath) const +{ + + Diffs diffs = CreateDiff(s1, s2); + size_t i = 0, j = 0; + char lastType = 'm'; + std::vector<std::string> patternList; + Tokens source, target; + BOOST_FOREACH(Diff type, diffs) { + if(type == 'm') { + if(lastType != 'm') { + std::string pattern = CreateSinglePattern(source, target); + patternList.push_back(pattern); + + if(m_context > 0) { + std::vector<std::string> leftContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, false); + std::vector<std::string> rightContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, true); + + BOOST_FOREACH(std::string left, leftContexts) + patternList.push_back(left + pattern); + + BOOST_FOREACH(std::string right, rightContexts) + patternList.push_back(pattern + right); + + BOOST_FOREACH(std::string left, leftContexts) + BOOST_FOREACH(std::string right, rightContexts) + patternList.push_back(left + pattern + right); + } + } + source.clear(); + target.clear(); + if(s1[i] != s2[j]) { + source.push_back(s1[i]); + target.push_back(s2[j]); + } + i++; + j++; + } else if(type == 'd') { + source.push_back(s1[i]); + i++; + } else if(type == 'i') { + target.push_back(s2[j]); + j++; + } + lastType = type; + } + if(lastType != 'm') { + std::string pattern = CreateSinglePattern(source, target); + patternList.push_back(pattern); + + if(m_context > 0) { + std::vector<std::string> leftContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, false); + std::vector<std::string> rightContexts = GetContext(i, source.size(), m_context, input, inputPath, m_contextFactors, true); + + BOOST_FOREACH(std::string left, leftContexts) + patternList.push_back(left + pattern); + + BOOST_FOREACH(std::string right, rightContexts) + patternList.push_back(pattern + right); + + BOOST_FOREACH(std::string left, leftContexts) + BOOST_FOREACH(std::string right, rightContexts) + patternList.push_back(left + pattern + right); + } + } + + return patternList; +} + +CorrectionPattern::CorrectionPattern(const std::string &line) + : StatelessFeatureFunction(0, line), m_factors(1, 0), m_general(false), + m_context(0), m_contextFactors(1, 0) +{ + std::cerr << "Initializing correction pattern feature.." << std::endl; + ReadParameters(); +} + +void CorrectionPattern::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "factor") { + m_factors = std::vector<FactorType>(1, Scan<FactorType>(value)); + } else if (key == "context-factor") { + m_contextFactors = std::vector<FactorType>(1, Scan<FactorType>(value)); + } else if (key == "general") { + m_general = Scan<bool>(value); + } else if (key == "context") { + m_context = Scan<size_t>(value); + } else { + StatelessFeatureFunction::SetParameter(key, value); + } +} + +void CorrectionPattern::EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore) const +{ + ComputeFeatures(input, inputPath, targetPhrase, &scoreBreakdown); +} + +void CorrectionPattern::ComputeFeatures( + const InputType &input, + const InputPath &inputPath, + const TargetPhrase& target, + ScoreComponentCollection* accumulator) const +{ + const Phrase &source = inputPath.GetPhrase(); + + std::vector<std::string> sourceTokens; + for(size_t i = 0; i < source.GetSize(); ++i) + sourceTokens.push_back(source.GetWord(i).GetString(m_factors, false)); + + std::vector<std::string> targetTokens; + for(size_t i = 0; i < target.GetSize(); ++i) + targetTokens.push_back(target.GetWord(i).GetString(m_factors, false)); + + std::vector<std::string> patternList = CreatePattern(sourceTokens, targetTokens, input, inputPath); + for(size_t i = 0; i < patternList.size(); ++i) + accumulator->PlusEquals(this, patternList[i], 1); + + /* + BOOST_FOREACH(std::string w, sourceTokens) + std::cerr << w << " "; + std::cerr << std::endl; + BOOST_FOREACH(std::string w, targetTokens) + std::cerr << w << " "; + std::cerr << std::endl; + BOOST_FOREACH(std::string w, patternList) + std::cerr << w << " "; + std::cerr << std::endl << std::endl; + */ +} + +bool CorrectionPattern::IsUseable(const FactorMask &mask) const +{ + bool ret = true; + for(size_t i = 0; i < m_factors.size(); ++i) + ret = ret && mask[m_factors[i]]; + for(size_t i = 0; i < m_contextFactors.size(); ++i) + ret = ret && mask[m_contextFactors[i]]; + return ret; +} + +} diff --git a/moses/FF/CorrectionPattern.h b/moses/FF/CorrectionPattern.h new file mode 100644 index 000000000..516a56ce2 --- /dev/null +++ b/moses/FF/CorrectionPattern.h @@ -0,0 +1,73 @@ +#ifndef moses_CorrectionPattern_h +#define moses_CorrectionPattern_h + +#include <string> +#include <boost/unordered_set.hpp> + +#include "StatelessFeatureFunction.h" +#include "moses/FactorCollection.h" +#include "moses/AlignmentInfo.h" + +namespace Moses +{ + +typedef std::vector<std::string> Tokens; + +/** Sets the features for length of source phrase, target phrase, both. + */ +class CorrectionPattern : public StatelessFeatureFunction +{ +private: + std::vector<FactorType> m_factors; + bool m_general; + size_t m_context; + std::vector<FactorType> m_contextFactors; + +public: + CorrectionPattern(const std::string &line); + + bool IsUseable(const FactorMask &mask) const; + + void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const + {} + + virtual void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore = NULL) const; + + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const + {} + + void EvaluateWhenApplied(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const + {} + void EvaluateWhenApplied(const ChartHypothesis &hypo, + ScoreComponentCollection* accumulator) const + {} + + void ComputeFeatures(const InputType &input, + const InputPath &inputPath, + const TargetPhrase& targetPhrase, + ScoreComponentCollection* accumulator) const; + + void SetParameter(const std::string& key, const std::string& value); + + std::vector<std::string> CreatePattern(const Tokens &s1, + const Tokens &s2, + const InputType &input, + const InputPath &inputPath) const; + + std::string CreateSinglePattern(const Tokens &s1, const Tokens &s2) const; + +}; + +} + +#endif // moses_CorrectionPattern_h diff --git a/moses/FF/Diffs.h b/moses/FF/Diffs.h new file mode 100644 index 000000000..8935d1fb9 --- /dev/null +++ b/moses/FF/Diffs.h @@ -0,0 +1,150 @@ +#ifndef moses_Diffs_h +#define moses_Diffs_h + +#include <cmath> + +namespace Moses +{ + +typedef char Diff; +typedef std::vector<Diff> Diffs; + +template <class Sequence, class Pred> +void CreateDiffRec(size_t** c, + const Sequence &s1, + const Sequence &s2, + size_t start, + size_t i, + size_t j, + Diffs& diffs, + Pred pred) +{ + if(i > 0 && j > 0 && pred(s1[i - 1 + start], s2[j - 1 + start])) { + CreateDiffRec(c, s1, s2, start, i - 1, j - 1, diffs, pred); + diffs.push_back(Diff('m')); + } else if(j > 0 && (i == 0 || c[i][j-1] >= c[i-1][j])) { + CreateDiffRec(c, s1, s2, start, i, j-1, diffs, pred); + diffs.push_back(Diff('i')); + } else if(i > 0 && (j == 0 || c[i][j-1] < c[i-1][j])) { + CreateDiffRec(c, s1, s2, start, i-1, j, diffs, pred); + diffs.push_back(Diff('d')); + } +} + +template <class Sequence, class Pred> +Diffs CreateDiff(const Sequence& s1, + const Sequence& s2, + Pred pred) +{ + + Diffs diffs; + + size_t n = s2.size(); + + int start = 0; + int m_end = s1.size() - 1; + int n_end = s2.size() - 1; + + while(start <= m_end && start <= n_end && pred(s1[start], s2[start])) { + diffs.push_back(Diff('m')); + start++; + } + while(start <= m_end && start <= n_end && pred(s1[m_end], s2[n_end])) { + m_end--; + n_end--; + } + + size_t m_new = m_end - start + 1; + size_t n_new = n_end - start + 1; + + size_t** c = new size_t*[m_new + 1]; + for(size_t i = 0; i <= m_new; ++i) { + c[i] = new size_t[n_new + 1]; + c[i][0] = 0; + } + for(size_t j = 0; j <= n_new; ++j) + c[0][j] = 0; + for(size_t i = 1; i <= m_new; ++i) + for(size_t j = 1; j <= n_new; ++j) + if(pred(s1[i - 1 + start], s2[j - 1 + start])) + c[i][j] = c[i-1][j-1] + 1; + else + c[i][j] = c[i][j-1] > c[i-1][j] ? c[i][j-1] : c[i-1][j]; + + CreateDiffRec(c, s1, s2, start, m_new, n_new, diffs, pred); + + for(size_t i = 0; i <= m_new; ++i) + delete[] c[i]; + delete[] c; + + for (size_t i = n_end + 1; i < n; ++i) + diffs.push_back(Diff('m')); + + return diffs; +} + +template <class Sequence> +Diffs CreateDiff(const Sequence& s1, const Sequence& s2) +{ + return CreateDiff(s1, s2, std::equal_to<typename Sequence::value_type>()); +} + +template <class Sequence, class Sig, class Stats> +void AddStats(const Sequence& s1, const Sequence& s2, const Sig& sig, Stats& stats) +{ + if(sig.size() != stats.size()) + throw "Signature size differs from score array size."; + + size_t m = 0, d = 0, i = 0, s = 0; + Diffs diff = CreateDiff(s1, s2); + + for(int j = 0; j < (int)diff.size(); ++j) { + if(diff[j] == 'm') + m++; + else if(diff[j] == 'd') { + d++; + int k = 0; + while(j - k >= 0 && j + 1 + k < (int)diff.size() && + diff[j - k] == 'd' && diff[j + 1 + k] == 'i') { + d--; + s++; + k++; + } + j += k; + } else if(diff[j] == 'i') + i++; + } + + for(size_t j = 0; j < sig.size(); ++j) { + switch (sig[j]) { + case 'l': + stats[j] += d + i + s; + break; + case 'm': + stats[j] += m; + break; + case 'd': + stats[j] += d; + break; + case 'i': + stats[j] += i; + break; + case 's': + stats[j] += s; + break; + case 'r': + float macc = 1; + if (d + i + s + m) + macc = 1.0 - (float)(d + i + s)/(float)(d + i + s + m); + if(macc > 0) + stats[j] += log(macc); + else + stats[j] += log(1.0/(float)(d + i + s + m + 1)); + break; + } + } +} + +} + +#endif diff --git a/moses/FF/EditOps.cpp b/moses/FF/EditOps.cpp new file mode 100644 index 000000000..fa66acf1c --- /dev/null +++ b/moses/FF/EditOps.cpp @@ -0,0 +1,119 @@ +#include <sstream> +#include "EditOps.h" +#include "moses/Phrase.h" +#include "moses/TargetPhrase.h" +#include "moses/Hypothesis.h" +#include "moses/ChartHypothesis.h" +#include "moses/ScoreComponentCollection.h" +#include "moses/TranslationOption.h" +#include "util/string_piece_hash.hh" +#include "util/exception.hh" + +#include <functional> + +#include <boost/foreach.hpp> +#include <boost/algorithm/string.hpp> + +#include "Diffs.h" + +namespace Moses +{ + +using namespace std; + +std::string ParseScores(const std::string &line, const std::string& defaultScores) +{ + std::vector<std::string> toks = Tokenize(line); + UTIL_THROW_IF2(toks.empty(), "Empty line"); + + for (size_t i = 1; i < toks.size(); ++i) { + std::vector<std::string> args = TokenizeFirstOnly(toks[i], "="); + UTIL_THROW_IF2(args.size() != 2, + "Incorrect format for feature function arg: " << toks[i]); + + if (args[0] == "scores") { + return args[1]; + } + } + return defaultScores; +} + +EditOps::EditOps(const std::string &line) + : StatelessFeatureFunction(ParseScores(line, "dis").size(), line) + , m_factorType(0), m_chars(false), m_scores(ParseScores(line, "dis")) +{ + std::cerr << "Initializing EditOps feature.." << std::endl; + ReadParameters(); +} + +void EditOps::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "factor") { + m_factorType = Scan<FactorType>(value); + } else if (key == "chars") { + m_chars = Scan<bool>(value); + } else if (key == "scores") { + m_scores = value; + } else { + StatelessFeatureFunction::SetParameter(key, value); + } +} + +void EditOps::Load() +{ } + +void EditOps::EvaluateInIsolation(const Phrase &source + , const TargetPhrase &target + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const +{ + ComputeFeatures(source, target, &scoreBreakdown); +} + +void EditOps::ComputeFeatures( + const Phrase &source, + const TargetPhrase& target, + ScoreComponentCollection* accumulator) const +{ + std::vector<float> ops(GetNumScoreComponents(), 0); + + if(m_chars) { + std::vector<FactorType> factors; + factors.push_back(m_factorType); + + std::string sourceStr = source.GetStringRep(factors); + std::string targetStr = target.GetStringRep(factors); + + AddStats(sourceStr, targetStr, m_scores, ops); + } else { + std::vector<std::string> sourceTokens; + //std::cerr << "Ed src: "; + for(size_t i = 0; i < source.GetSize(); ++i) { + if(!source.GetWord(i).IsNonTerminal()) + sourceTokens.push_back(source.GetWord(i).GetFactor(m_factorType)->GetString().as_string()); + //std::cerr << sourceTokens.back() << " "; + } + //std::cerr << std::endl; + + std::vector<std::string> targetTokens; + //std::cerr << "Ed trg: "; + for(size_t i = 0; i < target.GetSize(); ++i) { + if(!target.GetWord(i).IsNonTerminal()) + targetTokens.push_back(target.GetWord(i).GetFactor(m_factorType)->GetString().as_string()); + //std::cerr << targetTokens.back() << " "; + } + //std::cerr << std::endl; + + AddStats(sourceTokens, targetTokens, m_scores, ops); + } + + accumulator->PlusEquals(this, ops); +} + +bool EditOps::IsUseable(const FactorMask &mask) const +{ + bool ret = mask[m_factorType]; + return ret; +} + +} diff --git a/moses/FF/EditOps.h b/moses/FF/EditOps.h new file mode 100644 index 000000000..e7e7dd315 --- /dev/null +++ b/moses/FF/EditOps.h @@ -0,0 +1,64 @@ +#ifndef moses_EditOps_h +#define moses_EditOps_h + +#include <string> +#include <boost/unordered_set.hpp> + +#include "StatelessFeatureFunction.h" +#include "moses/FactorCollection.h" +#include "moses/AlignmentInfo.h" + +namespace Moses +{ + +typedef std::vector<std::string> Tokens; + +/** Calculates string edit operations that transform source phrase into target + * phrase using the LCS algorithm. Potentially usefule for monolingual tasks + * like paraphrasing, summarization, correction. + */ +class EditOps : public StatelessFeatureFunction +{ +private: + FactorType m_factorType; + bool m_chars; + std::string m_scores; + +public: + EditOps(const std::string &line); + + bool IsUseable(const FactorMask &mask) const; + + void Load(); + + virtual void EvaluateInIsolation(const Phrase &source + , const TargetPhrase &targetPhrase + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection &estimatedFutureScore) const; + + void EvaluateWithSourceContext(const InputType &input + , const InputPath &inputPath + , const TargetPhrase &targetPhrase + , const StackVec *stackVec + , ScoreComponentCollection &scoreBreakdown + , ScoreComponentCollection *estimatedFutureScore = NULL) const + {} + void EvaluateWhenApplied(const Hypothesis& hypo, + ScoreComponentCollection* accumulator) const + {} + void EvaluateWhenApplied(const ChartHypothesis &hypo, + ScoreComponentCollection* accumulator) const + {} + void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const + {} + + void ComputeFeatures(const Phrase &source, + const TargetPhrase& targetPhrase, + ScoreComponentCollection* accumulator) const; + void SetParameter(const std::string& key, const std::string& value); +}; + +} + +#endif // moses_CorrectionPattern_h diff --git a/moses/FF/Factory.cpp b/moses/FF/Factory.cpp index 3d9be2fa3..9312f9779 100644 --- a/moses/FF/Factory.cpp +++ b/moses/FF/Factory.cpp @@ -73,8 +73,14 @@ #include "moses/Syntax/InputWeightFF.h" #include "moses/Syntax/RuleTableFF.h" +#include "moses/FF/EditOps.h" +#include "moses/FF/CorrectionPattern.h" + #ifdef HAVE_VW #include "moses/FF/VW/VW.h" +#include "moses/FF/VW/VWFeatureContextBigrams.h" +#include "moses/FF/VW/VWFeatureContextBilingual.h" +#include "moses/FF/VW/VWFeatureContextWindow.h" #include "moses/FF/VW/VWFeatureSourceBagOfWords.h" #include "moses/FF/VW/VWFeatureSourceBigrams.h" #include "moses/FF/VW/VWFeatureSourceIndicator.h" @@ -294,8 +300,14 @@ FeatureRegistry::FeatureRegistry() MOSES_FNAME(SkeletonTranslationOptionListFeature); MOSES_FNAME(SkeletonPT); + MOSES_FNAME(EditOps); + MOSES_FNAME(CorrectionPattern); + #ifdef HAVE_VW MOSES_FNAME(VW); + MOSES_FNAME(VWFeatureContextBigrams); + MOSES_FNAME(VWFeatureContextBilingual); + MOSES_FNAME(VWFeatureContextWindow); MOSES_FNAME(VWFeatureSourceBagOfWords); MOSES_FNAME(VWFeatureSourceBigrams); MOSES_FNAME(VWFeatureSourceIndicator); diff --git a/moses/FF/GlobalLexicalModel.h b/moses/FF/GlobalLexicalModel.h index 6957d7d7c..8391609a2 100644 --- a/moses/FF/GlobalLexicalModel.h +++ b/moses/FF/GlobalLexicalModel.h @@ -76,7 +76,7 @@ public: , const TargetPhrase &targetPhrase , ScoreComponentCollection &scoreBreakdown , ScoreComponentCollection &estimatedScores) const { - } + } void EvaluateWhenApplied(const Hypothesis& hypo, ScoreComponentCollection* accumulator) const { diff --git a/moses/FF/OSM-Feature/KenOSM.cpp b/moses/FF/OSM-Feature/KenOSM.cpp index 25a1e6a93..d20e762f6 100644 --- a/moses/FF/OSM-Feature/KenOSM.cpp +++ b/moses/FF/OSM-Feature/KenOSM.cpp @@ -3,10 +3,11 @@ namespace Moses { -OSMLM* ConstructOSMLM(const char *file) +OSMLM* ConstructOSMLM(const char *file, util::LoadMethod load_method) { lm::ngram::ModelType model_type; lm::ngram::Config config; + config.load_method = load_method; if (lm::ngram::RecognizeBinary(file, model_type)) { switch(model_type) { case lm::ngram::PROBING: diff --git a/moses/FF/OSM-Feature/KenOSM.h b/moses/FF/OSM-Feature/KenOSM.h index 53268442b..ce3872a35 100644 --- a/moses/FF/OSM-Feature/KenOSM.h +++ b/moses/FF/OSM-Feature/KenOSM.h @@ -47,7 +47,7 @@ private: typedef KenOSMBase OSMLM; -OSMLM* ConstructOSMLM(const char *file); +OSMLM* ConstructOSMLM(const char *file, util::LoadMethod load_method); } // namespace diff --git a/moses/FF/OSM-Feature/OpSequenceModel.cpp b/moses/FF/OSM-Feature/OpSequenceModel.cpp index 4118c8690..1c889e329 100644 --- a/moses/FF/OSM-Feature/OpSequenceModel.cpp +++ b/moses/FF/OSM-Feature/OpSequenceModel.cpp @@ -17,6 +17,7 @@ OpSequenceModel::OpSequenceModel(const std::string &line) tFactor = 0; numFeatures = 5; ReadParameters(); + load_method = util::READ; } OpSequenceModel::~OpSequenceModel() @@ -27,7 +28,7 @@ OpSequenceModel::~OpSequenceModel() void OpSequenceModel :: readLanguageModel(const char *lmFile) { string unkOp = "_TRANS_SLF_"; - OSM = ConstructOSMLM(m_lmPath.c_str()); + OSM = ConstructOSMLM(m_lmPath.c_str(), load_method); State startState = OSM->NullContextState(); State endState; @@ -248,6 +249,20 @@ void OpSequenceModel::SetParameter(const std::string& key, const std::string& va sFactor = Scan<int>(value); } else if (key == "output-factor") { tFactor = Scan<int>(value); + } else if (key == "load") { + if (value == "lazy") { + load_method = util::LAZY; + } else if (value == "populate_or_lazy") { + load_method = util::POPULATE_OR_LAZY; + } else if (value == "populate_or_read" || value == "populate") { + load_method = util::POPULATE_OR_READ; + } else if (value == "read") { + load_method = util::READ; + } else if (value == "parallel_read") { + load_method = util::PARALLEL_READ; + } else { + UTIL_THROW2("Unknown KenLM load method " << value); + } } else { StatefulFeatureFunction::SetParameter(key, value); } diff --git a/moses/FF/OSM-Feature/OpSequenceModel.h b/moses/FF/OSM-Feature/OpSequenceModel.h index 925f9c83a..94beac5aa 100644 --- a/moses/FF/OSM-Feature/OpSequenceModel.h +++ b/moses/FF/OSM-Feature/OpSequenceModel.h @@ -20,6 +20,7 @@ public: int sFactor; // Source Factor ... int tFactor; // Target Factor ... int numFeatures; // Number of features used ... + util::LoadMethod load_method; // method to load model OpSequenceModel(const std::string &line); ~OpSequenceModel(); diff --git a/moses/FF/VW/AlignmentConstraint.h b/moses/FF/VW/AlignmentConstraint.h new file mode 100644 index 000000000..28ba7d4f3 --- /dev/null +++ b/moses/FF/VW/AlignmentConstraint.h @@ -0,0 +1,40 @@ +#pragma once + +namespace Moses +{ + +/** + * Helper class for storing alignment constraints. + */ +class AlignmentConstraint +{ +public: + AlignmentConstraint() : m_min(std::numeric_limits<int>::max()), m_max(-1) {} + + AlignmentConstraint(int min, int max) : m_min(min), m_max(max) {} + + /** + * We are aligned to point => our min cannot be larger, our max cannot be smaller. + */ + void Update(int point) { + if (m_min > point) m_min = point; + if (m_max < point) m_max = point; + } + + bool IsSet() const { + return m_max != -1; + } + + int GetMin() const { + return m_min; + } + + int GetMax() const { + return m_max; + } + +private: + int m_min, m_max; +}; + +} diff --git a/moses/FF/VW/VW.cpp b/moses/FF/VW/VW.cpp new file mode 100644 index 000000000..e5e5316b6 --- /dev/null +++ b/moses/FF/VW/VW.cpp @@ -0,0 +1,637 @@ +#include <string> +#include <map> +#include <limits> +#include <vector> + +#include <boost/unordered_map.hpp> +#include <boost/functional/hash.hpp> + +#include "moses/FF/StatefulFeatureFunction.h" +#include "moses/PP/CountsPhraseProperty.h" +#include "moses/TranslationOptionList.h" +#include "moses/TranslationOption.h" +#include "moses/Util.h" +#include "moses/TypeDef.h" +#include "moses/StaticData.h" +#include "moses/Phrase.h" +#include "moses/AlignmentInfo.h" +#include "moses/AlignmentInfoCollection.h" +#include "moses/Word.h" +#include "moses/FactorCollection.h" + +#include "Normalizer.h" +#include "Classifier.h" +#include "VWFeatureBase.h" +#include "TabbedSentence.h" +#include "ThreadLocalByFeatureStorage.h" +#include "TrainingLoss.h" +#include "VWTargetSentence.h" +#include "VWState.h" +#include "VW.h" + +namespace Moses +{ + +VW::VW(const std::string &line) + : StatefulFeatureFunction(1, line) + , TLSTargetSentence(this) + , m_train(false) + , m_sentenceStartWord(Word()) +{ + ReadParameters(); + Discriminative::ClassifierFactory *classifierFactory = m_train + ? new Discriminative::ClassifierFactory(m_modelPath) + : new Discriminative::ClassifierFactory(m_modelPath, m_vwOptions); + + m_tlsClassifier = new TLSClassifier(this, *classifierFactory); + + m_tlsFutureScores = new TLSFloatHashMap(this); + m_tlsComputedStateExtensions = new TLSStateExtensions(this); + m_tlsTranslationOptionFeatures = new TLSFeatureVectorMap(this); + m_tlsTargetContextFeatures = new TLSFeatureVectorMap(this); + + if (! m_normalizer) { + VERBOSE(1, "VW :: No loss function specified, assuming logistic loss.\n"); + m_normalizer = (Discriminative::Normalizer *) new Discriminative::LogisticLossNormalizer(); + } + + if (! m_trainingLoss) { + VERBOSE(1, "VW :: Using basic 1/0 loss calculation in training.\n"); + m_trainingLoss = (TrainingLoss *) new TrainingLossBasic(); + } + + // create a virtual beginning-of-sentence word with all factors replaced by <S> + const Factor *bosFactor = FactorCollection::Instance().AddFactor(BOS_); + for (size_t i = 0; i < MAX_NUM_FACTORS; i++) + m_sentenceStartWord.SetFactor(i, bosFactor); +} + +VW::~VW() +{ + delete m_tlsClassifier; + delete m_normalizer; + // TODO delete more stuff +} + +FFState* VW::EvaluateWhenApplied( + const Hypothesis& curHypo, + const FFState* prevState, + ScoreComponentCollection* accumulator) const +{ + VERBOSE(3, "VW :: Evaluating translation options\n"); + + const VWState& prevVWState = *static_cast<const VWState *>(prevState); + + const std::vector<VWFeatureBase*>& contextFeatures = + VWFeatureBase::GetTargetContextFeatures(GetScoreProducerDescription()); + + if (contextFeatures.empty()) { + // no target context features => we already evaluated everything in + // EvaluateTranslationOptionListWithSourceContext(). Nothing to do now, + // no state information to track. + return new VWState(); + } + + size_t spanStart = curHypo.GetTranslationOption().GetStartPos(); + size_t spanEnd = curHypo.GetTranslationOption().GetEndPos(); + + // compute our current key + size_t cacheKey = MakeCacheKey(prevState, spanStart, spanEnd); + + boost::unordered_map<size_t, FloatHashMap> &computedStateExtensions + = *m_tlsComputedStateExtensions->GetStored(); + + if (computedStateExtensions.find(cacheKey) == computedStateExtensions.end()) { + // we have not computed this set of translation options yet + const TranslationOptionList *topts = + curHypo.GetManager().getSntTranslationOptions()->GetTranslationOptionList(spanStart, spanEnd); + + const InputType& input = curHypo.GetManager().GetSource(); + + Discriminative::Classifier &classifier = *m_tlsClassifier->GetStored(); + + // extract target context features + size_t contextHash = prevVWState.hash(); + + FeatureVectorMap &contextFeaturesCache = *m_tlsTargetContextFeatures->GetStored(); + + FeatureVectorMap::const_iterator contextIt = contextFeaturesCache.find(contextHash); + if (contextIt == contextFeaturesCache.end()) { + // we have not extracted features for this context yet + + const Phrase &targetContext = prevVWState.GetPhrase(); + Discriminative::FeatureVector contextVector; + const AlignmentInfo *alignInfo = TransformAlignmentInfo(curHypo, targetContext.GetSize()); + for(size_t i = 0; i < contextFeatures.size(); ++i) + (*contextFeatures[i])(input, targetContext, *alignInfo, classifier, contextVector); + + contextFeaturesCache[contextHash] = contextVector; + VERBOSE(3, "VW :: context cache miss\n"); + } else { + // context already in cache, simply put feature IDs in the classifier object + classifier.AddLabelIndependentFeatureVector(contextIt->second); + VERBOSE(3, "VW :: context cache hit\n"); + } + + std::vector<float> losses(topts->size()); + + for (size_t toptIdx = 0; toptIdx < topts->size(); toptIdx++) { + const TranslationOption *topt = topts->Get(toptIdx); + const TargetPhrase &targetPhrase = topt->GetTargetPhrase(); + size_t toptHash = hash_value(*topt); + + // start with pre-computed source-context-only VW scores + losses[toptIdx] = m_tlsFutureScores->GetStored()->find(toptHash)->second; + + // add all features associated with this translation option + // (pre-computed when evaluated with source context) + const Discriminative::FeatureVector &targetFeatureVector = + m_tlsTranslationOptionFeatures->GetStored()->find(toptHash)->second; + + classifier.AddLabelDependentFeatureVector(targetFeatureVector); + + // add classifier score with context+target features only to the total loss + losses[toptIdx] += classifier.Predict(MakeTargetLabel(targetPhrase)); + } + + // normalize classifier scores to get a probability distribution + (*m_normalizer)(losses); + + // fill our cache with the results + FloatHashMap &toptScores = computedStateExtensions[cacheKey]; + for (size_t toptIdx = 0; toptIdx < topts->size(); toptIdx++) { + const TranslationOption *topt = topts->Get(toptIdx); + size_t toptHash = hash_value(*topt); + toptScores[toptHash] = FloorScore(TransformScore(losses[toptIdx])); + } + + VERBOSE(3, "VW :: cache miss\n"); + } else { + VERBOSE(3, "VW :: cache hit\n"); + } + + // now our cache is guaranteed to contain the required score, simply look it up + std::vector<float> newScores(m_numScoreComponents); + size_t toptHash = hash_value(curHypo.GetTranslationOption()); + newScores[0] = computedStateExtensions[cacheKey][toptHash]; + VERBOSE(3, "VW :: adding score: " << newScores[0] << "\n"); + accumulator->PlusEquals(this, newScores); + + return new VWState(prevVWState, curHypo); +} + +const FFState* VW::EmptyHypothesisState(const InputType &input) const +{ + size_t maxContextSize = VWFeatureBase::GetMaximumContextSize(GetScoreProducerDescription()); + Phrase initialPhrase; + for (size_t i = 0; i < maxContextSize; i++) + initialPhrase.AddWord(m_sentenceStartWord); + + return new VWState(initialPhrase); +} + +void VW::EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const +{ + Discriminative::Classifier &classifier = *m_tlsClassifier->GetStored(); + + if (translationOptionList.size() == 0) + return; // nothing to do + + VERBOSE(3, "VW :: Evaluating translation options\n"); + + // which feature functions do we use (on the source and target side) + const std::vector<VWFeatureBase*>& sourceFeatures = + VWFeatureBase::GetSourceFeatures(GetScoreProducerDescription()); + + const std::vector<VWFeatureBase*>& contextFeatures = + VWFeatureBase::GetTargetContextFeatures(GetScoreProducerDescription()); + + const std::vector<VWFeatureBase*>& targetFeatures = + VWFeatureBase::GetTargetFeatures(GetScoreProducerDescription()); + + size_t maxContextSize = VWFeatureBase::GetMaximumContextSize(GetScoreProducerDescription()); + + // only use stateful score computation when needed + bool haveTargetContextFeatures = ! contextFeatures.empty(); + + const Range &sourceRange = translationOptionList.Get(0)->GetSourceWordsRange(); + + if (m_train) { + // + // extract features for training the classifier (only call this when using vwtrainer, not in Moses!) + // + + // find which topts are correct + std::vector<bool> correct(translationOptionList.size()); + std::vector<int> startsAt(translationOptionList.size()); + std::set<int> uncoveredStartingPositions; + + for (size_t i = 0; i < translationOptionList.size(); i++) { + std::pair<bool, int> isCorrect = IsCorrectTranslationOption(* translationOptionList.Get(i)); + correct[i] = isCorrect.first; + startsAt[i] = isCorrect.second; + if (isCorrect.first) { + uncoveredStartingPositions.insert(isCorrect.second); + } + } + + // optionally update translation options using leave-one-out + std::vector<bool> keep = (m_leaveOneOut.size() > 0) + ? LeaveOneOut(translationOptionList, correct) + : std::vector<bool>(translationOptionList.size(), true); + + while (! uncoveredStartingPositions.empty()) { + int currentStart = *uncoveredStartingPositions.begin(); + uncoveredStartingPositions.erase(uncoveredStartingPositions.begin()); + + // check whether we (still) have some correct translation + int firstCorrect = -1; + for (size_t i = 0; i < translationOptionList.size(); i++) { + if (keep[i] && correct[i] && startsAt[i] == currentStart) { + firstCorrect = i; + break; + } + } + + // do not train if there are no positive examples + if (firstCorrect == -1) { + VERBOSE(3, "VW :: skipping topt collection, no correct translation for span at current tgt start position\n"); + continue; + } + + // the first correct topt can be used by some loss functions + const TargetPhrase &correctPhrase = translationOptionList.Get(firstCorrect)->GetTargetPhrase(); + + // feature extraction *at prediction time* outputs feature hashes which can be cached; + // this is training time, simply store everything in this dummyVector + Discriminative::FeatureVector dummyVector; + + // extract source side features + for(size_t i = 0; i < sourceFeatures.size(); ++i) + (*sourceFeatures[i])(input, sourceRange, classifier, dummyVector); + + // build target-side context + Phrase targetContext; + for (size_t i = 0; i < maxContextSize; i++) + targetContext.AddWord(m_sentenceStartWord); + + const Phrase *targetSent = GetStored()->m_sentence; + + // word alignment info shifted by context size + AlignmentInfo contextAlignment = TransformAlignmentInfo(*GetStored()->m_alignment, maxContextSize, currentStart); + + if (currentStart > 0) + targetContext.Append(targetSent->GetSubString(Range(0, currentStart - 1))); + + // extract target-context features + for(size_t i = 0; i < contextFeatures.size(); ++i) + (*contextFeatures[i])(input, targetContext, contextAlignment, classifier, dummyVector); + + // go over topts, extract target side features and train the classifier + for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) { + + // this topt was discarded by leaving one out + if (! keep[toptIdx]) + continue; + + // extract target-side features for each topt + const TargetPhrase &targetPhrase = translationOptionList.Get(toptIdx)->GetTargetPhrase(); + for(size_t i = 0; i < targetFeatures.size(); ++i) + (*targetFeatures[i])(input, targetPhrase, classifier, dummyVector); + + bool isCorrect = correct[toptIdx] && startsAt[toptIdx] == currentStart; + float loss = (*m_trainingLoss)(targetPhrase, correctPhrase, isCorrect); + + // train classifier on current example + classifier.Train(MakeTargetLabel(targetPhrase), loss); + } + } + } else { + // + // predict using a trained classifier, use this in decoding (=at test time) + // + + std::vector<float> losses(translationOptionList.size()); + + Discriminative::FeatureVector outFeaturesSourceNamespace; + + // extract source side features + for(size_t i = 0; i < sourceFeatures.size(); ++i) + (*sourceFeatures[i])(input, sourceRange, classifier, outFeaturesSourceNamespace); + + for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) { + const TranslationOption *topt = translationOptionList.Get(toptIdx); + const TargetPhrase &targetPhrase = topt->GetTargetPhrase(); + Discriminative::FeatureVector outFeaturesTargetNamespace; + + // extract target-side features for each topt + for(size_t i = 0; i < targetFeatures.size(); ++i) + (*targetFeatures[i])(input, targetPhrase, classifier, outFeaturesTargetNamespace); + + // cache the extracted target features (i.e. features associated with given topt) + // for future use at decoding time + size_t toptHash = hash_value(*topt); + m_tlsTranslationOptionFeatures->GetStored()->insert( + std::make_pair(toptHash, outFeaturesTargetNamespace)); + + // get classifier score + losses[toptIdx] = classifier.Predict(MakeTargetLabel(targetPhrase)); + } + + // normalize classifier scores to get a probability distribution + std::vector<float> rawLosses = losses; + (*m_normalizer)(losses); + + // update scores of topts + for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) { + TranslationOption *topt = *(translationOptionList.begin() + toptIdx); + if (! haveTargetContextFeatures) { + // no target context features; evaluate the FF now + std::vector<float> newScores(m_numScoreComponents); + newScores[0] = FloorScore(TransformScore(losses[toptIdx])); + + ScoreComponentCollection &scoreBreakDown = topt->GetScoreBreakdown(); + scoreBreakDown.PlusEquals(this, newScores); + + topt->UpdateScore(); + } else { + // We have target context features => this is just a partial score, + // do not add it to the score component collection. + size_t toptHash = hash_value(*topt); + + // Subtract the score contribution of target-only features, otherwise it would + // be included twice. + Discriminative::FeatureVector emptySource; + const Discriminative::FeatureVector &targetFeatureVector = + m_tlsTranslationOptionFeatures->GetStored()->find(toptHash)->second; + classifier.AddLabelIndependentFeatureVector(emptySource); + classifier.AddLabelDependentFeatureVector(targetFeatureVector); + float targetOnlyLoss = classifier.Predict(VW_DUMMY_LABEL); + + float futureScore = rawLosses[toptIdx] - targetOnlyLoss; + m_tlsFutureScores->GetStored()->insert(std::make_pair(toptHash, futureScore)); + } + } + } +} + +void VW::SetParameter(const std::string& key, const std::string& value) +{ + if (key == "train") { + m_train = Scan<bool>(value); + } else if (key == "path") { + m_modelPath = value; + } else if (key == "vw-options") { + m_vwOptions = value; + } else if (key == "leave-one-out-from") { + m_leaveOneOut = value; + } else if (key == "training-loss") { + // which type of loss to use for training + if (value == "basic") { + m_trainingLoss = (TrainingLoss *) new TrainingLossBasic(); + } else if (value == "bleu") { + m_trainingLoss = (TrainingLoss *) new TrainingLossBLEU(); + } else { + UTIL_THROW2("Unknown training loss type:" << value); + } + } else if (key == "loss") { + // which normalizer to use (theoretically depends on the loss function used for training the + // classifier (squared/logistic/hinge/...), hence the name "loss" + if (value == "logistic") { + m_normalizer = (Discriminative::Normalizer *) new Discriminative::LogisticLossNormalizer(); + } else if (value == "squared") { + m_normalizer = (Discriminative::Normalizer *) new Discriminative::SquaredLossNormalizer(); + } else { + UTIL_THROW2("Unknown loss type:" << value); + } + } else { + StatefulFeatureFunction::SetParameter(key, value); + } +} + +void VW::InitializeForInput(ttasksptr const& ttask) +{ + // do not keep future cost estimates across sentences! + m_tlsFutureScores->GetStored()->clear(); + + // invalidate our caches after each sentence + m_tlsComputedStateExtensions->GetStored()->clear(); + + // it's not certain that we should clear these caches; we do it + // because they shouldn't be allowed to grow indefinitely large but + // target contexts and translation options will have identical features + // the next time we extract them... + m_tlsTargetContextFeatures->GetStored()->clear(); + m_tlsTranslationOptionFeatures->GetStored()->clear(); + + InputType const& source = *(ttask->GetSource().get()); + // tabbed sentence is assumed only in training + if (! m_train) + return; + + UTIL_THROW_IF2(source.GetType() != TabbedSentenceInput, + "This feature function requires the TabbedSentence input type"); + + const TabbedSentence& tabbedSentence = static_cast<const TabbedSentence&>(source); + UTIL_THROW_IF2(tabbedSentence.GetColumns().size() < 2, + "TabbedSentence must contain target<tab>alignment"); + + // target sentence represented as a phrase + Phrase *target = new Phrase(); + target->CreateFromString( + Output + , StaticData::Instance().options()->output.factor_order + , tabbedSentence.GetColumns()[0] + , NULL); + + // word alignment between source and target sentence + // we don't store alignment info in AlignmentInfoCollection because we keep alignments of whole + // sentences, not phrases + AlignmentInfo *alignment = new AlignmentInfo(tabbedSentence.GetColumns()[1]); + + VWTargetSentence &targetSent = *GetStored(); + targetSent.Clear(); + targetSent.m_sentence = target; + targetSent.m_alignment = alignment; + + // pre-compute max- and min- aligned points for faster translation option checking + targetSent.SetConstraints(source.GetSize()); +} + +/************************************************************************************* + * private methods + ************************************************************************************/ + +const AlignmentInfo *VW::TransformAlignmentInfo(const Hypothesis &curHypo, size_t contextSize) const +{ + std::set<std::pair<size_t, size_t> > alignmentPoints; + const Hypothesis *contextHypo = curHypo.GetPrevHypo(); + int idxInContext = contextSize - 1; + int processedWordsInHypo = 0; + while (idxInContext >= 0 && contextHypo) { + int idxInHypo = contextHypo->GetCurrTargetLength() - 1 - processedWordsInHypo; + if (idxInHypo >= 0) { + const AlignmentInfo &hypoAlign = contextHypo->GetCurrTargetPhrase().GetAlignTerm(); + std::set<size_t> alignedToTgt = hypoAlign.GetAlignmentsForTarget(idxInHypo); + size_t srcOffset = contextHypo->GetCurrSourceWordsRange().GetStartPos(); + BOOST_FOREACH(size_t srcIdx, alignedToTgt) { + alignmentPoints.insert(std::make_pair(srcOffset + srcIdx, idxInContext)); + } + processedWordsInHypo++; + idxInContext--; + } else { + processedWordsInHypo = 0; + contextHypo = contextHypo->GetPrevHypo(); + } + } + + return AlignmentInfoCollection::Instance().Add(alignmentPoints); +} + +AlignmentInfo VW::TransformAlignmentInfo(const AlignmentInfo &alignInfo, size_t contextSize, int currentStart) const +{ + std::set<std::pair<size_t, size_t> > alignmentPoints; + for (int i = std::max(0, currentStart - (int)contextSize); i < currentStart; i++) { + std::set<size_t> alignedToTgt = alignInfo.GetAlignmentsForTarget(i); + BOOST_FOREACH(size_t srcIdx, alignedToTgt) { + alignmentPoints.insert(std::make_pair(srcIdx, i + contextSize)); + } + } + return AlignmentInfo(alignmentPoints); +} + +std::pair<bool, int> VW::IsCorrectTranslationOption(const TranslationOption &topt) const +{ + + //std::cerr << topt.GetSourceWordsRange() << std::endl; + + int sourceStart = topt.GetSourceWordsRange().GetStartPos(); + int sourceEnd = topt.GetSourceWordsRange().GetEndPos(); + + const VWTargetSentence &targetSentence = *GetStored(); + + // [targetStart, targetEnd] spans aligned target words + int targetStart = targetSentence.m_sentence->GetSize(); + int targetEnd = -1; + + // get the left-most and right-most alignment point within source span + for(int i = sourceStart; i <= sourceEnd; ++i) { + if(targetSentence.m_sourceConstraints[i].IsSet()) { + if(targetStart > targetSentence.m_sourceConstraints[i].GetMin()) + targetStart = targetSentence.m_sourceConstraints[i].GetMin(); + if(targetEnd < targetSentence.m_sourceConstraints[i].GetMax()) + targetEnd = targetSentence.m_sourceConstraints[i].GetMax(); + } + } + // there was no alignment + if(targetEnd == -1) + return std::make_pair(false, -1); + + //std::cerr << "Shorter: " << targetStart << " " << targetEnd << std::endl; + + // [targetStart2, targetEnd2] spans unaligned words left and right of [targetStart, targetEnd] + int targetStart2 = targetStart; + for(int i = targetStart2; i >= 0 && !targetSentence.m_targetConstraints[i].IsSet(); --i) + targetStart2 = i; + + int targetEnd2 = targetEnd; + for(int i = targetEnd2; + i < targetSentence.m_sentence->GetSize() && !targetSentence.m_targetConstraints[i].IsSet(); + ++i) + targetEnd2 = i; + + //std::cerr << "Longer: " << targetStart2 << " " << targetEnd2 << std::endl; + + const TargetPhrase &tphrase = topt.GetTargetPhrase(); + //std::cerr << tphrase << std::endl; + + // if target phrase is shorter than inner span return false + if(tphrase.GetSize() < targetEnd - targetStart + 1) + return std::make_pair(false, -1); + + // if target phrase is longer than outer span return false + if(tphrase.GetSize() > targetEnd2 - targetStart2 + 1) + return std::make_pair(false, -1); + + // for each possible starting point + for(int tempStart = targetStart2; tempStart <= targetStart; tempStart++) { + bool found = true; + // check if the target phrase is within longer span + for(int i = tempStart; i <= targetEnd2 && i < tphrase.GetSize() + tempStart; ++i) { + if(tphrase.GetWord(i - tempStart) != targetSentence.m_sentence->GetWord(i)) { + found = false; + break; + } + } + // return true if there was a match + if(found) { + //std::cerr << "Found" << std::endl; + return std::make_pair(true, tempStart); + } + } + + return std::make_pair(false, -1); +} + +std::vector<bool> VW::LeaveOneOut(const TranslationOptionList &topts, const std::vector<bool> &correct) const +{ + UTIL_THROW_IF2(m_leaveOneOut.size() == 0 || ! m_train, "LeaveOneOut called in wrong setting!"); + + float sourceRawCount = 0.0; + const float ONE = 1.0001; // I don't understand floating point numbers + + std::vector<bool> keepOpt; + + for (size_t i = 0; i < topts.size(); i++) { + TranslationOption *topt = *(topts.begin() + i); + const TargetPhrase &targetPhrase = topt->GetTargetPhrase(); + + // extract raw counts from phrase-table property + const CountsPhraseProperty *property = + static_cast<const CountsPhraseProperty *>(targetPhrase.GetProperty("Counts")); + + if (! property) { + VERBOSE(2, "VW :: Counts not found for topt! Is this an OOV?\n"); + // keep all translation opts without updating, this is either OOV or bad usage... + keepOpt.assign(topts.size(), true); + return keepOpt; + } + + if (sourceRawCount == 0.0) { + sourceRawCount = property->GetSourceMarginal() - ONE; // discount one occurrence of the source phrase + if (sourceRawCount <= 0) { + // no translation options survived, source phrase was a singleton + keepOpt.assign(topts.size(), false); + return keepOpt; + } + } + + float discount = correct[i] ? ONE : 0.0; + float target = property->GetTargetMarginal() - discount; + float joint = property->GetJointCount() - discount; + if (discount != 0.0) VERBOSE(3, "VW :: leaving one out!\n"); + + if (joint > 0) { + // topt survived leaving one out, update its scores + const FeatureFunction *feature = &FindFeatureFunction(m_leaveOneOut); + std::vector<float> scores = targetPhrase.GetScoreBreakdown().GetScoresForProducer(feature); + UTIL_THROW_IF2(scores.size() != 4, "Unexpected number of scores in feature " << m_leaveOneOut); + scores[0] = TransformScore(joint / target); // P(f|e) + scores[2] = TransformScore(joint / sourceRawCount); // P(e|f) + + ScoreComponentCollection &scoreBreakDown = topt->GetScoreBreakdown(); + scoreBreakDown.Assign(feature, scores); + topt->UpdateScore(); + keepOpt.push_back(true); + } else { + // they only occurred together once, discard topt + VERBOSE(2, "VW :: discarded topt when leaving one out\n"); + keepOpt.push_back(false); + } + } + + return keepOpt; +} + +} // namespace Moses diff --git a/moses/FF/VW/VW.h b/moses/FF/VW/VW.h index da8a5cfb8..d94cce502 100644 --- a/moses/FF/VW/VW.h +++ b/moses/FF/VW/VW.h @@ -3,8 +3,12 @@ #include <string> #include <map> #include <limits> +#include <vector> -#include "moses/FF/StatelessFeatureFunction.h" +#include <boost/unordered_map.hpp> +#include <boost/functional/hash.hpp> + +#include "moses/FF/StatefulFeatureFunction.h" #include "moses/PP/CountsPhraseProperty.h" #include "moses/TranslationOptionList.h" #include "moses/TranslationOption.h" @@ -13,6 +17,8 @@ #include "moses/StaticData.h" #include "moses/Phrase.h" #include "moses/AlignmentInfo.h" +#include "moses/Word.h" +#include "moses/FactorCollection.h" #include "Normalizer.h" #include "Classifier.h" @@ -20,119 +26,50 @@ #include "TabbedSentence.h" #include "ThreadLocalByFeatureStorage.h" #include "TrainingLoss.h" +#include "VWTargetSentence.h" + +/* + * VW classifier feature. See vw/README.md for further information. + * + * TODO: say which paper to cite. + */ namespace Moses { -const std::string VW_DUMMY_LABEL = "1111"; // VW does not use the actual label, other classifiers might +// dummy class label; VW does not use the actual label, other classifiers might +const std::string VW_DUMMY_LABEL = "1111"; -/** - * Helper class for storing alignment constraints. - */ -class Constraint -{ -public: - Constraint() : m_min(std::numeric_limits<int>::max()), m_max(-1) {} +// thread-specific classifier instance +typedef ThreadLocalByFeatureStorage<Discriminative::Classifier, Discriminative::ClassifierFactory &> TLSClassifier; - Constraint(int min, int max) : m_min(min), m_max(max) {} +// current target sentence, used in VW training (vwtrainer), not in decoding (prediction time) +typedef ThreadLocalByFeatureStorage<VWTargetSentence> TLSTargetSentence; - /** - * We are aligned to point => our min cannot be larger, our max cannot be smaller. - */ - void Update(int point) { - if (m_min > point) m_min = point; - if (m_max < point) m_max = point; - } +// hash table of feature vectors +typedef boost::unordered_map<size_t, Discriminative::FeatureVector> FeatureVectorMap; - bool IsSet() const { - return m_max != -1; - } +// thread-specific feature vector hash +typedef ThreadLocalByFeatureStorage<FeatureVectorMap> TLSFeatureVectorMap; - int GetMin() const { - return m_min; - } +// hash table of partial scores +typedef boost::unordered_map<size_t, float> FloatHashMap; - int GetMax() const { - return m_max; - } +// thread-specific score hash table, used for caching +typedef ThreadLocalByFeatureStorage<FloatHashMap> TLSFloatHashMap; -private: - int m_min, m_max; -}; +// thread-specific hash tablei for caching full classifier outputs +typedef ThreadLocalByFeatureStorage<boost::unordered_map<size_t, FloatHashMap> > TLSStateExtensions; -/** - * VW thread-specific data about target sentence. +/* + * VW feature function. A discriminative classifier with source and target context features. */ -struct VWTargetSentence { - VWTargetSentence() : m_sentence(NULL), m_alignment(NULL) {} - - void Clear() { - if (m_sentence) delete m_sentence; - if (m_alignment) delete m_alignment; - } - - ~VWTargetSentence() { - Clear(); - } - - void SetConstraints(size_t sourceSize) { - // initialize to unconstrained - m_sourceConstraints.assign(sourceSize, Constraint()); - m_targetConstraints.assign(m_sentence->GetSize(), Constraint()); - - // set constraints according to alignment points - AlignmentInfo::const_iterator it; - for (it = m_alignment->begin(); it != m_alignment->end(); it++) { - int src = it->first; - int tgt = it->second; - - if (src >= m_sourceConstraints.size() || tgt >= m_targetConstraints.size()) { - UTIL_THROW2("VW :: alignment point out of bounds: " << src << "-" << tgt); - } - - m_sourceConstraints[src].Update(tgt); - m_targetConstraints[tgt].Update(src); - } - } - - Phrase *m_sentence; - AlignmentInfo *m_alignment; - std::vector<Constraint> m_sourceConstraints, m_targetConstraints; -}; - -typedef ThreadLocalByFeatureStorage<Discriminative::Classifier, Discriminative::ClassifierFactory &> TLSClassifier; - -typedef ThreadLocalByFeatureStorage<VWTargetSentence> TLSTargetSentence; - -class VW : public StatelessFeatureFunction, public TLSTargetSentence +class VW : public StatefulFeatureFunction, public TLSTargetSentence { public: - VW(const std::string &line) - : StatelessFeatureFunction(1, line) - , TLSTargetSentence(this) - , m_train(false) { - ReadParameters(); - Discriminative::ClassifierFactory *classifierFactory = m_train - ? new Discriminative::ClassifierFactory(m_modelPath) - : new Discriminative::ClassifierFactory(m_modelPath, m_vwOptions); - - m_tlsClassifier = new TLSClassifier(this, *classifierFactory); - - if (! m_normalizer) { - VERBOSE(1, "VW :: No loss function specified, assuming logistic loss.\n"); - m_normalizer = (Discriminative::Normalizer *) new Discriminative::LogisticLossNormalizer(); - } - - if (! m_trainingLoss) { - VERBOSE(1, "VW :: Using basic 1/0 loss calculation in training.\n"); - m_trainingLoss = (TrainingLoss *) new TrainingLossBasic(); - } - } + VW(const std::string &line); - virtual ~VW() { - delete m_tlsClassifier; - delete m_normalizer; - } + virtual ~VW(); bool IsUseable(const FactorMask &mask) const { return true; @@ -152,335 +89,89 @@ public: , ScoreComponentCollection *estimatedFutureScore = NULL) const { } - void EvaluateTranslationOptionListWithSourceContext(const InputType &input - , const TranslationOptionList &translationOptionList) const { - Discriminative::Classifier &classifier = *m_tlsClassifier->GetStored(); - - if (translationOptionList.size() == 0) - return; // nothing to do - - VERBOSE(2, "VW :: Evaluating translation options\n"); - - // which feature functions do we use (on the source and target side) - const std::vector<VWFeatureBase*>& sourceFeatures = - VWFeatureBase::GetSourceFeatures(GetScoreProducerDescription()); - - const std::vector<VWFeatureBase*>& targetFeatures = - VWFeatureBase::GetTargetFeatures(GetScoreProducerDescription()); - - const Range &sourceRange = translationOptionList.Get(0)->GetSourceWordsRange(); - const InputPath &inputPath = translationOptionList.Get(0)->GetInputPath(); - - if (m_train) { - // - // extract features for training the classifier (only call this when using vwtrainer, not in Moses!) - // - - // find which topts are correct - std::vector<bool> correct(translationOptionList.size()); - for (size_t i = 0; i < translationOptionList.size(); i++) - correct[i] = IsCorrectTranslationOption(* translationOptionList.Get(i)); - - // optionally update translation options using leave-one-out - std::vector<bool> keep = (m_leaveOneOut.size() > 0) - ? LeaveOneOut(translationOptionList, correct) - : std::vector<bool>(translationOptionList.size(), true); - - // check whether we (still) have some correct translation - int firstCorrect = -1; - for (size_t i = 0; i < translationOptionList.size(); i++) { - if (keep[i] && correct[i]) { - firstCorrect = i; - break; - } - } - - // do not train if there are no positive examples - if (firstCorrect == -1) { - VERBOSE(2, "VW :: skipping topt collection, no correct translation for span\n"); - return; - } - - // the first correct topt can be used by some loss functions - const TargetPhrase &correctPhrase = translationOptionList.Get(firstCorrect)->GetTargetPhrase(); - - // extract source side features - for(size_t i = 0; i < sourceFeatures.size(); ++i) - (*sourceFeatures[i])(input, inputPath, sourceRange, classifier); - - // go over topts, extract target side features and train the classifier - for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) { - - // this topt was discarded by leaving one out - if (! keep[toptIdx]) - continue; - - // extract target-side features for each topt - const TargetPhrase &targetPhrase = translationOptionList.Get(toptIdx)->GetTargetPhrase(); - for(size_t i = 0; i < targetFeatures.size(); ++i) - (*targetFeatures[i])(input, inputPath, targetPhrase, classifier); - - float loss = (*m_trainingLoss)(targetPhrase, correctPhrase, correct[toptIdx]); - - // train classifier on current example - classifier.Train(MakeTargetLabel(targetPhrase), loss); - } - } else { - // - // predict using a trained classifier, use this in decoding (=at test time) - // - - std::vector<float> losses(translationOptionList.size()); - - // extract source side features - for(size_t i = 0; i < sourceFeatures.size(); ++i) - (*sourceFeatures[i])(input, inputPath, sourceRange, classifier); - - for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) { - const TranslationOption *topt = translationOptionList.Get(toptIdx); - const TargetPhrase &targetPhrase = topt->GetTargetPhrase(); - - // extract target-side features for each topt - for(size_t i = 0; i < targetFeatures.size(); ++i) - (*targetFeatures[i])(input, inputPath, targetPhrase, classifier); - - // get classifier score - losses[toptIdx] = classifier.Predict(MakeTargetLabel(targetPhrase)); - } - - // normalize classifier scores to get a probability distribution - (*m_normalizer)(losses); - - // update scores of topts - for (size_t toptIdx = 0; toptIdx < translationOptionList.size(); toptIdx++) { - TranslationOption *topt = *(translationOptionList.begin() + toptIdx); - std::vector<float> newScores(m_numScoreComponents); - newScores[0] = FloorScore(TransformScore(losses[toptIdx])); - - ScoreComponentCollection &scoreBreakDown = topt->GetScoreBreakdown(); - scoreBreakDown.PlusEquals(this, newScores); - - topt->UpdateScore(); - } - } - } - - void EvaluateWhenApplied(const Hypothesis& hypo, - ScoreComponentCollection* accumulator) const { - } + // This behavior of this method depends on whether it's called during VW + // training (feature extraction) by vwtrainer or during decoding (prediction + // time) by Moses. + // + // When predicting, it evaluates all translation options with the VW model; + // if no target-context features are defined, this is the final score and it + // is added directly to the TranslationOption score. If there are target + // context features, the score is a partial score and it is only stored in + // cache; the final score is computed based on target context in + // EvaluateWhenApplied(). + // + // This method is also used in training by vwtrainer in which case features + // are written to a file, no classifier predictions take place. Target-side + // context is constant at training time (we know the true target sentence), + // so target-context features are extracted here as well. + virtual void EvaluateTranslationOptionListWithSourceContext(const InputType &input + , const TranslationOptionList &translationOptionList) const; + + // Evaluate VW during decoding. This is only used at prediction time (not in training). + // When no target-context features are defined, VW predictions were already fully calculated + // in EvaluateTranslationOptionListWithSourceContext() and the scores were added to the model. + // If there are target-context features, we compute the context-dependent part of the + // classifier score and combine it with the source-context only partial score which was computed + // in EvaluateTranslationOptionListWithSourceContext(). Various caches are used to make this + // method more efficient. + virtual FFState* EvaluateWhenApplied( + const Hypothesis& curHypo, + const FFState* prevState, + ScoreComponentCollection* accumulator) const; + + virtual FFState* EvaluateWhenApplied( + const ChartHypothesis&, + int, + ScoreComponentCollection* accumulator) const { + throw new std::logic_error("hiearchical/syntax not supported"); + } + + // Initial VW state; contains unaligned BOS symbols. + const FFState* EmptyHypothesisState(const InputType &input) const; + + void SetParameter(const std::string& key, const std::string& value); + + // At prediction time, this clears our caches. At training time, we load the next sentence, its + // translation and word alignment. + virtual void InitializeForInput(ttasksptr const& ttask); - void EvaluateWhenApplied(const ChartHypothesis &hypo, - ScoreComponentCollection* accumulator) const { +private: + inline std::string MakeTargetLabel(const TargetPhrase &targetPhrase) const { + return VW_DUMMY_LABEL; // VW does not care about class labels in our setting (--csoaa_ldf mc). } - void SetParameter(const std::string& key, const std::string& value) { - if (key == "train") { - m_train = Scan<bool>(value); - } else if (key == "path") { - m_modelPath = value; - } else if (key == "vw-options") { - m_vwOptions = value; - } else if (key == "leave-one-out-from") { - m_leaveOneOut = value; - } else if (key == "training-loss") { - // which type of loss to use for training - if (value == "basic") { - m_trainingLoss = (TrainingLoss *) new TrainingLossBasic(); - } else if (value == "bleu") { - m_trainingLoss = (TrainingLoss *) new TrainingLossBLEU(); - } else { - UTIL_THROW2("Unknown training loss type:" << value); - } - } else if (key == "loss") { - // which normalizer to use (theoretically depends on the loss function used for training the - // classifier (squared/logistic/hinge/...), hence the name "loss" - if (value == "logistic") { - m_normalizer = (Discriminative::Normalizer *) new Discriminative::LogisticLossNormalizer(); - } else if (value == "squared") { - m_normalizer = (Discriminative::Normalizer *) new Discriminative::SquaredLossNormalizer(); - } else { - UTIL_THROW2("Unknown loss type:" << value); - } - } else { - StatelessFeatureFunction::SetParameter(key, value); - } + inline size_t MakeCacheKey(const FFState *prevState, size_t spanStart, size_t spanEnd) const { + size_t key = 0; + boost::hash_combine(key, prevState); + boost::hash_combine(key, spanStart); + boost::hash_combine(key, spanEnd); + return key; } - virtual void InitializeForInput(ttasksptr const& ttask) { - InputType const& source = *(ttask->GetSource().get()); - // tabbed sentence is assumed only in training - if (! m_train) - return; - - UTIL_THROW_IF2(source.GetType() != TabbedSentenceInput, - "This feature function requires the TabbedSentence input type"); - - const TabbedSentence& tabbedSentence = static_cast<const TabbedSentence&>(source); - UTIL_THROW_IF2(tabbedSentence.GetColumns().size() < 2, - "TabbedSentence must contain target<tab>alignment"); - - // target sentence represented as a phrase - Phrase *target = new Phrase(); - target->CreateFromString( - Output - , StaticData::Instance().options()->output.factor_order - , tabbedSentence.GetColumns()[0] - , NULL); - - // word alignment between source and target sentence - // we don't store alignment info in AlignmentInfoCollection because we keep alignments of whole - // sentences, not phrases - AlignmentInfo *alignment = new AlignmentInfo(tabbedSentence.GetColumns()[1]); - - VWTargetSentence &targetSent = *GetStored(); - targetSent.Clear(); - targetSent.m_sentence = target; - targetSent.m_alignment = alignment; - - // pre-compute max- and min- aligned points for faster translation option checking - targetSent.SetConstraints(source.GetSize()); - } + // used in decoding to transform the global word alignment information into + // context-phrase internal alignment information (i.e., with target indices correspoding + // to positions in contextPhrase) + const AlignmentInfo *TransformAlignmentInfo(const Hypothesis &curHypo, size_t contextSize) const; + // used during training to extract relevant alignment points from the full sentence alignment + // and shift them by target context size + AlignmentInfo TransformAlignmentInfo(const AlignmentInfo &alignInfo, size_t contextSize, int currentStart) const; -private: - std::string MakeTargetLabel(const TargetPhrase &targetPhrase) const { - return VW_DUMMY_LABEL; - } + // At training time, determine whether a translation option is correct for the current target sentence + // based on word alignment. This is a bit complicated because we need to handle various corner-cases + // where some word(s) on phrase borders are unaligned. + std::pair<bool, int> IsCorrectTranslationOption(const TranslationOption &topt) const; - bool IsCorrectTranslationOption(const TranslationOption &topt) const { - - //std::cerr << topt.GetSourceWordsRange() << std::endl; - - int sourceStart = topt.GetSourceWordsRange().GetStartPos(); - int sourceEnd = topt.GetSourceWordsRange().GetEndPos(); - - const VWTargetSentence &targetSentence = *GetStored(); - - // [targetStart, targetEnd] spans aligned target words - int targetStart = targetSentence.m_sentence->GetSize(); - int targetEnd = -1; - - // get the left-most and right-most alignment point within source span - for(int i = sourceStart; i <= sourceEnd; ++i) { - if(targetSentence.m_sourceConstraints[i].IsSet()) { - if(targetStart > targetSentence.m_sourceConstraints[i].GetMin()) - targetStart = targetSentence.m_sourceConstraints[i].GetMin(); - if(targetEnd < targetSentence.m_sourceConstraints[i].GetMax()) - targetEnd = targetSentence.m_sourceConstraints[i].GetMax(); - } - } - // there was no alignment - if(targetEnd == -1) - return false; - - //std::cerr << "Shorter: " << targetStart << " " << targetEnd << std::endl; - - // [targetStart2, targetEnd2] spans unaligned words left and right of [targetStart, targetEnd] - int targetStart2 = targetStart; - for(int i = targetStart2; i >= 0 && !targetSentence.m_targetConstraints[i].IsSet(); --i) - targetStart2 = i; - - int targetEnd2 = targetEnd; - for(int i = targetEnd2; - i < targetSentence.m_sentence->GetSize() && !targetSentence.m_targetConstraints[i].IsSet(); - ++i) - targetEnd2 = i; - - //std::cerr << "Longer: " << targetStart2 << " " << targetEnd2 << std::endl; - - const TargetPhrase &tphrase = topt.GetTargetPhrase(); - //std::cerr << tphrase << std::endl; - - // if target phrase is shorter than inner span return false - if(tphrase.GetSize() < targetEnd - targetStart + 1) - return false; - - // if target phrase is longer than outer span return false - if(tphrase.GetSize() > targetEnd2 - targetStart2 + 1) - return false; - - // for each possible starting point - for(int tempStart = targetStart2; tempStart <= targetStart; tempStart++) { - bool found = true; - // check if the target phrase is within longer span - for(int i = tempStart; i <= targetEnd2 && i < tphrase.GetSize() + tempStart; ++i) { - if(tphrase.GetWord(i - tempStart) != targetSentence.m_sentence->GetWord(i)) { - found = false; - break; - } - } - // return true if there was a match - if(found) { - //std::cerr << "Found" << std::endl; - return true; - } - } - - return false; - } - - std::vector<bool> LeaveOneOut(const TranslationOptionList &topts, const std::vector<bool> &correct) const { - UTIL_THROW_IF2(m_leaveOneOut.size() == 0 || ! m_train, "LeaveOneOut called in wrong setting!"); - - float sourceRawCount = 0.0; - const float ONE = 1.0001; // I don't understand floating point numbers - - std::vector<bool> keepOpt; - - for (size_t i = 0; i < topts.size(); i++) { - TranslationOption *topt = *(topts.begin() + i); - const TargetPhrase &targetPhrase = topt->GetTargetPhrase(); - - // extract raw counts from phrase-table property - const CountsPhraseProperty *property = - static_cast<const CountsPhraseProperty *>(targetPhrase.GetProperty("Counts")); - - if (! property) { - VERBOSE(1, "VW :: Counts not found for topt! Is this an OOV?\n"); - // keep all translation opts without updating, this is either OOV or bad usage... - keepOpt.assign(topts.size(), true); - return keepOpt; - } - - if (sourceRawCount == 0.0) { - sourceRawCount = property->GetSourceMarginal() - ONE; // discount one occurrence of the source phrase - if (sourceRawCount <= 0) { - // no translation options survived, source phrase was a singleton - keepOpt.assign(topts.size(), false); - return keepOpt; - } - } - - float discount = correct[i] ? ONE : 0.0; - float target = property->GetTargetMarginal() - discount; - float joint = property->GetJointCount() - discount; - if (discount != 0.0) VERBOSE(2, "VW :: leaving one out!\n"); - - if (joint > 0) { - // topt survived leaving one out, update its scores - const FeatureFunction *feature = &FindFeatureFunction(m_leaveOneOut); - std::vector<float> scores = targetPhrase.GetScoreBreakdown().GetScoresForProducer(feature); - UTIL_THROW_IF2(scores.size() != 4, "Unexpected number of scores in feature " << m_leaveOneOut); - scores[0] = TransformScore(joint / target); // P(f|e) - scores[2] = TransformScore(joint / sourceRawCount); // P(e|f) - - ScoreComponentCollection &scoreBreakDown = topt->GetScoreBreakdown(); - scoreBreakDown.Assign(feature, scores); - topt->UpdateScore(); - keepOpt.push_back(true); - } else { - // they only occurred together once, discard topt - VERBOSE(2, "VW :: discarded topt when leaving one out\n"); - keepOpt.push_back(false); - } - } - - return keepOpt; - } + // At training time, optionally discount occurrences of phrase pairs from the current sentence, helps prevent + // over-fitting. + std::vector<bool> LeaveOneOut(const TranslationOptionList &topts, const std::vector<bool> &correct) const; bool m_train; // false means predict - std::string m_modelPath; - std::string m_vwOptions; + std::string m_modelPath; // path to the VW model file; at training time, this is where extracted features are stored + std::string m_vwOptions; // options for Vowpal Wabbit + + // BOS token, all factors + Word m_sentenceStartWord; // calculator of training loss TrainingLoss *m_trainingLoss = NULL; @@ -488,9 +179,16 @@ private: // optionally contains feature name of a phrase table where we recompute scores with leaving one out std::string m_leaveOneOut; + // normalizer, typically this means softmax Discriminative::Normalizer *m_normalizer = NULL; + + // thread-specific classifier instance TLSClassifier *m_tlsClassifier; + + // caches for partial scores and feature vectors + TLSFloatHashMap *m_tlsFutureScores; + TLSStateExtensions *m_tlsComputedStateExtensions; + TLSFeatureVectorMap *m_tlsTranslationOptionFeatures, *m_tlsTargetContextFeatures; }; } - diff --git a/moses/FF/VW/VWFeatureBase.cpp b/moses/FF/VW/VWFeatureBase.cpp index 874544203..e51396b3f 100644 --- a/moses/FF/VW/VWFeatureBase.cpp +++ b/moses/FF/VW/VWFeatureBase.cpp @@ -2,11 +2,26 @@ #include <string> #include "VWFeatureBase.h" +#include "VWFeatureContext.h" namespace Moses { std::map<std::string, std::vector<VWFeatureBase*> > VWFeatureBase::s_features; std::map<std::string, std::vector<VWFeatureBase*> > VWFeatureBase::s_sourceFeatures; +std::map<std::string, std::vector<VWFeatureBase*> > VWFeatureBase::s_targetContextFeatures; std::map<std::string, std::vector<VWFeatureBase*> > VWFeatureBase::s_targetFeatures; + +std::map<std::string, size_t> VWFeatureBase::s_targetContextLength; + + +void VWFeatureBase::UpdateContextSize(const std::string &usedBy) +{ + // using the standard map behavior here: if the entry does not + // exist, it will be added and initialized to zero + size_t currentSize = s_targetContextLength[usedBy]; + size_t newSize = static_cast<VWFeatureContext *const>(this)->GetContextSize(); + s_targetContextLength[usedBy] = std::max(currentSize, newSize); +} + } diff --git a/moses/FF/VW/VWFeatureBase.h b/moses/FF/VW/VWFeatureBase.h index c8bd60a81..ca3317d31 100644 --- a/moses/FF/VW/VWFeatureBase.h +++ b/moses/FF/VW/VWFeatureBase.h @@ -12,11 +12,17 @@ namespace Moses { +enum VWFeatureType { + vwft_source, + vwft_target, + vwft_targetContext +}; + class VWFeatureBase : public StatelessFeatureFunction { public: - VWFeatureBase(const std::string &line, bool isSource = true) - : StatelessFeatureFunction(0, line), m_usedBy(1, "VW0"), m_isSource(isSource) { + VWFeatureBase(const std::string &line, VWFeatureType featureType = vwft_source) + : StatelessFeatureFunction(0, line), m_usedBy(1, "VW0"), m_featureType(featureType) { // defaults m_sourceFactors.push_back(0); m_targetFactors.push_back(0); @@ -71,26 +77,47 @@ public: return s_sourceFeatures[name]; } + // Return only target-context classifier features + static const std::vector<VWFeatureBase*>& GetTargetContextFeatures(std::string name = "VW0") { + // don't throw an exception when there are no target-context features, this feature type is not mandatory + return s_targetContextFeatures[name]; + } + // Return only target-dependent classifier features static const std::vector<VWFeatureBase*>& GetTargetFeatures(std::string name = "VW0") { UTIL_THROW_IF2(s_targetFeatures.count(name) == 0, "No target features registered for parent classifier: " + name); return s_targetFeatures[name]; } + // Required length context (maximum context size of defined target-context features) + static size_t GetMaximumContextSize(std::string name = "VW0") { + return s_targetContextLength[name]; // 0 by default + } + // Overload to process source-dependent data, create features once for every // source sentence word range. virtual void operator()(const InputType &input - , const InputPath &inputPath , const Range &sourceRange - , Discriminative::Classifier &classifier) const = 0; + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const = 0; // Overload to process target-dependent features, create features once for - // every target phrase. One source word range will have at leat one target + // every target phrase. One source word range will have at least one target // phrase, but may have more. virtual void operator()(const InputType &input - , const InputPath &inputPath , const TargetPhrase &targetPhrase - , Discriminative::Classifier &classifier) const = 0; + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const = 0; + + // Overload to process target-context dependent features, these features are + // evaluated during decoding. For efficiency, features are not fed directly into + // the classifier object but instead output in the vector "features" and managed + // separately in VW.h. + virtual void operator()(const InputType &input + , const Phrase &contextPhrase + , const AlignmentInfo &alignmentInfo + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const = 0; protected: std::vector<FactorType> m_sourceFactors, m_targetFactors; @@ -99,10 +126,15 @@ protected: for(std::vector<std::string>::const_iterator it = m_usedBy.begin(); it != m_usedBy.end(); it++) { s_features[*it].push_back(this); - if(m_isSource) + + if(m_featureType == vwft_source) { s_sourceFeatures[*it].push_back(this); - else + } else if (m_featureType == vwft_targetContext) { + s_targetContextFeatures[*it].push_back(this); + UpdateContextSize(*it); + } else { s_targetFeatures[*it].push_back(this); + } } } @@ -112,11 +144,16 @@ private: Tokenize(m_usedBy, usedBy, ","); } + void UpdateContextSize(const std::string &usedBy); + std::vector<std::string> m_usedBy; - bool m_isSource; + VWFeatureType m_featureType; static std::map<std::string, std::vector<VWFeatureBase*> > s_features; static std::map<std::string, std::vector<VWFeatureBase*> > s_sourceFeatures; + static std::map<std::string, std::vector<VWFeatureBase*> > s_targetContextFeatures; static std::map<std::string, std::vector<VWFeatureBase*> > s_targetFeatures; + + static std::map<std::string, size_t> s_targetContextLength; }; } diff --git a/moses/FF/VW/VWFeatureContext.h b/moses/FF/VW/VWFeatureContext.h new file mode 100644 index 000000000..18632d91b --- /dev/null +++ b/moses/FF/VW/VWFeatureContext.h @@ -0,0 +1,116 @@ +#pragma once + +#include <string> +#include <boost/foreach.hpp> +#include "VWFeatureBase.h" +#include "moses/InputType.h" +#include "moses/TypeDef.h" +#include "moses/Word.h" + +namespace Moses +{ + +// Inherit from this for source-dependent classifier features. They will +// automatically register with the classifier class named VW0 or one or more +// names specified by the used-by=name1,name2,... parameter. +// +// The classifier gets a full list by calling +// VWFeatureBase::GetTargetContextFeatures(GetScoreProducerDescription()) + + +class VWFeatureContext : public VWFeatureBase +{ +public: + VWFeatureContext(const std::string &line, size_t contextSize) + : VWFeatureBase(line, vwft_targetContext), m_contextSize(contextSize) { + } + + // Gets its pure virtual functions from VWFeatureBase + + virtual void operator()(const InputType &input + , const TargetPhrase &targetPhrase + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + } + + virtual void operator()(const InputType &input + , const Range &sourceRange + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + if (key == "size") { + m_contextSize = Scan<size_t>(value); + } else if (key == "factor-positions") { + // factor positions: assuming a factor such as positional morphological tag, use this + // option to select only certain positions; this assumes that only a single + // target-side factor is defined + Tokenize<size_t>(m_factorPositions, value, ","); + } else { + VWFeatureBase::SetParameter(key, value); + } + } + + size_t GetContextSize() { + return m_contextSize; + } + +protected: + // Get word with the correct subset of factors as string. Because we're target + // context features, we look at a limited number of words to the left of the + // current translation. posFromEnd is interpreted like this: + // 0 = last word of the hypothesis + // 1 = next to last word + // ...etc. + inline std::string GetWord(const Phrase &phrase, size_t posFromEnd) const { + const Word &word = phrase.GetWord(phrase.GetSize() - posFromEnd - 1); + if (m_factorPositions.empty()) { + return word.GetString(m_targetFactors, false); + } else { + if (m_targetFactors.size() != 1) + UTIL_THROW2("You can only use factor-positions when a single target-side factor is defined."); + const std::string &fullFactor = word.GetFactor(m_targetFactors[0])->GetString().as_string(); + + // corner cases: at sentence beginning/end, we don't have the correct factors set up + // similarly for UNK + if (fullFactor == BOS_ || fullFactor == EOS_ || fullFactor == UNKNOWN_FACTOR) + return fullFactor; + + std::string subFactor(m_factorPositions.size(), 'x'); // initialize string with correct size and placeholder chars + for (size_t i = 0; i < m_factorPositions.size(); i++) + subFactor[i] = fullFactor[m_factorPositions[i]]; + + return subFactor; + } + } + + // some target-context feature functions also look at the source + inline std::string GetSourceWord(const InputType &input, size_t pos) const { + return input.GetWord(pos).GetString(m_sourceFactors, false); + } + + // get source words aligned to a particular context word + std::vector<std::string> GetAlignedSourceWords(const Phrase &contextPhrase + , const InputType &input + , const AlignmentInfo &alignInfo + , size_t posFromEnd) const { + size_t idx = contextPhrase.GetSize() - posFromEnd - 1; + std::set<size_t> alignedToTarget = alignInfo.GetAlignmentsForTarget(idx); + std::vector<std::string> out; + out.reserve(alignedToTarget.size()); + BOOST_FOREACH(size_t srcIdx, alignedToTarget) { + out.push_back(GetSourceWord(input, srcIdx)); + } + return out; + } + + // required context size + size_t m_contextSize; + + // factor positions: assuming a factor such as positional morphological tag, use this + // option to select only certain positions + std::vector<size_t> m_factorPositions; +}; + +} diff --git a/moses/FF/VW/VWFeatureContextBigrams.h b/moses/FF/VW/VWFeatureContextBigrams.h new file mode 100644 index 000000000..92b652123 --- /dev/null +++ b/moses/FF/VW/VWFeatureContextBigrams.h @@ -0,0 +1,40 @@ +#pragma once + +#include <string> +#include <algorithm> +#include "VWFeatureContext.h" +#include "moses/Util.h" + +namespace Moses +{ + +class VWFeatureContextBigrams : public VWFeatureContext +{ +public: + VWFeatureContextBigrams(const std::string &line) + : VWFeatureContext(line, DEFAULT_WINDOW_SIZE) { + ReadParameters(); + + // Call this last + VWFeatureBase::UpdateRegister(); + } + + virtual void operator()(const InputType &input + , const Phrase &contextPhrase + , const AlignmentInfo &alignmentInfo + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + for (size_t i = 1; i < m_contextSize; i++) + outFeatures.push_back(classifier.AddLabelIndependentFeature("tcbigram^-" + SPrint(i + 1) + + "^" + GetWord(contextPhrase, i - 1) + "^" + GetWord(contextPhrase, i))); + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + VWFeatureContext::SetParameter(key, value); + } + +private: + static const int DEFAULT_WINDOW_SIZE = 1; +}; + +} diff --git a/moses/FF/VW/VWFeatureContextBilingual.h b/moses/FF/VW/VWFeatureContextBilingual.h new file mode 100644 index 000000000..f681fcb78 --- /dev/null +++ b/moses/FF/VW/VWFeatureContextBilingual.h @@ -0,0 +1,45 @@ +#pragma once + +#include <string> +#include <boost/foreach.hpp> +#include <algorithm> +#include "VWFeatureContext.h" +#include "moses/Util.h" + +namespace Moses +{ + +class VWFeatureContextBilingual : public VWFeatureContext +{ +public: + VWFeatureContextBilingual(const std::string &line) + : VWFeatureContext(line, DEFAULT_WINDOW_SIZE) { + ReadParameters(); + + // Call this last + VWFeatureBase::UpdateRegister(); + } + + virtual void operator()(const InputType &input + , const Phrase &contextPhrase + , const AlignmentInfo &alignmentInfo + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + for (size_t i = 0; i < m_contextSize; i++) { + std::string tgtWord = GetWord(contextPhrase, i); + std::vector<std::string> alignedTo = GetAlignedSourceWords(contextPhrase, input, alignmentInfo, i); + BOOST_FOREACH(const std::string &srcWord, alignedTo) { + outFeatures.push_back(classifier.AddLabelIndependentFeature("tcblng^-" + SPrint(i + 1) + "^" + tgtWord + "^" + srcWord)); + } + } + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + VWFeatureContext::SetParameter(key, value); + } + +private: + static const int DEFAULT_WINDOW_SIZE = 1; +}; + +} diff --git a/moses/FF/VW/VWFeatureContextWindow.h b/moses/FF/VW/VWFeatureContextWindow.h new file mode 100644 index 000000000..66c9c3ec5 --- /dev/null +++ b/moses/FF/VW/VWFeatureContextWindow.h @@ -0,0 +1,39 @@ +#pragma once + +#include <string> +#include <algorithm> +#include "VWFeatureContext.h" +#include "moses/Util.h" + +namespace Moses +{ + +class VWFeatureContextWindow : public VWFeatureContext +{ +public: + VWFeatureContextWindow(const std::string &line) + : VWFeatureContext(line, DEFAULT_WINDOW_SIZE) { + ReadParameters(); + + // Call this last + VWFeatureBase::UpdateRegister(); + } + + virtual void operator()(const InputType &input + , const Phrase &contextPhrase + , const AlignmentInfo &alignmentInfo + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + for (size_t i = 0; i < m_contextSize; i++) + outFeatures.push_back(classifier.AddLabelIndependentFeature("tcwin^-" + SPrint(i + 1) + "^" + GetWord(contextPhrase, i))); + } + + virtual void SetParameter(const std::string& key, const std::string& value) { + VWFeatureContext::SetParameter(key, value); + } + +private: + static const int DEFAULT_WINDOW_SIZE = 1; +}; + +} diff --git a/moses/FF/VW/VWFeatureSource.h b/moses/FF/VW/VWFeatureSource.h index 564f4a3b6..7a306b59c 100644 --- a/moses/FF/VW/VWFeatureSource.h +++ b/moses/FF/VW/VWFeatureSource.h @@ -19,15 +19,22 @@ class VWFeatureSource : public VWFeatureBase { public: VWFeatureSource(const std::string &line) - : VWFeatureBase(line, true) { + : VWFeatureBase(line, vwft_source) { } // Gets its pure virtual functions from VWFeatureBase virtual void operator()(const InputType &input - , const InputPath &inputPath , const TargetPhrase &targetPhrase - , Discriminative::Classifier &classifier) const { + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + } + + virtual void operator()(const InputType &input + , const Phrase &contextPhrase + , const AlignmentInfo &alignmentInfo + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { } virtual void SetParameter(const std::string& key, const std::string& value) { diff --git a/moses/FF/VW/VWFeatureSourceBagOfWords.h b/moses/FF/VW/VWFeatureSourceBagOfWords.h index 97a1cc6c3..b815b4d0e 100644 --- a/moses/FF/VW/VWFeatureSourceBagOfWords.h +++ b/moses/FF/VW/VWFeatureSourceBagOfWords.h @@ -18,11 +18,11 @@ public: } void operator()(const InputType &input - , const InputPath &inputPath , const Range &sourceRange - , Discriminative::Classifier &classifier) const { + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { for (size_t i = 0; i < input.GetSize(); i++) { - classifier.AddLabelIndependentFeature("bow^" + GetWord(input, i)); + outFeatures.push_back(classifier.AddLabelIndependentFeature("bow^" + GetWord(input, i))); } } diff --git a/moses/FF/VW/VWFeatureSourceBigrams.h b/moses/FF/VW/VWFeatureSourceBigrams.h index ce5430ab8..5de3ab2c3 100644 --- a/moses/FF/VW/VWFeatureSourceBigrams.h +++ b/moses/FF/VW/VWFeatureSourceBigrams.h @@ -18,11 +18,11 @@ public: } void operator()(const InputType &input - , const InputPath &inputPath , const Range &sourceRange - , Discriminative::Classifier &classifier) const { + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { for (size_t i = 1; i < input.GetSize(); i++) { - classifier.AddLabelIndependentFeature("bigram^" + GetWord(input, i - 1) + "^" + GetWord(input, i)); + outFeatures.push_back(classifier.AddLabelIndependentFeature("bigram^" + GetWord(input, i - 1) + "^" + GetWord(input, i))); } } diff --git a/moses/FF/VW/VWFeatureSourceExternalFeatures.h b/moses/FF/VW/VWFeatureSourceExternalFeatures.h index bacc5d231..9995ad1b2 100644 --- a/moses/FF/VW/VWFeatureSourceExternalFeatures.h +++ b/moses/FF/VW/VWFeatureSourceExternalFeatures.h @@ -23,12 +23,12 @@ public: } void operator()(const InputType &input - , const InputPath &inputPath , const Range &sourceRange - , Discriminative::Classifier &classifier) const { + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { const Features& features = *m_tls.GetStored(); for (size_t i = 0; i < features.size(); i++) { - classifier.AddLabelIndependentFeature("srcext^" + features[i]); + outFeatures.push_back(classifier.AddLabelIndependentFeature("srcext^" + features[i])); } } diff --git a/moses/FF/VW/VWFeatureSourceIndicator.h b/moses/FF/VW/VWFeatureSourceIndicator.h index fda929f13..b0d43eb0f 100644 --- a/moses/FF/VW/VWFeatureSourceIndicator.h +++ b/moses/FF/VW/VWFeatureSourceIndicator.h @@ -20,9 +20,9 @@ public: } void operator()(const InputType &input - , const InputPath &inputPath , const Range &sourceRange - , Discriminative::Classifier &classifier) const { + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { size_t begin = sourceRange.GetStartPos(); size_t end = sourceRange.GetEndPos() + 1; @@ -31,7 +31,7 @@ public: for (size_t i = 0; i < end - begin; i++) words[i] = GetWord(input, begin + i); - classifier.AddLabelIndependentFeature("sind^" + Join(" ", words)); + outFeatures.push_back(classifier.AddLabelIndependentFeature("sind^" + Join(" ", words))); } virtual void SetParameter(const std::string& key, const std::string& value) { diff --git a/moses/FF/VW/VWFeatureSourcePhraseInternal.h b/moses/FF/VW/VWFeatureSourcePhraseInternal.h index 4e7f6e8d1..b346660a0 100644 --- a/moses/FF/VW/VWFeatureSourcePhraseInternal.h +++ b/moses/FF/VW/VWFeatureSourcePhraseInternal.h @@ -20,14 +20,14 @@ public: } void operator()(const InputType &input - , const InputPath &inputPath , const Range &sourceRange - , Discriminative::Classifier &classifier) const { + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { size_t begin = sourceRange.GetStartPos(); size_t end = sourceRange.GetEndPos() + 1; while (begin < end) { - classifier.AddLabelIndependentFeature("sin^" + GetWord(input, begin++)); + outFeatures.push_back(classifier.AddLabelIndependentFeature("sin^" + GetWord(input, begin++))); } } diff --git a/moses/FF/VW/VWFeatureSourceSenseWindow.h b/moses/FF/VW/VWFeatureSourceSenseWindow.h index 614f7ff52..e7b1e1a71 100644 --- a/moses/FF/VW/VWFeatureSourceSenseWindow.h +++ b/moses/FF/VW/VWFeatureSourceSenseWindow.h @@ -51,9 +51,9 @@ public: } void operator()(const InputType &input - , const InputPath &inputPath , const Range &sourceRange - , Discriminative::Classifier &classifier) const { + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { int begin = sourceRange.GetStartPos(); int end = sourceRange.GetEndPos() + 1; int inputLen = input.GetSize(); @@ -64,24 +64,24 @@ public: // before current phrase for (int i = std::max(0, begin - m_size); i < begin; i++) { BOOST_FOREACH(const Sense &sense, senses[i]) { - classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob); - classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob); + outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob)); + outFeatures.push_back(classifier.AddLabelIndependentFeature("snsb^" + forms[i] + sense.m_label, sense.m_prob)); } } // within current phrase for (int i = begin; i < end; i++) { BOOST_FOREACH(const Sense &sense, senses[i]) { - classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob); - classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob); + outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob)); + outFeatures.push_back(classifier.AddLabelIndependentFeature("snsin^" + forms[i] + sense.m_label, sense.m_prob)); } } // after current phrase for (int i = end; i < std::min(end + m_size, inputLen); i++) { BOOST_FOREACH(const Sense &sense, senses[i]) { - classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob); - classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob); + outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + SPrint(i - begin) + "^" + sense.m_label, sense.m_prob)); + outFeatures.push_back(classifier.AddLabelIndependentFeature("snsa^" + forms[i] + sense.m_label, sense.m_prob)); } } } diff --git a/moses/FF/VW/VWFeatureSourceWindow.h b/moses/FF/VW/VWFeatureSourceWindow.h index 5205e4f2f..14c617586 100644 --- a/moses/FF/VW/VWFeatureSourceWindow.h +++ b/moses/FF/VW/VWFeatureSourceWindow.h @@ -20,19 +20,19 @@ public: } void operator()(const InputType &input - , const InputPath &inputPath , const Range &sourceRange - , Discriminative::Classifier &classifier) const { + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { int begin = sourceRange.GetStartPos(); int end = sourceRange.GetEndPos() + 1; int inputLen = input.GetSize(); for (int i = std::max(0, begin - m_size); i < begin; i++) { - classifier.AddLabelIndependentFeature("c^" + SPrint(i - begin) + "^" + GetWord(input, i)); + outFeatures.push_back(classifier.AddLabelIndependentFeature("c^" + SPrint(i - begin) + "^" + GetWord(input, i))); } for (int i = end; i < std::min(end + m_size, inputLen); i++) { - classifier.AddLabelIndependentFeature("c^" + SPrint(i - end + 1) + "^" + GetWord(input, i)); + outFeatures.push_back(classifier.AddLabelIndependentFeature("c^" + SPrint(i - end + 1) + "^" + GetWord(input, i))); } } diff --git a/moses/FF/VW/VWFeatureTarget.h b/moses/FF/VW/VWFeatureTarget.h index 2935b2b4e..ed936ebf3 100644 --- a/moses/FF/VW/VWFeatureTarget.h +++ b/moses/FF/VW/VWFeatureTarget.h @@ -17,15 +17,22 @@ class VWFeatureTarget : public VWFeatureBase { public: VWFeatureTarget(const std::string &line) - : VWFeatureBase(line, false) { + : VWFeatureBase(line, vwft_target) { } // Gets its pure virtual functions from VWFeatureBase virtual void operator()(const InputType &input - , const InputPath &inputPath , const Range &sourceRange - , Discriminative::Classifier &classifier) const { + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + } + + virtual void operator()(const InputType &input + , const Phrase &contextPhrase + , const AlignmentInfo &alignmentInfo + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { } virtual void SetParameter(const std::string& key, const std::string& value) { diff --git a/moses/FF/VW/VWFeatureTargetBigrams.h b/moses/FF/VW/VWFeatureTargetBigrams.h index 6f3f35270..30264dbf5 100644 --- a/moses/FF/VW/VWFeatureTargetBigrams.h +++ b/moses/FF/VW/VWFeatureTargetBigrams.h @@ -17,11 +17,11 @@ public: } void operator()(const InputType &input - , const InputPath &inputPath , const TargetPhrase &targetPhrase - , Discriminative::Classifier &classifier) const { + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { for (size_t i = 1; i < targetPhrase.GetSize(); i++) { - classifier.AddLabelDependentFeature("tbigram^" + GetWord(targetPhrase, i - 1) + "^" + GetWord(targetPhrase, i)); + outFeatures.push_back(classifier.AddLabelDependentFeature("tbigram^" + GetWord(targetPhrase, i - 1) + "^" + GetWord(targetPhrase, i))); } } diff --git a/moses/FF/VW/VWFeatureTargetIndicator.h b/moses/FF/VW/VWFeatureTargetIndicator.h index 39d8a37a0..0195990d0 100644 --- a/moses/FF/VW/VWFeatureTargetIndicator.h +++ b/moses/FF/VW/VWFeatureTargetIndicator.h @@ -17,10 +17,10 @@ public: } void operator()(const InputType &input - , const InputPath &inputPath , const TargetPhrase &targetPhrase - , Discriminative::Classifier &classifier) const { - classifier.AddLabelDependentFeature("tind^" + targetPhrase.GetStringRep(m_targetFactors)); + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { + outFeatures.push_back(classifier.AddLabelDependentFeature("tind^" + targetPhrase.GetStringRep(m_targetFactors))); } virtual void SetParameter(const std::string& key, const std::string& value) { diff --git a/moses/FF/VW/VWFeatureTargetPhraseInternal.h b/moses/FF/VW/VWFeatureTargetPhraseInternal.h index e376a1ed3..8a9928aaa 100644 --- a/moses/FF/VW/VWFeatureTargetPhraseInternal.h +++ b/moses/FF/VW/VWFeatureTargetPhraseInternal.h @@ -17,11 +17,11 @@ public: } void operator()(const InputType &input - , const InputPath &inputPath , const TargetPhrase &targetPhrase - , Discriminative::Classifier &classifier) const { + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { for (size_t i = 0; i < targetPhrase.GetSize(); i++) { - classifier.AddLabelDependentFeature("tin^" + GetWord(targetPhrase, i)); + outFeatures.push_back(classifier.AddLabelDependentFeature("tin^" + GetWord(targetPhrase, i))); } } diff --git a/moses/FF/VW/VWFeatureTargetPhraseScores.h b/moses/FF/VW/VWFeatureTargetPhraseScores.h index 5a4519fb1..6c9ab63d2 100644 --- a/moses/FF/VW/VWFeatureTargetPhraseScores.h +++ b/moses/FF/VW/VWFeatureTargetPhraseScores.h @@ -20,9 +20,9 @@ public: } void operator()(const InputType &input - , const InputPath &inputPath , const TargetPhrase &targetPhrase - , Discriminative::Classifier &classifier) const { + , Discriminative::Classifier &classifier + , Discriminative::FeatureVector &outFeatures) const { std::vector<FeatureFunction*> features = FeatureFunction::GetFeatureFunctions(); for (size_t i = 0; i < features.size(); i++) { std::string fname = features[i]->GetScoreProducerDescription(); @@ -31,7 +31,7 @@ public: std::vector<float> scores = targetPhrase.GetScoreBreakdown().GetScoresForProducer(features[i]); for(size_t j = 0; j < scores.size(); ++j) - classifier.AddLabelDependentFeature(fname + "^" + boost::lexical_cast<std::string>(j), scores[j]); + outFeatures.push_back(classifier.AddLabelDependentFeature(fname + "^" + boost::lexical_cast<std::string>(j), scores[j])); } } diff --git a/moses/FF/VW/VWState.cpp b/moses/FF/VW/VWState.cpp new file mode 100644 index 000000000..000b8532b --- /dev/null +++ b/moses/FF/VW/VWState.cpp @@ -0,0 +1,77 @@ +#include "VWState.h" + +#include "moses/FF/FFState.h" +#include "moses/Phrase.h" +#include "moses/Hypothesis.h" +#include "moses/Util.h" +#include "moses/TypeDef.h" +#include "moses/StaticData.h" +#include "moses/TranslationOption.h" +#include <boost/functional/hash.hpp> + +namespace Moses +{ + +VWState::VWState() : m_spanStart(0), m_spanEnd(0) +{ + ComputeHash(); +} + +VWState::VWState(const Phrase &phrase) + : m_phrase(phrase), m_spanStart(0), m_spanEnd(0) +{ + ComputeHash(); +} + +VWState::VWState(const VWState &prevState, const Hypothesis &curHypo) +{ + VERBOSE(3, "VW :: updating state\n>> previous state: " << prevState << "\n"); + + // copy phrase from previous state + Phrase phrase = prevState.GetPhrase(); + size_t contextSize = phrase.GetSize(); // identical to VWFeatureBase::GetMaximumContextSize() + + // add words from current hypothesis + phrase.Append(curHypo.GetCurrTargetPhrase()); + + VERBOSE(3, ">> current hypo: " << curHypo.GetCurrTargetPhrase() << "\n"); + + // get a slice of appropriate length + Range range(phrase.GetSize() - contextSize, phrase.GetSize() - 1); + m_phrase = phrase.GetSubString(range); + + // set current span start/end + m_spanStart = curHypo.GetTranslationOption().GetStartPos(); + m_spanEnd = curHypo.GetTranslationOption().GetEndPos(); + + // compute our hash + ComputeHash(); + + VERBOSE(3, ">> updated state: " << *this << "\n"); +} + +bool VWState::operator==(const FFState& o) const +{ + const VWState &other = static_cast<const VWState &>(o); + + return m_phrase == other.GetPhrase() + && m_spanStart == other.GetSpanStart() + && m_spanEnd == other.GetSpanEnd(); +} + +void VWState::ComputeHash() +{ + m_hash = 0; + + boost::hash_combine(m_hash, m_phrase); + boost::hash_combine(m_hash, m_spanStart); + boost::hash_combine(m_hash, m_spanEnd); +} + +std::ostream &operator<<(std::ostream &out, const VWState &state) +{ + out << state.GetPhrase() << "::" << state.GetSpanStart() << "-" << state.GetSpanEnd(); + return out; +} + +} diff --git a/moses/FF/VW/VWState.h b/moses/FF/VW/VWState.h new file mode 100644 index 000000000..d83035553 --- /dev/null +++ b/moses/FF/VW/VWState.h @@ -0,0 +1,56 @@ +#pragma once + +#include <ostream> + +#include "moses/FF/FFState.h" +#include "moses/Phrase.h" +#include "moses/Hypothesis.h" + +namespace Moses +{ + +/** + * VW state, used in decoding (when target context is enabled). + */ +class VWState : public FFState +{ +public: + // empty state, used only when VWState is ignored + VWState(); + + // used for construction of the initial VW state + VWState(const Phrase &phrase); + + // continue from previous VW state with a new hypothesis + VWState(const VWState &prevState, const Hypothesis &curHypo); + + virtual bool operator==(const FFState& o) const; + + inline virtual size_t hash() const { + return m_hash; + } + + inline const Phrase &GetPhrase() const { + return m_phrase; + } + + inline size_t GetSpanStart() const { + return m_spanStart; + } + + inline size_t GetSpanEnd() const { + return m_spanEnd; + } + +private: + void ComputeHash(); + + Phrase m_phrase; + size_t m_spanStart, m_spanEnd; + size_t m_hash; +}; + +// how to print a VW state +std::ostream &operator<<(std::ostream &out, const VWState &state); + +} diff --git a/moses/FF/VW/VWTargetSentence.h b/moses/FF/VW/VWTargetSentence.h new file mode 100644 index 000000000..1387bc042 --- /dev/null +++ b/moses/FF/VW/VWTargetSentence.h @@ -0,0 +1,55 @@ +#pragma once + +#include <vector> + +#include "moses/AlignmentInfo.h" +#include "moses/Phrase.h" + +#include "AlignmentConstraint.h" + +namespace Moses +{ + +/** + * VW thread-specific data about target sentence. + */ +class VWTargetSentence +{ +public: + VWTargetSentence() : m_sentence(NULL), m_alignment(NULL) {} + + void Clear() { + if (m_sentence) delete m_sentence; + if (m_alignment) delete m_alignment; + } + + ~VWTargetSentence() { + Clear(); + } + + void SetConstraints(size_t sourceSize) { + // initialize to unconstrained + m_sourceConstraints.assign(sourceSize, AlignmentConstraint()); + m_targetConstraints.assign(m_sentence->GetSize(), AlignmentConstraint()); + + // set constraints according to alignment points + AlignmentInfo::const_iterator it; + for (it = m_alignment->begin(); it != m_alignment->end(); it++) { + int src = it->first; + int tgt = it->second; + + if (src >= m_sourceConstraints.size() || tgt >= m_targetConstraints.size()) { + UTIL_THROW2("VW :: alignment point out of bounds: " << src << "-" << tgt); + } + + m_sourceConstraints[src].Update(tgt); + m_targetConstraints[tgt].Update(src); + } + } + + Phrase *m_sentence; + AlignmentInfo *m_alignment; + std::vector<AlignmentConstraint> m_sourceConstraints, m_targetConstraints; +}; + +} diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp index ada728919..67267ce90 100644 --- a/moses/Parameter.cpp +++ b/moses/Parameter.cpp @@ -59,6 +59,7 @@ Parameter::Parameter() AddParam(main_opts,"version", "show version of Moses and libraries used"); AddParam(main_opts,"show-weights", "print feature weights and exit"); AddParam(main_opts,"time-out", "seconds after which is interrupted (-1=no time-out, default is -1)"); + AddParam(main_opts,"segment-time-out", "seconds for single segment after which is interrupted (-1=no time-out, default is -1)"); /////////////////////////////////////////////////////////////////////////////////////// // factorization options diff --git a/moses/ReorderingConstraint.cpp b/moses/ReorderingConstraint.cpp index a5627508f..c4950daad 100644 --- a/moses/ReorderingConstraint.cpp +++ b/moses/ReorderingConstraint.cpp @@ -54,8 +54,8 @@ void ReorderingConstraint::SetWall( size_t pos, bool value ) void ReorderingConstraint::FinalizeWalls() { for(size_t z = 0; z < m_zone.size(); z++ ) { - const size_t startZone = m_zone[z][0]; - const size_t endZone = m_zone[z][1];// note: wall after endZone is not local + const size_t startZone = m_zone[z].first; + const size_t endZone = m_zone[z].second;// note: wall after endZone is not local for( size_t pos = startZone; pos < endZone; pos++ ) { if (m_wall[ pos ]) { m_localWall[ pos ] = z; @@ -65,8 +65,8 @@ void ReorderingConstraint::FinalizeWalls() // enforce that local walls only apply to innermost zone else if (m_localWall[ pos ] != NOT_A_ZONE) { size_t assigned_z = m_localWall[ pos ]; - if ((m_zone[assigned_z][0] < startZone) || - (m_zone[assigned_z][1] > endZone)) { + if ((m_zone[assigned_z].first < startZone) || + (m_zone[assigned_z].second > endZone)) { m_localWall[ pos ] = z; } } @@ -97,9 +97,9 @@ void ReorderingConstraint::SetMonotoneAtPunctuation( const Phrase &sentence ) void ReorderingConstraint::SetZone( size_t startPos, size_t endPos ) { VERBOSE(3,"SETTING zone " << startPos << "-" << endPos << std::endl); - std::vector< size_t > newZone; - newZone.push_back( startPos ); - newZone.push_back( endPos ); + std::pair<size_t,size_t> newZone; + newZone.first = startPos; + newZone.second = endPos; m_zone.push_back( newZone ); m_active = true; } @@ -138,8 +138,8 @@ bool ReorderingConstraint::Check( const Bitmap &bitmap, size_t startPos, size_t // check zones for(size_t z = 0; z < m_zone.size(); z++ ) { - const size_t startZone = m_zone[z][0]; - const size_t endZone = m_zone[z][1]; + const size_t startZone = m_zone[z].first; + const size_t endZone = m_zone[z].second; // fine, if translation has not reached zone yet and phrase outside zone if (lastPos < startZone && ( endPos < startZone || startPos > endZone ) ) { @@ -236,4 +236,25 @@ bool ReorderingConstraint::Check( const Bitmap &bitmap, size_t startPos, size_t return true; } +std::ostream& operator<<(std::ostream& out, const ReorderingConstraint &obj) +{ + out << "Zones:"; + for (size_t i = 0; i < obj.m_zone.size(); ++i) { + const std::pair<size_t,size_t> &zone1 = obj.m_zone[i]; + out << zone1.first << "-" << zone1.second << " "; + } + + out << "Walls:"; + for (size_t i = 0; i < obj.m_size; ++i) { + out << obj.m_wall[i]; + } + + out << " Local walls:"; + for (size_t i = 0; i < obj.m_size; ++i) { + out << obj.m_localWall[i] << " "; + } + + return out; +} + } diff --git a/moses/ReorderingConstraint.h b/moses/ReorderingConstraint.h index fc74dea7d..047382076 100644 --- a/moses/ReorderingConstraint.h +++ b/moses/ReorderingConstraint.h @@ -45,13 +45,13 @@ class Bitmap; */ class ReorderingConstraint { - friend std::ostream& operator<<(std::ostream& out, const ReorderingConstraint& reorderingConstraint); + friend std::ostream& operator<<(std::ostream& out, const ReorderingConstraint &obj); protected: // const size_t m_size; /**< number of words in sentence */ size_t m_size; /**< number of words in sentence */ bool *m_wall; /**< flag for each word if it is a wall */ size_t *m_localWall; /**< flag for each word if it is a local wall */ - std::vector< std::vector< size_t > > m_zone; /** zones that limit reordering */ + std::vector< std::pair<size_t,size_t> > m_zone; /** zones that limit reordering */ bool m_active; /**< flag indicating, if there are any active constraints */ int m_max_distortion; public: @@ -93,7 +93,7 @@ public: void SetZone( size_t startPos, size_t endPos ); //! returns the vector of zones - std::vector< std::vector< size_t > > & GetZones() { + std::vector< std::pair<size_t,size_t> > & GetZones() { return m_zone; } diff --git a/moses/Search.cpp b/moses/Search.cpp index 2d8c74b5f..caf9425cf 100644 --- a/moses/Search.cpp +++ b/moses/Search.cpp @@ -17,21 +17,34 @@ Search::Search(Manager& manager) , interrupted_flag(0) { m_initialTransOpt.SetInputPath(m_inputPath); + m_timer.start(); } - bool Search:: out_of_time() { int const& timelimit = m_options.search.timeout; - if (!timelimit) return false; - double elapsed_time = GetUserTime(); - if (elapsed_time <= timelimit) return false; - VERBOSE(1,"Decoding is out of time (" << elapsed_time << "," - << timelimit << ")" << std::endl); - interrupted_flag = 1; - return true; + if (timelimit > 0) { + double elapsed_time = GetUserTime(); + if (elapsed_time > timelimit) { + VERBOSE(1,"Decoding is out of time (" << elapsed_time << "," + << timelimit << ")" << std::endl); + interrupted_flag = 1; + return true; + } + } + int const& segment_timelimit = m_options.search.segment_timeout; + if (segment_timelimit > 0) { + double elapsed_time = m_timer.get_elapsed_time(); + if (elapsed_time > segment_timelimit) { + VERBOSE(1,"Decoding for segment is out of time (" << elapsed_time << "," + << segment_timelimit << ")" << std::endl); + interrupted_flag = 1; + return true; + } + } + return false; } } diff --git a/moses/Search.h b/moses/Search.h index a0e07870d..7797f07a0 100644 --- a/moses/Search.h +++ b/moses/Search.h @@ -7,6 +7,7 @@ #include "Phrase.h" #include "InputPath.h" #include "Bitmaps.h" +#include "Timer.h" namespace Moses { @@ -48,6 +49,7 @@ protected: /** flag indicating that decoder ran out of time (see switch -time-out) */ size_t interrupted_flag; + Timer m_timer; bool out_of_time(); }; diff --git a/moses/SearchCubePruning.cpp b/moses/SearchCubePruning.cpp index 9984ecadb..f921b9860 100644 --- a/moses/SearchCubePruning.cpp +++ b/moses/SearchCubePruning.cpp @@ -97,7 +97,6 @@ void SearchCubePruning::Decode() // go through each stack size_t stackNo = 1; - int timelimit = m_options.search.timeout; std::vector < HypothesisStack* >::iterator iterStack; for (iterStack = m_hypoStackColl.begin() + 1 ; iterStack != m_hypoStackColl.end() ; ++iterStack) { // BOOST_FOREACH(HypothesisStack* hstack, m_hypoStackColl) { diff --git a/moses/Sentence.cpp b/moses/Sentence.cpp index 4db022e5e..98bfb9e0a 100644 --- a/moses/Sentence.cpp +++ b/moses/Sentence.cpp @@ -155,7 +155,9 @@ aux_interpret_xml(std::string& line, std::vector<size_t> & xmlWalls, m_xmlOptions, m_reorderingConstraint, xmlWalls, placeholders); - UTIL_THROW_IF2(!OK, "Unable to parse XML in line: " << line); + if (!OK) { + TRACE_ERR("Unable to parse XML in line: " << line); + } } } diff --git a/moses/TranslationModel/CompactPT/CanonicalHuffman.h b/moses/TranslationModel/CompactPT/CanonicalHuffman.h index 9f6c14e56..10f3019b1 100644 --- a/moses/TranslationModel/CompactPT/CanonicalHuffman.h +++ b/moses/TranslationModel/CompactPT/CanonicalHuffman.h @@ -76,8 +76,9 @@ private: MinHeapSorter hs(A); std::make_heap(A.begin(), A.begin() + n, hs); - size_t h = n; - size_t m1, m2; + // marked volatile to prevent the intel compiler from generating bad code + volatile size_t h = n; + volatile size_t m1, m2; while(h > 1) { m1 = A[0]; std::pop_heap(A.begin(), A.begin() + h, hs); diff --git a/moses/parameters/SearchOptions.cpp b/moses/parameters/SearchOptions.cpp index 678f9bfe0..958569e94 100644 --- a/moses/parameters/SearchOptions.cpp +++ b/moses/parameters/SearchOptions.cpp @@ -38,6 +38,7 @@ namespace Moses param.SetParameter(early_discarding_threshold, "early-discarding-threshold", DEFAULT_EARLY_DISCARDING_THRESHOLD); param.SetParameter(timeout, "time-out", 0); + param.SetParameter(segment_timeout, "segment-time-out", 0); param.SetParameter(max_phrase_length, "max-phrase-length", DEFAULT_MAX_PHRASE_LENGTH); param.SetParameter(trans_opt_threshold, "translation-option-threshold", diff --git a/moses/parameters/SearchOptions.h b/moses/parameters/SearchOptions.h index 46c53e95b..30a612f05 100644 --- a/moses/parameters/SearchOptions.h +++ b/moses/parameters/SearchOptions.h @@ -25,6 +25,7 @@ namespace Moses float beam_width; int timeout; + int segment_timeout; bool consensus; //! Use Consensus decoding (DeNero et al 2009) diff --git a/scripts/Transliteration/train-transliteration-module.pl b/scripts/Transliteration/train-transliteration-module.pl index d072719d1..8d22ae6ce 100755 --- a/scripts/Transliteration/train-transliteration-module.pl +++ b/scripts/Transliteration/train-transliteration-module.pl @@ -240,7 +240,7 @@ sub train_transliteration_module{ `$MOSES_SRC_DIR/scripts/ems/support/substitute-filtered-tables.perl $OUT_DIR/tuning/filtered/moses.ini < $OUT_DIR/model/moses.ini > $OUT_DIR/tuning/moses.filtered.ini`; - `$MOSES_SRC_DIR/scripts/training/mert-moses.pl $OUT_DIR/tuning/input $OUT_DIR/tuning/reference $DECODER $OUT_DIR/tuning/moses.filtered.ini --nbest 100 --working-dir $OUT_DIR/tuning/tmp --decoder-flags "-threads 16 -drop-unknown -v 0 -distortion-limit 0" --rootdir $MOSES_SRC_DIR/scripts -mertdir $MOSES_SRC_DIR/mert -threads=16 --no-filter-phrase-table`; + `$MOSES_SRC_DIR/scripts/training/mert-moses.pl $OUT_DIR/tuning/input $OUT_DIR/tuning/reference $DECODER $OUT_DIR/tuning/moses.filtered.ini --nbest 100 --working-dir $OUT_DIR/tuning/tmp --decoder-flags "-threads 16 -drop-unknown -v 0 -distortion-limit 0" --rootdir $MOSES_SRC_DIR/scripts -mertdir $MOSES_SRC_DIR/bin -threads=16 --no-filter-phrase-table`; `cp $OUT_DIR/tuning/tmp/moses.ini $OUT_DIR/tuning/moses.ini`; diff --git a/scripts/ems/example/config.basic b/scripts/ems/example/config.basic index 257166721..e6b2d4a5c 100644 --- a/scripts/ems/example/config.basic +++ b/scripts/ems/example/config.basic @@ -54,7 +54,7 @@ output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-ext # For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) # Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) # "Farasa: A Fast and Furious Segmenter for Arabic" -input-tokenizer = "$farasa-dir/farasa_moses.sh" +#input-tokenizer = "$farasa-dir/farasa_moses.sh" # truecasers - comment out if you do not use the truecaser diff --git a/scripts/ems/example/config.factored b/scripts/ems/example/config.factored index 6f7beb438..7e1004db6 100644 --- a/scripts/ems/example/config.factored +++ b/scripts/ems/example/config.factored @@ -54,7 +54,7 @@ output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-ext # For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) # Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) # "Farasa: A Fast and Furious Segmenter for Arabic" -input-tokenizer = "$farasa-dir/farasa_moses.sh" +#input-tokenizer = "$farasa-dir/farasa_moses.sh" # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl diff --git a/scripts/ems/example/config.hierarchical b/scripts/ems/example/config.hierarchical index 6fb77a18a..3d00ffd79 100644 --- a/scripts/ems/example/config.hierarchical +++ b/scripts/ems/example/config.hierarchical @@ -57,7 +57,7 @@ output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-ext # For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) # Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) # "Farasa: A Fast and Furious Segmenter for Arabic" -input-tokenizer = "$farasa-dir/farasa_moses.sh" +#input-tokenizer = "$farasa-dir/farasa_moses.sh" # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl diff --git a/scripts/ems/example/config.syntax b/scripts/ems/example/config.syntax index ddde6baad..bdbd2b4e0 100644 --- a/scripts/ems/example/config.syntax +++ b/scripts/ems/example/config.syntax @@ -57,7 +57,7 @@ output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-ext # For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) # Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) # "Farasa: A Fast and Furious Segmenter for Arabic" -input-tokenizer = "$farasa-dir/farasa_moses.sh" +#input-tokenizer = "$farasa-dir/farasa_moses.sh" # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl diff --git a/scripts/ems/example/config.toy b/scripts/ems/example/config.toy index dff4ed10d..6667a9744 100644 --- a/scripts/ems/example/config.toy +++ b/scripts/ems/example/config.toy @@ -54,7 +54,7 @@ output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-ext # For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) # Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) # "Farasa: A Fast and Furious Segmenter for Arabic" -input-tokenizer = "$farasa-dir/farasa_moses.sh" +#input-tokenizer = "$farasa-dir/farasa_moses.sh" # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl diff --git a/scripts/ems/example/config.toy.bilinguallm b/scripts/ems/example/config.toy.bilinguallm index f4730a80f..9bf94613f 100644 --- a/scripts/ems/example/config.toy.bilinguallm +++ b/scripts/ems/example/config.toy.bilinguallm @@ -54,7 +54,7 @@ output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-ext # For Arabic tokenizer try Farasa (download: http://qatsdemo.cloudapp.net/farasa/) # Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) # "Farasa: A Fast and Furious Segmenter for Arabic" -input-tokenizer = "$farasa-dir/farasa_moses.sh" +#input-tokenizer = "$farasa-dir/farasa_moses.sh" # truecasers - comment out if you do not use the truecaser input-truecaser = $moses-script-dir/recaser/truecase.perl diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 11c69eab4..8713af8bf 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -827,7 +827,7 @@ create-config in: sigtest-filter-reordering-table sigtest-filter-phrase-translation-table transliteration-table generation-table-pruned sparse corpus-mml-prefilter=OR=corpus-mml-postfilter=OR=domains osm-model INTERPOLATED-LM:binlm LM:binlm out: config ignore-if: use-hiero thot - rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt no-glue-grammar dont-tune-glue-grammar use-syntax-input-weight-feature + rerun-on-change: decoding-steps alignment-factors translation-factors reordering-factors generation-factors lexicalized-reordering training-options script decoding-graph-backoff score-settings additional-ini mmsapt no-glue-grammar dont-tune-glue-grammar use-syntax-input-weight-feature operation-sequence-model-load-method default-name: model/moses.ini error: Unknown option error: requires an argument @@ -1540,6 +1540,150 @@ analysis-precision rerun-on-change: precision-by-coverage-base final-model: yes +[QUALITY-ESTIMATION] single +tokenize-input + in: raw-input + out: tokenized-input + default-name: quality-estimation/input.tok + pass-unless: input-tokenizer + template: $input-tokenizer < IN > OUT +tokenize-input-devtest + in: raw-input-devtest + out: tokenized-input-devtest + default-name: quality-estimation/input.devtest.tok + pass-unless: input-tokenizer + template: $input-tokenizer < IN > OUT +lowercase-input + in: tokenized-input + out: truecased-input + default-name: quality-estimation/input.lc + pass-unless: input-lowercaser + ignore-if: input-truecaser + template: $input-lowercaser < IN > OUT +lowercase-input-devtest + in: tokenized-input-devtest + out: truecased-input-devtest + default-name: quality-estimation/input.devtest.lc + pass-unless: input-lowercaser + ignore-if: input-truecaser + template: $input-lowercaser < IN > OUT +truecase-input + in: tokenized-input TRUECASER:truecase-model + out: truecased-input + rerun-on-change: input-truecaser + default-name: quality-estimation/input.tc + ignore-unless: input-truecaser + template: $input-truecaser -model IN1.$input-extension < IN > OUT +truecase-input-devtest + in: tokenized-input-devtest TRUECASER:truecase-model + out: truecased-input-devtest + rerun-on-change: input-truecaser + ignore-unless: input-truecaser + default-name: quality-estimation/input.devtest.tc + template: $input-truecaser -model IN1.$input-extension < IN > OUT +split-input + in: truecased-input SPLITTER:splitter-model + out: split-input + rerun-on-change: input-splitter + default-name: quality-estimation/input.split + pass-unless: input-splitter + template: $input-splitter -model IN1.$input-extension < IN > OUT +split-input-devtest + in: truecased-input-devtest SPLITTER:splitter-model + out: split-input-devtest + rerun-on-change: input-splitter + default-name: quality-estimation/input.devtest.split + pass-unless: input-splitter + template: $input-splitter -model IN1.$input-extension < IN > OUT +tokenize-reference + in: raw-reference + out: tokenized-reference + default-name: quality-estimation/reference.tok + pass-unless: output-tokenizer + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $output-tokenizer < IN > OUT +tokenize-reference-devtest + in: raw-reference-devtest + out: tokenized-reference-devtest + default-name: quality-estimation/reference.devtest.tok + pass-unless: output-tokenizer + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $output-tokenizer < IN > OUT +lowercase-reference + in: tokenized-reference + out: truecased-reference + default-name: quality-estimation/reference.lc + pass-unless: output-lowercaser + ignore-if: output-truecaser + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $output-lowercaser < IN > OUT +lowercase-reference-devtest + in: tokenized-reference-devtest + out: truecased-reference-devtest + default-name: quality-estimation/reference.devtest.lc + pass-unless: output-lowercaser + ignore-if: output-truecaser + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $output-lowercaser < IN > OUT +truecase-reference + in: tokenized-reference TRUECASER:truecase-model + out: truecased-reference + rerun-on-change: output-truecaser + default-name: quality-estimation/reference.tc + ignore-unless: output-truecaser + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $output-truecaser -model IN1.$output-extension < IN > OUT +truecase-reference-devtest + in: tokenized-reference-devtest TRUECASER:truecase-model + out: truecased-reference-devtest + rerun-on-change: output-truecaser + default-name: quality-estimation/reference.devtest.tc + ignore-unless: output-truecaser + multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl + template: $output-truecaser -model IN1.$output-extension < IN > OUT +decode + in: TUNING:config-with-reused-weights split-input + out: rich-output + default-name: quality-estimation/output + template: $decoder -v 0 -tt -f IN < IN1 > OUT + error: Translation was not performed correctly + not-error: trans: No such file or directory +decode-devtest + in: TUNING:config-with-reused-weights split-input-devtest + out: rich-output-devtest + default-name: quality-estimation/output-devtest + template: $decoder -v 0 -tt -f IN < IN1 > OUT + error: Translation was not performed correctly + not-error: trans: No such file or directory +remove-markup + in: rich-output + out: cleaned-output + default-name: quality-estimation/tokenized-output + template: $moses-script-dir/ems/support/remove-segmentation-markup.perl < IN > OUT +remove-markup-devtest + in: rich-output-devtest + out: cleaned-output-devtest + default-name: quality-estimation/tokenized-output-devtest + template: $moses-script-dir/ems/support/remove-segmentation-markup.perl < IN > OUT +score-output + in: cleaned-output truecased-reference + out: scored-output + default-name: quality-estimation/output-scored + tmp-name: quality-estimation/ter + template: mkdir TMP ; $moses-script-dir/ems/support/ter.perl $tercom IN IN1 TMP > OUT +score-output-devtest + in: cleaned-output-devtest truecased-reference-devtest + out: scored-output-devtest + default-name: quality-estimation/output-scored-devtest + tmp-name: quality-estimation/ter-devtest + template: mkdir TMP ; $moses-script-dir/ems/support/ter.perl $tercom IN IN1 TMP > OUT +train + in: input rich-output scored-output input-devtest rich-output-devtest scored-output-devtest + out: quality-estimation-model + default-name: quality-estimation/model + template: $trainer --train-rich IN1 --train-ter IN2 --eval-rich IN4 --eval-ter IN5 --model OUT + final-model: yes + [REPORTING] single report in: EVALUATION:nist-bleu-score EVALUATION:nist-bleu-c-score EVALUATION:bolt-bleu-score EVALUATION:bolt-bleu-c-score EVALUATION:multi-bleu-score EVALUATION:multi-bleu-c-score EVALUATION:multi-bleu-detok-score EVALUATION:multi-bleu-c-detok-score EVALUATION:meteor-score EVALUATION:ter-score EVALUATION:wer-score EVALUATION:ibm-bleu-score EVALUATION:ibm-bleu-c-score EVALUATION:analysis EVALUATION:analysis-coverage EVALUATION:analysis-prec TRAINING:biconcor-model EVALUATION:wade-analysis diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 6d0019838..e52c82319 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -2660,12 +2660,16 @@ sub define_training_create_config { if ($osm) { my $osm_settings = &get("TRAINING:operation-sequence-model-settings"); - if ($osm_settings =~ /-factor *(\S+)/){ + if ($osm_settings =~ /-factor *(\S+)/) { $cmd .= "-osm-model $osm/ -osm-setting $1 "; } else { $cmd .= "-osm-model $osm/operationLM.bin "; } + my $osm_load_method = &get("TRAINING:operation-sequence-model-load-method"); + if (defined($osm_load_method)) { + $cmd .= "-osm-load-method $osm_load_method "; + } } if (&get("TRAINING:phrase-orientation")) { diff --git a/scripts/ems/support/create-xml.perl b/scripts/ems/support/create-xml.perl new file mode 100755 index 000000000..610c2ccf8 --- /dev/null +++ b/scripts/ems/support/create-xml.perl @@ -0,0 +1,42 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use warnings; +use strict; + +my ($type) = @ARGV; +if ($type =~ /^s/i) { + print "<srcset setid=\"test\" srclang=\"any\">\n"; + print "<doc docid=\"doc\">\n"; +} +elsif ($type =~ /^t/i) { + print "<tstset setid=\"test\" tgtlang=\"any\" srclang=\"any\">\n"; + print "<doc sysid=\"moses\" docid=\"doc\">\n"; +} +elsif ($type =~ /^r/i) { + print "<refset setid=\"test\" tgtlang=\"any\" srclang=\"any\">\n"; + print "<doc sysid=\"ref\" docid=\"doc\">\n"; +} +else { + die("ERROR: specify source / target / ref"); +} + +my $i = 0; +while(<STDIN>) { + chomp; + print "<seg id=\"".(++$i)."\">$_</seg>\n"; +} + +print "</doc>\n"; + +if ($type =~ /^s/i) { + print "</srcset>\n"; +} +elsif ($type =~ /^t/i) { + print "</tstset>\n"; +} +elsif ($type =~ /^r/i) { + print "</refset>\n"; +} diff --git a/scripts/ems/support/remove-segmentation-markup.perl b/scripts/ems/support/remove-segmentation-markup.perl index 3b02bceaf..1e5820dd5 100755 --- a/scripts/ems/support/remove-segmentation-markup.perl +++ b/scripts/ems/support/remove-segmentation-markup.perl @@ -9,7 +9,16 @@ use strict; $|++; while(<STDIN>) { - s/ \|\d+\-\d+\| / /g; - s/ \|\d+\-\d+\|$//; - print $_; + chop; + s/\|[^\|]+\|//g; + s/\s+/ /g; + s/^ //; + s/ $//; + print $_."\n"; } + +#while(<STDIN>) { +# s/ \|\d+\-\d+\| / /g; +# s/ \|\d+\-\d+\|$//; +# print $_; +#} diff --git a/scripts/ems/support/ter.perl b/scripts/ems/support/ter.perl new file mode 100644 index 000000000..1bae6f146 --- /dev/null +++ b/scripts/ems/support/ter.perl @@ -0,0 +1,15 @@ +#!/usr/bin/env perl +# +# This file is part of moses. Its use is licensed under the GNU Lesser General +# Public License version 2.1 or, at your option, any later version. + +use strict; +use FindBin qw($RealBin); + +my ($jar, $hyp,$ref,$tmp) = @ARGV; +`mkdir -p $tmp`; +`$RealBin/create-xml.perl test < $hyp > $tmp/hyp`; +`$RealBin/create-xml.perl ref < $ref > $tmp/ref`; +`java -jar $jar -h $tmp/hyp -r $tmp/ref -o ter -n $tmp/out`; +print `cat $tmp/out.ter`; + diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 3e8dabb79..9fae8ec8b 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -83,6 +83,7 @@ my($_EXTERNAL_BINDIR, $_CONFIG, $_OSM, $_OSM_FACTORS, + $_OSM_LOAD_METHOD, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE, $_HIERARCHICAL, @@ -238,6 +239,7 @@ $_HELP = 1 'config=s' => \$_CONFIG, 'osm-model=s' => \$_OSM, 'osm-setting=s' => \$_OSM_FACTORS, + 'osm-load-method=s' => \$_OSM_LOAD_METHOD, 'post-decoding-translit=s' => \$_POST_DECODING_TRANSLIT, 'transliteration-phrase-table=s' => \$_TRANSLITERATION_PHRASE_TABLE, 'mmsapt' => \$_MMSAPT, @@ -2249,6 +2251,8 @@ sub create_ini { if($_OSM) { + my $load_method = ""; + $load_method = " load=$_OSM_LOAD_METHOD" if defined($_OSM_LOAD_METHOD); if (defined($_OSM_FACTORS)) { my $count = 0; @@ -2258,11 +2262,11 @@ sub create_ini { my ($factor_f,$factor_e) = split(/\-/,$factor_val); if($count == 0){ - $feature_spec .= "OpSequenceModel name=OpSequenceModel$count num-features=5 path=". $_OSM . $factor_val . "/operationLM.bin" . " input-factor=". $factor_f . " output-factor=". $factor_e . " support-features=yes \n"; + $feature_spec .= "OpSequenceModel$load_method name=OpSequenceModel$count num-features=5 path=". $_OSM . $factor_val . "/operationLM.bin" . " input-factor=". $factor_f . " output-factor=". $factor_e . " support-features=yes \n"; $weight_spec .= "OpSequenceModel$count= 0.08 -0.02 0.02 -0.001 0.03\n"; } else{ - $feature_spec .= "OpSequenceModel name=OpSequenceModel$count num-features=1 path=". $_OSM . $factor_val . "/operationLM.bin" . " input-factor=". $factor_f . " output-factor=". $factor_e . " support-features=no \n"; + $feature_spec .= "OpSequenceModel$load_method name=OpSequenceModel$count num-features=1 path=". $_OSM . $factor_val . "/operationLM.bin" . " input-factor=". $factor_f . " output-factor=". $factor_e . " support-features=no \n"; $weight_spec .= "OpSequenceModel$count= 0.08 \n"; } @@ -2271,7 +2275,7 @@ sub create_ini { } else { - $feature_spec .= "OpSequenceModel name=OpSequenceModel0 num-features=5 path=". $_OSM . " \n"; + $feature_spec .= "OpSequenceModel$load_method name=OpSequenceModel0 num-features=5 path=". $_OSM . " \n"; $weight_spec .= "OpSequenceModel0= 0.08 -0.02 0.02 -0.001 0.03\n"; } } @@ -2292,7 +2296,9 @@ sub create_ini { } $type = "KENLM" unless defined $type; # default to KENLM if no type given - if ($type =~ /^\d+$/) { + if ($type =~ /^8-(.+)/) { + $type = "KENLM load=$1"; + } elsif ($type =~ /^\d+$/) { # backwards compatibility if the type is given not as string but as a number if ($type == 0) { $type = "SRILM"; diff --git a/vw/Classifier.h b/vw/Classifier.h index 39b3461ad..cb2c8b227 100644 --- a/vw/Classifier.h +++ b/vw/Classifier.h @@ -24,6 +24,8 @@ class ezexample; namespace Discriminative { +typedef std::pair<uint32_t, float> FeatureType; // feature hash (=ID) and value +typedef std::vector<FeatureType> FeatureVector; /** * Abstract class to be implemented by classifiers. @@ -34,12 +36,22 @@ public: /** * Add a feature that does not depend on the class (label). */ - virtual void AddLabelIndependentFeature(const StringPiece &name, float value) = 0; + virtual FeatureType AddLabelIndependentFeature(const StringPiece &name, float value) = 0; /** * Add a feature that is specific for the given class. */ - virtual void AddLabelDependentFeature(const StringPiece &name, float value) = 0; + virtual FeatureType AddLabelDependentFeature(const StringPiece &name, float value) = 0; + + /** + * Efficient addition of features when their IDs are already computed. + */ + virtual void AddLabelIndependentFeatureVector(const FeatureVector &features) = 0; + + /** + * Efficient addition of features when their IDs are already computed. + */ + virtual void AddLabelDependentFeatureVector(const FeatureVector &features) = 0; /** * Train using current example. Use loss to distinguish positive and negative training examples. @@ -54,12 +66,12 @@ public: virtual float Predict(const StringPiece &label) = 0; // helper methods for indicator features - void AddLabelIndependentFeature(const StringPiece &name) { - AddLabelIndependentFeature(name, 1.0); + FeatureType AddLabelIndependentFeature(const StringPiece &name) { + return AddLabelIndependentFeature(name, 1.0); } - void AddLabelDependentFeature(const StringPiece &name) { - AddLabelDependentFeature(name, 1.0); + FeatureType AddLabelDependentFeature(const StringPiece &name) { + return AddLabelDependentFeature(name, 1.0); } virtual ~Classifier() {} @@ -95,8 +107,10 @@ public: VWTrainer(const std::string &outputFile); virtual ~VWTrainer(); - virtual void AddLabelIndependentFeature(const StringPiece &name, float value); - virtual void AddLabelDependentFeature(const StringPiece &name, float value); + virtual FeatureType AddLabelIndependentFeature(const StringPiece &name, float value); + virtual FeatureType AddLabelDependentFeature(const StringPiece &name, float value); + virtual void AddLabelIndependentFeatureVector(const FeatureVector &features); + virtual void AddLabelDependentFeatureVector(const FeatureVector &features); virtual void Train(const StringPiece &label, float loss); virtual float Predict(const StringPiece &label); @@ -121,15 +135,17 @@ public: VWPredictor(const std::string &modelFile, const std::string &vwOptions); virtual ~VWPredictor(); - virtual void AddLabelIndependentFeature(const StringPiece &name, float value); - virtual void AddLabelDependentFeature(const StringPiece &name, float value); + virtual FeatureType AddLabelIndependentFeature(const StringPiece &name, float value); + virtual FeatureType AddLabelDependentFeature(const StringPiece &name, float value); + virtual void AddLabelIndependentFeatureVector(const FeatureVector &features); + virtual void AddLabelDependentFeatureVector(const FeatureVector &features); virtual void Train(const StringPiece &label, float loss); virtual float Predict(const StringPiece &label); friend class ClassifierFactory; protected: - void AddFeature(const StringPiece &name, float values); + FeatureType AddFeature(const StringPiece &name, float values); ::vw *m_VWInstance, *m_VWParser; ::ezexample *m_ex; diff --git a/vw/Normalizer.h b/vw/Normalizer.h index 74d94a79f..210b29060 100644 --- a/vw/Normalizer.h +++ b/vw/Normalizer.h @@ -2,6 +2,7 @@ #define moses_Normalizer_h #include <vector> +#include <algorithm> #include "Util.h" namespace Discriminative @@ -45,16 +46,25 @@ public: virtual ~SquaredLossNormalizer() {} }; +// safe softmax class LogisticLossNormalizer : public Normalizer { public: virtual void operator()(std::vector<float> &losses) const { - float sum = 0; std::vector<float>::iterator it; + + float sum = 0; + float max = 0; for (it = losses.begin(); it != losses.end(); it++) { - *it = exp(-*it); + *it = -*it; + max = std::max(max, *it); + } + + for (it = losses.begin(); it != losses.end(); it++) { + *it = exp(*it - max); sum += *it; } + for (it = losses.begin(); it != losses.end(); it++) { *it /= sum; } diff --git a/vw/VWPredictor.cpp b/vw/VWPredictor.cpp index 01192a9c6..88d8cfa7f 100644 --- a/vw/VWPredictor.cpp +++ b/vw/VWPredictor.cpp @@ -36,7 +36,7 @@ VWPredictor::~VWPredictor() VW::finish(*m_VWInstance); } -void VWPredictor::AddLabelIndependentFeature(const StringPiece &name, float value) +FeatureType VWPredictor::AddLabelIndependentFeature(const StringPiece &name, float value) { // label-independent features are kept in a different feature namespace ('s' = source) @@ -48,10 +48,10 @@ void VWPredictor::AddLabelIndependentFeature(const StringPiece &name, float valu m_ex->addns('s'); if (DEBUG) std::cerr << "VW :: Setting source namespace\n"; } - AddFeature(name, value); // namespace 's' is set up, add the feature + return AddFeature(name, value); // namespace 's' is set up, add the feature } -void VWPredictor::AddLabelDependentFeature(const StringPiece &name, float value) +FeatureType VWPredictor::AddLabelDependentFeature(const StringPiece &name, float value) { // VW does not use the label directly, instead, we do a Cartesian product between source and target feature // namespaces, where the source namespace ('s') contains label-independent features and the target @@ -63,7 +63,37 @@ void VWPredictor::AddLabelDependentFeature(const StringPiece &name, float value) m_ex->addns('t'); if (DEBUG) std::cerr << "VW :: Setting target namespace\n"; } - AddFeature(name, value); + return AddFeature(name, value); +} + +void VWPredictor::AddLabelIndependentFeatureVector(const FeatureVector &features) +{ + if (m_isFirstSource) { + // the first feature of a new example => create the source namespace for + // label-independent features to live in + m_isFirstSource = false; + m_ex->finish(); + m_ex->addns('s'); + if (DEBUG) std::cerr << "VW :: Setting source namespace\n"; + } + + // add each feature index using this "low level" call to VW + for (FeatureVector::const_iterator it = features.begin(); it != features.end(); it++) + m_ex->addf(it->first, it->second); +} + +void VWPredictor::AddLabelDependentFeatureVector(const FeatureVector &features) +{ + if (m_isFirstTarget) { + // the first target-side feature => create namespace 't' + m_isFirstTarget = false; + m_ex->addns('t'); + if (DEBUG) std::cerr << "VW :: Setting target namespace\n"; + } + + // add each feature index using this "low level" call to VW + for (FeatureVector::const_iterator it = features.begin(); it != features.end(); it++) + m_ex->addf(it->first, it->second); } void VWPredictor::Train(const StringPiece &label, float loss) @@ -82,10 +112,10 @@ float VWPredictor::Predict(const StringPiece &label) return loss; } -void VWPredictor::AddFeature(const StringPiece &name, float value) +FeatureType VWPredictor::AddFeature(const StringPiece &name, float value) { if (DEBUG) std::cerr << "VW :: Adding feature: " << EscapeSpecialChars(name.as_string()) << ":" << value << "\n"; - m_ex->addf(EscapeSpecialChars(name.as_string()), value); + return std::make_pair(m_ex->addf(EscapeSpecialChars(name.as_string()), value), value); } } // namespace Discriminative diff --git a/vw/VWTrainer.cpp b/vw/VWTrainer.cpp index e513de3d2..c019bc0c6 100644 --- a/vw/VWTrainer.cpp +++ b/vw/VWTrainer.cpp @@ -25,7 +25,7 @@ VWTrainer::~VWTrainer() close(m_bfos); } -void VWTrainer::AddLabelIndependentFeature(const StringPiece &name, float value) +FeatureType VWTrainer::AddLabelIndependentFeature(const StringPiece &name, float value) { if (m_isFirstSource) { if (m_isFirstExample) { @@ -43,9 +43,11 @@ void VWTrainer::AddLabelIndependentFeature(const StringPiece &name, float value) } AddFeature(name, value); + + return std::make_pair(0, value); // we don't hash features } -void VWTrainer::AddLabelDependentFeature(const StringPiece &name, float value) +FeatureType VWTrainer::AddLabelDependentFeature(const StringPiece &name, float value) { if (m_isFirstTarget) { m_isFirstTarget = false; @@ -56,6 +58,18 @@ void VWTrainer::AddLabelDependentFeature(const StringPiece &name, float value) } AddFeature(name, value); + + return std::make_pair(0, value); // we don't hash features +} + +void VWTrainer::AddLabelIndependentFeatureVector(const FeatureVector &features) +{ + throw logic_error("VW trainer does not support feature IDs."); +} + +void VWTrainer::AddLabelDependentFeatureVector(const FeatureVector &features) +{ + throw logic_error("VW trainer does not support feature IDs."); } void VWTrainer::Train(const StringPiece &label, float loss) |