diff options
Diffstat (limited to 'moses/TranslationModel/UG/mm/ug_phrasepair.h')
-rw-r--r-- | moses/TranslationModel/UG/mm/ug_phrasepair.h | 246 |
1 files changed, 246 insertions, 0 deletions
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h new file mode 100644 index 000000000..28a926587 --- /dev/null +++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h @@ -0,0 +1,246 @@ +// -*- c++ -*- +#pragma once +#include <vector> +#include "ug_typedefs.h" +#include "ug_bitext_pstats.h" + +namespace Moses +{ + namespace bitext + { + template<typename Token> + class + PhrasePair + { + public: + class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; }; + Token const* start1; + Token const* start2; + uint32_t len1; + uint32_t len2; + uint64_t p1, p2; + uint32_t raw1, raw2, sample1, sample2, good1, good2, joint; + std::vector<float> fvals; + float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs? + float dbwd[Moses::LRModel::NONE+1]; // distortion counts + std::vector<uchar> aln; + float score; + bool inverse; + std::vector<uint32_t> indoc; + PhrasePair() { }; + PhrasePair(PhrasePair const& o); + + PhrasePair const& operator+=(PhrasePair const& other); + + bool operator<(PhrasePair const& other) const; + bool operator>(PhrasePair const& other) const; + bool operator<=(PhrasePair const& other) const; + bool operator>=(PhrasePair const& other) const; + + void init(); + void init(uint64_t const pid1, bool is_inverse, + Token const* x, uint32_t const len, + pstats const* ps = NULL, size_t const numfeats=0); + + PhrasePair const& + update(uint64_t const pid2, Token const* x, + uint32_t const len, jstats const& js); + + class SortByTargetIdSeq + { + public: + int cmp(PhrasePair const& a, PhrasePair const& b) const; + bool operator()(PhrasePair const& a, PhrasePair const& b) const; + }; + + class SortDescendingByJointCount + { + public: + int cmp(PhrasePair const& a, PhrasePair const& b) const; + bool operator()(PhrasePair const& a, PhrasePair const& b) const; + }; + }; + + template<typename Token> + void PhrasePair<Token> + ::init(uint64_t const pid1, bool is_inverse, + Token const* x, uint32_t const len, + pstats const* ps, size_t const numfeats) + { + inverse = is_inverse; + start1 = x; len1 = len; + p1 = pid1; + p2 = 0; + if (ps) + { + raw1 = ps->raw_cnt; + sample1 = ps->sample_cnt; + good1 = ps->good; + } + else raw1 = sample1 = good1 = 0; + joint = 0; + good2 = 0; + sample2 = 0; + raw2 = 0; + fvals.resize(numfeats); + } + + template<typename Token> + PhrasePair<Token> const& + PhrasePair<Token> + ::update(uint64_t const pid2, + Token const* x, uint32_t const len, jstats const& js) + { + p2 = pid2; + start2 = x; len2 = len; + raw2 = js.cnt2(); + joint = js.rcnt(); + assert(js.aln().size()); + if (js.aln().size()) + aln = js.aln()[0].second; + float total_fwd = 0, total_bwd = 0; + for (int i = 0; i <= Moses::LRModel::NONE; i++) + { + PhraseOrientation po = static_cast<PhraseOrientation>(i); + total_fwd += js.dcnt_fwd(po)+1; + total_bwd += js.dcnt_bwd(po)+1; + } + + // should we do that here or leave the raw counts? + for (int i = 0; i <= Moses::LRModel::NONE; i++) + { + PhraseOrientation po = static_cast<PhraseOrientation>(i); + dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd; + dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd; + } + + indoc = js.indoc; + return *this; + } + + template<typename Token> + bool + PhrasePair<Token> + ::operator<(PhrasePair const& other) const + { + return this->score < other.score; + } + + template<typename Token> + bool + PhrasePair<Token> + ::operator>(PhrasePair const& other) const + { + return this->score > other.score; + } + + template<typename Token> + bool + PhrasePair<Token> + ::operator<=(PhrasePair const& other) const + { + return this->score <= other.score; + } + + template<typename Token> + bool + PhrasePair<Token> + ::operator>=(PhrasePair const& other) const + { + return this->score >= other.score; + } + + template<typename Token> + PhrasePair<Token> const& + PhrasePair<Token> + ::operator+=(PhrasePair const& o) + { + raw1 += o.raw1; + raw2 += o.raw2; + good1 += o.good1; + good2 += o.good2; + joint += o.joint; + sample1 += o.sample1; + sample2 += o.sample2; + return *this; + } + + template<typename Token> + PhrasePair<Token> + ::PhrasePair(PhrasePair<Token> const& o) + : start1(o.start1) , start2(o.start2) + , len1(o.len1) , len2(o.len2) + , p1(o.p1) , p2(o.p2) + , raw1(o.raw1) , raw2(o.raw2) + , sample1(o.sample1) , sample2(o.sample2) + , good1(o.good1) , good2(o.good2) + , joint(o.joint) + , fvals(o.fvals) + , aln(o.aln) + , score(o.score) + , inverse(o.inverse) + , indoc(o.indoc) + { + for (int i = 0; i <= Moses::LRModel::NONE; ++i) + { + dfwd[i] = o.dfwd[i]; + dbwd[i] = o.dbwd[i]; + } + } + + template<typename Token> + int PhrasePair<Token> + ::SortByTargetIdSeq + ::cmp(PhrasePair const& a, PhrasePair const& b) const + { + size_t i = 0; + Token const* x = a.start2; + Token const* y = b.start2; + while (i < a.len2 && i < b.len2 && x->id() == y->id()) + { + x = x->next(); + y = y->next(); + ++i; + } + if (i == a.len2 && i == b.len2) return 0; + if (i == a.len2) return -1; + if (i == b.len2) return 1; + return x->id() < y->id() ? -1 : 1; + } + + template<typename Token> + bool PhrasePair<Token> + ::SortByTargetIdSeq + ::operator()(PhrasePair const& a, PhrasePair const& b) const + { + return this->cmp(a,b) < 0; + } + + template<typename Token> + int PhrasePair<Token> + ::SortDescendingByJointCount + ::cmp(PhrasePair const& a, PhrasePair const& b) const + { + if (a.joint == b.joint) return 0; + return a.joint > b.joint ? -1 : 1; + } + + template<typename Token> + bool PhrasePair<Token> + ::SortDescendingByJointCount + ::operator()(PhrasePair const& a, PhrasePair const& b) const + { + return this->cmp(a,b) < 0; + } + + template<typename Token> + void PhrasePair<Token> + ::init() + { + inverse = false; + len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0; + start1 = start2 = NULL; + p1 = p2 = 0; + } + } +} |