Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'moses/TranslationModel/UG/mm/ug_phrasepair.h')
-rw-r--r--moses/TranslationModel/UG/mm/ug_phrasepair.h243
1 files changed, 243 insertions, 0 deletions
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h
new file mode 100644
index 000000000..8cd43dc18
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@@ -0,0 +1,243 @@
+//-*- c++ -*-
+#pragma once
+#include "ug_bitext.h"
+
+using namespace ugdiss;
+using namespace std;
+
+namespace Moses {
+ namespace bitext
+ {
+
+ template<typename Token>
+ string
+ toString(TokenIndex const& V, Token const* x, size_t const len)
+ {
+ if (!len) return "";
+ UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
+ ostringstream buf;
+ buf << V[x->id()];
+ size_t i = 1;
+ for (x = x->next(); x && i < len; ++i, x = x->next())
+ buf << " " << V[x->id()];
+ UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
+ return buf.str();
+ }
+
+ template<typename Token>
+ class
+ PhrasePair
+ {
+ public:
+ Token const* start1;
+ Token const* start2;
+ uint32_t len1;
+ uint32_t len2;
+ // uint64_t p1, p2;
+ uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
+ vector<float> fvals;
+ float dfwd[po_other+1]; // distortion counts // counts or probs?
+ float dbwd[po_other+1]; // distortion counts
+ vector<uchar> aln;
+ float score;
+ PhrasePair() { };
+ PhrasePair(PhrasePair const& o);
+
+ PhrasePair const& operator+=(PhrasePair const& other);
+
+ bool operator<(PhrasePair const& other) const;
+ bool operator>(PhrasePair const& other) const;
+ bool operator<=(PhrasePair const& other) const;
+ bool operator>=(PhrasePair const& other) const;
+
+ void init();
+ void init(Token const* x, uint32_t const len,
+ pstats const* ps = NULL, size_t const numfeats=0);
+
+ // void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
+ // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
+ // size_t const numfeats);
+
+ // PhrasePair const&
+ // update(uint64_t const pid2, size_t r2 = 0);
+
+ PhrasePair const&
+ update(Token const* x, uint32_t const len, jstats const& js);
+
+ // PhrasePair const&
+ // update(uint64_t const pid2, jstats const& js1, jstats const& js2);
+
+ // PhrasePair const&
+ // update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
+
+ // float
+ // eval(vector<float> const& w);
+
+ class SortByTargetIdSeq
+ {
+ public:
+ int cmp(PhrasePair const& a, PhrasePair const& b) const;
+ bool operator()(PhrasePair const& a, PhrasePair const& b) const;
+ };
+ };
+
+ template<typename Token>
+ void
+ PhrasePair<Token>::
+ init(Token const* x, uint32_t const len,
+ pstats const* ps, size_t const numfeats)
+ {
+ start1 = x; len1 = len;
+ // p1 = pid1;
+ // p2 = 0;
+ if (ps)
+ {
+ raw1 = ps->raw_cnt;
+ sample1 = ps->sample_cnt;
+ good1 = ps->good;
+ }
+ else raw1 = sample1 = good1 = 0;
+ joint = 0;
+ good2 = 0;
+ sample2 = 0;
+ raw2 = 0;
+ fvals.resize(numfeats);
+ }
+
+ template<typename Token>
+ PhrasePair<Token> const&
+ PhrasePair<Token>::
+ update(Token const* x, uint32_t const len, jstats const& js)
+ {
+ // p2 = pid2;
+ start2 = x; len2 = len;
+ raw2 = js.cnt2();
+ joint = js.rcnt();
+ assert(js.aln().size());
+ if (js.aln().size())
+ aln = js.aln()[0].second;
+ float total_fwd = 0, total_bwd = 0;
+ for (int i = po_first; i <= po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ total_fwd += js.dcnt_fwd(po)+1;
+ total_bwd += js.dcnt_bwd(po)+1;
+ }
+
+ // should we do that here or leave the raw counts?
+ for (int i = po_first; i <= po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
+ dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
+ }
+
+ return *this;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator<(PhrasePair const& other) const
+ { return this->score < other.score; }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator>(PhrasePair const& other) const
+ { return this->score > other.score; }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator<=(PhrasePair const& other) const
+ { return this->score <= other.score; }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator>=(PhrasePair const& other) const
+ { return this->score >= other.score; }
+
+ template<typename Token>
+ PhrasePair<Token> const&
+ PhrasePair<Token>::
+ operator+=(PhrasePair const& o)
+ {
+ raw1 += o.raw1;
+ raw2 += o.raw2;
+ sample1 += o.sample1;
+ sample2 += o.sample2;
+ good1 += o.good1;
+ good2 += o.good2;
+ joint += o.joint;
+ return *this;
+ }
+
+ template<typename Token>
+ PhrasePair<Token>::
+ PhrasePair(PhrasePair<Token> const& o)
+ : start1(o.start1)
+ , start2(o.start2)
+ , len1(o.len1)
+ , len2(o.len2)
+ , raw1(o.raw1)
+ , raw2(o.raw2)
+ , sample1(o.sample1)
+ , sample2(o.sample2)
+ , good1(o.good1)
+ , good2(o.good2)
+ , joint(o.joint)
+ , fvals(o.fvals)
+ , aln(o.aln)
+ , score(o.score)
+ {
+ for (size_t i = 0; i <= po_other; ++i)
+ {
+ dfwd[i] = o.dfwd[i];
+ dbwd[i] = o.dbwd[i];
+ }
+ }
+
+ template<typename Token>
+ int
+ PhrasePair<Token>::
+ SortByTargetIdSeq::
+ cmp(PhrasePair const& a, PhrasePair const& b) const
+ {
+ size_t i = 0;
+ Token const* x = a.start2;
+ Token const* y = b.start2;
+ while (i < a.len2 && i < b.len2 && x->id() == y->id())
+ {
+ x = x->next();
+ y = y->next();
+ ++i;
+ }
+ if (i == a.len2 && i == b.len2) return 0;
+ if (i == a.len2) return -1;
+ if (i == b.len2) return 1;
+ return x->id() < y->id() ? -1 : 1;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ SortByTargetIdSeq::
+ operator()(PhrasePair const& a, PhrasePair const& b) const
+ {
+ return this->cmp(a,b) < 0;
+ }
+
+ template<typename Token>
+ void
+ PhrasePair<Token>::
+ init()
+ {
+ len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+ start1 = start2 = NULL;
+ }
+
+
+ } // namespace bitext
+} // namespace Moses