Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
Diffstat (limited to 'moses/TranslationModel/UG/mm')
-rw-r--r--moses/TranslationModel/UG/mm/Jamfile19
-rw-r--r--moses/TranslationModel/UG/mm/custom-pt.cc9
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext.cc183
-rw-r--r--moses/TranslationModel/UG/mm/ug_bitext.h39
-rw-r--r--moses/TranslationModel/UG/mm/ug_im_ttrack.h34
-rw-r--r--moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h21
-rw-r--r--moses/TranslationModel/UG/mm/ug_phrasepair.cc97
-rw-r--r--moses/TranslationModel/UG/mm/ug_phrasepair.h243
-rw-r--r--moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h46
9 files changed, 464 insertions, 227 deletions
diff --git a/moses/TranslationModel/UG/mm/Jamfile b/moses/TranslationModel/UG/mm/Jamfile
index 2cc923581..8d8af050a 100644
--- a/moses/TranslationModel/UG/mm/Jamfile
+++ b/moses/TranslationModel/UG/mm/Jamfile
@@ -72,15 +72,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
$(TOP)/util//kenutil
;
-exe custom-pt :
-custom-pt.cc
-$(TOP)/moses//moses
-$(TOP)//boost_iostreams
-$(TOP)//boost_program_options
-$(TOP)/moses/TranslationModel/UG/mm//mm
-$(TOP)/moses/TranslationModel/UG/generic//generic
-$(TOP)/util//kenutil
-;
+# exe custom-pt :
+# custom-pt.cc
+# $(TOP)/moses//moses
+# $(TOP)//boost_iostreams
+# $(TOP)//boost_program_options
+# $(TOP)/moses/TranslationModel/UG/mm//mm
+# $(TOP)/moses/TranslationModel/UG/generic//generic
+# $(TOP)/util//kenutil
+# ;
exe calc-coverage :
@@ -98,7 +98,6 @@ mtt-dump
mtt-count-words
symal2mam
mam2symal
-custom-pt
mmlex-build
mmlex-lookup
mam_verify
diff --git a/moses/TranslationModel/UG/mm/custom-pt.cc b/moses/TranslationModel/UG/mm/custom-pt.cc
index 1c1e0893c..e52772b48 100644
--- a/moses/TranslationModel/UG/mm/custom-pt.cc
+++ b/moses/TranslationModel/UG/mm/custom-pt.cc
@@ -1,6 +1,6 @@
// build a phrase table for the given input
// #include "ug_lexical_phrase_scorer2.h"
-
+#if 0
#include <stdint.h>
#include <string>
#include <vector>
@@ -25,7 +25,7 @@
#include "ug_bitext.h"
#include "../mmsapt_phrase_scorers.h"
#include "ug_lexical_phrase_scorer2.h"
-
+#include "../sapt_phrase_scorers.h"
using namespace std;
using namespace ugdiss;
using namespace Moses;
@@ -110,6 +110,7 @@ int main(int argc, char* argv[])
{
// assert(argc == 4);
#if 0
+#if 0
string base = argv[1];
string L1 = argv[2];
string L2 = argv[3];
@@ -182,7 +183,7 @@ int main(int argc, char* argv[])
}
}
}
-
+#endif
exit(0);
}
-
+#endif
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc
index 8dbbdcb92..a1a6dff7b 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@@ -158,99 +158,25 @@ namespace Moses
jstats::
invalidate()
{
- my_rcnt = 0;
+ if (my_wcnt > 0)
+ my_wcnt *= -1;
}
- bool
+ void
jstats::
- valid()
- {
- return my_rcnt != 0;
- }
-
- bool
- PhrasePair::
- operator<=(PhrasePair const& other) const
+ validate()
{
- return this->score <= other.score;
+ if (my_wcnt < 0)
+ my_wcnt *= -1;
}
bool
- PhrasePair::
- operator>=(PhrasePair const& other) const
- {
- return this->score >= other.score;
- }
-
- bool
- PhrasePair::
- operator<(PhrasePair const& other) const
- {
- return this->score < other.score;
- }
-
- bool
- PhrasePair::
- operator>(PhrasePair const& other) const
- {
- return this->score > other.score;
- }
-
- PhrasePair::
- PhrasePair() {}
-
- PhrasePair::
- PhrasePair(PhrasePair const& o)
- : p1(o.p1),
- p2(o.p2),
- raw1(o.raw1),
- raw2(o.raw2),
- sample1(o.sample1),
- sample2(o.sample2),
- good1(o.good1),
- good2(o.good2),
- joint(o.joint),
- fvals(o.fvals),
- aln(o.aln),
- score(o.score)
- {
- for (size_t i = 0; i <= po_other; ++i)
- {
- dfwd[i] = o.dfwd[i];
- dbwd[i] = o.dbwd[i];
- }
- }
-
- void
- PhrasePair::
- init(uint64_t const pid1, pstats const& ps, size_t const numfeats)
+ jstats::
+ valid()
{
- p1 = pid1;
- p2 = 0;
- raw1 = ps.raw_cnt;
- sample1 = ps.sample_cnt;
- sample2 = 0;
- good1 = ps.good;
- good2 = 0;
- raw2 = 0;
- fvals.resize(numfeats);
+ return my_wcnt >= 0;
}
- void
- PhrasePair::
- init(uint64_t const pid1,
- pstats const& ps1,
- pstats const& ps2,
- size_t const numfeats)
- {
- p1 = pid1;
- raw1 = ps1.raw_cnt + ps2.raw_cnt;
- sample1 = ps1.sample_cnt + ps2.sample_cnt;
- sample2 = 0;
- good1 = ps1.good + ps2.good;
- good2 = 0;
- fvals.resize(numfeats);
- }
float
lbop(size_t const tries, size_t const succ, float const confidence)
@@ -261,85 +187,6 @@ namespace Moses
find_lower_bound_on_p(tries, succ, confidence)));
}
- PhrasePair const&
- PhrasePair::
- update(uint64_t const pid2, jstats const& js)
- {
- p2 = pid2;
- raw2 = js.cnt2();
- joint = js.rcnt();
- assert(js.aln().size());
- if (js.aln().size())
- aln = js.aln()[0].second;
- float total_fwd = 0, total_bwd = 0;
- for (int i = po_first; i <= po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- total_fwd += js.dcnt_fwd(po)+1;
- total_bwd += js.dcnt_bwd(po)+1;
- }
- for (int i = po_first; i <= po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
- dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
- }
- return *this;
- }
-
- PhrasePair const&
- PhrasePair::
- update(uint64_t const pid2, jstats const& js1, jstats const& js2)
- {
- p2 = pid2;
- raw2 = js1.cnt2() + js2.cnt2();
- joint = js1.rcnt() + js2.rcnt();
- assert(js1.aln().size() || js2.aln().size());
- if (js1.aln().size())
- aln = js1.aln()[0].second;
- else if (js2.aln().size())
- aln = js2.aln()[0].second;
- for (int i = po_first; i < po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
- dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
- }
- return *this;
- }
-
- PhrasePair const&
- PhrasePair::
- update(uint64_t const pid2,
- size_t const raw2extra,
- jstats const& js)
- {
- p2 = pid2;
- raw2 = js.cnt2() + raw2extra;
- joint = js.rcnt();
- assert(js.aln().size());
- if (js.aln().size())
- aln = js.aln()[0].second;
- for (int i = po_first; i <= po_other; i++)
- {
- PhraseOrientation po = static_cast<PhraseOrientation>(i);
- dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
- dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
- }
- return *this;
- }
-
- float
- PhrasePair::
- eval(vector<float> const& w)
- {
- assert(w.size() == this->fvals.size());
- this->score = 0;
- for (size_t i = 0; i < w.size(); ++i)
- this->score += w[i] * this->fvals[i];
- return this->score;
- }
-
template<>
sptr<imBitext<L2R_Token<SimpleWordId> > >
imBitext<L2R_Token<SimpleWordId> >::
@@ -371,7 +218,8 @@ namespace Moses
uint32_t row,col; char c;
while (ibuf >> row >> c >> col)
{
- assert(c == '-');
+ UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
+ << "Error in alignment information:\n" << a);
binwrite(obuf,row);
binwrite(obuf,col);
}
@@ -639,7 +487,6 @@ namespace Moses
cout << string(90,'-') << endl;
}
-
PhraseOrientation
find_po_fwd(vector<vector<ushort> >& a1,
vector<vector<ushort> >& a2,
@@ -654,13 +501,13 @@ namespace Moses
ushort ns1,ne1,ne2;
if (!expand_phrase_pair(a1,a2,n2,b1,e1,ns1,ne1,ne2))
- {
- return po_other;
- }
+ return po_other;
+
if (ns1 >= e1)
{
for (ushort j = e1; j < ns1; ++j)
- if (a1[j].size()) return po_jfwd;
+ if (a1[j].size())
+ return po_jfwd;
return po_mono;
}
else
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index 397253973..4cb34c02d 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -56,6 +56,7 @@ namespace Moses {
class Mmsapt;
namespace bitext
{
+ template<typename TKN> class Bitext;
using namespace ugdiss;
template<typename TKN> class Bitext;
@@ -120,6 +121,7 @@ namespace Moses {
void add(float w, vector<uchar> const& a, uint32_t const cnt2,
uint32_t fwd_orient, uint32_t bwd_orient);
void invalidate();
+ void validate();
bool valid();
uint32_t dcnt_fwd(PhraseOrientation const idx) const;
uint32_t dcnt_bwd(PhraseOrientation const idx) const;
@@ -157,43 +159,6 @@ namespace Moses {
uint32_t fwd_o, uint32_t bwd_o);
};
- class
- PhrasePair
- {
- public:
- uint64_t p1, p2;
- uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
- vector<float> fvals;
- float dfwd[po_other+1];
- float dbwd[po_other+1];
- vector<uchar> aln;
- // float avlex12,avlex21; // average lexical probs (Moses std)
- // float znlex1,znlex2; // zens-ney lexical smoothing
- // float colex1,colex2; // based on raw lexical occurrences
- float score;
- PhrasePair();
- PhrasePair(PhrasePair const& o);
- bool operator<(PhrasePair const& other) const;
- bool operator>(PhrasePair const& other) const;
- bool operator<=(PhrasePair const& other) const;
- bool operator>=(PhrasePair const& other) const;
-
- void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
- void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
- size_t const numfeats);
-
- PhrasePair const&
- update(uint64_t const pid2, jstats const& js);
-
- PhrasePair const&
- update(uint64_t const pid2, jstats const& js1, jstats const& js2);
-
- PhrasePair const&
- update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
-
- float eval(vector<float> const& w);
- };
-
template<typename TKN>
class Bitext
diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
index 05066c922..0c6e4afbf 100644
--- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
@@ -16,6 +16,9 @@
#include "tpt_tokenindex.h"
#include "ug_ttrack_base.h"
#include "tpt_tokenindex.h"
+#include "util/exception.hh"
+#include "moses/Util.h"
+
// #include "ug_vocab.h"
// define the corpus buffer size (in sentences) and the
@@ -49,6 +52,8 @@ namespace ugdiss
typename boost::shared_ptr<imTtrack<Token> >
append<Token>(typename boost::shared_ptr<imTtrack<Token> > const & crp, vector<Token> const & snt);
+ void m_check_token_count(); // debugging function
+
public:
imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d);
@@ -70,6 +75,22 @@ namespace ugdiss
};
template<typename Token>
+ void
+ imTtrack<Token>::
+ m_check_token_count()
+ { // sanity check
+ size_t check = 0;
+ BOOST_FOREACH(vector<Token> const& s, *myData)
+ check += s.size();
+ UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]"
+ << " Wrong token count after appending sentence!"
+ << " Counted " << check << " but expected "
+ << this->numToks << " in a total of " << myData->size()
+ << " sentences.");
+
+ }
+
+ template<typename Token>
Token const*
imTtrack<Token>::
sntStart(size_t sid) const // return pointer to beginning of sentence
@@ -111,9 +132,9 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
imTtrack(istream& in, TokenIndex const& V, ostream* log = NULL)
+ : numToks(0)
{
myData.reset(new vector<vector<Token> >());
- numToks = 0;
string line,w;
size_t linectr=0;
boost::unordered_map<string,id_type> H;
@@ -135,6 +156,7 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
imTtrack(size_t reserve)
+ : numToks(0)
{
myData.reset(new vector<vector<Token> >());
if (reserve) myData->reserve(reserve);
@@ -143,9 +165,9 @@ namespace ugdiss
template<typename Token>
imTtrack<Token>::
imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d)
+ : numToks(0)
{
myData = d;
- numToks = 0;
BOOST_FOREACH(vector<Token> const& v, *d)
numToks += v.size();
}
@@ -171,6 +193,9 @@ namespace ugdiss
shared_ptr<imTtrack<TOKEN> >
append(shared_ptr<imTtrack<TOKEN> > const& crp, vector<TOKEN> const & snt)
{
+#if 1
+ if (crp) crp->m_check_token_count();
+#endif
shared_ptr<imTtrack<TOKEN> > ret;
if (crp == NULL)
{
@@ -185,6 +210,11 @@ namespace ugdiss
}
else ret = crp;
ret->myData->push_back(snt);
+ ret->numToks += snt.size();
+
+#if 1
+ ret->m_check_token_count();
+#endif
return ret;
}
diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
index 558b5a7fa..b7e359223 100644
--- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
+++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
@@ -27,7 +27,6 @@ namespace ugdiss
typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t;
table_t COOC;
void open(string const& fname);
-
template<typename someint>
void
score(TKN const* snt1, size_t const s1, size_t const e1,
@@ -104,7 +103,19 @@ namespace ugdiss
if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
<< ": alpha parameter must be >= 0");
- return float(COOC[s][t]+alpha)/(COOC.m1(s)+alpha);
+ float ret = COOC[s][t]+alpha;
+ ret = (ret?ret:1.)/(COOC.m1(s)+alpha);
+ UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
+ << ": result not > 0 and <= 1. alpha = " << alpha << "; "
+ << COOC[s][t] << "/" << COOC.m1(s));
+
+#if 0
+ cerr << "[" << s << "," << t << "] "
+ << COOC.m1(s) << "/"
+ << COOC[s][t] << "/"
+ << COOC.m2(t) << endl;
+#endif
+ return ret;
}
template<typename TKN>
@@ -115,7 +126,11 @@ namespace ugdiss
if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
<< ": alpha parameter must be >= 0");
- return float(COOC[s][t]+alpha)/(COOC.m2(t)+alpha);
+ float ret = float(COOC[s][t]+alpha);
+ ret = (ret?ret:1.)/(COOC.m2(t)+alpha);
+ UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__
+ << ": result not > 0 and <= 1.");
+ return ret;
}
template<typename TKN>
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.cc b/moses/TranslationModel/UG/mm/ug_phrasepair.cc
new file mode 100644
index 000000000..6373f8468
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.cc
@@ -0,0 +1,97 @@
+#include "ug_phrasepair.h"
+namespace Moses {
+ namespace bitext
+ {
+
+#if 0
+ void
+ PhrasePair::
+ init()
+ {
+ p1 = p2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+ }
+
+ void
+ PhrasePair::
+ init(uint64_t const pid1,
+ pstats const& ps1,
+ pstats const& ps2,
+ size_t const numfeats)
+ {
+ p1 = pid1;
+ raw1 = ps1.raw_cnt + ps2.raw_cnt;
+ sample1 = ps1.sample_cnt + ps2.sample_cnt;
+ sample2 = 0;
+ good1 = ps1.good + ps2.good;
+ good2 = 0;
+ joint = 0;
+ fvals.resize(numfeats);
+ }
+
+ PhrasePair const&
+ PhrasePair::
+ update(uint64_t const pid2, jstats const& js1, jstats const& js2)
+ {
+ p2 = pid2;
+ raw2 = js1.cnt2() + js2.cnt2();
+ joint = js1.rcnt() + js2.rcnt();
+ assert(js1.aln().size() || js2.aln().size());
+ if (js1.aln().size())
+ aln = js1.aln()[0].second;
+ else if (js2.aln().size())
+ aln = js2.aln()[0].second;
+ for (int i = po_first; i < po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
+ dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
+ }
+ return *this;
+ }
+
+ PhrasePair const&
+ PhrasePair::
+ update(uint64_t const pid2, size_t r2)
+ {
+ p2 = pid2;
+ raw2 = r2;
+ joint = 0;
+ return *this;
+ }
+
+
+ PhrasePair const&
+ PhrasePair::
+ update(uint64_t const pid2,
+ size_t const raw2extra,
+ jstats const& js)
+ {
+ p2 = pid2;
+ raw2 = js.cnt2() + raw2extra;
+ joint = js.rcnt();
+ assert(js.aln().size());
+ if (js.aln().size())
+ aln = js.aln()[0].second;
+ for (int i = po_first; i <= po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
+ dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
+ }
+ return *this;
+ }
+
+ float
+ PhrasePair::
+ eval(vector<float> const& w)
+ {
+ assert(w.size() == this->fvals.size());
+ this->score = 0;
+ for (size_t i = 0; i < w.size(); ++i)
+ this->score += w[i] * this->fvals[i];
+ return this->score;
+ }
+#endif
+ } // namespace bitext
+} // namespace Moses
+
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h
new file mode 100644
index 000000000..8cd43dc18
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@@ -0,0 +1,243 @@
+//-*- c++ -*-
+#pragma once
+#include "ug_bitext.h"
+
+using namespace ugdiss;
+using namespace std;
+
+namespace Moses {
+ namespace bitext
+ {
+
+ template<typename Token>
+ string
+ toString(TokenIndex const& V, Token const* x, size_t const len)
+ {
+ if (!len) return "";
+ UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
+ ostringstream buf;
+ buf << V[x->id()];
+ size_t i = 1;
+ for (x = x->next(); x && i < len; ++i, x = x->next())
+ buf << " " << V[x->id()];
+ UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
+ return buf.str();
+ }
+
+ template<typename Token>
+ class
+ PhrasePair
+ {
+ public:
+ Token const* start1;
+ Token const* start2;
+ uint32_t len1;
+ uint32_t len2;
+ // uint64_t p1, p2;
+ uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
+ vector<float> fvals;
+ float dfwd[po_other+1]; // distortion counts // counts or probs?
+ float dbwd[po_other+1]; // distortion counts
+ vector<uchar> aln;
+ float score;
+ PhrasePair() { };
+ PhrasePair(PhrasePair const& o);
+
+ PhrasePair const& operator+=(PhrasePair const& other);
+
+ bool operator<(PhrasePair const& other) const;
+ bool operator>(PhrasePair const& other) const;
+ bool operator<=(PhrasePair const& other) const;
+ bool operator>=(PhrasePair const& other) const;
+
+ void init();
+ void init(Token const* x, uint32_t const len,
+ pstats const* ps = NULL, size_t const numfeats=0);
+
+ // void init(uint64_t const pid1, pstats const& ps, size_t const numfeats);
+ // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2,
+ // size_t const numfeats);
+
+ // PhrasePair const&
+ // update(uint64_t const pid2, size_t r2 = 0);
+
+ PhrasePair const&
+ update(Token const* x, uint32_t const len, jstats const& js);
+
+ // PhrasePair const&
+ // update(uint64_t const pid2, jstats const& js1, jstats const& js2);
+
+ // PhrasePair const&
+ // update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
+
+ // float
+ // eval(vector<float> const& w);
+
+ class SortByTargetIdSeq
+ {
+ public:
+ int cmp(PhrasePair const& a, PhrasePair const& b) const;
+ bool operator()(PhrasePair const& a, PhrasePair const& b) const;
+ };
+ };
+
+ template<typename Token>
+ void
+ PhrasePair<Token>::
+ init(Token const* x, uint32_t const len,
+ pstats const* ps, size_t const numfeats)
+ {
+ start1 = x; len1 = len;
+ // p1 = pid1;
+ // p2 = 0;
+ if (ps)
+ {
+ raw1 = ps->raw_cnt;
+ sample1 = ps->sample_cnt;
+ good1 = ps->good;
+ }
+ else raw1 = sample1 = good1 = 0;
+ joint = 0;
+ good2 = 0;
+ sample2 = 0;
+ raw2 = 0;
+ fvals.resize(numfeats);
+ }
+
+ template<typename Token>
+ PhrasePair<Token> const&
+ PhrasePair<Token>::
+ update(Token const* x, uint32_t const len, jstats const& js)
+ {
+ // p2 = pid2;
+ start2 = x; len2 = len;
+ raw2 = js.cnt2();
+ joint = js.rcnt();
+ assert(js.aln().size());
+ if (js.aln().size())
+ aln = js.aln()[0].second;
+ float total_fwd = 0, total_bwd = 0;
+ for (int i = po_first; i <= po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ total_fwd += js.dcnt_fwd(po)+1;
+ total_bwd += js.dcnt_bwd(po)+1;
+ }
+
+ // should we do that here or leave the raw counts?
+ for (int i = po_first; i <= po_other; i++)
+ {
+ PhraseOrientation po = static_cast<PhraseOrientation>(i);
+ dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
+ dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
+ }
+
+ return *this;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator<(PhrasePair const& other) const
+ { return this->score < other.score; }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator>(PhrasePair const& other) const
+ { return this->score > other.score; }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator<=(PhrasePair const& other) const
+ { return this->score <= other.score; }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ operator>=(PhrasePair const& other) const
+ { return this->score >= other.score; }
+
+ template<typename Token>
+ PhrasePair<Token> const&
+ PhrasePair<Token>::
+ operator+=(PhrasePair const& o)
+ {
+ raw1 += o.raw1;
+ raw2 += o.raw2;
+ sample1 += o.sample1;
+ sample2 += o.sample2;
+ good1 += o.good1;
+ good2 += o.good2;
+ joint += o.joint;
+ return *this;
+ }
+
+ template<typename Token>
+ PhrasePair<Token>::
+ PhrasePair(PhrasePair<Token> const& o)
+ : start1(o.start1)
+ , start2(o.start2)
+ , len1(o.len1)
+ , len2(o.len2)
+ , raw1(o.raw1)
+ , raw2(o.raw2)
+ , sample1(o.sample1)
+ , sample2(o.sample2)
+ , good1(o.good1)
+ , good2(o.good2)
+ , joint(o.joint)
+ , fvals(o.fvals)
+ , aln(o.aln)
+ , score(o.score)
+ {
+ for (size_t i = 0; i <= po_other; ++i)
+ {
+ dfwd[i] = o.dfwd[i];
+ dbwd[i] = o.dbwd[i];
+ }
+ }
+
+ template<typename Token>
+ int
+ PhrasePair<Token>::
+ SortByTargetIdSeq::
+ cmp(PhrasePair const& a, PhrasePair const& b) const
+ {
+ size_t i = 0;
+ Token const* x = a.start2;
+ Token const* y = b.start2;
+ while (i < a.len2 && i < b.len2 && x->id() == y->id())
+ {
+ x = x->next();
+ y = y->next();
+ ++i;
+ }
+ if (i == a.len2 && i == b.len2) return 0;
+ if (i == a.len2) return -1;
+ if (i == b.len2) return 1;
+ return x->id() < y->id() ? -1 : 1;
+ }
+
+ template<typename Token>
+ bool
+ PhrasePair<Token>::
+ SortByTargetIdSeq::
+ operator()(PhrasePair const& a, PhrasePair const& b) const
+ {
+ return this->cmp(a,b) < 0;
+ }
+
+ template<typename Token>
+ void
+ PhrasePair<Token>::
+ init()
+ {
+ len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+ start1 = start2 = NULL;
+ }
+
+
+ } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
index 14bf6cdad..ab7f96bf0 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
@@ -7,6 +7,8 @@
#include "ug_typedefs.h"
#include "tpt_tokenindex.h"
#include <iostream>
+#include "util/exception.hh"
+#include "moses/Util.h"
//#include <cassert>
// #include "ug_bv_iter.h"
@@ -60,10 +62,15 @@ namespace ugdiss
// TSA_tree_iterator(TSA_tree_iterator const& other);
TSA_tree_iterator(TSA<Token> const* s);
+ TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other);
TSA_tree_iterator(TSA<Token> const* r, id_type const* s, size_t const len);
// TSA_tree_iterator(TSA<Token> const* s, Token const& t);
TSA_tree_iterator(TSA<Token> const* s,
Token const* kstart,
+ size_t const len,
+ bool full_match_only=true);
+ TSA_tree_iterator(TSA<Token> const* s,
+ Token const* kstart,
Token const* kend,
bool full_match_only=true);
// TSA_tree_iterator(TSA<Token> const* s,
@@ -150,9 +157,12 @@ namespace ugdiss
double approxOccurrenceCount(int p=-1) const
{
assert(root);
+ if (p < 0) p += lower.size();
double ret = arrayByteSpanSize(p)/root->aveIndexEntrySize();
- assert(ret < root->corpus->numTokens());
if (ret < 25) ret = rawCnt(p);
+ UTIL_THROW_IF2(ret > root->corpus->numTokens(), "[" << HERE << "] "
+ << "Word count mismatch.");
+ assert(ret <= root->corpus->numTokens());
return ret;
}
@@ -320,6 +330,18 @@ namespace ugdiss
template<typename Token>
TSA_tree_iterator<Token>::
+ TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other)
+ : root(s)
+ {
+ Token const* x = other.getToken(0);
+ for (size_t i = 0; i < other.size() && this->extend(x->id()); ++i)
+ x = x->next();
+ };
+
+
+
+ template<typename Token>
+ TSA_tree_iterator<Token>::
TSA_tree_iterator
(TSA<Token> const* r,
id_type const* s,
@@ -385,6 +407,25 @@ namespace ugdiss
template<typename Token>
TSA_tree_iterator<Token>::
TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
+ size_t const len, bool full_match_only)
+ : root(s)
+ {
+ if (!root) return;
+ size_t i = 0;
+ for (; i < len && kstart && extend(*kstart); ++i)
+ kstart = kstart->next();
+ if (full_match_only && i != len)
+ {
+ lower.clear();
+ upper.clear();
+ }
+ };
+
+ // DEPRECATED: DO NOT USE. Use the one that takes the length
+ // instead of kend.
+ template<typename Token>
+ TSA_tree_iterator<Token>::
+ TSA_tree_iterator(TSA<Token> const* s, Token const* kstart,
Token const* kend, bool full_match_only)
: root(s)
{
@@ -561,8 +602,7 @@ namespace ugdiss
TSA_tree_iterator<Token>::
rawCnt(int p) const
{
- if (p < 0)
- p = lower.size()+p;
+ if (p < 0) p += lower.size();
assert(p>=0);
if (lower.size() == 0) return root->getCorpusSize();
return root->rawCnt(lower[p],upper[p]);