9 files changed, 464 insertions, 227 deletions
diff --git a/moses/TranslationModel/UG/mm/Jamfile b/moses/TranslationModel/UG/mm/Jamfile
index 2cc923581..8d8af050a 100644
--- a/moses/TranslationModel/UG/mm/Jamfile
+++ b/moses/TranslationModel/UG/mm/Jamfile
@@ -72,15 +72,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
 $(TOP)/util//kenutil 
 ; 
 
-exe custom-pt : 
-custom-pt.cc 
-$(TOP)/moses//moses
-$(TOP)//boost_iostreams 
-$(TOP)//boost_program_options 
-$(TOP)/moses/TranslationModel/UG/mm//mm 
-$(TOP)/moses/TranslationModel/UG/generic//generic 
-$(TOP)/util//kenutil 
-; 
+# exe custom-pt : 
+# custom-pt.cc 
+# $(TOP)/moses//moses
+# $(TOP)//boost_iostreams 
+# $(TOP)//boost_program_options 
+# $(TOP)/moses/TranslationModel/UG/mm//mm 
+# $(TOP)/moses/TranslationModel/UG/generic//generic 
+# $(TOP)/util//kenutil 
+# ; 
 
 
 exe calc-coverage : 
@@ -98,7 +98,6 @@ mtt-dump
 mtt-count-words 
 symal2mam 
 mam2symal 
-custom-pt 
 mmlex-build 
 mmlex-lookup
 mam_verify 
diff --git a/moses/TranslationModel/UG/mm/custom-pt.cc b/moses/TranslationModel/UG/mm/custom-pt.cc
index 1c1e0893c..e52772b48 100644
--- a/moses/TranslationModel/UG/mm/custom-pt.cc
+++ b/moses/TranslationModel/UG/mm/custom-pt.cc
@@ -1,6 +1,6 @@
 // build a phrase table for the given input
 // #include "ug_lexical_phrase_scorer2.h"
-
+#if 0
 #include <stdint.h>
 #include <string>
 #include <vector>
@@ -25,7 +25,7 @@
 #include "ug_bitext.h"
 #include "../mmsapt_phrase_scorers.h"
 #include "ug_lexical_phrase_scorer2.h"
-
+#include "../sapt_phrase_scorers.h"
 using namespace std;
 using namespace ugdiss;
 using namespace Moses;
@@ -110,6 +110,7 @@ int main(int argc, char* argv[])
 {
   // assert(argc == 4);
 #if 0
+#if 0
   string base = argv[1];
   string L1   = argv[2];
   string L2   = argv[3];
@@ -182,7 +183,7 @@ int main(int argc, char* argv[])
       	    }
       	}
     }
-  
+#endif  
     exit(0);
 }
-
+#endif
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc
index 8dbbdcb92..a1a6dff7b 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@@ -158,99 +158,25 @@ namespace Moses
     jstats::
     invalidate()
     {
-      my_rcnt = 0;
+      if (my_wcnt > 0) 
+	my_wcnt *= -1;
     }
 
-    bool
+    void 
     jstats::
-    valid()
-    {
-      return my_rcnt != 0;
-    }
-
-    bool
-    PhrasePair::
-    operator<=(PhrasePair const& other) const
+    validate()
     {
-      return this->score <= other.score;
+      if (my_wcnt < 0) 
+	my_wcnt *= -1;
     }
 
     bool
-    PhrasePair::
-    operator>=(PhrasePair const& other) const
-    {
-      return this->score >= other.score;
-    }
-
-    bool
-    PhrasePair::
-    operator<(PhrasePair const& other) const
-    {
-      return this->score < other.score;
-    }
-    
-    bool
-    PhrasePair::
-    operator>(PhrasePair const& other) const
-    {
-      return this->score > other.score;
-    }
-    
-    PhrasePair::
-    PhrasePair() {}
-
-    PhrasePair::
-    PhrasePair(PhrasePair const& o) 
-      : p1(o.p1), 
-	p2(o.p2),
-	raw1(o.raw1), 
-	raw2(o.raw2), 
-	sample1(o.sample1),
-	sample2(o.sample2),
-	good1(o.good1),
-	good2(o.good2),
-	joint(o.joint),
-	fvals(o.fvals),
-	aln(o.aln),
-	score(o.score)
-    {
-      for (size_t i = 0; i <= po_other; ++i)
-	{
-	  dfwd[i] = o.dfwd[i];
-	  dbwd[i] = o.dbwd[i];
-	}
-    }
-    
-    void
-    PhrasePair::
-    init(uint64_t const pid1, pstats const& ps, size_t const numfeats)
+    jstats::
+    valid()
     {
-      p1      = pid1;
-      p2      = 0;
-      raw1    = ps.raw_cnt;
-      sample1 = ps.sample_cnt;
-      sample2 = 0;
-      good1   = ps.good;
-      good2   = 0;
-      raw2    = 0;
-      fvals.resize(numfeats);
+      return my_wcnt >= 0;
     }
 
-    void
-    PhrasePair::
-    init(uint64_t const pid1, 
-	 pstats const& ps1, 
-	 pstats const& ps2, 
-	 size_t const numfeats)
-    {
-      p1      = pid1;
-      raw1    = ps1.raw_cnt    + ps2.raw_cnt;
-      sample1 = ps1.sample_cnt + ps2.sample_cnt;
-      sample2 = 0;
-      good1   = ps1.good       + ps2.good;
-      good2   = 0;
-      fvals.resize(numfeats);
-    }
     
     float 
     lbop(size_t const tries, size_t const succ, float const confidence)
@@ -261,85 +187,6 @@ namespace Moses
 		 find_lower_bound_on_p(tries, succ, confidence)));
     }
     
-    PhrasePair const&
-    PhrasePair::
-    update(uint64_t const pid2, jstats const& js)   
-    {
-      p2    = pid2;
-      raw2  = js.cnt2();
-      joint = js.rcnt();
-      assert(js.aln().size());
-      if (js.aln().size()) 
-	aln = js.aln()[0].second;
-      float total_fwd = 0, total_bwd = 0;
-      for (int i = po_first; i <= po_other; i++)
-	{
-	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
-	  total_fwd += js.dcnt_fwd(po)+1;
-	  total_bwd += js.dcnt_bwd(po)+1;
-	}
-      for (int i = po_first; i <= po_other; i++)
-	{
-	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
-	  dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
-	  dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
-	}
-      return *this;
-    }
-
-    PhrasePair const&
-    PhrasePair::
-    update(uint64_t const pid2, jstats const& js1, jstats const& js2)   
-    {
-      p2    = pid2;
-      raw2  = js1.cnt2() + js2.cnt2();
-      joint = js1.rcnt() + js2.rcnt();
-      assert(js1.aln().size() || js2.aln().size());
-      if (js1.aln().size()) 
-	aln = js1.aln()[0].second;
-      else if (js2.aln().size()) 
-	aln = js2.aln()[0].second;
-      for (int i = po_first; i < po_other; i++)
-	{
-	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
-	  dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
-	  dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
-	}
-      return *this;
-    }
-
-    PhrasePair const&
-    PhrasePair::
-    update(uint64_t const pid2, 
-	   size_t   const raw2extra,
-	   jstats   const& js)   
-    {
-      p2    = pid2;
-      raw2  = js.cnt2() + raw2extra;
-      joint = js.rcnt();
-      assert(js.aln().size());
-      if (js.aln().size()) 
-	aln = js.aln()[0].second;
-      for (int i = po_first; i <= po_other; i++)
-	{
-	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
-	  dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
-	  dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
-	}
-      return *this;
-    }
-
-    float
-    PhrasePair::
-    eval(vector<float> const& w)
-    {
-      assert(w.size() == this->fvals.size());
-      this->score = 0;
-      for (size_t i = 0; i < w.size(); ++i)
-	this->score += w[i] * this->fvals[i];
-      return this->score;
-    }
-  
     template<>
     sptr<imBitext<L2R_Token<SimpleWordId> > > 
     imBitext<L2R_Token<SimpleWordId> >::
@@ -371,7 +218,8 @@ namespace Moses
 	  uint32_t row,col; char c;
 	  while (ibuf >> row >> c >> col)
 	    {
-	      assert(c == '-');
+	      UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
+			     << "Error in alignment information:\n" << a);
 	      binwrite(obuf,row);
 	      binwrite(obuf,col);
 	    }
@@ -639,7 +487,6 @@ namespace Moses
       cout  << string(90,'-') << endl;
     }
 
-
     PhraseOrientation 
     find_po_fwd(vector<vector<ushort> >& a1,
 		vector<vector<ushort> >& a2,
@@ -654,13 +501,13 @@ namespace Moses
       
       ushort ns1,ne1,ne2;
       if (!expand_phrase_pair(a1,a2,n2,b1,e1,ns1,ne1,ne2))
-	{
-	  return po_other;
-	}
+	return po_other;
+
       if (ns1 >= e1)
 	{
 	  for (ushort j = e1; j < ns1; ++j)
-	    if (a1[j].size()) return po_jfwd;
+	    if (a1[j].size()) 
+	      return po_jfwd;
 	  return po_mono;
 	}
       else
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index 397253973..4cb34c02d 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -56,6 +56,7 @@ namespace Moses {
   class Mmsapt;
   namespace bitext
   {
+    template<typename TKN> class Bitext;
     using namespace ugdiss;
 
     template<typename TKN> class Bitext;
@@ -120,6 +121,7 @@ namespace Moses {
       void add(float w, vector<uchar> const& a, uint32_t const cnt2,
 	       uint32_t fwd_orient, uint32_t bwd_orient);
       void invalidate();
+      void validate();
       bool valid();
       uint32_t dcnt_fwd(PhraseOrientation const idx) const;
       uint32_t dcnt_bwd(PhraseOrientation const idx) const;
@@ -157,43 +159,6 @@ namespace Moses {
 	  uint32_t fwd_o, uint32_t bwd_o);
     };
     
-    class 
-    PhrasePair
-    {
-    public:
-      uint64_t p1, p2;
-      uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
-      vector<float> fvals;
-      float dfwd[po_other+1];
-      float dbwd[po_other+1];
-      vector<uchar> aln;
-      // float    avlex12,avlex21; // average lexical probs (Moses std)
-      // float    znlex1,znlex2;   // zens-ney lexical smoothing
-      // float    colex1,colex2;   // based on raw lexical occurrences
-      float score;
-      PhrasePair();
-      PhrasePair(PhrasePair const& o);
-      bool operator<(PhrasePair const& other) const;
-      bool operator>(PhrasePair const& other) const;
-      bool operator<=(PhrasePair const& other) const;
-      bool operator>=(PhrasePair const& other) const;
-
-      void init(uint64_t const pid1, pstats const& ps,  size_t const numfeats);
-      void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, 
-		size_t const numfeats);
-
-      PhrasePair const& 
-      update(uint64_t const pid2, jstats const& js);
-
-      PhrasePair const& 
-      update(uint64_t const pid2, jstats   const& js1, jstats   const& js2);
-
-      PhrasePair const& 
-      update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
-
-      float eval(vector<float> const& w);
-    };
-
 
     template<typename TKN>
     class Bitext 
diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
index 05066c922..0c6e4afbf 100644
--- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
@@ -16,6 +16,9 @@
 #include "tpt_tokenindex.h"
 #include "ug_ttrack_base.h"
 #include "tpt_tokenindex.h"
+#include "util/exception.hh"
+#include "moses/Util.h"
+
 // #include "ug_vocab.h"
 
 // define the corpus buffer size (in sentences) and the
@@ -49,6 +52,8 @@ namespace ugdiss
     typename boost::shared_ptr<imTtrack<Token> > 
     append<Token>(typename boost::shared_ptr<imTtrack<Token> > const & crp, vector<Token> const & snt);
 
+    void m_check_token_count(); // debugging function
+
   public:
 
     imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d);
@@ -70,6 +75,22 @@ namespace ugdiss
   };
 
   template<typename Token>
+  void
+  imTtrack<Token>::
+  m_check_token_count()
+  { // sanity check
+    size_t check = 0;
+    BOOST_FOREACH(vector<Token> const& s, *myData)
+      check += s.size();
+    UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]" 
+		   << " Wrong token count after appending sentence!"
+		   << " Counted " << check << " but expected " 
+		   << this->numToks << " in a total of " << myData->size() 
+		   << " sentences.");
+    
+  }
+
+  template<typename Token>
   Token const* 
   imTtrack<Token>::
   sntStart(size_t sid) const // return pointer to beginning of sentence
@@ -111,9 +132,9 @@ namespace ugdiss
   template<typename Token>
   imTtrack<Token>::
   imTtrack(istream& in, TokenIndex const& V, ostream* log = NULL)
+    : numToks(0)
   {
     myData.reset(new vector<vector<Token> >());
-    numToks = 0;
     string line,w;
     size_t linectr=0;
     boost::unordered_map<string,id_type> H;
@@ -135,6 +156,7 @@ namespace ugdiss
   template<typename Token>
   imTtrack<Token>::
   imTtrack(size_t reserve)
+    : numToks(0)
   {
     myData.reset(new vector<vector<Token> >());
     if (reserve) myData->reserve(reserve);
@@ -143,9 +165,9 @@ namespace ugdiss
   template<typename Token>
   imTtrack<Token>::
   imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d)
+    : numToks(0)
   {
     myData  = d;
-    numToks = 0;
     BOOST_FOREACH(vector<Token> const& v, *d)
       numToks += v.size();
   }
@@ -171,6 +193,9 @@ namespace ugdiss
   shared_ptr<imTtrack<TOKEN> > 
   append(shared_ptr<imTtrack<TOKEN> > const& crp, vector<TOKEN> const & snt)
   {
+#if 1
+    if (crp) crp->m_check_token_count();
+#endif
     shared_ptr<imTtrack<TOKEN> > ret;
     if (crp == NULL)
       {
@@ -185,6 +210,11 @@ namespace ugdiss
       }
     else ret = crp;
     ret->myData->push_back(snt);
+    ret->numToks += snt.size();
+
+#if 1
+    ret->m_check_token_count();
+#endif
     return ret;
   }
 
diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
index 558b5a7fa..b7e359223 100644
--- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
+++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
@@ -27,7 +27,6 @@ namespace ugdiss
     typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t;
     table_t COOC;
     void open(string const& fname);
-
     template<typename someint>
     void 
     score(TKN const* snt1, size_t const s1, size_t const e1,
@@ -104,7 +103,19 @@ namespace ugdiss
     if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
     UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
 		   << ": alpha parameter must be >= 0");
-    return float(COOC[s][t]+alpha)/(COOC.m1(s)+alpha);
+    float ret = COOC[s][t]+alpha;
+    ret =  (ret?ret:1.)/(COOC.m1(s)+alpha);
+    UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__ 
+		   << ": result not > 0 and <= 1. alpha = " << alpha << "; "
+		   << COOC[s][t] << "/" << COOC.m1(s));
+
+#if 0
+    cerr << "[" << s << "," << t << "] " 
+	 << COOC.m1(s) << "/" 
+	 << COOC[s][t] << "/" 
+	 << COOC.m2(t) << endl;
+#endif
+    return ret;
   }
   
   template<typename TKN>
@@ -115,7 +126,11 @@ namespace ugdiss
     if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
     UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
 		   << ": alpha parameter must be >= 0");
-    return float(COOC[s][t]+alpha)/(COOC.m2(t)+alpha);
+    float ret = float(COOC[s][t]+alpha);
+    ret = (ret?ret:1.)/(COOC.m2(t)+alpha);
+    UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__ 
+		   << ": result not > 0 and <= 1.");
+    return ret;
   }
   
   template<typename TKN>
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.cc b/moses/TranslationModel/UG/mm/ug_phrasepair.cc
new file mode 100644
index 000000000..6373f8468
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.cc
@@ -0,0 +1,97 @@
+#include "ug_phrasepair.h"
+namespace Moses {
+  namespace bitext
+  {
+
+#if 0
+    void 
+    PhrasePair::
+    init()
+    {
+      p1 = p2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+    }
+
+    void
+    PhrasePair::
+    init(uint64_t const pid1, 
+	 pstats const& ps1, 
+	 pstats const& ps2, 
+	 size_t const numfeats)
+    {
+      p1      = pid1;
+      raw1    = ps1.raw_cnt    + ps2.raw_cnt;
+      sample1 = ps1.sample_cnt + ps2.sample_cnt;
+      sample2 = 0;
+      good1   = ps1.good       + ps2.good;
+      good2   = 0;
+      joint   = 0;
+      fvals.resize(numfeats);
+    }
+
+    PhrasePair const&
+    PhrasePair::
+    update(uint64_t const pid2, jstats const& js1, jstats const& js2)   
+    {
+      p2    = pid2;
+      raw2  = js1.cnt2() + js2.cnt2();
+      joint = js1.rcnt() + js2.rcnt();
+      assert(js1.aln().size() || js2.aln().size());
+      if (js1.aln().size()) 
+	aln = js1.aln()[0].second;
+      else if (js2.aln().size()) 
+	aln = js2.aln()[0].second;
+      for (int i = po_first; i < po_other; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
+	  dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
+	}
+      return *this;
+    }
+
+    PhrasePair const&
+    PhrasePair::
+    update(uint64_t const pid2, size_t r2)
+    {
+      p2    = pid2;
+      raw2  = r2;
+      joint = 0;
+      return *this;
+    } 
+
+
+    PhrasePair const&
+    PhrasePair::
+    update(uint64_t const pid2, 
+	   size_t   const raw2extra,
+	   jstats   const& js)   
+    {
+      p2    = pid2;
+      raw2  = js.cnt2() + raw2extra;
+      joint = js.rcnt();
+      assert(js.aln().size());
+      if (js.aln().size()) 
+	aln = js.aln()[0].second;
+      for (int i = po_first; i <= po_other; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
+	  dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
+	}
+      return *this;
+    }
+
+    float
+    PhrasePair::
+    eval(vector<float> const& w)
+    {
+      assert(w.size() == this->fvals.size());
+      this->score = 0;
+      for (size_t i = 0; i < w.size(); ++i)
+	this->score += w[i] * this->fvals[i];
+      return this->score;
+    }
+#endif
+  } // namespace bitext
+} // namespace Moses
+
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h
new file mode 100644
index 000000000..8cd43dc18
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@@ -0,0 +1,243 @@
+//-*- c++ -*-
+#pragma once
+#include "ug_bitext.h"
+
+using namespace ugdiss;
+using namespace std;
+
+namespace Moses {
+  namespace bitext
+  {
+
+    template<typename Token>
+    string 
+    toString(TokenIndex const& V, Token const* x, size_t const len)
+    {
+      if (!len) return "";
+      UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
+      ostringstream buf; 
+      buf << V[x->id()];
+      size_t i = 1;
+      for (x = x->next(); x && i < len; ++i, x = x->next())
+	buf << " " << V[x->id()];
+      UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
+      return buf.str();
+    }
+
+    template<typename Token>
+    class 
+    PhrasePair
+    {
+    public:
+      Token const* start1;
+      Token const* start2;
+      uint32_t len1;
+      uint32_t len2;
+      // uint64_t p1, p2;
+      uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
+      vector<float> fvals;
+      float dfwd[po_other+1]; // distortion counts // counts or probs?
+      float dbwd[po_other+1]; // distortion counts
+      vector<uchar> aln;
+      float score;
+      PhrasePair() { };
+      PhrasePair(PhrasePair const& o);
+
+      PhrasePair const& operator+=(PhrasePair const& other);
+
+      bool operator<(PhrasePair const& other) const;
+      bool operator>(PhrasePair const& other) const;
+      bool operator<=(PhrasePair const& other) const; 
+      bool operator>=(PhrasePair const& other) const;
+
+      void init();
+      void init(Token const* x,   uint32_t const len,
+		pstats const* ps = NULL, size_t const numfeats=0);
+      
+      // void init(uint64_t const pid1, pstats const& ps,  size_t const numfeats);
+      // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, 
+      // size_t const numfeats);
+
+      // PhrasePair const&
+      // update(uint64_t const pid2, size_t r2 = 0);
+
+      PhrasePair const& 
+      update(Token const* x, uint32_t const len, jstats const& js);
+      
+      // PhrasePair const& 
+      // update(uint64_t const pid2, jstats   const& js1, jstats   const& js2);
+
+      // PhrasePair const& 
+      // update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
+
+      // float 
+      // eval(vector<float> const& w);
+
+      class SortByTargetIdSeq
+      {
+      public:
+	int cmp(PhrasePair const& a, PhrasePair const& b) const;
+	bool operator()(PhrasePair const& a, PhrasePair const& b) const;
+      };
+    };
+
+    template<typename Token>
+    void
+    PhrasePair<Token>::
+    init(Token const* x, uint32_t const len, 
+	 pstats const* ps, size_t const numfeats)
+    {
+      start1 = x; len1 = len;
+      // p1      = pid1;
+      // p2      = 0;
+      if (ps)
+	{
+	  raw1    = ps->raw_cnt;
+	  sample1 = ps->sample_cnt;
+	  good1   = ps->good;
+	}
+      else raw1 = sample1 = good1 = 0;
+      joint   = 0;
+      good2   = 0;
+      sample2 = 0;
+      raw2    = 0;
+      fvals.resize(numfeats);
+    }
+
+    template<typename Token>
+    PhrasePair<Token> const&
+    PhrasePair<Token>::
+    update(Token const* x, uint32_t const len, jstats const& js)   
+    {
+      // p2    = pid2;
+      start2 = x; len2 = len;
+      raw2  = js.cnt2();
+      joint = js.rcnt();
+      assert(js.aln().size());
+      if (js.aln().size()) 
+	aln = js.aln()[0].second;
+      float total_fwd = 0, total_bwd = 0;
+      for (int i = po_first; i <= po_other; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  total_fwd += js.dcnt_fwd(po)+1;
+	  total_bwd += js.dcnt_bwd(po)+1;
+	}
+
+      // should we do that here or leave the raw counts?
+      for (int i = po_first; i <= po_other; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
+	  dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
+	}
+
+      return *this;
+    }
+
+    template<typename Token>
+    bool 
+    PhrasePair<Token>::
+    operator<(PhrasePair const& other) const 
+    { return this->score < other.score; }
+    
+    template<typename Token>
+    bool 
+    PhrasePair<Token>::
+    operator>(PhrasePair const& other) const
+    { return this->score > other.score; }
+
+    template<typename Token>
+    bool 
+    PhrasePair<Token>::
+    operator<=(PhrasePair const& other) const 
+    { return this->score <= other.score; }
+    
+    template<typename Token>
+    bool 
+    PhrasePair<Token>::
+    operator>=(PhrasePair const& other) const
+    { return this->score >= other.score; }
+
+    template<typename Token>
+    PhrasePair<Token> const&
+    PhrasePair<Token>::
+    operator+=(PhrasePair const& o) 
+    { 
+      raw1 += o.raw1;
+      raw2 += o.raw2;
+      sample1 += o.sample1;
+      sample2 += o.sample2;
+      good1 += o.good1;
+      good2 += o.good2;
+      joint += o.joint;
+      return *this;
+    }
+
+    template<typename Token>
+    PhrasePair<Token>::
+    PhrasePair(PhrasePair<Token> const& o) 
+      : start1(o.start1)
+      , start2(o.start2)
+      , len1(o.len1)
+      , len2(o.len2)
+      , raw1(o.raw1) 
+      , raw2(o.raw2) 
+      , sample1(o.sample1)
+      , sample2(o.sample2)
+      ,	good1(o.good1)
+      , good2(o.good2)
+      , joint(o.joint)
+      , fvals(o.fvals)
+      , aln(o.aln)
+      , score(o.score)
+    {
+      for (size_t i = 0; i <= po_other; ++i)
+	{
+	  dfwd[i] = o.dfwd[i];
+	  dbwd[i] = o.dbwd[i];
+	}
+    }
+    
+    template<typename Token>
+    int
+    PhrasePair<Token>::
+    SortByTargetIdSeq::
+    cmp(PhrasePair const& a, PhrasePair const& b) const
+    {
+      size_t i = 0;
+      Token const* x = a.start2;
+      Token const* y = b.start2;
+      while (i < a.len2 && i < b.len2 && x->id() == y->id()) 
+	{
+	  x = x->next();
+	  y = y->next();
+	  ++i;
+	}
+      if (i == a.len2 && i == b.len2) return 0;
+      if (i == a.len2) return -1;
+      if (i == b.len2) return  1;
+      return x->id() < y->id() ? -1 : 1;
+    }
+    
+    template<typename Token>
+    bool
+    PhrasePair<Token>::
+    SortByTargetIdSeq::
+    operator()(PhrasePair const& a, PhrasePair const& b) const
+    {
+      return this->cmp(a,b) < 0;
+    }
+
+    template<typename Token>
+    void 
+    PhrasePair<Token>::
+    init()
+    {
+      len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+      start1 = start2 = NULL;
+    }
+
+
+  } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
index 14bf6cdad..ab7f96bf0 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
@@ -7,6 +7,8 @@
 #include "ug_typedefs.h"
 #include "tpt_tokenindex.h"
 #include <iostream>
+#include "util/exception.hh"
+#include "moses/Util.h"
 //#include <cassert>
 
 // #include "ug_bv_iter.h"
@@ -60,10 +62,15 @@ namespace ugdiss
 
     // TSA_tree_iterator(TSA_tree_iterator const& other);
     TSA_tree_iterator(TSA<Token> const* s);
+    TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other);
     TSA_tree_iterator(TSA<Token> const* r, id_type const* s, size_t const len);
     // TSA_tree_iterator(TSA<Token> const* s, Token const& t);
     TSA_tree_iterator(TSA<Token> const* s, 
 		      Token const* kstart, 
+		      size_t const len, 
+		      bool full_match_only=true);
+    TSA_tree_iterator(TSA<Token> const* s, 
+		      Token const* kstart, 
 		      Token const* kend, 
 		      bool full_match_only=true);
     // TSA_tree_iterator(TSA<Token> const* s, 
@@ -150,9 +157,12 @@ namespace ugdiss
     double approxOccurrenceCount(int p=-1) const
     {
       assert(root);
+      if (p < 0) p += lower.size();
       double ret = arrayByteSpanSize(p)/root->aveIndexEntrySize();
-      assert(ret < root->corpus->numTokens());
       if (ret < 25) ret = rawCnt(p);
+      UTIL_THROW_IF2(ret > root->corpus->numTokens(), "[" << HERE << "] "
+		     << "Word count mismatch.");
+      assert(ret <= root->corpus->numTokens());
       return ret;
     }
 
@@ -320,6 +330,18 @@ namespace ugdiss
 
   template<typename Token>
   TSA_tree_iterator<Token>::
+  TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other)
+    : root(s) 
+  {
+    Token const* x = other.getToken(0);
+    for (size_t i = 0; i < other.size() && this->extend(x->id()); ++i)
+      x = x->next(); 
+  };
+
+
+
+  template<typename Token>
+  TSA_tree_iterator<Token>::
   TSA_tree_iterator
   (TSA<Token> const* r,
    id_type    const* s, 
@@ -385,6 +407,25 @@ namespace ugdiss
   template<typename Token>
   TSA_tree_iterator<Token>::
   TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, 
+		    size_t const len, bool full_match_only)
+    : root(s) 
+  {
+    if (!root) return;
+    size_t i = 0;
+    for (; i < len && kstart && extend(*kstart); ++i)
+      kstart = kstart->next();
+    if (full_match_only && i != len) 
+      {
+        lower.clear();
+        upper.clear();
+      }
+  };
+
+  // DEPRECATED: DO NOT USE. Use the one that takes the length 
+  // instead of kend.
+  template<typename Token>
+  TSA_tree_iterator<Token>::
+  TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, 
 		    Token const* kend, bool full_match_only)
     : root(s) 
   {
@@ -561,8 +602,7 @@ namespace ugdiss
   TSA_tree_iterator<Token>::
   rawCnt(int p) const
   {
-    if (p < 0)
-      p = lower.size()+p;
+    if (p < 0) p += lower.size();
     assert(p>=0);
     if (lower.size() == 0) return root->getCorpusSize();
     return root->rawCnt(lower[p],upper[p]);