- Code refactoring for Bitext class.

- Bug fixes and conceptual improvements in biased sampling. The sampling now tries to stick to the bias, even when an unsuitable corpus dominates the occurrences.
author: Ulrich Germann <Ulrich.Germann@gmail.com> 2015-04-05 16:17:47 +0300
committer: Ulrich Germann <Ulrich.Germann@gmail.com> 2015-04-05 16:29:00 +0300
commit: 46e31a285c8f9257a9d6ab411db74b5cbec9d0fe (patch)
tree: 9bf1afa3827e7252e6b9fd38e8ee27cef8693a9a /moses/TranslationModel/UG/mm
parent: 05c4e382ff7914369700eb516a61a45238292bdf (diff)
16 files changed, 1475 insertions, 1598 deletions
diff --git a/moses/TranslationModel/UG/mm/mmlex-build.cc b/moses/TranslationModel/UG/mm/mmlex-build.cc
index 4ef0842e4..5e5ea194c 100644
--- a/moses/TranslationModel/UG/mm/mmlex-build.cc
+++ b/moses/TranslationModel/UG/mm/mmlex-build.cc
@@ -24,6 +24,7 @@
 #include <boost/unordered_set.hpp> 
 
 #include "moses/TranslationModel/UG/generic/program_options/ug_get_options.h"
+#include "moses/Util.h"
 #include "ug_mm_2d_table.h"
 #include "ug_mm_ttrack.h"
 #include "ug_corpus_token.h"
@@ -241,10 +242,14 @@ processSentence(id_type sid)
       p = binread(p,r);
       p = binread(p,c);
       // cout << sid << " " << r << "-" << c << endl;
-      assert(r < check1.size());
-      assert(c < check2.size());
-      assert(s1+r < e1);
-      assert(s2+c < e2);
+      UTIL_THROW_IF2(r >= check1.size(), "out of bounds at line " << sid);
+      UTIL_THROW_IF2(c >= check2.size(), "out of bounds at line " << sid);
+      // assert(r < check1.size());
+      // assert(c < check2.size());
+      UTIL_THROW_IF2(s1+r >= e1, "out of bounds at line " << sid);
+      UTIL_THROW_IF2(s2+c >= e2, "out of bounds at line " << sid);
+      // assert(s1+r < e1);
+      // assert(s2+c < e2);
       check1.reset(r);
       check2.reset(c);
       id_type id1 = (s1+r)->id();
@@ -266,66 +271,6 @@ processSentence(id_type sid)
     CNT[wpair(0,(s2+i)->id())].a++;
 }
 
-// void
-// writeTable(string ofname, 
-// 	   vector<vector<uint32_t> >& FREQ,
-// 	   vector<map<id_type,uint32_t> >& RARE)
-// {
-//   ofstream out(ofname.c_str());
-//   filepos_type idxOffset=0;
-
-//   vector<uint32_t> m1; // marginals L1
-//   vector<uint32_t> m2; // marginals L2
-//   m1.resize(max(first_rare_id,V1.getNumTokens()),0);
-//   m2.resize(V2.getNumTokens(),0);
-//   vector<id_type> index(V1.getNumTokens()+1,0);
-//   numwrite(out,idxOffset); // blank for the time being
-//   numwrite(out,id_type(m1.size()));
-//   numwrite(out,id_type(m2.size()));
-
-//   id_type cellCount=0;
-//   id_type stop = min(first_rare_id,id_type(m1.size()));
-//   for (id_type id1 = 0; id1 < stop; ++id1)
-//     {
-//       index[id1]  = cellCount;
-//       vector<uint32_t> const& v = FREQ[id1];
-//       for (id_type id2 = 0; id2 < id_type(v.size()); ++id2)
-//         {
-//           if (!v[id2]) continue;
-//           cellCount++;
-//           numwrite(out,id2);
-//           out.write(reinterpret_cast<char const*>(&v[id2]),sizeof(uint32_t));
-//           m1[id1] += v[id2];
-//           m2[id2] += v[id2];
-//         }
-//     }
-//   for (id_type id1 = stop; id1 < id_type(m1.size()); ++id1)
-//     {
-//       index[id1]  = cellCount;
-//       map<id_type,uint32_t> const& M = RARE[id1];
-//       for (map<id_type,uint32_t>::const_iterator m = M.begin(); m != M.end(); ++m)
-//         {
-//           if (m->second == 0) continue;
-//           cellCount++;
-//           numwrite(out,m->first);
-//           out.write(reinterpret_cast<char const*>(&m->second),sizeof(float));
-//           m1[id1] += m->second;
-//           m2[m->first] += m->second;
-//         }
-//     }
-//   index[m1.size()] = cellCount;
-//   idxOffset    = out.tellp();
-//   for (size_t i = 0; i < index.size(); ++i)
-//     numwrite(out,index[i]);
-//   out.write(reinterpret_cast<char const*>(&m1[0]),m1.size()*sizeof(float));
-//   out.write(reinterpret_cast<char const*>(&m2[0]),m2.size()*sizeof(float));
-  
-//   // re-write the file header
-//   out.seekp(0);
-//   numwrite(out,idxOffset);
-//   out.close();
-// }
-
 int 
 main(int argc, char* argv[])
 {
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc
index 29104aaec..fe95596ab 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@@ -11,192 +11,6 @@ namespace Moses
   namespace bitext 
   {
 
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
-    ThreadSafeCounter pstats::active;
-#endif
-    
-    pstats::
-    pstats()
-      : raw_cnt     (0)
-      , sample_cnt  (0)
-      , good        (0)
-      , sum_pairs   (0)
-      , in_progress (0)
-    {
-      for (int i = 0; i <= Moses::LRModel::NONE; ++i) 
-	ofwd[i] = obwd[i] = 0;
-    }
-
-    pstats::
-    ~pstats()
-    {
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
-      // counter may not exist any more at destruction time, so try ... catch
-      try { --active; } catch (...) {} 
-#endif
-    }
-
-    void
-    pstats::
-    register_worker()
-    {
-      this->lock.lock();
-      ++this->in_progress;
-      this->lock.unlock();
-    }
-  
-    void
-    pstats::
-    release()
-    {
-      this->lock.lock();
-      if (this->in_progress-- == 1) // last one - >we're done
-	this->ready.notify_all();
-      this->lock.unlock();
-    }
-
-    bool
-    pstats::
-    add(uint64_t pid, float const w, 
-	vector<uchar> const& a, 
-	uint32_t const cnt2, 
-	uint32_t fwd_o, 
-	uint32_t bwd_o, int const docid)
-    {
-      boost::lock_guard<boost::mutex> guard(this->lock);
-      jstats& entry = this->trg[pid];
-      entry.add(w,a,cnt2,fwd_o,bwd_o,docid);
-      if (this->good < entry.rcnt())
-	{
-	  UTIL_THROW(util::Exception, "more joint counts than good counts:" 
-		     << entry.rcnt() << "/" << this->good << "!");
-	}
-
-      if (docid >= 0)
-	{
-	  while (int(indoc.size()) <= docid) indoc.push_back(0);
-	  ++indoc[docid];
-	}
-
-      return true;
-    }
-
-    jstats::
-    jstats()
-      : my_rcnt(0), my_wcnt(0), my_cnt2(0)
-    { 
-      for (int i = 0; i <= Moses::LRModel::NONE; ++i) 
-	ofwd[i] = obwd[i] = 0;
-      my_aln.reserve(1);
-    }
-
-    jstats::
-    jstats(jstats const& other)
-    {
-      my_rcnt = other.rcnt();
-      my_wcnt = other.wcnt();
-      my_aln  = other.aln();
-      indoc = other.indoc;
-      for (int i = 0; i <= Moses::LRModel::NONE; i++)
-	{
-	  ofwd[i] = other.ofwd[i];
-	  obwd[i] = other.obwd[i];
-	}
-    }
-  
-    uint32_t 
-    jstats::
-    dcnt_fwd(PhraseOrientation const idx) const
-    {
-      assert(idx <= Moses::LRModel::NONE);
-      return ofwd[idx];
-    }
-
-    uint32_t 
-    jstats::
-    dcnt_bwd(PhraseOrientation const idx) const
-    {
-      assert(idx <= Moses::LRModel::NONE);
-      return obwd[idx];
-    }
-    
-    void 
-    jstats::
-    add(float w, vector<uchar> const& a, uint32_t const cnt2,
-	uint32_t fwd_orient, uint32_t bwd_orient, int const docid)
-    {
-      boost::lock_guard<boost::mutex> lk(this->lock);
-      my_rcnt += 1;
-      my_wcnt += w;
-      // my_cnt2 += cnt2; // could I really be that stupid? [UG]
-      my_cnt2 = cnt2;
-      if (a.size())
-	{
-	  size_t i = 0;
-	  while (i < my_aln.size() && my_aln[i].second != a) ++i;
-	  if (i == my_aln.size()) 
-	    my_aln.push_back(pair<size_t,vector<uchar> >(1,a));
-	  else
-	    my_aln[i].first++;
-	  if (my_aln[i].first > my_aln[i/2].first)
-	    push_heap(my_aln.begin(),my_aln.begin()+i+1);
-	}
-      ++ofwd[fwd_orient];
-      ++obwd[bwd_orient];
-      if (docid >= 0)
-	{
-	  while (int(indoc.size()) <= docid) indoc.push_back(0);
-	  ++indoc[docid];
-
-	  // cout << docid << " => " << indoc[docid] << " " << HERE << endl;
-
-	}
-    }
-    
-    uint32_t 
-    jstats::
-    rcnt() const 
-    { return my_rcnt; }
-    
-    float
-    jstats::
-    wcnt() const 
-    { return my_wcnt; }
-
-    uint32_t
-    jstats::
-    cnt2() const 
-    { return my_cnt2; }
-   
-    vector<pair<size_t, vector<uchar> > > const&
-    jstats::
-    aln() const 
-    { return my_aln; }
-
-    void 
-    jstats::
-    invalidate()
-    {
-      if (my_wcnt > 0) 
-	my_wcnt *= -1;
-    }
-
-    void 
-    jstats::
-    validate()
-    {
-      if (my_wcnt < 0) 
-	my_wcnt *= -1;
-    }
-
-    bool
-    jstats::
-    valid()
-    {
-      return my_wcnt >= 0;
-    }
-
-    
     float 
     lbop(size_t const tries, size_t const succ, float const confidence)
     {
@@ -206,83 +20,6 @@ namespace Moses
 		 find_lower_bound_on_p(tries, succ, confidence)));
     }
     
-    template<>
-    sptr<imBitext<L2R_Token<SimpleWordId> > > 
-    imBitext<L2R_Token<SimpleWordId> >::
-    add(vector<string> const& s1, 
-	vector<string> const& s2, 
-	vector<string> const& aln) const
-    {
-      typedef L2R_Token<SimpleWordId> TKN;
-      assert(s1.size() == s2.size() && s1.size() == aln.size());
-      
-#ifndef NDEBUG
-      size_t first_new_snt = this->T1 ? this->T1->size() : 0;
-#endif
-
-      sptr<imBitext<TKN> > ret;
-      {
-	boost::unique_lock<boost::shared_mutex> guard(m_lock);
-	ret.reset(new imBitext<TKN>(*this));
-      }
-      
-      // we add the sentences in separate threads (so it's faster)
-      boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1));
-      // thread1.join(); // for debugging
-      boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2));
-      BOOST_FOREACH(string const& a, aln)
-	{
-	  istringstream ibuf(a);
-	  ostringstream obuf;
-	  uint32_t row,col; char c;
-	  while (ibuf >> row >> c >> col)
-	    {
-	      UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
-			     << "Error in alignment information:\n" << a);
-	      binwrite(obuf,row);
-	      binwrite(obuf,col);
-	    }
-	  // important: DO NOT replace the two lines below this comment by 
-	  // char const* x = obuf.str().c_str(), as the memory x is pointing 
-	  // to is freed immediately upon deconstruction of the string object.
-	  string foo = obuf.str(); 
-	  char const* x = foo.c_str();
-	  vector<char> v(x,x+foo.size());
-	  ret->myTx = append(ret->myTx, v);
-	}
-
-      thread1.join();
-      thread2.join();
-
-      ret->Tx = ret->myTx;
-      ret->T1 = ret->myT1;
-      ret->T2 = ret->myT2;
-      ret->I1 = ret->myI1;
-      ret->I2 = ret->myI2;
-
-#ifndef NDEBUG
-      // sanity check
-      for (size_t i = first_new_snt; i < ret->T1->size(); ++i)
-	{
-	  size_t slen1  = ret->T1->sntLen(i);
-	  size_t slen2  = ret->T2->sntLen(i);
-	  char const* p = ret->Tx->sntStart(i);
-	  char const* q = ret->Tx->sntEnd(i);
-	  size_t k;
-	  while (p < q)
-	    {
-	      p = binread(p,k);
-	      assert(p);
-	      assert(p < q);
-	      assert(k < slen1);
-	      p = binread(p,k);
-	      assert(p);
-	      assert(k < slen2);
-	    }
-	}
-#endif
-      return ret;
-    }
 
     // template<>
     void
@@ -425,6 +162,5 @@ namespace Moses
 	}
       cout  << string(90,'-') << endl;
     }
-
   }
 }
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index bd2975cf7..89aeeaa7a 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -1,7 +1,5 @@
 //-*- c++ -*-
-
-#ifndef __ug_bitext_h
-#define __ug_bitext_h
+#pragma once
 // Implementations of word-aligned bitext.
 // Written by Ulrich Germann
 // 
@@ -26,11 +24,11 @@
 #include <iomanip>
 #include <algorithm>
 
-#include <boost/unordered_map.hpp>
 #include <boost/foreach.hpp>
-#include <boost/thread.hpp>
 #include <boost/random.hpp>
 #include <boost/format.hpp>
+#include <boost/thread.hpp>
+#include <boost/unordered_map.hpp>
 #include <boost/math/distributions/binomial.hpp>
 
 #include "moses/TranslationModel/UG/generic/sorting/VectorIndexSorter.h"
@@ -59,6 +57,7 @@
 #include "ug_lru_cache.h"
 #include "ug_lexical_reordering.h"
 #include "ug_sampling_bias.h"
+#include "ug_phrasepair.h"
 
 #define PSTATS_CACHE_THRESHOLD 50
 
@@ -66,101 +65,10 @@ namespace Moses {
   class Mmsapt;
   namespace bitext
   {
-    // using namespace ugdiss;
-    // using namespace std;
-
-    template<typename TKN> class Bitext;
-    template<typename TKN> class PhrasePair;
     using namespace ugdiss;
 
-    template<typename TKN> class Bitext;
-
-    template<typename sid_t, typename off_t, typename len_t>
-    void 
-    parse_pid(uint64_t const pid, sid_t & sid, 
-	      off_t & off, len_t& len)
-    {
-      static uint64_t two32 = uint64_t(1)<<32;
-      static uint64_t two16 = uint64_t(1)<<16;
-      len = pid%two16;
-      off = (pid%two32)>>16;
-      sid = pid>>32;
-    }
-
-    float 
-    lbop(size_t const tries, size_t const succ, 
-	 float const confidence);
-
-    // "joint" (i.e., phrase pair) statistics
-    class
-    jstats
-    {
-      boost::mutex lock;
-      uint32_t my_rcnt; // unweighted count
-      float    my_wcnt; // weighted count 
-      uint32_t my_cnt2;
-      vector<pair<size_t, vector<uchar> > > my_aln; 
-      uint32_t ofwd[Moses::LRModel::NONE+1], obwd[Moses::LRModel::NONE+1];
-    public:
-      vector<uint32_t> indoc;
-      jstats();
-      jstats(jstats const& other);
-      uint32_t rcnt() const;
-      uint32_t cnt2() const; // raw target phrase occurrence count
-      float    wcnt() const;
-      
-      vector<pair<size_t, vector<uchar> > > const & aln() const;
-      void add(float w, vector<uchar> const& a, uint32_t const cnt2,
-	       uint32_t fwd_orient, uint32_t bwd_orient, int const docid);
-      void invalidate();
-      void validate();
-      bool valid();
-      uint32_t dcnt_fwd(PhraseOrientation const idx) const;
-      uint32_t dcnt_bwd(PhraseOrientation const idx) const;
-    };
-
-    struct 
-    pstats
-    {
-      typedef boost::unordered_map<uint64_t, sptr<pstats> > map_t;
-      typedef ThreadSafeContainer<uint64_t, sptr<pstats>, map_t> cache_t;
-
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
-      static ThreadSafeCounter active;
-#endif
-      boost::mutex lock;               // for parallel gathering of stats
-      boost::condition_variable ready; /* consumers can wait for this
-					* data structure to be ready. */
-      
-      size_t raw_cnt;    // (approximate) raw occurrence count 
-      size_t sample_cnt; // number of instances selected during sampling
-      size_t good;       // number of selected instances with valid word alignments
-      size_t sum_pairs;
-      size_t in_progress; // keeps track of how many threads are currently working on this
-
-      // size_t Moses::LRModel::ReorderingType 
-      uint32_t ofwd[Moses::LRModel::NONE+1], obwd[Moses::LRModel::NONE+1];
+    float lbop(size_t const tries, size_t const succ, float const confidence);
 
-      vector<uint32_t> indoc;
-
-      
-      // typedef typename boost::unordered_map<typename uint64_t, jstats> trg_map_t;
-      typedef std::map<uint64_t, jstats> trg_map_t;
-      trg_map_t trg;
-      pstats();
-      ~pstats();
-      void release();
-      void register_worker();
-      size_t count_workers() { return in_progress; } 
-
-      bool 
-      add(uint64_t const pid, 
-	  float    const w, 
-	  vector<uchar> const& a, 
-	  uint32_t      const cnt2,
-	  uint32_t fwd_o, uint32_t bwd_o, int const docid);
-    };
-    
     struct 
     ContextForQuery
     {
@@ -174,297 +82,36 @@ namespace Moses {
       ContextForQuery() : bias_log(NULL) { }
     };
 
-    template<typename Token>
-    string 
-    toString(TokenIndex const& V, Token const* x, size_t const len)
-    {
-      if (!len) return "";
-      UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
-      ostringstream buf; 
-      buf << V[x->id()];
-      size_t i = 1;
-      for (x = x->next(); x && i < len; ++i, x = x->next())
-	buf << " " << V[x->id()];
-      UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
-      return buf.str();
-    }
 
-    template<typename Token>
-    class 
-    PhrasePair
+    template<typename TKN>
+    class Bitext 
     {
     public:
-      class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; };
-      Token const* start1;
-      Token const* start2;
-      uint32_t len1;
-      uint32_t len2;
-      uint64_t p1, p2;
-      uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
-      vector<float> fvals;
-      float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs?
-      float dbwd[Moses::LRModel::NONE+1]; // distortion counts
-      vector<uchar> aln;
-      float score;
-      bool inverse;
-      vector<uint32_t> indoc;
-      PhrasePair() { };
-      PhrasePair(PhrasePair const& o);
-
-      PhrasePair const& operator+=(PhrasePair const& other);
-
-      bool operator<(PhrasePair const& other) const;
-      bool operator>(PhrasePair const& other) const;
-      bool operator<=(PhrasePair const& other) const; 
-      bool operator>=(PhrasePair const& other) const;
-
-      void init();
-      void init(uint64_t const pid1, bool is_inverse, 
-		Token const* x,   uint32_t const len,
-		pstats const* ps = NULL, size_t const numfeats=0);
-      
-      // void init(uint64_t const pid1, pstats const& ps,  size_t const numfeats);
-      // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, 
-      // size_t const numfeats);
-
-      // PhrasePair const&
-      // update(uint64_t const pid2, size_t r2 = 0);
-
-      PhrasePair const& 
-      update(uint64_t const pid2, Token const* x, 
-	     uint32_t const len, jstats const& js);
-      
-      // PhrasePair const& 
-      // update(uint64_t const pid2, jstats   const& js1, jstats   const& js2);
-
-      // PhrasePair const& 
-      // update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
-
-      // float 
-      // eval(vector<float> const& w);
-
-      class SortByTargetIdSeq
-      {
-      public:
-	int cmp(PhrasePair const& a, PhrasePair const& b) const;
-	bool operator()(PhrasePair const& a, PhrasePair const& b) const;
-      };
-
-      class SortDescendingByJointCount
-      {
-      public:
-	int cmp(PhrasePair const& a, PhrasePair const& b) const;
-	bool operator()(PhrasePair const& a, PhrasePair const& b) const;
-      };
-    };
+      typedef TKN Token;
+      typedef typename TSA<Token>::tree_iterator   iter;
+      typedef typename std::vector<PhrasePair<Token> > vec_ppair;
+      typedef typename lru_cache::LRU_Cache<uint64_t, vec_ppair> pplist_cache_t;
 
-    template<typename Token>
-    void
-    PhrasePair<Token>::
-    init(uint64_t const pid1, bool is_inverse, Token const* x, uint32_t const len, 
-	 pstats const* ps, size_t const numfeats)
-    {
-      inverse = is_inverse;
-      start1 = x; len1 = len;
-      p1     = pid1;
-      p2     = 0;
-      if (ps)
-	{
-	  raw1    = ps->raw_cnt;
-	  sample1 = ps->sample_cnt;
-	  good1   = ps->good;
-	}
-      else raw1 = sample1 = good1 = 0;
-      joint   = 0;
-      good2   = 0;
-      sample2 = 0;
-      raw2    = 0;
-      fvals.resize(numfeats);
-    }
+      friend class Moses::Mmsapt;
+    protected:
+      mutable boost::shared_mutex m_lock; // for thread-safe operation
 
-    template<typename Token>
-    PhrasePair<Token> const&
-    PhrasePair<Token>::
-    update(uint64_t const pid2, 
-	   Token const* x, uint32_t const len, jstats const& js)   
-    {
-      p2    = pid2;
-      start2 = x; len2 = len;
-      raw2  = js.cnt2();
-      joint = js.rcnt();
-      assert(js.aln().size());
-      if (js.aln().size()) 
-	aln = js.aln()[0].second;
-      float total_fwd = 0, total_bwd = 0;
-      for (int i = 0; i <= Moses::LRModel::NONE; i++)
-	{
-	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
-	  total_fwd += js.dcnt_fwd(po)+1;
-	  total_bwd += js.dcnt_bwd(po)+1;
-	}
+      class agenda; // for parallel sampling see ug_bitext_agenda.h
+      mutable sptr<agenda> ag; 
+      size_t m_num_workers; // number of workers available to the agenda
 
-      // should we do that here or leave the raw counts?
-      for (int i = 0; i <= Moses::LRModel::NONE; i++)
-	{
-	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
-	  dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
-	  dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
-	}
+      size_t m_default_sample_size;    
+      size_t m_pstats_cache_threshold; // threshold for caching sampling results
+      sptr<pstats::cache_t> m_cache1, m_cache2; // caches for sampling results
       
-      indoc = js.indoc;
-      return *this;
-    }
-
-    template<typename Token>
-    bool 
-    PhrasePair<Token>::
-    operator<(PhrasePair const& other) const 
-    { return this->score < other.score; }
-    
-    template<typename Token>
-    bool 
-    PhrasePair<Token>::
-    operator>(PhrasePair const& other) const
-    { return this->score > other.score; }
-
-    template<typename Token>
-    bool 
-    PhrasePair<Token>::
-    operator<=(PhrasePair const& other) const 
-    { return this->score <= other.score; }
-    
-    template<typename Token>
-    bool 
-    PhrasePair<Token>::
-    operator>=(PhrasePair const& other) const
-    { return this->score >= other.score; }
-
-    template<typename Token>
-    PhrasePair<Token> const&
-    PhrasePair<Token>::
-    operator+=(PhrasePair const& o) 
-    { 
-      raw1 += o.raw1;
-      raw2 += o.raw2;
-      sample1 += o.sample1;
-      sample2 += o.sample2;
-      good1 += o.good1;
-      good2 += o.good2;
-      joint += o.joint;
-      return *this;
-    }
-
-    template<typename Token>
-    PhrasePair<Token>::
-    PhrasePair(PhrasePair<Token> const& o) 
-      : start1(o.start1)
-      , start2(o.start2)
-      , len1(o.len1)
-      , len2(o.len2)
-      , p1(o.p1) 
-      , p2(o.p2)
-      , raw1(o.raw1) 
-      , raw2(o.raw2) 
-      , sample1(o.sample1)
-      , sample2(o.sample2)
-      ,	good1(o.good1)
-      , good2(o.good2)
-      , joint(o.joint)
-      , fvals(o.fvals)
-      , aln(o.aln)
-      , score(o.score)
-      , inverse(o.inverse)
-      , indoc(o.indoc)
-    {
-      for (int i = 0; i <= Moses::LRModel::NONE; ++i)
-	{
-	  dfwd[i] = o.dfwd[i];
-	  dbwd[i] = o.dbwd[i];
-	}
-    }
-    
-    template<typename Token>
-    int
-    PhrasePair<Token>::
-    SortByTargetIdSeq::
-    cmp(PhrasePair const& a, PhrasePair const& b) const
-    {
-      size_t i = 0;
-      Token const* x = a.start2;
-      Token const* y = b.start2;
-      while (i < a.len2 && i < b.len2 && x->id() == y->id()) 
-	{
-	  x = x->next();
-	  y = y->next();
-	  ++i;
-	}
-      if (i == a.len2 && i == b.len2) return 0;
-      if (i == a.len2) return -1;
-      if (i == b.len2) return  1;
-      return x->id() < y->id() ? -1 : 1;
-    }
-    
-    template<typename Token>
-    bool
-    PhrasePair<Token>::
-    SortByTargetIdSeq::
-    operator()(PhrasePair const& a, PhrasePair const& b) const
-    {
-      return this->cmp(a,b) < 0;
-    }
-
-    template<typename Token>
-    int
-    PhrasePair<Token>::
-    SortDescendingByJointCount::
-    cmp(PhrasePair const& a, PhrasePair const& b) const
-    {
-      // size_t i = 0;
-      if (a.joint == b.joint) return 0;
-      return a.joint > b.joint ? -1 : 1;
-    }
-
-    template<typename Token>
-    bool
-    PhrasePair<Token>::
-    SortDescendingByJointCount::
-    operator()(PhrasePair const& a, PhrasePair const& b) const
-    {
-      return this->cmp(a,b) < 0;
-    }
+      map<string,id_type>  m_docname2docid; // maps from doc names to ids
+      sptr<std::vector<id_type> >   m_sid2docid; // maps from sentences to docs (ids)
 
-    template<typename Token>
-    void 
-    PhrasePair<Token>::
-    init()
-    {
-      inverse = false;
-      len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
-      start1 = start2 = NULL;
-      p1 = p2 = 0;
-    }
-    
-    template<typename TKN>
-    class Bitext 
-    {
-      friend class Moses::Mmsapt;
+      mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
+      // caches for unbiased sampling; biased sampling uses the caches that 
+      // are stored locally on the translation task
 
-    protected:
-      mutable boost::shared_mutex m_lock;
     public:
-      typedef TKN Token;
-      typedef typename TSA<Token>::tree_iterator iter;
-
-      class agenda;
-      // stores the list of unfinished jobs;
-      // maintains a pool of workers and assigns the jobs to them
-      
-      // to be done: work with multiple agendas for faster lookup
-      // (multiplex jobs); not sure if an agenda having more than 
-      // four or so workers is efficient, because workers get into 
-      // each other's way. 
-      mutable sptr<agenda> ag; 
-      
       sptr<Ttrack<char> >  Tx; // word alignments
       sptr<Ttrack<Token> > T1; // token track
       sptr<Ttrack<Token> > T2; // token track
@@ -473,76 +120,43 @@ namespace Moses {
       sptr<TSA<Token> >    I1; // indices
       sptr<TSA<Token> >    I2; // indices
 
-      map<string,id_type>  m_docname2docid; // maps from doc names to ids
-      sptr<vector<id_type> >   m_sid2docid; // maps from sentences to docs (ids)
-      
       /// given the source phrase sid[start:stop]
       //  find the possible start (s1 .. s2) and end (e1 .. e2) 
       //  points of the target phrase; if non-NULL, store word
       //  alignments in *core_alignment. If /flip/, source phrase is 
       //  L2.
-      bool 
-      find_trg_phr_bounds
+      bool find_trg_phr_bounds
       ( size_t const sid,    // sentence to investigate
 	size_t const start,  // start of source phrase
 	size_t const stop,   // last position of source phrase
         size_t & s1, size_t & s2, // beginning and end of target start
 	size_t & e1, size_t & e2, // beginning and end of target end
         int& po_fwd, int& po_bwd, // phrase orientations
-	vector<uchar> * core_alignment, // stores the core alignment
+	std::vector<uchar> * core_alignment, // stores the core alignment
 	bitvector* full_alignment, // stores full word alignment for this sent.
 	bool const flip) const;   // flip source and target (reverse lookup) 
       
-      sptr<pstats::cache_t> m_cache1, m_cache2; 
-      // caches for unbiased sampling; biased sampling uses the caches that 
-      // are stored locally on the translation task
-    protected:
-      typedef typename 
-      lru_cache::LRU_Cache<uint64_t, vector<PhrasePair<Token> > >  
-      pplist_cache_t;
-
-      size_t m_default_sample_size;
-      size_t m_num_workers;
-      size_t m_pstats_cache_threshold;
-      mutable pplist_cache_t m_pplist_cache1, m_pplist_cache2;
-
-    protected:
-
+      // prep2 launches sampling and returns immediately. 
+      // lookup (below) waits for the job to finish before it returns
       sptr<pstats> 
       prep2(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
-      // prep2 launches sampling and returns immediately, lookup (below) waits
-      // for the job to finish before it returns
-
+      
     public:
-      Bitext(size_t const max_sample = 1000, 
-	     size_t const xnum_workers = 16);
+      Bitext(size_t const max_sample = 1000, size_t const xnum_workers = 16);
 
-      Bitext(Ttrack<Token>* const t1, 
-	     Ttrack<Token>* const t2, 
-	     Ttrack<char>*  const tx,
-	     TokenIndex*    const v1, 
-	     TokenIndex*    const v2,
-	     TSA<Token>* const i1, 
-	     TSA<Token>* const i2,
-	     size_t const max_sample=1000,
+      Bitext(Ttrack<Token>* const t1, Ttrack<Token>* const t2, 
+	     Ttrack<char>*  const tx, 
+	     TokenIndex*    const v1, TokenIndex*    const v2,
+	     TSA<Token>*    const i1, TSA<Token>*    const i2,
+	     size_t const max_sample=1000, 
 	     size_t const xnum_workers=16);
 	     
-      virtual void open(string const base, string const L1, string const L2) = 0;
+      virtual void 
+      open(string const base, string const L1, string const L2) = 0;
       
       sptr<pstats> 
       lookup(ttasksptr const& ttask, iter const& phrase, int max_sample = -1) const;
 
-#if 0
-      // needs to be adapted to the new API
-      void
-      lookup(vector<Token> const& snt, TSA<Token>& idx, 
-	     vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
-	     vector<vector<uint64_t> >* pidmap = NULL,
-	     typename PhrasePair<Token>::Scorer* scorer=NULL, 
-	     sptr<SamplingBias const> const bias,
-	     bool multithread=true) const;
-#endif
-
       void prep(ttasksptr const& ttask, iter const& phrase) const;
 
       void   setDefaultSampleSize(size_t const max_samples);
@@ -556,11 +170,23 @@ namespace Moses {
       loadSentenceBias(string const& fname) const;
 
       sptr<DocumentBias>
-      SetupDocumentBias(string const& bserver, string const& text, 
-			ostream* log) const;
+      SetupDocumentBias(string const& bserver, string const& text, ostream* log) const;
+
+#if 0
+      // needs to be adapted to the new API
+      void
+      lookup(std::vector<Token> const& snt, TSA<Token>& idx, 
+	     std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > >& dest,
+	     std::vector<std::vector<uint64_t> >* pidmap = NULL,
+	     typename PhrasePair<Token>::Scorer* scorer=NULL, 
+	     sptr<SamplingBias const> const bias,
+	     bool multithread=true) const;
+#endif
 
     };
 
+#include "ug_bitext_agenda.h"
+
     template<typename Token>
     sptr<SentenceBias>
     Bitext<Token>::
@@ -594,8 +220,6 @@ namespace Moses {
       return buf.str();
     }
     
-    
-
     template<typename Token>
     size_t 
     Bitext<Token>::
@@ -620,8 +244,8 @@ namespace Moses {
     template<typename Token>
     Bitext<Token>::
     Bitext(size_t const max_sample, size_t const xnum_workers)
-      : m_default_sample_size(max_sample)
-      , m_num_workers(xnum_workers)
+      : m_num_workers(xnum_workers)
+      , m_default_sample_size(max_sample)
       , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
       , m_cache1(new pstats::cache_t)
       , m_cache2(new pstats::cache_t)
@@ -638,639 +262,14 @@ namespace Moses {
 	   TSA<Token>* const i2,
 	   size_t const max_sample,
 	   size_t const xnum_workers)
-      : Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
+      : m_num_workers(xnum_workers)
       , m_default_sample_size(max_sample)
-      , m_num_workers(xnum_workers)
       , m_pstats_cache_threshold(PSTATS_CACHE_THRESHOLD)
       , m_cache1(new pstats::cache_t)
       , m_cache2(new pstats::cache_t)
+      , Tx(tx), T1(t1), T2(t2), V1(v1), V2(v2), I1(i1), I2(i2)
     { }
 
-    // agenda is a pool of jobs 
-    template<typename Token>
-    class 
-    Bitext<Token>::
-    agenda
-    {
-      boost::mutex lock; 
-      class job 
-      {
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
-	static ThreadSafeCounter active;
-#endif
-	Bitext<Token> const* const m_bitext;
-	boost::mutex lock; 
-	friend class agenda;
-	boost::taus88 rnd;  // every job has its own pseudo random generator 
-	double rnddenom;    // denominator for scaling random sampling
-	size_t min_diverse; // minimum number of distinct translations
-      public:
-	size_t         workers; // how many workers are working on this job?
-	sptr<TSA<Token> const> root; // root of the underlying suffix array
-	char const*       next; // next position to read from 
-	char const*       stop; // end of index range
-	size_t     max_samples; // how many samples to extract at most
-	size_t             ctr; /* # of phrase occurrences considered so far
-				 * # of samples chosen is stored in stats->good 
-				 */
-	size_t             len; // phrase length
-	bool               fwd; // if true, source phrase is L1 
-	sptr<pstats>     stats; // stores statistics collected during sampling
-	sptr<SamplingBias const> const m_bias; // sentence-level bias for sampling
-	float bias_total;
-	bool step(uint64_t & sid, uint64_t & offset); // select another occurrence
-	bool done() const;
-	job(Bitext<Token> const* const theBitext, 
-	    typename TSA<Token>::tree_iterator const& m, 
-	    sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd, 
-	    sptr<SamplingBias const> const& bias);
-	~job();
-      };
-    public:      
-      class 
-      worker
-      {
-	agenda& ag;
-      public:
-	worker(agenda& a) : ag(a) {}
-	void operator()();
-      };
-    private:
-      list<sptr<job> > joblist;
-      vector<sptr<boost::thread> > workers;
-      bool shutdown;
-      size_t doomed;
-    public:
-      Bitext<Token>   const& bt;
-      agenda(Bitext<Token> const& bitext);
-      ~agenda();
-      void add_workers(int n);
-
-      sptr<pstats> 
-      add_job(Bitext<Token> const* const theBitext, 
-	      typename TSA<Token>::tree_iterator const& phrase, 
-	      size_t const max_samples, sptr<SamplingBias const> const& bias);
-
-      sptr<job> get_job();
-    };
-    
-    template<typename Token>
-    bool
-    Bitext<Token>::
-    agenda::
-    job::
-    step(uint64_t & sid, uint64_t & offset)
-    {
-      boost::lock_guard<boost::mutex> jguard(lock);
-      bool ret = (max_samples == 0) && (next < stop);
-      if (ret)
-	{
-	  next = root->readSid(next,stop,sid);
-	  next = root->readOffset(next,stop,offset);
-	  boost::lock_guard<boost::mutex> sguard(stats->lock);
-	  if (stats->raw_cnt == ctr) ++stats->raw_cnt;
-	  if (m_bias && (*m_bias)[sid] == 0)
-	    return false;
-	  stats->sample_cnt++;
-	}
-      else 
-	{
-	  while (next < stop && (stats->good < max_samples || 
-				 stats->trg.size() < min_diverse))
-	    {
-	      next = root->readSid(next,stop,sid);
-	      next = root->readOffset(next,stop,offset);
-	      if (m_bias)
-		{
-		  id_type docid = m_bias->GetClass(sid);
-		  if (stats->indoc.size() > docid)
-		    {
-		      uint32_t N = stats->good;
-		      float k = min(stats->indoc[docid],N);
-		      float p = (*m_bias)[sid];
-		      
-		      typedef boost::math::binomial_distribution<> binomial;
-		      using namespace boost::math;
-		      if (cdf(complement(binomial(N+1, p), k)) < .05) continue;
-		    }
-		}
-	      { // brackets required for lock scoping; 
-		// see sguard immediately below
-		boost::lock_guard<boost::mutex> sguard(stats->lock); 
-		if (stats->raw_cnt == ctr) ++stats->raw_cnt;
-		size_t scalefac = (stats->raw_cnt - ctr++);
-		size_t rnum = scalefac * (rnd()/(rnd.max()+1.));
-		size_t th = (bias_total 
-			     ? ((*m_bias)[sid]/bias_total * stats->raw_cnt 
-				* max_samples)
-			     : max_samples);
-#if 0
-		cerr << rnum << "/" << scalefac << " vs. " 
-		     << max_samples - stats->good << " ("
-		     << max_samples << " - " << stats->good << ")" 
-		     << " th=" << th;
-		if (m_bias) 
-		  cerr << " with bias " << (*m_bias)[sid] 
-		       << " => " << th;
-		else cerr << " without bias";
-		cerr << endl;
-#endif
-#if 0
-		cerr << "bias total: " << bias_total 
-		     << " bias local: " << (*m_bias)[sid] 
-		     << " rnum: " << rnum 
- 		     << " good: " << stats->good 
-		     << " th: " << th 
-		     << " raw: " << stats->raw_cnt 
-		     << endl;
-#endif
-		if (rnum + stats->good < th)
-		  {
-		    stats->sample_cnt++;
-		    ret = true;
-		    break;
-		  }
-	      }
-	    }
-	}
-      
-      // boost::lock_guard<boost::mutex> sguard(stats->lock); 
-      // abuse of lock for clean output to cerr
-      // cerr << stats->sample_cnt++;
-      return ret;
-    }
-
-    template<typename Token>
-    void
-    Bitext<Token>::
-    agenda::
-    add_workers(int n)
-    {
-      static boost::posix_time::time_duration nodelay(0,0,0,0); 
-      boost::lock_guard<boost::mutex> guard(this->lock);
-
-      int target  = max(1, int(n + workers.size() - this->doomed));
-      // house keeping: remove all workers that have finished
-      for (size_t i = 0; i < workers.size(); )
-	{
-	  if (workers[i]->timed_join(nodelay))
-	    {
-	      if (i + 1 < workers.size())
-		workers[i].swap(workers.back());
-	      workers.pop_back();
-	    }
-	  else ++i;
-	}
-      // cerr << workers.size() << "/" << target << " active" << endl;
-      if (int(workers.size()) > target)
-	this->doomed = workers.size() - target;
-      else 
-	while (int(workers.size()) < target)
-	  {
-	    sptr<boost::thread> w(new boost::thread(worker(*this)));
-	    workers.push_back(w);
-	  }
-    }
-
-    template<typename Token>
-    void
-    Bitext<Token>::
-    agenda::
-    worker::
-    operator()()
-    {
-      // things to do:
-      // - have each worker maintain their own pstats object and merge results at the end;
-      // - ensure the minimum size of samples considered by a non-locked counter that is only 
-      //   ever incremented -- who cares if we look at more samples than required, as long
-      //   as we look at at least the minimum required
-      // This way, we can reduce the number of lock / unlock operations we need to do during 
-      // sampling. 
-      size_t s1=0, s2=0, e1=0, e2=0;
-      uint64_t sid=0, offset=0; // of the source phrase
-      while(sptr<job> j = ag.get_job())
-	{
-	  j->stats->register_worker();
-	  vector<uchar> aln;
-	  bitvector full_alignment(100*100);
-	  while (j->step(sid,offset))
-	    {
-	      int docid = j->m_bias ? j->m_bias->GetClass(sid) : -1;
-
-	      Token const* t = ag.bt.T2->sntStart(sid);
-	      Token const* eos = ag.bt.T2->sntEnd(sid);
-#if 0
-	      cerr << "[" << j->stats->good + 1 << "] ";
-	      while (t != eos) cerr << (*ag.bt.V2)[(t++)->id()] << " "; 
-	      cerr << "[" << docid << "]" << endl;
-#endif
-	      aln.clear();
-	      int po_fwd=Moses::LRModel::NONE,po_bwd=Moses::LRModel::NONE;
-	      if (j->fwd)
-		{
-		  if (!ag.bt.find_trg_phr_bounds
-		      (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
-		       &aln,&full_alignment,false))
-		    continue;
-		}
-	      else if (!ag.bt.find_trg_phr_bounds
-		       (sid,offset,offset+j->len,s1,s2,e1,e2,po_fwd,po_bwd,
-			&aln,NULL,true)) // NULL,NULL,true))
-		continue;
-	      j->stats->lock.lock(); 
-	      j->stats->good += 1; 
-	      j->stats->sum_pairs += (s2-s1+1)*(e2-e1+1);
-	      ++j->stats->ofwd[po_fwd];
-	      ++j->stats->obwd[po_bwd];
-	      j->stats->lock.unlock();
-	      // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) 
-	      for (size_t k = 1; k < aln.size(); k += 2) 
-		aln[k] += s2 - s1;
-	      Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid);
-	      float sample_weight = 1./((s2-s1+1)*(e2-e1+1));
-
-	      vector<uint64_t> seen; 
-	      seen.reserve(100);
-	      // It is possible that the phrase extraction extracts the same
-	      // phrase twice, e.g., when word a co-occurs with sequence b b b
-	      // but is aligned only to the middle word. We can only count
-	      // each phrase pair once per source phrase occurrence, or else
-	      // run the risk of having more joint counts than marginal
-	      // counts.
-
-	      for (size_t s = s1; s <= s2; ++s)
-		{
-		  sptr<iter> b = (j->fwd ? ag.bt.I2 : ag.bt.I1)->find(o+s,e1-s);
-		  if (!b || b->size() < e1 -s)
-		    UTIL_THROW(util::Exception, "target phrase not found");
-		  // assert(b);
-		  for (size_t i = e1; i <= e2; ++i)
-		    {
-		      uint64_t tpid = b->getPid();
-		      size_t s = 0;
-		      while (s < seen.size() && seen[s] != tpid) ++s;
-		      if (s < seen.size())
-			{
-#if 0
-			  size_t sid, off, len;
-			  parse_pid(tpid,sid,off,len);
-			  cerr << "HA, gotcha! " << sid << ":" << off << " at " << HERE << endl;
-			  for (size_t z = 0; z < len; ++z)
-			    {
-			      id_type tid = ag.bt.T2->sntStart(sid)[off+z].id();
-			      cerr << (*ag.bt.V2)[tid] << " "; 
-			    }
-			  cerr << endl;
-#endif
-			  continue;
-			}
-		      seen.push_back(tpid);
-		      if (! j->stats->add(tpid,sample_weight,aln,
-					  b->approxOccurrenceCount(),
-					  po_fwd,po_bwd,docid))
-			{
-			  cerr << "FATAL ERROR AT " << __FILE__ 
-			       << ":" << __LINE__ << endl;
-			  assert(0);
-			  ostringstream msg;
-			  for (size_t z = 0; z < j->len; ++z)
-			    {
-			      id_type tid = ag.bt.T1->sntStart(sid)[offset+z].id();
-			      cerr << (*ag.bt.V1)[tid] << " "; 
-			    }
-			  cerr << endl;
-			  for (size_t z = s; z <= i; ++z)
-			    cerr << (*ag.bt.V2)[(o+z)->id()] << " "; 
-			  cerr << endl;
-			  assert(0);
-			  UTIL_THROW(util::Exception,"Error in sampling.");
-			}
-		      if (i < e2)
-			{
-#ifndef NDEBUG
-			  bool ok = b->extend(o[i].id());
-			  assert(ok);
-#else
-			  b->extend(o[i].id());
-			  // cerr << "boo" << endl;
-#endif 
-			}
-		    }
-		  // if (j->fwd && s < s2) 
-		  // for (size_t k = j->fwd ? 1 : 0; k < aln.size(); k += 2) 
-		  if (s < s2)
-		    for (size_t k = 1; k < aln.size(); k += 2) 
-		      --aln[k];
-		}
-	      // j->stats->lock.unlock();
-	    }
-	  j->stats->release();
-	}
-    }
-
-    template<typename Token>
-    Bitext<Token>::
-    agenda::
-    job::
-    ~job()
-    {
-      if (stats) stats.reset();
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
-      try { --active; } catch (...) {} 
-#endif
-      // counter may not exist any more at destruction time
-    }
-
-    template<typename Token>
-    Bitext<Token>::
-    agenda::
-    job::
-    job(Bitext<Token> const* const theBitext,
-	typename TSA<Token>::tree_iterator const& m, 
-	sptr<TSA<Token> > const& r, size_t maxsmpl, 
-	bool isfwd, sptr<SamplingBias const> const& bias)
-      : m_bitext(theBitext)
-      , rnd(0)
-      , rnddenom(rnd.max() + 1.)
-      , min_diverse(1)
-      , workers(0)
-      , root(r)
-      , next(m.lower_bound(-1))
-      , stop(m.upper_bound(-1))
-      , max_samples(maxsmpl)
-      , ctr(0)
-      , len(m.size())
-      , fwd(isfwd)
-      , m_bias(bias)
-    {
-      stats.reset(new pstats());
-      stats->raw_cnt = m.approxOccurrenceCount();
-      bias_total = 0; 
-      // we need to renormalize on the fly, as the summ of all sentence probs over 
-      // all candidates (not all sentences in the corpus) needs to add to 1.
-      // Profiling question: how much does that cost us?
-      if (m_bias)
-	{
-	  int ctr = 0;
-	  stats->raw_cnt = 0;
-	  for (char const* x = m.lower_bound(-1); x < stop;)
-	    {
-	      uint32_t sid; ushort offset;
-	      x = root->readSid(x,stop,sid);
-	      x = root->readOffset(x,stop,offset);
-#if 0
-	      cerr << ctr++ << " " << m.str(m_bitext->V1.get()) 
-		   << " " << sid << "/" << root->getCorpusSize() 
-		   << " " << offset << " " << stop-x << endl;
-#endif
-	      bias_total += (*m_bias)[sid];
-	      ++stats->raw_cnt;
-	    }
-	}
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
-      ++active;
-      // if (active%5 == 0) 
-      // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl;
-#endif
-    }
-
-    template<typename Token>
-    sptr<pstats> 
-    Bitext<Token>::
-    agenda::
-    add_job(Bitext<Token> const* const theBitext,
-	    typename TSA<Token>::tree_iterator const& phrase, 
-	    size_t const max_samples, sptr<SamplingBias const> const& bias)
-    {
-      boost::unique_lock<boost::mutex> lk(this->lock);
-      static boost::posix_time::time_duration nodelay(0,0,0,0); 
-      bool fwd = phrase.root == bt.I1.get();
-      sptr<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2, 
-			  max_samples, fwd, bias));
-      j->stats->register_worker();
-      
-      joblist.push_back(j);
-      if (joblist.size() == 1)
-	{
-	  size_t i = 0;
-	  while (i < workers.size())
-	    {
-	      if (workers[i]->timed_join(nodelay))
-		{
-		  if (doomed)
-		    {
-		      if (i+1 < workers.size())
-			workers[i].swap(workers.back());
-		      workers.pop_back();
-		      --doomed;
-		    }
-		  else
-		    workers[i++] = sptr<boost::thread>(new boost::thread(worker(*this)));
-		}
-	      else ++i;
-	    }
-	}
-      return j->stats;
-    }
-    
-    template<typename Token>
-    sptr<typename Bitext<Token>::agenda::job>
-    Bitext<Token>::
-    agenda::
-    get_job()
-    {
-      // cerr << workers.size() << " workers on record" << endl;
-      sptr<job> ret;
-      if (this->shutdown) return ret;
-      boost::unique_lock<boost::mutex> lock(this->lock);
-      if (this->doomed) 
-	{
-	  --this->doomed;
-	  return ret;
-	}
-      typename list<sptr<job> >::iterator j = joblist.begin();
-      while (j != joblist.end())
-	{
-	  if ((*j)->done()) 
-	    {
-	      (*j)->stats->release();
-	      joblist.erase(j++);
-	    } 
-	  else if ((*j)->workers >= 4) 
-	    {
-	      ++j;
-	    }
-	  else break;
-	}
-      if (joblist.size())
-	{
-	  ret = j == joblist.end() ? joblist.front() : *j;
-	  boost::lock_guard<boost::mutex> jguard(ret->lock);
-	  ++ret->workers;
-	}
-      return ret;
-    }
-
-   
-    template<typename TKN>
-    class mmBitext : public Bitext<TKN>
-    {
-      void load_document_map(string const& fname);
-    public:
-      void open(string const base, string const L1, string L2);
-      mmBitext();
-    };
-
-    template<typename TKN>
-    mmBitext<TKN>::
-    mmBitext()
-      : Bitext<TKN>(new mmTtrack<TKN>(),
-		    new mmTtrack<TKN>(),
-		    new mmTtrack<char>(),
-		    new TokenIndex(),
-		    new TokenIndex(),
-		    new mmTSA<TKN>(),
-		    new mmTSA<TKN>())
-    {};
-    
-    template<typename TKN>
-    void
-    mmBitext<TKN>::
-    load_document_map(string const& fname)
-    {
-	  ifstream docmap(fname.c_str());
-	  // the docmap file should list the documents in the corpus 
-	  // in the order in which they appear with one line per document:
-	  // <docname> <number of lines / sentences>
-	  //
-	  // in the future, we might also allow listing documents with
-	  // sentence ranges.
-	  string buffer,docname; size_t a=0,b;
-	  this->m_sid2docid.reset(new vector<id_type>(this->T1->size()));
-	  while(getline(docmap,buffer))
-	    {
-	      istringstream line(buffer); 
-	      if (!(line>>docname)) continue; // empty line
-	      if (docname.size() && docname[0] == '#') continue; // comment
-	      size_t docid = this->m_docname2docid.size();
-	      this->m_docname2docid[docname] = docid;
-	      line >> b;
-	      VERBOSE(1, "DOCUMENT MAP " << docname 
-		      << " " << a << "-" << b+a << endl);
-	      for (b += a; a < b; ++a)
-		(*this->m_sid2docid)[a] = docid;
-	    }
-	  UTIL_THROW_IF2(b != this->T1->size(), 
-			 "Document map doesn't match corpus!");
-    }
-
-    template<typename TKN>
-    void
-    mmBitext<TKN>::
-    open(string const base, string const L1, string L2)
-    {
-      mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
-      mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
-      mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
-      t1.open(base+L1+".mct");
-      t2.open(base+L2+".mct");
-      tx.open(base+L1+"-"+L2+".mam");
-      this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
-      this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
-      mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
-      mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
-      i1.open(base+L1+".sfa", this->T1);
-      i2.open(base+L2+".sfa", this->T2);
-      assert(this->T1->size() == this->T2->size());
-
-      string docmapfile = base+"dmp";
-      if (!access(docmapfile.c_str(),F_OK))
-	load_document_map(docmapfile);
-    }
-
-   
-    template<typename TKN>
-    class imBitext : public Bitext<TKN>
-    {
-      sptr<imTtrack<char> > myTx;
-      sptr<imTtrack<TKN> >  myT1;
-      sptr<imTtrack<TKN> >  myT2;
-      sptr<imTSA<TKN> >     myI1; 
-      sptr<imTSA<TKN> >     myI2;
-      static ThreadSafeCounter my_revision;
-    public:
-      size_t revision() const { return my_revision; }
-      void open(string const base, string const L1, string L2);
-      imBitext(sptr<TokenIndex> const& V1,
-	       sptr<TokenIndex> const& V2,
-	       size_t max_sample = 5000, size_t num_workers=4);
-      imBitext(size_t max_sample = 5000, size_t num_workers=4);
-      imBitext(imBitext const& other);
-      
-      // sptr<imBitext<TKN> > 
-      // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a);
-
-      sptr<imBitext<TKN> > 
-      add(vector<string> const& s1, 
-	  vector<string> const& s2, 
-	  vector<string> const& a) const;
-
-    };
-
-    template<typename TKN>
-    ThreadSafeCounter 
-    imBitext<TKN>::my_revision;
-
-    template<typename TKN>
-    imBitext<TKN>::
-    imBitext(size_t max_sample, size_t num_workers)
-      : Bitext<TKN>(max_sample, num_workers)
-    { 
-      this->m_default_sample_size = max_sample;
-      this->V1.reset(new TokenIndex());
-      this->V2.reset(new TokenIndex());
-      this->V1->setDynamic(true);
-      this->V2->setDynamic(true);
-      ++my_revision;
-    }
-    
-    template<typename TKN>
-    imBitext<TKN>::
-    imBitext(sptr<TokenIndex> const& v1,
-	     sptr<TokenIndex> const& v2,
-	     size_t max_sample, size_t num_workers)
-      : Bitext<TKN>(max_sample, num_workers)
-    { 
-      // this->default_sample_size = max_sample;
-      this->V1 = v1;
-      this->V2 = v2;
-      this->V1->setDynamic(true);
-      this->V2->setDynamic(true);
-      ++my_revision;
-    }
-    
-
-    template<typename TKN>
-    imBitext<TKN>::
-    imBitext(imBitext<TKN> const& other)
-    { 
-      this->myTx = other.myTx;
-      this->myT1 = other.myT1;
-      this->myT2 = other.myT2;
-      this->myI1 = other.myI1;
-      this->myI2 = other.myI2;
-      this->Tx = this->myTx;
-      this->T1 = this->myT1;
-      this->T2 = this->myT2;
-      this->I1 = this->myI1;
-      this->I2 = this->myI2;
-      this->V1 = other.V1;
-      this->V2 = other.V2;
-      this->m_default_sample_size = other.m_default_sample_size;
-      this->m_num_workers = other.m_num_workers;
-      ++my_revision;
-    }
-    
     template<typename TKN> class snt_adder;
     template<>             class snt_adder<L2R_Token<SimpleWordId> >;
 
@@ -1278,147 +277,17 @@ namespace Moses {
     class snt_adder<L2R_Token<SimpleWordId> >
     {
       typedef L2R_Token<SimpleWordId> TKN;
-      vector<string> const & snt;
+      std::vector<string> const & snt;
       TokenIndex           & V;
       sptr<imTtrack<TKN> > & track;
       sptr<imTSA<TKN > >   & index;
     public:
-      snt_adder(vector<string> const& s, TokenIndex& v, 
+      snt_adder(std::vector<string> const& s, TokenIndex& v, 
     		sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i);
       
       void operator()();
     };
 
-    // template<typename TKN>
-    // class snt_adder
-    // {
-    //   vector<string> const & snt;
-    //   TokenIndex           & V;
-    //   sptr<imTtrack<TKN> > & track;
-    //   sptr<imTSA<TKN > >   & index;
-    // public:
-    //   snt_adder(vector<string> const& s, TokenIndex& v, 
-    //  		sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i);
-
-    //   template<typename T>
-    //   void operator()();
-    // };
-
-    // // template<>
-    // void
-    // snt_adder<L2R_Token<SimpleWordId> >::
-    // operator()();
-
-    //  template<>
-    //  void
-    //  snt_adder<char>::
-    //  operator()()
-    //  {
-    // 	vector<id_type> sids;
-    // 	sids.reserve(snt.size());
-    // 	BOOST_FOREACH(string const& s, snt)
-    // 	  {
-    // 	    sids.push_back(track ? track->size() : 0);
-    // 	    istringstream buf(s);
-    // 	    string w;
-    // 	    vector<char> s;
-    // 	    s.reserve(100);
-    // 	    while (buf >> w) 
-    // 	      s.push_back(vector<char>(V[w]));
-    // 	    track = append(track,s);
-    // 	  }
-    // 	index.reset(new imTSA<char>(*index,track,sids,V.tsize()));
-    // }
-    
-    // template<typename TKN>
-    // snt_adder<TKN>::
-    // snt_adder(vector<string> const& s, TokenIndex& v, 
-    //  	      sptr<imTtrack<TKN> >& t, sptr<imTSA<TKN> >& i)
-    //   : snt(s), V(v), track(t), index(i) 
-    // {
-    //   throw "Not implemented yet.";
-    // }
-
-    template<>
-    sptr<imBitext<L2R_Token<SimpleWordId> > > 
-    imBitext<L2R_Token<SimpleWordId> >::
-    add(vector<string> const& s1, 
-	vector<string> const& s2, 
-	vector<string> const& aln) const;
-
-    template<typename TKN>
-    sptr<imBitext<TKN> > 
-    imBitext<TKN>::
-    add(vector<string> const& s1, 
-	vector<string> const& s2, 
-	vector<string> const& aln) const
-    {
-      throw "Not yet implemented";
-    }
-    // template<typename TKN>
-    // sptr<imBitext<TKN> > 
-    // imBitext<TKN>::
-    // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a)
-    // {
-    //   boost::unique_lock<boost::shared_mutex> guard(m_lock);
-    //   sptr<imBitext<TKN> > ret(new imBitext<TKN>());
-    //   vector<id_type> sids(1,this->myT1.size()-1);
-    //   ret->myT1 = add(this->myT1,s1);
-    //   ret->myT2 = add(this->myT2,s2);
-    //   size_t v1size = this->V1.tsize();
-    //   size_t v2size = this->V2.tsize();
-    //   BOOST_FOREACH(TKN const& t, s1) { if (t->id() >= v1size) v1size = t->id() + 1; }
-    //   BOOST_FOREACH(TKN const& t, s2) { if (t->id() >= v2size) v2size = t->id() + 1; }
-    //   ret->myI1.reset(new imTSA<TKN>(*this->I1,ret->myT1,sids,v1size));
-    //   ret->myI2.reset(new imTSA<TKN>(*this->I2,ret->myT2,sids,v2size));
-    //   ostringstream abuf; 
-    //   BOOST_FOREACH(ushort x, a) binwrite(abuf,x);
-    //   vector<char> foo(abuf.str().begin(),abuf.str().end());
-    //   ret->myTx = add(this->myTx,foo);
-    //   ret->T1 = ret->myT1;
-    //   ret->T2 = ret->myT2;
-    //   ret->Tx = ret->myTx;
-    //   ret->I1 = ret->myI1;
-    //   ret->I2 = ret->myI2;
-    //   ret->V1 = this->V1;
-    //   ret->V2 = this->V2; 
-    //   return ret;
-    // }
-
-
-    // template<typename TKN>
-    // imBitext<TKN>::
-    // imBitext()
-    //   : Bitext<TKN>(new imTtrack<TKN>(),
-    // 		    new imTtrack<TKN>(),
-    // 		    new imTtrack<char>(),
-    // 		    new TokenIndex(),
-    // 		    new TokenIndex(),
-    // 		    new imTSA<TKN>(),
-    // 		    new imTSA<TKN>())
-    //   {}
-    
-
-    template<typename TKN>
-    void
-    imBitext<TKN>::
-    open(string const base, string const L1, string L2)
-    {
-      mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
-      mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
-      mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
-      t1.open(base+L1+".mct");
-      t2.open(base+L2+".mct");
-      tx.open(base+L1+"-"+L2+".mam");
-      this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
-      this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
-      mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
-      mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
-      i1.open(base+L1+".sfa", this->T1);
-      i2.open(base+L2+".sfa", this->T2);
-      assert(this->T1->size() == this->T2->size());
-    }
-
     template<typename Token>
     bool
     Bitext<Token>::
@@ -1427,7 +296,7 @@ namespace Moses {
      size_t const start, size_t const stop,
      size_t & s1, size_t & s2, size_t & e1, size_t & e2,
      int & po_fwd, int & po_bwd,
-     vector<uchar>* core_alignment, bitvector* full_alignment, 
+     std::vector<uchar>* core_alignment, bitvector* full_alignment, 
      bool const flip) const
     {
       // if (core_alignment) cout << "HAVE CORE ALIGNMENT" << endl;
@@ -1464,7 +333,7 @@ namespace Moses {
       size_t src,trg;
       size_t lft = forbidden.size();
       size_t rgt = 0;
-      vector<vector<ushort> > aln1(slen1),aln2(slen2);
+      std::vector<std::vector<ushort> > aln1(slen1),aln2(slen2);
       char const* p = Tx->sntStart(sid);
       char const* x = Tx->sntEnd(sid);
 
@@ -1532,33 +401,6 @@ namespace Moses {
       return ret;
     }
 
-//     template<typename Token>
-//     sptr<DocumentBias>
-//     Bitext<Token>::
-//     SetupDocBias(string const& bserver, map<id_type,size_t> const& ctx) const
-//     {
-
-//       sptr<DocumentBias> ret;
-// #ifdef HAVE_CURLPP
-//       map<id_type,size_t>::const_iterator w = ctx.begin();
-//       while(w != ctx.end() && w->second == 0) ++w;
-//       if (w == ctx.end()) return ret;
-//       string context; context.reserve(5000);
-//       context += (*V1)[w->first];
-//       while (++w != ctx.end())
-// 	{
-// 	  if (w->second == 0) continue;
-// 	  context += " ";
-// 	  context += (*V1)[w->first];
-// 	}
-//       cerr << HERE << endl;
-//       cerr << "BIAS LOOKUP CONTEXT: " << context << endl; 
-//       ret =  GetDocBiasFromServer(bserver+curlpp::escape(context));
-// #endif
-//       return ret;
-//     }
-    
-    
     template<typename Token>
     void
     Bitext<Token>::
@@ -1587,7 +429,7 @@ namespace Moses {
       // - no caching for rare phrases and special requests (max_sample)
       //   (still need to test what a good caching threshold is ...)
       // - use the task-specific cache when there is a sampling bias
-      if (max_sample == m_default_sample_size 
+      if (max_sample == int(m_default_sample_size)
 	  && phrase.approxOccurrenceCount() > m_pstats_cache_threshold) 
 	{
 	  cache = (phrase.root == I1.get() 
@@ -1622,7 +464,7 @@ namespace Moses {
     {
       Ttrack<Token> const& m_other;
       sptr<pstats> m_pstats;
-      vector<PhrasePair<Token> >& m_pplist;
+      std::vector<PhrasePair<Token> >& m_pplist;
       typename PhrasePair<Token>::Scorer const* m_scorer;
       PhrasePair<Token> m_pp;
       Token const* m_token;
@@ -1635,7 +477,7 @@ namespace Moses {
       pstats2pplist(typename TSA<Token>::tree_iterator const& m,
 		    Ttrack<Token> const& other,
 		    sptr<pstats> const& ps, 
-		    vector<PhrasePair<Token> >& dest, 
+		    std::vector<PhrasePair<Token> >& dest, 
 		    typename PhrasePair<Token>::Scorer const* scorer)
 	: m_other(other)
 	, m_pstats(ps)
@@ -1665,7 +507,8 @@ namespace Moses {
 	    uint32_t sid,off,len;
 	    parse_pid(a->first, sid, off, len);
 	    m_pp.update(a->first, m_other.sntStart(sid)+off, len, a->second);
-	    m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),m_pp.joint);
+	    m_pp.good2 = max(uint32_t(m_pp.raw2 * float(m_pp.good1)/m_pp.raw1),
+			     m_pp.joint);
 	    size_t J = m_pp.joint<<7; // hard coded threshold of 1/128
 	    if (m_pp.good1 > J || m_pp.good2 > J) continue; 
 	    if (m_scorer) 
@@ -1683,13 +526,13 @@ namespace Moses {
     template<typename Token>
     void
     Bitext<Token>::
-    lookup(vector<Token> const& snt, TSA<Token>& idx, 
-	   vector<vector<sptr<vector<PhrasePair<Token> > > > >& dest,
-	   vector<vector<uint64_t> >* pidmap,
+    lookup(std::vector<Token> const& snt, TSA<Token>& idx, 
+	   std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > >& dest,
+	   std::vector<std::vector<uint64_t> >* pidmap,
 	   typename PhrasePair<Token>::Scorer* scorer,
 	   sptr<SamplingBias const> const& bias, bool multithread) const
     {
-      // typedef vector<vector<sptr<vector<PhrasePair<Token> > > > > ret_t;
+      // typedef std::vector<std::vector<sptr<std::vector<PhrasePair<Token> > > > > ret_t;
       
       dest.clear(); 
       dest.resize(snt.size());
@@ -1698,7 +541,7 @@ namespace Moses {
       // collect statistics in parallel, then build PT entries as 
       // the sampling finishes
       bool fwd = &idx == I1.get();
-      vector<boost::thread*> workers; // background threads doing the lookup
+      std::vector<boost::thread*> workers; // background threads doing the lookup
       pplist_cache_t& C = (fwd ? m_pplist_cache1 : m_pplist_cache2);
       if (C.capacity() < 100000) C.reserve(100000);
       for (size_t i = 0; i < snt.size(); ++i)
@@ -1709,12 +552,12 @@ namespace Moses {
 	    {
 	      uint64_t key = m.getPid();
 	      if (pidmap) (*pidmap)[i].push_back(key);
-	      sptr<vector<PhrasePair<Token> > > pp = C.get(key);
+	      sptr<std::vector<PhrasePair<Token> > > pp = C.get(key);
 	      if (pp) 
 		dest[i].push_back(pp);
 	      else 
 		{
-		  pp.reset(new vector<PhrasePair<Token> >());
+		  pp.reset(new std::vector<PhrasePair<Token> >());
 		  C.set(key,pp);
 		  dest[i].push_back(pp);
 		  sptr<pstats> x = prep2(m, this->default_sample_size,bias);
@@ -1780,49 +623,12 @@ namespace Moses {
       return ret;
     }
 #endif
-
-    template<typename Token>
-    Bitext<Token>::
-    agenda::
-    ~agenda()
-    {
-      this->lock.lock();
-      this->shutdown = true;
-      this->lock.unlock();
-      for (size_t i = 0; i < workers.size(); ++i)
-	workers[i]->join();
-    }
     
     template<typename Token>
-    Bitext<Token>::
-    agenda::
-    agenda(Bitext<Token> const& thebitext)
-      : shutdown(false), doomed(0), bt(thebitext)
-    { }
-    
-    template<typename Token>
-    bool
-    Bitext<Token>::
-    agenda::
-    job::
-    done() const
-    { 
-      return (max_samples && stats->good >= max_samples) || next == stop; 
-    }
-
-#if UG_BITEXT_TRACK_ACTIVE_THREADS
-    template<typename TKN>
-    ThreadSafeCounter 
-    Bitext<TKN>::
-    agenda::
-    job::active;
-#endif
-
-    template<typename Token>
     void 
     expand(typename Bitext<Token>::iter const& m, 
 	   Bitext<Token> const& bt, pstats const& ps, 
-	   vector<PhrasePair<Token> >& dest, ostream* log)
+	   std::vector<PhrasePair<Token> >& dest, ostream* log)
     {
       bool fwd = m.root == bt.I1.get();
       dest.reserve(ps.trg.size());
@@ -1887,5 +693,9 @@ namespace Moses {
 #endif
   } // end of namespace bitext
 } // end of namespace moses
-#endif
+
+#include "ug_im_bitext.h"
+#include "ug_mm_bitext.h"
+
+
 
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
new file mode 100644
index 000000000..a9632c056
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda.h
@@ -0,0 +1,186 @@
+// -*- c++ -*-
+// to be included from ug_bitext.h
+
+// The agenda handles parallel sampling. 
+// It maintains a queue of unfinished sampling jobs and 
+// assigns them to a pool of workers.
+//
+template<typename Token>
+class Bitext<Token>
+::agenda
+{
+public:
+  class job;
+  class worker;
+private:
+  boost::mutex lock; 
+  std::list<sptr<job> > joblist;
+  std::vector<sptr<boost::thread> > workers;
+  bool shutdown;
+  size_t doomed;
+
+public:
+
+
+  Bitext<Token>   const& bt;
+
+  agenda(Bitext<Token> const& bitext);
+  ~agenda();
+
+  void 
+  add_workers(int n);
+
+  sptr<pstats> 
+  add_job(Bitext<Token> const* const theBitext,
+	  typename TSA<Token>::tree_iterator const& phrase, 
+	  size_t const max_samples, sptr<SamplingBias const> const& bias);
+    // add_job(Bitext<Token> const* const theBitext,
+    // 	  typename TSA<Token>::tree_iterator const& phrase, 
+    // 	  size_t const max_samples, SamplingBias const* const bias);
+
+  sptr<job> 
+  get_job();
+};
+    
+template<typename Token>
+class 
+Bitext<Token>::agenda::
+worker
+{
+  agenda& ag;
+public:
+  worker(agenda& a) : ag(a) {}
+  void operator()();
+};
+
+#include "ug_bitext_agenda_worker.h"
+#include "ug_bitext_agenda_job.h"
+
+template<typename Token>
+void Bitext<Token>
+::agenda
+::add_workers(int n)
+{
+  static boost::posix_time::time_duration nodelay(0,0,0,0); 
+  boost::lock_guard<boost::mutex> guard(this->lock);
+  
+  int target  = max(1, int(n + workers.size() - this->doomed));
+  // house keeping: remove all workers that have finished
+  for (size_t i = 0; i < workers.size(); )
+    {
+      if (workers[i]->timed_join(nodelay))
+	{
+	  if (i + 1 < workers.size())
+	    workers[i].swap(workers.back());
+	  workers.pop_back();
+	}
+      else ++i;
+    }
+  // cerr << workers.size() << "/" << target << " active" << endl;
+  if (int(workers.size()) > target)
+    this->doomed = workers.size() - target;
+  else 
+    while (int(workers.size()) < target)
+      {
+	sptr<boost::thread> w(new boost::thread(worker(*this)));
+	workers.push_back(w);
+      }
+}
+
+
+template<typename Token>
+sptr<pstats> Bitext<Token>
+::agenda
+::add_job(Bitext<Token> const* const theBitext,
+	  typename TSA<Token>::tree_iterator const& phrase, 
+	  size_t const max_samples, sptr<SamplingBias const> const& bias)
+{
+  boost::unique_lock<boost::mutex> lk(this->lock);
+  static boost::posix_time::time_duration nodelay(0,0,0,0); 
+  bool fwd = phrase.root == bt.I1.get();
+  sptr<job> j(new job(theBitext, phrase, fwd ? bt.I1 : bt.I2, 
+		      max_samples, fwd, bias));
+  j->stats->register_worker();
+  
+  joblist.push_back(j);
+  if (joblist.size() == 1)
+    {
+      size_t i = 0;
+      while (i < workers.size())
+	{
+	  if (workers[i]->timed_join(nodelay))
+	    {
+	      if (doomed)
+		{
+		  if (i+1 < workers.size())
+		    workers[i].swap(workers.back());
+		  workers.pop_back();
+		  --doomed;
+		}
+	      else
+		workers[i++] = sptr<boost::thread>(new boost::thread(worker(*this)));
+	    }
+	  else ++i;
+	}
+    }
+  return j->stats;
+}
+
+template<typename Token>
+sptr<typename Bitext<Token>::agenda::job>
+Bitext<Token>
+::agenda
+::get_job()
+{
+  // cerr << workers.size() << " workers on record" << endl;
+  sptr<job> ret;
+  if (this->shutdown) return ret;
+  boost::unique_lock<boost::mutex> lock(this->lock);
+  if (this->doomed) 
+    { // the number of workers has been reduced, tell the redundant once to quit
+      --this->doomed;
+      return ret;
+    }
+
+  typename list<sptr<job> >::iterator j = joblist.begin();
+  while (j != joblist.end())
+    {
+      if ((*j)->done()) 
+	{
+	  (*j)->stats->release();
+	  joblist.erase(j++);
+	} 
+      else if ((*j)->workers >= 4) ++j; // no more than 4 workers per job
+      else break; // found one
+    }
+  if (joblist.size()) 
+    {
+      ret = j == joblist.end() ? joblist.front() : *j;
+      // if we've reached the end of the queue (all jobs have 4 workers on them),
+      // take the first in the queue
+      boost::lock_guard<boost::mutex> jguard(ret->lock);
+      ++ret->workers;
+    }
+  return ret;
+}
+
+template<typename Token>
+Bitext<Token>::
+agenda::
+~agenda()
+{
+  this->lock.lock();
+  this->shutdown = true;
+  this->lock.unlock();
+  for (size_t i = 0; i < workers.size(); ++i)
+    workers[i]->join();
+}
+    
+template<typename Token>
+Bitext<Token>::
+agenda::
+agenda(Bitext<Token> const& thebitext)
+  : shutdown(false), doomed(0), bt(thebitext)
+{ }
+  
+
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
new file mode 100644
index 000000000..efbebad52
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_job.h
@@ -0,0 +1,240 @@
+// -*- c++ -*-
+// class declaration of template<typename Token> class Bitxt<Token>::agenda::job
+// to be included by ug_bitext.h
+// todo: add check to enforce this
+
+template<typename Token>
+class 
+Bitext<Token>::agenda::
+job 
+{
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+  static ThreadSafeCounter active;
+#endif
+  Bitext<Token> const* const m_bitext;
+  boost::mutex lock; 
+  friend class agenda;
+  boost::taus88 rnd;  // every job has its own pseudo random generator 
+  double rnddenom;    // denominator for scaling random sampling
+  size_t min_diverse; // minimum number of distinct translations
+
+  bool flip_coin(uint64_t & sid, uint64_t & offset); 
+  bool step(uint64_t & sid, uint64_t & offset); // proceed to next occurrence
+
+public:
+  size_t         workers; // how many workers are working on this job?
+  sptr<TSA<Token> const> root; // root of the underlying suffix array
+  char const*       next; // next position to read from 
+  char const*       stop; // end of index range
+  size_t     max_samples; // how many samples to extract at most
+  size_t             ctr; /* # of phrase occurrences considered so far
+			   * # of samples chosen is stored in stats->good 
+			   */
+  size_t             len; // phrase length
+  bool               fwd; // if true, source phrase is L1 
+  sptr<pstats>     stats; // stores statistics collected during sampling
+  sptr<SamplingBias const> const m_bias; // sentence-level bias for sampling
+  float bias_total;
+  bool nextSample(uint64_t & sid, uint64_t & offset); // select next occurrence
+  
+  int 
+  check_sample_distribution(uint64_t const& sid, uint64_t const& offset);
+  // for biased sampling: ensure the distribution approximately matches 
+  // the bias
+  
+  bool done() const;
+  job(Bitext<Token> const* const theBitext, 
+      typename TSA<Token>::tree_iterator const& m, 
+      sptr<TSA<Token> > const& r, size_t maxsmpl, bool isfwd, 
+      sptr<SamplingBias const> const& bias);
+  ~job();
+};
+
+template<typename Token>
+Bitext<Token>::agenda::job
+::~job()
+{
+  if (stats) stats.reset();
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+  // counter may not exist any more at destruction time, hence try .. catch ...
+  try { --active; } catch (...) {} 
+#endif
+}
+
+template<typename Token>
+Bitext<Token>::agenda::job
+::job(Bitext<Token> const* const theBitext,
+      typename TSA<Token>::tree_iterator const& m, 
+      sptr<TSA<Token> > const& r, size_t maxsmpl, 
+      bool isfwd, sptr<SamplingBias const> const& bias)
+  : m_bitext(theBitext)
+  , rnd(0)
+  , rnddenom(rnd.max() + 1.)
+  , min_diverse(1)
+  , workers(0)
+  , root(r)
+  , next(m.lower_bound(-1))
+  , stop(m.upper_bound(-1))
+  , max_samples(maxsmpl)
+  , ctr(0)
+  , len(m.size())
+  , fwd(isfwd)
+  , m_bias(bias)
+{
+  stats.reset(new pstats());
+  stats->raw_cnt = m.approxOccurrenceCount();
+  bias_total = 0; 
+  
+  // we need to renormalize on the fly, as the summ of all sentence probs over 
+  // all candidates (not all sentences in the corpus) needs to add to 1.
+  // Profiling question: how much does that cost us?
+  if (m_bias)
+    {
+      int ctr = 0;
+      stats->raw_cnt = 0;
+      for (char const* x = m.lower_bound(-1); x < stop;)
+	{
+	  uint32_t sid; ushort offset;
+	  x = root->readSid(x,stop,sid);
+	  x = root->readOffset(x,stop,offset);
+#if 0
+	  cerr << ctr++ << " " << m.str(m_bitext->V1.get()) 
+	       << " " << sid << "/" << root->getCorpusSize() 
+	       << " " << offset << " " << stop-x << endl;
+#endif
+	  bias_total += (*m_bias)[sid];
+	  ++stats->raw_cnt;
+	}
+    }
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+  ++active;
+  // if (active%5 == 0) 
+  // cerr << size_t(active) << " active jobs at " << __FILE__ << ":" << __LINE__ << endl;
+#endif
+}
+
+template<typename Token>
+bool Bitext<Token>::agenda::job
+::done() const
+{ 
+  return (max_samples && stats->good >= max_samples) || next == stop; 
+}
+
+template<typename Token>
+int Bitext<Token>::agenda::job
+::check_sample_distribution(uint64_t const& sid, uint64_t const& offset)
+{ // ensure that the sampled distribution approximately matches the bias
+  // @return 0: SKIP this occurrence
+  // @return 1: consider this occurrence for sampling 
+  // @return 2: include this occurrence in the sample by all means
+
+  if (!m_bias) return 1;
+  
+  using namespace boost::math;
+  typedef boost::math::binomial_distribution<> binomial;
+  
+  ostream* log = m_bias->loglevel > 1 ? m_bias->log : NULL;
+  
+  float p = (*m_bias)[sid]; 
+  id_type docid = m_bias->GetClass(sid); 
+  uint32_t k = docid < stats->indoc.size() ? stats->indoc[docid] : 0; 
+
+  // always consider candidates from dominating documents and
+  // from documents that have not been considered at all yet
+  bool ret =  (p > .5 || k == 0);
+  
+  if (ret && !log) return 1;
+  
+  uint32_t N = stats->good; // number of trials
+  float d = cdf(complement(binomial(N, p), k)); 
+  // d: probability that samples contains k or more instances from doc #docid
+  ret = ret || d >= .05; 
+  
+  if (log)
+    {
+      Token const* t = root->getCorpus()->sntStart(sid)+offset;
+      Token const* x = t - min(offset,uint64_t(3));
+      Token const* e = t+4; 
+      if (e > root->getCorpus()->sntEnd(sid))
+	e = root->getCorpus()->sntEnd(sid);
+      *log << docid << ":" << sid << " " << size_t(k) << "/" << N 
+	   << " @" << p << " => " << d << " [";
+      for (size_t i = 0; i < stats->indoc.size(); ++i)
+	{
+	  if (i) *log << " ";
+	  *log << stats->indoc[i];
+	}
+      *log << "] ";
+      for (; x < e; ++x) *log << (*m_bitext->V1)[x->id()] << " ";
+      if (!ret) *log << "SKIP";
+      else if (p < .5 && d > .9) *log << "FORCE";
+      *log << endl;
+    }
+  
+  return (ret ? (p < .5 && d > .9) ? 2 : 1 : 0); 
+}
+
+template<typename Token>
+bool Bitext<Token>::agenda::job
+::flip_coin(uint64_t & sid, uint64_t & offset)
+{
+  int no_maybe_yes = m_bias ? check_sample_distribution(sid, offset) : 1;
+  if (no_maybe_yes == 0) return false; // no
+  if (no_maybe_yes > 1)  return true;  // yes
+  // ... maybe: flip a coin
+  size_t options_chosen = stats->good;
+  size_t options_total  = max(stats->raw_cnt, this->ctr);
+  size_t options_left   = (options_total - this->ctr);
+  size_t random_number  = options_left * (rnd()/(rnd.max()+1.));
+  size_t threshold; 
+  if (bias_total) // we have a bias and there are candidates with non-zero prob
+    threshold = ((*m_bias)[sid]/bias_total * options_total * max_samples);
+  else // no bias, or all have prob 0 (can happen with a very opinionated bias)
+    threshold = max_samples;
+  return random_number + options_chosen < threshold;
+}
+
+template<typename Token>
+bool Bitext<Token>::agenda::job
+::step(uint64_t & sid, uint64_t & offset)
+{ // caller must lock!
+  if (next == stop) return false;
+  UTIL_THROW_IF2 
+    ( next > stop, "Fatal error at " << HERE << ". How did that happen?" );
+  // boost::lock_guard<boost::mutex> jguard(lock); // caller must lock!
+  next = root->readSid(next, stop, sid);
+  next = root->readOffset(next, stop, offset);
+  ++ctr;
+  return true;
+}
+
+template<typename Token>
+bool Bitext<Token>::agenda::job
+::nextSample(uint64_t & sid, uint64_t & offset)
+{
+  boost::lock_guard<boost::mutex> jguard(lock);
+  if (max_samples == 0) // no sampling, consider all occurrences
+    return step(sid, offset); 
+
+  while (step(sid,offset)) 
+    {
+      size_t good      = stats->good;
+      size_t diversity = stats->trg.size();
+      if (good >= max_samples && diversity >= min_diverse) 
+	return false; // done
+
+      // flip_coin softly enforces approximation of the sampling to the 
+      // bias (occurrences that would steer the sample too far from the bias
+      // are ruled out), and flips a biased coin otherwise.
+      if (!flip_coin(sid,offset)) continue;
+      return true;
+    } 
+  return false;
+}
+
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+template<typename TKN>
+ThreadSafeCounter Bitext<TKN>::agenda
+::job
+::active;
+#endif
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h
new file mode 100644
index 000000000..92ed3d36a
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_agenda_worker.h
@@ -0,0 +1,102 @@
+// to be included from ug_bitext_agenda.h
+
+template<typename Token>
+void
+Bitext<Token>::agenda
+::worker
+::operator()()
+{
+  // things to do:
+  // 
+  // - have each worker maintain their own pstats object and merge
+  //   results at the end (to minimize mutex locking);
+  // 
+  // - use a non-locked, monotonically increasing counter to
+  //   ensure the minimum size of samples considered --- it's OK if
+  //   we look at more samples than required. This way, we can 
+  //   reduce the number of lock / unlock operations we need to do
+  //   during sampling.
+
+  uint64_t sid=0, offset=0;      // sid and offset of source phrase
+  size_t s1=0, s2=0, e1=0, e2=0; // soft and hard boundaries of target phrase
+  vector<uchar> aln;             // stores phrase-pair-internal alignment
+  while(sptr<job> j = ag.get_job())
+    {
+      j->stats->register_worker();
+      bitvector full_alignment(100*100); // Is full_alignment still needed???
+      while (j->nextSample(sid,offset))
+	{
+	  aln.clear();
+	  int po_fwd = Moses::LRModel::NONE;
+	  int po_bwd = Moses::LRModel::NONE;
+	  int docid  = j->m_bias ? j->m_bias->GetClass(sid) : -1;
+	  bitvector* full_aln = j->fwd ? &full_alignment : NULL;
+
+	  // find soft and hard boundaries of target phrase
+	  bool good = (ag.bt.find_trg_phr_bounds
+		       (sid, offset, offset + j->len,   // input parameters
+			s1, s2, e1, e2, po_fwd, po_bwd, // bounds & orientation
+			&aln, full_aln, !j->fwd));      // aln info / flip sides?
+
+	  if (!good) 
+	    { // no good, probably because phrase is not coherent
+	      j->stats->count_sample(docid, 0, po_fwd, po_bwd);
+	      continue;
+	    }
+
+	  // all good: register this sample as valid 
+	  size_t num_pairs = (s2-s1+1) * (e2-e1+1);
+	  j->stats->count_sample(docid, num_pairs, po_fwd, po_bwd);
+
+#if 0
+	  Token const* t = ag.bt.T2->sntStart(sid);
+	  Token const* eos = ag.bt.T2->sntEnd(sid);
+	  cerr << "[" << j->stats->good + 1 << "] ";
+	  while (t != eos) cerr << (*ag.bt.V2)[(t++)->id()] << " "; 
+	  cerr << "[" << docid << "]" << endl;
+#endif
+
+	  float sample_weight = 1./num_pairs;
+	  Token const* o = (j->fwd ? ag.bt.T2 : ag.bt.T1)->sntStart(sid);
+
+	  // adjust offsets in phrase-internal aligment 
+	  for (size_t k = 1; k < aln.size(); k += 2) aln[k] += s2 - s1;
+
+	  vector<uint64_t> seen; seen.reserve(10);
+	  // It is possible that the phrase extraction extracts the same
+	  // phrase twice, e.g., when word a co-occurs with sequence b b b
+	  // but is aligned only to the middle word. We can only count
+	  // each phrase pair once per source phrase occurrence, or else
+	  // run the risk of having more joint counts than marginal
+	  // counts.
+
+	  for (size_t s = s1; s <= s2; ++s)
+	    {
+	      TSA<Token> const& I = j->fwd ? *ag.bt.I2 : *ag.bt.I1;
+	      sptr<iter> b = I.find(o + s, e1 - s);
+	      UTIL_THROW_IF2(!b || b->size() < e1-s, "target phrase not found");
+
+	      for (size_t i = e1; i <= e2; ++i)
+		{
+		  uint64_t tpid = b->getPid();
+
+		  // poor man's protection against over-counting
+		  size_t s = 0;
+		  while (s < seen.size() && seen[s] != tpid) ++s;
+		  if (s < seen.size()) continue;
+		  seen.push_back(tpid);
+
+		  size_t raw2 = b->approxOccurrenceCount();
+		  j->stats->add(tpid, sample_weight, aln, raw2,
+				po_fwd, po_bwd, docid);
+		  bool ok = (i == e2) || b->extend(o[i].id());
+		  UTIL_THROW_IF2(!ok, "Could not extend target phrase.");
+		}
+	      if (s < s2) // shift phrase-internal alignments
+		for (size_t k = 1; k < aln.size(); k += 2) 
+		  --aln[k];
+	    }
+	}
+      j->stats->release(); // indicate that you're done working on j->stats
+    }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
new file mode 100644
index 000000000..2dda3ab9a
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.cc
@@ -0,0 +1,91 @@
+#include "ug_bitext_jstats.h"
+namespace Moses
+{
+  namespace bitext
+  {
+
+    uint32_t jstats::rcnt() const { return my_rcnt; }
+    float    jstats::wcnt() const { return my_wcnt; }
+    uint32_t jstats::cnt2() const { return my_cnt2; }
+
+    // What was that used for again? UG
+    bool jstats::valid() { return my_wcnt >= 0; }
+    void jstats::validate()   { if (my_wcnt < 0) my_wcnt *= -1; }
+    void jstats::invalidate() { if (my_wcnt > 0) my_wcnt *= -1; }
+
+    jstats::
+    jstats()
+      : my_rcnt(0), my_cnt2(0), my_wcnt(0)
+    { 
+      for (int i = 0; i <= Moses::LRModel::NONE; ++i) 
+	ofwd[i] = obwd[i] = 0;
+      my_aln.reserve(1);
+    }
+    
+    jstats::
+    jstats(jstats const& other)
+    {
+      my_rcnt = other.rcnt();
+      my_wcnt = other.wcnt();
+      my_aln  = other.aln();
+      indoc   = other.indoc;
+      for (int i = 0; i <= Moses::LRModel::NONE; i++)
+	{
+	  ofwd[i] = other.ofwd[i];
+	  obwd[i] = other.obwd[i];
+	}
+    }
+  
+    uint32_t 
+    jstats::
+    dcnt_fwd(PhraseOrientation const idx) const
+    {
+      assert(idx <= Moses::LRModel::NONE);
+      return ofwd[idx];
+    }
+
+    uint32_t 
+    jstats::
+    dcnt_bwd(PhraseOrientation const idx) const
+    {
+      assert(idx <= Moses::LRModel::NONE);
+      return obwd[idx];
+    }
+    
+    void 
+    jstats::
+    add(float w, vector<uchar> const& a, uint32_t const cnt2,
+	uint32_t fwd_orient, uint32_t bwd_orient, int const docid)
+    {
+      boost::lock_guard<boost::mutex> lk(this->lock);
+      my_cnt2 = cnt2;
+      my_rcnt += 1;
+      my_wcnt += w;
+      if (a.size())
+	{
+	  size_t i = 0;
+	  while (i < my_aln.size() && my_aln[i].second != a) ++i;
+	  if (i == my_aln.size()) 
+	    my_aln.push_back(pair<size_t,vector<uchar> >(1,a));
+	  else
+	    my_aln[i].first++;
+	  if (my_aln[i].first > my_aln[i/2].first)
+	    push_heap(my_aln.begin(),my_aln.begin()+i+1);
+	}
+      ++ofwd[fwd_orient];
+      ++obwd[bwd_orient];
+      if (docid >= 0)
+	{
+	  while (int(indoc.size()) <= docid) indoc.push_back(0);
+	  ++indoc[docid];
+	}
+    }
+
+    vector<pair<size_t, vector<uchar> > > const&
+    jstats::
+    aln() const 
+    { return my_aln; }
+
+  
+  }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_jstats.h b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
new file mode 100644
index 000000000..13c86e34d
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_jstats.h
@@ -0,0 +1,48 @@
+// -*- c++ -*-
+#pragma once
+#include "ug_typedefs.h"
+#include "ug_lexical_reordering.h"
+#include <boost/thread.hpp>
+
+namespace Moses 
+{
+  namespace bitext
+  {
+    using namespace ugdiss;
+
+    // "joint" (i.e., phrase pair) statistics    
+    class
+    jstats
+    {
+      boost::mutex lock;
+      uint32_t my_rcnt; // unweighted joint count
+      uint32_t my_cnt2; // raw counts L2
+      float    my_wcnt; // weighted joint count 
+
+      // to do: use a static alignment pattern store that stores each pattern only
+      // once, so that we don't have to store so many alignment vectors
+      vector<pair<size_t, vector<uchar> > > my_aln; // internal word alignment
+
+      uint32_t ofwd[Moses::LRModel::NONE+1]; //  forward distortion type counts
+      uint32_t obwd[Moses::LRModel::NONE+1]; // backward distortion type counts
+
+    public:
+      vector<uint32_t> indoc; // counts origin of samples (for biased sampling)
+      jstats();
+      jstats(jstats const& other);
+      uint32_t rcnt() const; // raw joint counts
+      uint32_t cnt2() const; // raw target phrase occurrence count
+      float    wcnt() const; // weighted joint counts
+      
+      vector<pair<size_t, vector<uchar> > > const & aln() const;
+      void add(float w, vector<uchar> const& a, uint32_t const cnt2,
+	       uint32_t fwd_orient, uint32_t bwd_orient, 
+	       int const docid);
+      void invalidate();
+      void validate();
+      bool valid();
+      uint32_t dcnt_fwd(PhraseOrientation const idx) const;
+      uint32_t dcnt_bwd(PhraseOrientation const idx) const;
+    };
+  }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
new file mode 100644
index 000000000..bbae42e85
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.cc
@@ -0,0 +1,83 @@
+#include "ug_bitext_pstats.h"
+
+namespace Moses
+{
+  namespace bitext
+  {
+
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+    ThreadSafeCounter pstats::active;
+#endif
+    
+    pstats::
+    pstats() : raw_cnt(0), sample_cnt(0), good(0), sum_pairs(0), in_progress(0)
+    {
+      for (int i = 0; i <= Moses::LRModel::NONE; ++i) 
+	ofwd[i] = obwd[i] = 0;
+    }
+
+    pstats::
+    ~pstats()
+    {
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+      // counter may not exist any more at destruction time, so try ... catch
+      try { --active; } catch (...) {} 
+#endif
+    }
+
+    void
+    pstats::
+    register_worker()
+    {
+      this->lock.lock();
+      ++this->in_progress;
+      this->lock.unlock();
+    }
+  
+    void
+    pstats::
+    release()
+    {
+      this->lock.lock();
+      if (this->in_progress-- == 1) // last one - >we're done
+	this->ready.notify_all();
+      this->lock.unlock();
+    }
+
+    void 
+    pstats
+    ::count_sample(int const docid, size_t const num_pairs, 
+		   int const po_fwd, int const po_bwd)
+    {
+      boost::lock_guard<boost::mutex> guard(lock);
+      ++sample_cnt;
+      if (num_pairs == 0) return;
+      ++good;
+      sum_pairs += num_pairs;
+      ++ofwd[po_fwd];
+      ++obwd[po_bwd];
+      while (int(indoc.size()) <= docid) indoc.push_back(0);
+      ++indoc[docid];
+    }
+
+    bool
+    pstats::
+    add(uint64_t pid, float const w, 
+	vector<uchar> const& a, 
+	uint32_t const cnt2, 
+	uint32_t fwd_o, 
+	uint32_t bwd_o, int const docid)
+    {
+      boost::lock_guard<boost::mutex> guard(this->lock);
+      jstats& entry = this->trg[pid];
+      entry.add(w, a, cnt2, fwd_o, bwd_o, docid);
+      if (this->good < entry.rcnt())
+	{
+	  UTIL_THROW(util::Exception, "more joint counts than good counts:" 
+		     << entry.rcnt() << "/" << this->good << "!");
+	}
+      return true;
+    }
+
+  }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_bitext_pstats.h b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
new file mode 100644
index 000000000..c5b6c0152
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_bitext_pstats.h
@@ -0,0 +1,63 @@
+// -*- c++ -*-
+#pragma once
+
+#include <boost/thread.hpp>
+#include <boost/unordered_map.hpp>
+
+#include "ug_typedefs.h"
+#include "ug_bitext_jstats.h"
+#include "moses/thread_safe_container.h"
+
+namespace Moses
+{
+  namespace bitext
+  {
+    struct 
+    pstats
+    {
+      typedef boost::unordered_map<uint64_t, sptr<pstats> > map_t;
+      typedef ThreadSafeContainer<uint64_t, sptr<pstats>, map_t> cache_t;
+      typedef std::vector<uchar> alnvec;
+#if UG_BITEXT_TRACK_ACTIVE_THREADS
+      static ThreadSafeCounter active;
+#endif
+      boost::mutex lock;               // for parallel gathering of stats
+      boost::condition_variable ready; // consumers can wait for me to be ready
+      
+      size_t raw_cnt;     // (approximate) raw occurrence count 
+      size_t sample_cnt;  // number of instances selected during sampling
+      size_t good;        // number of selected instances with valid word alignments
+      size_t sum_pairs;   // total number of target phrases extracted (can be > raw_cnt)
+      size_t in_progress; // how many threads are currently working on this?
+
+      uint32_t ofwd[Moses::LRModel::NONE+1]; // distribution of fwd phrase orientations
+      uint32_t obwd[Moses::LRModel::NONE+1]; // distribution of bwd phrase orientations
+
+      std::vector<uint32_t> indoc; // distribution over where samples came from
+      
+      typedef std::map<uint64_t, jstats> trg_map_t;
+      trg_map_t trg;
+      pstats();
+      ~pstats();
+      void release();
+      void register_worker();
+      size_t count_workers() { return in_progress; } 
+
+      bool 
+      add(uint64_t const  pid, // target phrase id
+	  float const       w, // sample weight (1./(# of phrases extractable))
+	  alnvec const&     a, // local alignment
+	  uint32_t const cnt2, // raw target phrase count
+	  uint32_t fwd_o,      // fwd. phrase orientation
+	  uint32_t bwd_o,      // bwd. phrase orientation
+	  int const docid);    // document where sample was found 
+
+      void 
+      count_sample(int const docid,        // document where sample was found
+		   size_t const num_pairs, // # of phrases extractable here
+		   int const po_fwd,       // fwd phrase orientation
+		   int const po_bwd);      // bwd phrase orientation
+    };
+
+  }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.cc b/moses/TranslationModel/UG/mm/ug_im_bitext.cc
new file mode 100644
index 000000000..9f26a181b
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_im_bitext.cc
@@ -0,0 +1,87 @@
+#include "ug_im_bitext.h"
+
+namespace Moses
+{
+  namespace bitext
+  {
+
+    template<>
+    sptr<imBitext<L2R_Token<SimpleWordId> > > 
+    imBitext<L2R_Token<SimpleWordId> >::
+    add(vector<string> const& s1, 
+	vector<string> const& s2, 
+	vector<string> const& aln) const
+    {
+      typedef L2R_Token<SimpleWordId> TKN;
+      assert(s1.size() == s2.size() && s1.size() == aln.size());
+      
+#ifndef NDEBUG
+      size_t first_new_snt = this->T1 ? this->T1->size() : 0;
+#endif
+
+      sptr<imBitext<TKN> > ret;
+      {
+	boost::unique_lock<boost::shared_mutex> guard(m_lock);
+	ret.reset(new imBitext<TKN>(*this));
+      }
+      
+      // we add the sentences in separate threads (so it's faster)
+      boost::thread thread1(snt_adder<TKN>(s1,*ret->V1,ret->myT1,ret->myI1));
+      // thread1.join(); // for debugging
+      boost::thread thread2(snt_adder<TKN>(s2,*ret->V2,ret->myT2,ret->myI2));
+      BOOST_FOREACH(string const& a, aln)
+	{
+	  istringstream ibuf(a);
+	  ostringstream obuf;
+	  uint32_t row,col; char c;
+	  while (ibuf >> row >> c >> col)
+	    {
+	      UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
+			     << "Error in alignment information:\n" << a);
+	      binwrite(obuf,row);
+	      binwrite(obuf,col);
+	    }
+	  // important: DO NOT replace the two lines below this comment by 
+	  // char const* x = obuf.str().c_str(), as the memory x is pointing 
+	  // to is freed immediately upon deconstruction of the string object.
+	  string foo = obuf.str(); 
+	  char const* x = foo.c_str();
+	  vector<char> v(x,x+foo.size());
+	  ret->myTx = append(ret->myTx, v);
+	}
+
+      thread1.join();
+      thread2.join();
+
+      ret->Tx = ret->myTx;
+      ret->T1 = ret->myT1;
+      ret->T2 = ret->myT2;
+      ret->I1 = ret->myI1;
+      ret->I2 = ret->myI2;
+
+#ifndef NDEBUG
+      // sanity check
+      for (size_t i = first_new_snt; i < ret->T1->size(); ++i)
+	{
+	  size_t slen1  = ret->T1->sntLen(i);
+	  size_t slen2  = ret->T2->sntLen(i);
+	  char const* p = ret->Tx->sntStart(i);
+	  char const* q = ret->Tx->sntEnd(i);
+	  size_t k;
+	  while (p < q)
+	    {
+	      p = binread(p,k);
+	      assert(p);
+	      assert(p < q);
+	      assert(k < slen1);
+	      p = binread(p,k);
+	      assert(p);
+	      assert(k < slen2);
+	    }
+	}
+#endif
+      return ret;
+    }
+
+  }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_im_bitext.h b/moses/TranslationModel/UG/mm/ug_im_bitext.h
new file mode 100644
index 000000000..a620b7219
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_im_bitext.h
@@ -0,0 +1,130 @@
+// -*- c++ -*-
+#pragma once
+#include "ug_bitext.h"
+
+namespace Moses
+{
+  namespace bitext 
+  {
+    template<typename TKN>
+    class imBitext : public Bitext<TKN>
+    {
+      sptr<imTtrack<char> > myTx;
+      sptr<imTtrack<TKN> >  myT1;
+      sptr<imTtrack<TKN> >  myT2;
+      sptr<imTSA<TKN> >     myI1; 
+      sptr<imTSA<TKN> >     myI2;
+      static ThreadSafeCounter my_revision;
+    public:
+      size_t revision() const { return my_revision; }
+      void open(string const base, string const L1, string L2);
+      imBitext(sptr<TokenIndex> const& V1,
+	       sptr<TokenIndex> const& V2,
+	       size_t max_sample = 5000, size_t num_workers=4);
+      imBitext(size_t max_sample = 5000, size_t num_workers=4);
+      imBitext(imBitext const& other);
+      
+      // sptr<imBitext<TKN> > 
+      // add(vector<TKN> const& s1, vector<TKN> const& s2, vector<ushort> & a);
+
+      sptr<imBitext<TKN> > 
+      add(vector<string> const& s1, 
+	  vector<string> const& s2, 
+	  vector<string> const& a) const;
+
+    };
+
+    template<typename TKN>
+    ThreadSafeCounter 
+    imBitext<TKN>::my_revision;
+
+    template<typename TKN>
+    imBitext<TKN>::
+    imBitext(size_t max_sample, size_t num_workers)
+      : Bitext<TKN>(max_sample, num_workers)
+    { 
+      this->m_default_sample_size = max_sample;
+      this->V1.reset(new TokenIndex());
+      this->V2.reset(new TokenIndex());
+      this->V1->setDynamic(true);
+      this->V2->setDynamic(true);
+      ++my_revision;
+    }
+    
+    template<typename TKN>
+    imBitext<TKN>::
+    imBitext(sptr<TokenIndex> const& v1,
+	     sptr<TokenIndex> const& v2,
+	     size_t max_sample, size_t num_workers)
+      : Bitext<TKN>(max_sample, num_workers)
+    { 
+      // this->default_sample_size = max_sample;
+      this->V1 = v1;
+      this->V2 = v2;
+      this->V1->setDynamic(true);
+      this->V2->setDynamic(true);
+      ++my_revision;
+    }
+    
+
+    template<typename TKN>
+    imBitext<TKN>::
+    imBitext(imBitext<TKN> const& other)
+    { 
+      this->myTx = other.myTx;
+      this->myT1 = other.myT1;
+      this->myT2 = other.myT2;
+      this->myI1 = other.myI1;
+      this->myI2 = other.myI2;
+      this->Tx = this->myTx;
+      this->T1 = this->myT1;
+      this->T2 = this->myT2;
+      this->I1 = this->myI1;
+      this->I2 = this->myI2;
+      this->V1 = other.V1;
+      this->V2 = other.V2;
+      this->m_default_sample_size = other.m_default_sample_size;
+      this->m_num_workers = other.m_num_workers;
+      ++my_revision;
+    }
+
+    template<>
+    sptr<imBitext<L2R_Token<SimpleWordId> > > 
+    imBitext<L2R_Token<SimpleWordId> >::
+    add(vector<string> const& s1, 
+	vector<string> const& s2, 
+	vector<string> const& aln) const;
+
+    template<typename TKN>
+    sptr<imBitext<TKN> > 
+    imBitext<TKN>::
+    add(vector<string> const& s1, 
+	vector<string> const& s2, 
+	vector<string> const& aln) const
+    {
+      throw "Not yet implemented";
+    }
+
+    // What's up with this function???? UG
+    template<typename TKN>
+    void
+    imBitext<TKN>::
+    open(string const base, string const L1, string L2)
+    {
+      mmTtrack<TKN>& t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
+      mmTtrack<TKN>& t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
+      mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
+      t1.open(base+L1+".mct");
+      t2.open(base+L2+".mct");
+      tx.open(base+L1+"-"+L2+".mam");
+      this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
+      this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
+      mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
+      mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
+      i1.open(base+L1+".sfa", this->T1);
+      i2.open(base+L2+".sfa", this->T2);
+      assert(this->T1->size() == this->T2->size());
+    }
+
+  }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_mm_bitext.h b/moses/TranslationModel/UG/mm/ug_mm_bitext.h
new file mode 100644
index 000000000..211793277
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_mm_bitext.h
@@ -0,0 +1,81 @@
+// -*- c++ -*-
+// don't include this file directly! it is included by ug_bitext.h
+
+namespace Moses
+{
+  namespace bitext 
+  {
+    template<typename TKN>
+    class mmBitext : public Bitext<TKN>
+    {
+      void load_document_map(string const& fname);
+    public:
+      void open(string const base, string const L1, string L2);
+      mmBitext();
+    };
+
+    template<typename TKN>
+    mmBitext<TKN>::
+    mmBitext()
+      : Bitext<TKN>(new mmTtrack<TKN>(), new mmTtrack<TKN>(), new mmTtrack<char>(), 
+		    new TokenIndex(), new TokenIndex(), 
+		    new mmTSA<TKN>(), new mmTSA<TKN>())
+    {};
+    
+    template<typename TKN>
+    void
+    mmBitext<TKN>::
+    load_document_map(string const& fname)
+    {
+      ifstream docmap(fname.c_str());
+      // the docmap file should list the documents in the corpus 
+      // in the order in which they appear with one line per document:
+      // <docname> <number of lines / sentences>
+      //
+      // in the future, we might also allow listing documents with
+      // sentence ranges.
+      string buffer,docname; size_t a=0,b;
+      this->m_sid2docid.reset(new vector<id_type>(this->T1->size()));
+      while(getline(docmap,buffer))
+	{
+	  istringstream line(buffer); 
+	  if (!(line>>docname)) continue; // empty line
+	  if (docname.size() && docname[0] == '#') continue; // comment
+	  size_t docid = this->m_docname2docid.size();
+	  this->m_docname2docid[docname] = docid;
+	  line >> b;
+	  VERBOSE(1, "DOCUMENT MAP " << docname 
+		  << " " << a << "-" << b+a << endl);
+	  for (b += a; a < b; ++a)
+	    (*this->m_sid2docid)[a] = docid;
+	}
+      UTIL_THROW_IF2(b != this->T1->size(), 
+		     "Document map doesn't match corpus!");
+    }
+    
+    template<typename TKN>
+    void
+    mmBitext<TKN>::
+    open(string const base, string const L1, string L2)
+    {
+      mmTtrack<TKN>&  t1 = *reinterpret_cast<mmTtrack<TKN>*>(this->T1.get());
+      mmTtrack<TKN>&  t2 = *reinterpret_cast<mmTtrack<TKN>*>(this->T2.get());
+      mmTtrack<char>& tx = *reinterpret_cast<mmTtrack<char>*>(this->Tx.get());
+      t1.open(base+L1+".mct");
+      t2.open(base+L2+".mct");
+      tx.open(base+L1+"-"+L2+".mam");
+      this->V1->open(base+L1+".tdx"); this->V1->iniReverseIndex();
+      this->V2->open(base+L2+".tdx"); this->V2->iniReverseIndex();
+      mmTSA<TKN>& i1 = *reinterpret_cast<mmTSA<TKN>*>(this->I1.get());
+      mmTSA<TKN>& i2 = *reinterpret_cast<mmTSA<TKN>*>(this->I2.get());
+      i1.open(base+L1+".sfa", this->T1);
+      i2.open(base+L2+".sfa", this->T2);
+      assert(this->T1->size() == this->T2->size());
+
+      string docmapfile = base+"dmp";
+      if (!access(docmapfile.c_str(),F_OK))
+	load_document_map(docmapfile);
+    }
+    
+  }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h
new file mode 100644
index 000000000..28a926587
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@@ -0,0 +1,246 @@
+// -*- c++ -*-
+#pragma once
+#include <vector>
+#include "ug_typedefs.h"
+#include "ug_bitext_pstats.h"
+
+namespace Moses
+{
+  namespace bitext
+  {
+    template<typename Token>
+    class 
+    PhrasePair
+    {
+    public:
+      class Scorer { public: virtual float operator()(PhrasePair& pp) const = 0; };
+      Token const* start1;
+      Token const* start2;
+      uint32_t len1;
+      uint32_t len2;
+      uint64_t p1, p2;
+      uint32_t raw1, raw2, sample1, sample2, good1, good2, joint;
+      std::vector<float> fvals;
+      float dfwd[Moses::LRModel::NONE+1]; // distortion counts // counts or probs?
+      float dbwd[Moses::LRModel::NONE+1]; // distortion counts
+      std::vector<uchar> aln;
+      float score;
+      bool inverse;
+      std::vector<uint32_t> indoc;
+      PhrasePair() { };
+      PhrasePair(PhrasePair const& o);
+
+      PhrasePair const& operator+=(PhrasePair const& other);
+
+      bool operator<(PhrasePair const& other) const;
+      bool operator>(PhrasePair const& other) const;
+      bool operator<=(PhrasePair const& other) const; 
+      bool operator>=(PhrasePair const& other) const;
+
+      void init();
+      void init(uint64_t const pid1, bool is_inverse, 
+		Token const* x,   uint32_t const len,
+		pstats const* ps = NULL, size_t const numfeats=0);
+
+      PhrasePair const& 
+      update(uint64_t const pid2, Token const* x, 
+	     uint32_t const len, jstats const& js);
+      
+      class SortByTargetIdSeq
+      {
+      public:
+	int cmp(PhrasePair const& a, PhrasePair const& b) const;
+	bool operator()(PhrasePair const& a, PhrasePair const& b) const;
+      };
+      
+      class SortDescendingByJointCount
+      {
+      public:
+	int cmp(PhrasePair const& a, PhrasePair const& b) const;
+	bool operator()(PhrasePair const& a, PhrasePair const& b) const;
+      };
+    };
+
+    template<typename Token>
+    void PhrasePair<Token>
+    ::init(uint64_t const pid1, bool is_inverse, 
+	   Token const* x, uint32_t const len, 
+	   pstats const* ps, size_t const numfeats)
+    {
+      inverse = is_inverse;
+      start1 = x; len1 = len;
+      p1     = pid1;
+      p2     = 0;
+      if (ps)
+	{
+	  raw1    = ps->raw_cnt;
+	  sample1 = ps->sample_cnt;
+	  good1   = ps->good;
+	}
+      else raw1 = sample1 = good1 = 0;
+      joint   = 0;
+      good2   = 0;
+      sample2 = 0;
+      raw2    = 0;
+      fvals.resize(numfeats);
+    }
+
+    template<typename Token>
+    PhrasePair<Token> const&
+    PhrasePair<Token>
+    ::update(uint64_t const pid2, 
+	     Token const* x, uint32_t const len, jstats const& js)   
+    {
+      p2    = pid2;
+      start2 = x; len2 = len;
+      raw2  = js.cnt2();
+      joint = js.rcnt();
+      assert(js.aln().size());
+      if (js.aln().size()) 
+	aln = js.aln()[0].second;
+      float total_fwd = 0, total_bwd = 0;
+      for (int i = 0; i <= Moses::LRModel::NONE; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  total_fwd += js.dcnt_fwd(po)+1;
+	  total_bwd += js.dcnt_bwd(po)+1;
+	}
+
+      // should we do that here or leave the raw counts?
+      for (int i = 0; i <= Moses::LRModel::NONE; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
+	  dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
+	}
+      
+      indoc = js.indoc;
+      return *this;
+    }
+
+    template<typename Token>
+    bool 
+    PhrasePair<Token>
+    ::operator<(PhrasePair const& other) const 
+    { 
+      return this->score < other.score; 
+    }
+    
+    template<typename Token>
+    bool 
+    PhrasePair<Token>
+    ::operator>(PhrasePair const& other) const
+    { 
+      return this->score > other.score; 
+    }
+
+    template<typename Token>
+    bool 
+    PhrasePair<Token>
+    ::operator<=(PhrasePair const& other) const 
+    { 
+      return this->score <= other.score; 
+    }
+    
+    template<typename Token>
+    bool 
+    PhrasePair<Token>
+    ::operator>=(PhrasePair const& other) const
+    { 
+      return this->score >= other.score; 
+    }
+
+    template<typename Token>
+    PhrasePair<Token> const&
+    PhrasePair<Token>
+    ::operator+=(PhrasePair const& o) 
+    { 
+      raw1    += o.raw1;
+      raw2    += o.raw2;
+      good1   += o.good1;
+      good2   += o.good2;
+      joint   += o.joint;
+      sample1 += o.sample1;
+      sample2 += o.sample2;
+      return *this;
+    }
+
+    template<typename Token>
+    PhrasePair<Token>
+    ::PhrasePair(PhrasePair<Token> const& o) 
+      : start1(o.start1)   , start2(o.start2)
+      , len1(o.len1)       , len2(o.len2)
+      , p1(o.p1)           , p2(o.p2)
+      , raw1(o.raw1)       , raw2(o.raw2) 
+      , sample1(o.sample1) , sample2(o.sample2)
+      ,	good1(o.good1)     , good2(o.good2)
+      , joint(o.joint)     
+      , fvals(o.fvals)
+      , aln(o.aln)      
+      , score(o.score)
+      , inverse(o.inverse)
+      , indoc(o.indoc)
+    {
+      for (int i = 0; i <= Moses::LRModel::NONE; ++i)
+	{
+	  dfwd[i] = o.dfwd[i];
+	  dbwd[i] = o.dbwd[i];
+	}
+    }
+    
+    template<typename Token>
+    int PhrasePair<Token>
+    ::SortByTargetIdSeq
+    ::cmp(PhrasePair const& a, PhrasePair const& b) const
+    {
+      size_t i = 0;
+      Token const* x = a.start2;
+      Token const* y = b.start2;
+      while (i < a.len2 && i < b.len2 && x->id() == y->id()) 
+	{
+	  x = x->next();
+	  y = y->next();
+	  ++i;
+	}
+      if (i == a.len2 && i == b.len2) return 0;
+      if (i == a.len2) return -1;
+      if (i == b.len2) return  1;
+      return x->id() < y->id() ? -1 : 1;
+    }
+    
+    template<typename Token>
+    bool PhrasePair<Token>
+    ::SortByTargetIdSeq
+    ::operator()(PhrasePair const& a, PhrasePair const& b) const
+    {
+      return this->cmp(a,b) < 0;
+    }
+
+    template<typename Token>
+    int PhrasePair<Token>
+    ::SortDescendingByJointCount
+    ::cmp(PhrasePair const& a, PhrasePair const& b) const
+    {
+      if (a.joint == b.joint) return 0;
+      return a.joint > b.joint ? -1 : 1;
+    }
+
+    template<typename Token>
+    bool PhrasePair<Token>
+    ::SortDescendingByJointCount
+    ::operator()(PhrasePair const& a, PhrasePair const& b) const
+    {
+      return this->cmp(a,b) < 0;
+    }
+    
+    template<typename Token>
+    void PhrasePair<Token>
+    ::init()
+    {
+      inverse = false;
+      len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+      start1 = start2 = NULL;
+      p1 = p2 = 0;
+    }
+  }
+}
diff --git a/moses/TranslationModel/UG/mm/ug_sampling_bias.h b/moses/TranslationModel/UG/mm/ug_sampling_bias.h
index f7c95f439..faed69e63 100644
--- a/moses/TranslationModel/UG/mm/ug_sampling_bias.h
+++ b/moses/TranslationModel/UG/mm/ug_sampling_bias.h
@@ -4,9 +4,9 @@
 #include <map>
 #include<vector>
 #include <string>
+#include <iostream>
 #include "moses/Util.h"
 #include "ug_typedefs.h"
-
 namespace Moses
 {
   namespace bitext
@@ -18,7 +18,8 @@ namespace Moses
     class SamplingBias 
     {
     public:
-      
+      int loglevel;
+      std::ostream* log;
       virtual float 
       operator[](id_type const ID) const = 0;
       // returns (unnormalized bias) for the class of item ID
diff --git a/moses/TranslationModel/UG/mm/ug_ttrack_base.h b/moses/TranslationModel/UG/mm/ug_ttrack_base.h
index 7c11b3942..f9864bda6 100644
--- a/moses/TranslationModel/UG/mm/ug_ttrack_base.h
+++ b/moses/TranslationModel/UG/mm/ug_ttrack_base.h
@@ -17,6 +17,7 @@
 #include "ug_ttrack_position.h"
 #include "tpt_typedefs.h"
 #include "tpt_tokenindex.h"
+#include "moses/Util.h"
 // #include "ug_vocab.h"
 
 namespace ugdiss
@@ -25,6 +26,33 @@ namespace ugdiss
 
   typedef boost::dynamic_bitset<uint64_t> bdBitset;
 
+  template<typename sid_t, typename off_t, typename len_t>
+  void 
+  parse_pid(uint64_t const pid, sid_t & sid, 
+	    off_t & off, len_t& len)
+  {
+    static uint64_t two32 = uint64_t(1)<<32;
+    static uint64_t two16 = uint64_t(1)<<16;
+    len = pid%two16;
+    off = (pid%two32)>>16;
+    sid = pid>>32;
+  }
+
+  template<typename Token>
+  string 
+  toString(TokenIndex const& V, Token const* x, size_t const len)
+  {
+    if (!len) return "";
+    UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
+    ostringstream buf; 
+    buf << V[x->id()];
+    size_t i = 1;
+    for (x = x->next(); x && i < len; ++i, x = x->next())
+      buf << " " << V[x->id()];
+    UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
+    return buf.str();
+  }
+
   template<typename TKN=id_type>
   class Ttrack
   {
author	Ulrich Germann <Ulrich.Germann@gmail.com>	2015-04-05 16:17:47 +0300
committer	Ulrich Germann <Ulrich.Germann@gmail.com>	2015-04-05 16:29:00 +0300
commit	46e31a285c8f9257a9d6ab411db74b5cbec9d0fe (patch)
tree	9bf1afa3827e7252e6b9fd38e8ee27cef8693a9a /moses/TranslationModel/UG/mm
parent	05c4e382ff7914369700eb516a61a45238292bdf (diff)