42 files changed, 2365 insertions, 1303 deletions
diff --git a/.gitignore b/.gitignore
index f870bed03..e7c37d86c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -79,3 +79,4 @@ nbproject/
 mingw/MosesGUI/MosesGUI.e4p
 mingw/MosesGUI/_eric4project/
 
+contrib/m4m/merge-sorted
diff --git a/Jamroot b/Jamroot
index 283b4dd6f..79ec39940 100644
--- a/Jamroot
+++ b/Jamroot
@@ -152,13 +152,15 @@ build-projects lm util phrase-extract search moses moses/LM mert moses-cmd moses
 if [ option.get "with-mm" : : "yes" ]
 {
  alias mm :  
+  moses/TranslationModel/UG//spe-check-coverage2
   moses/TranslationModel/UG//ptable-lookup 
+  moses/TranslationModel/UG//sim-pe 
+  moses/TranslationModel/UG//spe-check-coverage 
   moses/TranslationModel/UG/mm//mtt-build 
   moses/TranslationModel/UG/mm//mtt-dump 
   moses/TranslationModel/UG/mm//symal2mam 
   moses/TranslationModel/UG/mm//mam2symal 
   moses/TranslationModel/UG/mm//mam_verify 
-  moses/TranslationModel/UG/mm//custom-pt 
   moses/TranslationModel/UG/mm//mmlex-build 
   moses/TranslationModel/UG/mm//mmlex-lookup 
   moses/TranslationModel/UG/mm//mtt-count-words 
diff --git a/OnDiskPt/queryOnDiskPt.cpp b/OnDiskPt/queryOnDiskPt.cpp
index a38fc5435..77576d956 100644
--- a/OnDiskPt/queryOnDiskPt.cpp
+++ b/OnDiskPt/queryOnDiskPt.cpp
@@ -22,7 +22,7 @@ int main(int argc, char **argv)
 {
   int tableLimit = 20;
   std::string ttable = "";
-  bool useAlignments = false;
+  // bool useAlignments = false;
 
   for(int i = 1; i < argc; i++) {
     if(!strcmp(argv[i], "-tlimit")) {
diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp
index 1ff11f0ae..f14111f33 100644
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@@ -4,6 +4,7 @@
 #include <algorithm>
 
 
+#include "moses/Util.h"
 #include "moses/ChartManager.h"
 #include "moses/Hypothesis.h"
 #include "moses/Manager.h"
@@ -59,7 +60,7 @@ public:
     if(add2ORLM_) {
       //updateORLM();
     }
-    cerr << "Done inserting\n";
+    XVERBOSE(1,"Done inserting\n");
     //PhraseDictionary* pdsa = (PhraseDictionary*) pdf->GetDictionary(*dummy);
     map<string, xmlrpc_c::value> retData;
     //*retvalP = xmlrpc_c::value_struct(retData);
@@ -120,17 +121,17 @@ public:
     if(si == params.end())
       throw xmlrpc_c::fault("Missing source sentence", xmlrpc_c::fault::CODE_PARSE);
     source_ = xmlrpc_c::value_string(si->second);
-    cerr << "source = " << source_ << endl;
+    XVERBOSE(1,"source = " << source_ << endl);
     si = params.find("target");
     if(si == params.end())
       throw xmlrpc_c::fault("Missing target sentence", xmlrpc_c::fault::CODE_PARSE);
     target_ = xmlrpc_c::value_string(si->second);
-    cerr << "target = " << target_ << endl;
+    XVERBOSE(1,"target = " << target_ << endl);
     si = params.find("alignment");
     if(si == params.end())
       throw xmlrpc_c::fault("Missing alignment", xmlrpc_c::fault::CODE_PARSE);
     alignment_ = xmlrpc_c::value_string(si->second);
-    cerr << "alignment = " << alignment_ << endl;
+    XVERBOSE(1,"alignment = " << alignment_ << endl);
     si = params.find("bounded");
     bounded_ = (si != params.end());
     si = params.find("updateORLM");
@@ -224,7 +225,7 @@ public:
     }
     const string source((xmlrpc_c::value_string(si->second)));
 
-    cerr << "Input: " << source << endl;
+    XVERBOSE(1,"Input: " << source << endl);
     si = params.find("align");
     bool addAlignInfo = (si != params.end());
     si = params.find("word-align");
@@ -287,13 +288,13 @@ public:
         }
     } else {
         Sentence sentence;
-        const vector<FactorType> &inputFactorOrder =
-          staticData.GetInputFactorOrder();
+        const vector<FactorType> &
+	  inputFactorOrder = staticData.GetInputFactorOrder();
         stringstream in(source + "\n");
         sentence.Read(in,inputFactorOrder);
 	size_t lineNumber = 0; // TODO: Include sentence request number here?
         Manager manager(lineNumber, sentence, staticData.GetSearchAlgorithm());
-        manager.ProcessSentence();
+	manager.ProcessSentence();
         const Hypothesis* hypo = manager.GetBestHypothesis();
 
         vector<xmlrpc_c::value> alignInfo;
@@ -331,7 +332,7 @@ public:
     pair<string, xmlrpc_c::value>
     text("text", xmlrpc_c::value_string(out.str()));
     retData.insert(text);
-    cerr << "Output: " << out.str() << endl;
+    XVERBOSE(1,"Output: " << out.str() << endl);
     *retvalP = xmlrpc_c::value_struct(retData);
   }
 
@@ -574,7 +575,7 @@ int main(int argc, char** argv)
 {
 
   //Extract port and log, send other args to moses
-  char** mosesargv = new char*[argc+2];
+  char** mosesargv = new char*[argc+2]; // why "+2" [UG]
   int mosesargc = 0;
   int port = 8080;
   const char* logfile = "/dev/null";
@@ -634,11 +635,11 @@ int main(int argc, char** argv)
   myRegistry.addMethod("updater", updater);
   myRegistry.addMethod("optimize", optimizer);
 
-   xmlrpc_c::serverAbyss myAbyssServer(
-					myRegistry,
-					port,              // TCP port on which to listen
-					logfile
-					);
+  xmlrpc_c::serverAbyss myAbyssServer(
+				      myRegistry,
+				      port,              // TCP port on which to listen
+				      logfile
+				      );
   /* doesn't work with xmlrpc-c v. 1.16.33 - ie very old lib on Ubuntu 12.04
   xmlrpc_c::serverAbyss myAbyssServer(
     xmlrpc_c::serverAbyss::constrOpt()
@@ -648,12 +649,10 @@ int main(int argc, char** argv)
     .allowOrigin("*")
   );
   */
-
-  cerr << "Listening on port " << port << endl;
+  
+  XVERBOSE(1,"Listening on port " << port << endl);
   if (isSerial) {
-    while(1) {
-      myAbyssServer.runOnce();
-    }
+    while(1) myAbyssServer.runOnce();
   } else {
     myAbyssServer.run();
   }
diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile
index bddc10911..d257cd26c 100644
--- a/moses-cmd/Jamfile
+++ b/moses-cmd/Jamfile
@@ -3,4 +3,11 @@ alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z
 exe moses : Main.cpp deps ;
 exe lmbrgrid : LatticeMBRGrid.cpp deps ;
 
-alias programs : moses lmbrgrid ;
+exe simulate-pe : 
+simulate-pe.cc 
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)//boost_program_options 
+deps 
+;
+
+alias programs : moses lmbrgrid simulate-pe ;
diff --git a/moses/BitmapContainer.cpp b/moses/BitmapContainer.cpp
index 981b04895..ee2d55fc8 100644
--- a/moses/BitmapContainer.cpp
+++ b/moses/BitmapContainer.cpp
@@ -161,13 +161,17 @@ BackwardsEdge::BackwardsEdge(const BitmapContainer &prevBitmapContainer
   }
 
   if (m_translations.size() > 1) {
-	UTIL_THROW_IF2(m_translations.Get(0)->GetFutureScore() < m_translations.Get(1)->GetFutureScore(),
-			"Non-monotonic future score");
+    UTIL_THROW_IF2(m_translations.Get(0)->GetFutureScore() < m_translations.Get(1)->GetFutureScore(),
+		   "Non-monotonic future score: " 
+		   << m_translations.Get(0)->GetFutureScore() << " vs. " 
+		   << m_translations.Get(1)->GetFutureScore());
   }
 
   if (m_hypotheses.size() > 1) {
     UTIL_THROW_IF2(m_hypotheses[0]->GetTotalScore() < m_hypotheses[1]->GetTotalScore(),
-			  "Non-monotonic total score");
+		   "Non-monotonic total score" 
+		   << m_hypotheses[0]->GetTotalScore() << " vs. "
+		   << m_hypotheses[1]->GetTotalScore());
   }
 
   HypothesisScoreOrdererWithDistortion orderer (&transOptRange);
@@ -442,7 +446,9 @@ BitmapContainer::ProcessBestHypothesis()
   if (!Empty()) {
     HypothesisQueueItem *check = Dequeue(true);
     UTIL_THROW_IF2(item->GetHypothesis()->GetTotalScore() < check->GetHypothesis()->GetTotalScore(),
-    		"Non-monotonic total score");
+		   "Non-monotonic total score: "
+		   << item->GetHypothesis()->GetTotalScore() << " vs. "
+		   << check->GetHypothesis()->GetTotalScore());
   }
 
   // Logging for the criminally insane
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 6bc82378e..196f4d997 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -105,7 +105,9 @@ void Manager::ProcessSentence()
   // some reporting on how long this took
   IFVERBOSE(1) {
     GetSentenceStats().StopTimeCollectOpts();
-    TRACE_ERR("Line "<< m_lineNumber << ": Collecting options took " << GetSentenceStats().GetTimeCollectOpts() << " seconds" << endl);
+    TRACE_ERR("Line "<< m_lineNumber << ": Collecting options took " 
+	      << GetSentenceStats().GetTimeCollectOpts() << " seconds at " 
+	      << __FILE__ << ":" << __LINE__ << endl);
   }
 
   // search for best translation with the specified algorithm
diff --git a/moses/TranslationModel/UG/Jamfile b/moses/TranslationModel/UG/Jamfile
index ecd175a65..c36d4a072 100644
--- a/moses/TranslationModel/UG/Jamfile
+++ b/moses/TranslationModel/UG/Jamfile
@@ -20,6 +20,39 @@ $(TOP)/moses/TranslationModel/UG//mmsapt
 $(TOP)/util//kenutil 
 ; 
 
+exe sim-pe : 
+sim-pe.cc 
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)//boost_iostreams 
+$(TOP)//boost_program_options 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
+$(TOP)/moses/TranslationModel/UG//mmsapt 
+$(TOP)/util//kenutil 
+; 
+
+exe spe-check-coverage : 
+spe-check-coverage.cc 
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)//boost_iostreams 
+$(TOP)//boost_program_options 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
+$(TOP)/moses/TranslationModel/UG//mmsapt 
+$(TOP)/util//kenutil 
+; 
+
+exe spe-check-coverage2 : 
+spe-check-coverage2.cc 
+$(TOP)/moses//moses
+$(TOP)/moses/TranslationModel/UG/generic//generic 
+$(TOP)//boost_iostreams 
+$(TOP)//boost_program_options 
+$(TOP)/moses/TranslationModel/UG/mm//mm 
+$(TOP)/moses/TranslationModel/UG//mmsapt 
+$(TOP)/util//kenutil 
+; 
+
 install $(PREFIX)/bin : try-align ; 
 
-fakelib mmsapt : [ glob *.cpp mmsapt*.cc ] ;
+fakelib mmsapt : [ glob *.cpp mmsapt*.cc sapt*.cc ] ;
diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc
new file mode 100644
index 000000000..7dc2cd18f
--- /dev/null
+++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.cc
@@ -0,0 +1,50 @@
+//-*- c++ -*-
+#include "ug_splice_arglist.h"
+#include "moses/Util.h"
+#include "util/exception.hh"
+#include <boost/foreach.hpp>
+
+namespace Moses {
+  
+  void 
+  filter_arguments(int const argc_in, char const* const* const argv_in,
+		   int & argc_moses, char*** argv_moses,  
+		   int & argc_other, char*** argv_other,
+		   vector<pair<string,int> > const& filter)
+  {
+    *argv_moses = new char*[argc_in];
+    *argv_other = new char*[argc_in]; 
+    (*argv_moses)[0] = new char[strlen(argv_in[0])+1];
+    strcpy((*argv_moses)[0], argv_in[0]);
+    argc_moses = 1;
+    argc_other = 0;
+    typedef pair<string,int> option;
+    int i = 1;
+    while (i < argc_in)
+      {
+	BOOST_FOREACH(option const& o, filter)
+	  {
+	    if (o.first == argv_in[i])
+	      {
+		(*argv_other)[argc_other] = new char[strlen(argv_in[i])+1];
+		strcpy((*argv_other)[argc_other++],argv_in[i]);
+		for (int k = 0; k < o.second; ++k)
+		{
+		  UTIL_THROW_IF2(++i >= argc_in || argv_in[i][0] == '-', 
+				 "[" << HERE << "] Missing argument for "
+				 << "parameter " << o.first << "!");
+		  (*argv_other)[argc_other] = new char[strlen(argv_in[i])+1];
+		  strcpy((*argv_other)[argc_other++],argv_in[i]);
+		}
+		if (++i >= argc_in) break;
+	      }
+	  }
+	if (i >= argc_in) break;
+	(*argv_moses)[argc_moses] = new char[strlen(argv_in[i])+1];
+	strcpy((*argv_moses)[argc_moses++], argv_in[i++]);
+      }
+  }
+  
+} // namespace Moses
+
+
diff --git a/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h
new file mode 100644
index 000000000..e56585e8a
--- /dev/null
+++ b/moses/TranslationModel/UG/generic/program_options/ug_splice_arglist.h
@@ -0,0 +1,18 @@
+//-*- c++ -*-
+#pragma once
+#include <vector>
+#include <string>
+namespace Moses {
+  using namespace std;
+
+  // Function to splice the argument list (e.g. before handing it over to 
+  // Moses LoadParam() function. /filter/ is a vector of argument names
+  // and the number of arguments after each of them 
+  void 
+  filter_arguments(int const argc_in, char const* const* const argv_in,
+		   int & argc_moses, char*** argv_moses,  
+		   int & argc_other, char*** argv_other,
+		   vector<pair<string,int> > const& filter);
+
+
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/mm/Jamfile b/moses/TranslationModel/UG/mm/Jamfile
index 2cc923581..8d8af050a 100644
--- a/moses/TranslationModel/UG/mm/Jamfile
+++ b/moses/TranslationModel/UG/mm/Jamfile
@@ -72,15 +72,15 @@ $(TOP)/moses/TranslationModel/UG/mm//mm
 $(TOP)/util//kenutil 
 ; 
 
-exe custom-pt : 
-custom-pt.cc 
-$(TOP)/moses//moses
-$(TOP)//boost_iostreams 
-$(TOP)//boost_program_options 
-$(TOP)/moses/TranslationModel/UG/mm//mm 
-$(TOP)/moses/TranslationModel/UG/generic//generic 
-$(TOP)/util//kenutil 
-; 
+# exe custom-pt : 
+# custom-pt.cc 
+# $(TOP)/moses//moses
+# $(TOP)//boost_iostreams 
+# $(TOP)//boost_program_options 
+# $(TOP)/moses/TranslationModel/UG/mm//mm 
+# $(TOP)/moses/TranslationModel/UG/generic//generic 
+# $(TOP)/util//kenutil 
+# ; 
 
 
 exe calc-coverage : 
@@ -98,7 +98,6 @@ mtt-dump
 mtt-count-words 
 symal2mam 
 mam2symal 
-custom-pt 
 mmlex-build 
 mmlex-lookup
 mam_verify 
diff --git a/moses/TranslationModel/UG/mm/custom-pt.cc b/moses/TranslationModel/UG/mm/custom-pt.cc
index 1c1e0893c..e52772b48 100644
--- a/moses/TranslationModel/UG/mm/custom-pt.cc
+++ b/moses/TranslationModel/UG/mm/custom-pt.cc
@@ -1,6 +1,6 @@
 // build a phrase table for the given input
 // #include "ug_lexical_phrase_scorer2.h"
-
+#if 0
 #include <stdint.h>
 #include <string>
 #include <vector>
@@ -25,7 +25,7 @@
 #include "ug_bitext.h"
 #include "../mmsapt_phrase_scorers.h"
 #include "ug_lexical_phrase_scorer2.h"
-
+#include "../sapt_phrase_scorers.h"
 using namespace std;
 using namespace ugdiss;
 using namespace Moses;
@@ -110,6 +110,7 @@ int main(int argc, char* argv[])
 {
   // assert(argc == 4);
 #if 0
+#if 0
   string base = argv[1];
   string L1   = argv[2];
   string L2   = argv[3];
@@ -182,7 +183,7 @@ int main(int argc, char* argv[])
       	    }
       	}
     }
-  
+#endif  
     exit(0);
 }
-
+#endif
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.cc b/moses/TranslationModel/UG/mm/ug_bitext.cc
index 8dbbdcb92..a1a6dff7b 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.cc
+++ b/moses/TranslationModel/UG/mm/ug_bitext.cc
@@ -158,99 +158,25 @@ namespace Moses
     jstats::
     invalidate()
     {
-      my_rcnt = 0;
+      if (my_wcnt > 0) 
+	my_wcnt *= -1;
     }
 
-    bool
+    void 
     jstats::
-    valid()
-    {
-      return my_rcnt != 0;
-    }
-
-    bool
-    PhrasePair::
-    operator<=(PhrasePair const& other) const
+    validate()
     {
-      return this->score <= other.score;
+      if (my_wcnt < 0) 
+	my_wcnt *= -1;
     }
 
     bool
-    PhrasePair::
-    operator>=(PhrasePair const& other) const
-    {
-      return this->score >= other.score;
-    }
-
-    bool
-    PhrasePair::
-    operator<(PhrasePair const& other) const
-    {
-      return this->score < other.score;
-    }
-    
-    bool
-    PhrasePair::
-    operator>(PhrasePair const& other) const
-    {
-      return this->score > other.score;
-    }
-    
-    PhrasePair::
-    PhrasePair() {}
-
-    PhrasePair::
-    PhrasePair(PhrasePair const& o) 
-      : p1(o.p1), 
-	p2(o.p2),
-	raw1(o.raw1), 
-	raw2(o.raw2), 
-	sample1(o.sample1),
-	sample2(o.sample2),
-	good1(o.good1),
-	good2(o.good2),
-	joint(o.joint),
-	fvals(o.fvals),
-	aln(o.aln),
-	score(o.score)
-    {
-      for (size_t i = 0; i <= po_other; ++i)
-	{
-	  dfwd[i] = o.dfwd[i];
-	  dbwd[i] = o.dbwd[i];
-	}
-    }
-    
-    void
-    PhrasePair::
-    init(uint64_t const pid1, pstats const& ps, size_t const numfeats)
+    jstats::
+    valid()
     {
-      p1      = pid1;
-      p2      = 0;
-      raw1    = ps.raw_cnt;
-      sample1 = ps.sample_cnt;
-      sample2 = 0;
-      good1   = ps.good;
-      good2   = 0;
-      raw2    = 0;
-      fvals.resize(numfeats);
+      return my_wcnt >= 0;
     }
 
-    void
-    PhrasePair::
-    init(uint64_t const pid1, 
-	 pstats const& ps1, 
-	 pstats const& ps2, 
-	 size_t const numfeats)
-    {
-      p1      = pid1;
-      raw1    = ps1.raw_cnt    + ps2.raw_cnt;
-      sample1 = ps1.sample_cnt + ps2.sample_cnt;
-      sample2 = 0;
-      good1   = ps1.good       + ps2.good;
-      good2   = 0;
-      fvals.resize(numfeats);
-    }
     
     float 
     lbop(size_t const tries, size_t const succ, float const confidence)
@@ -261,85 +187,6 @@ namespace Moses
 		 find_lower_bound_on_p(tries, succ, confidence)));
     }
     
-    PhrasePair const&
-    PhrasePair::
-    update(uint64_t const pid2, jstats const& js)   
-    {
-      p2    = pid2;
-      raw2  = js.cnt2();
-      joint = js.rcnt();
-      assert(js.aln().size());
-      if (js.aln().size()) 
-	aln = js.aln()[0].second;
-      float total_fwd = 0, total_bwd = 0;
-      for (int i = po_first; i <= po_other; i++)
-	{
-	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
-	  total_fwd += js.dcnt_fwd(po)+1;
-	  total_bwd += js.dcnt_bwd(po)+1;
-	}
-      for (int i = po_first; i <= po_other; i++)
-	{
-	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
-	  dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
-	  dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
-	}
-      return *this;
-    }
-
-    PhrasePair const&
-    PhrasePair::
-    update(uint64_t const pid2, jstats const& js1, jstats const& js2)   
-    {
-      p2    = pid2;
-      raw2  = js1.cnt2() + js2.cnt2();
-      joint = js1.rcnt() + js2.rcnt();
-      assert(js1.aln().size() || js2.aln().size());
-      if (js1.aln().size()) 
-	aln = js1.aln()[0].second;
-      else if (js2.aln().size()) 
-	aln = js2.aln()[0].second;
-      for (int i = po_first; i < po_other; i++)
-	{
-	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
-	  dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
-	  dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
-	}
-      return *this;
-    }
-
-    PhrasePair const&
-    PhrasePair::
-    update(uint64_t const pid2, 
-	   size_t   const raw2extra,
-	   jstats   const& js)   
-    {
-      p2    = pid2;
-      raw2  = js.cnt2() + raw2extra;
-      joint = js.rcnt();
-      assert(js.aln().size());
-      if (js.aln().size()) 
-	aln = js.aln()[0].second;
-      for (int i = po_first; i <= po_other; i++)
-	{
-	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
-	  dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
-	  dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
-	}
-      return *this;
-    }
-
-    float
-    PhrasePair::
-    eval(vector<float> const& w)
-    {
-      assert(w.size() == this->fvals.size());
-      this->score = 0;
-      for (size_t i = 0; i < w.size(); ++i)
-	this->score += w[i] * this->fvals[i];
-      return this->score;
-    }
-  
     template<>
     sptr<imBitext<L2R_Token<SimpleWordId> > > 
     imBitext<L2R_Token<SimpleWordId> >::
@@ -371,7 +218,8 @@ namespace Moses
 	  uint32_t row,col; char c;
 	  while (ibuf >> row >> c >> col)
 	    {
-	      assert(c == '-');
+	      UTIL_THROW_IF2(c != '-', "[" << HERE << "] "
+			     << "Error in alignment information:\n" << a);
 	      binwrite(obuf,row);
 	      binwrite(obuf,col);
 	    }
@@ -639,7 +487,6 @@ namespace Moses
       cout  << string(90,'-') << endl;
     }
 
-
     PhraseOrientation 
     find_po_fwd(vector<vector<ushort> >& a1,
 		vector<vector<ushort> >& a2,
@@ -654,13 +501,13 @@ namespace Moses
       
       ushort ns1,ne1,ne2;
       if (!expand_phrase_pair(a1,a2,n2,b1,e1,ns1,ne1,ne2))
-	{
-	  return po_other;
-	}
+	return po_other;
+
       if (ns1 >= e1)
 	{
 	  for (ushort j = e1; j < ns1; ++j)
-	    if (a1[j].size()) return po_jfwd;
+	    if (a1[j].size()) 
+	      return po_jfwd;
 	  return po_mono;
 	}
       else
diff --git a/moses/TranslationModel/UG/mm/ug_bitext.h b/moses/TranslationModel/UG/mm/ug_bitext.h
index 397253973..4cb34c02d 100644
--- a/moses/TranslationModel/UG/mm/ug_bitext.h
+++ b/moses/TranslationModel/UG/mm/ug_bitext.h
@@ -56,6 +56,7 @@ namespace Moses {
   class Mmsapt;
   namespace bitext
   {
+    template<typename TKN> class Bitext;
     using namespace ugdiss;
 
     template<typename TKN> class Bitext;
@@ -120,6 +121,7 @@ namespace Moses {
       void add(float w, vector<uchar> const& a, uint32_t const cnt2,
 	       uint32_t fwd_orient, uint32_t bwd_orient);
       void invalidate();
+      void validate();
       bool valid();
       uint32_t dcnt_fwd(PhraseOrientation const idx) const;
       uint32_t dcnt_bwd(PhraseOrientation const idx) const;
@@ -157,43 +159,6 @@ namespace Moses {
 	  uint32_t fwd_o, uint32_t bwd_o);
     };
     
-    class 
-    PhrasePair
-    {
-    public:
-      uint64_t p1, p2;
-      uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
-      vector<float> fvals;
-      float dfwd[po_other+1];
-      float dbwd[po_other+1];
-      vector<uchar> aln;
-      // float    avlex12,avlex21; // average lexical probs (Moses std)
-      // float    znlex1,znlex2;   // zens-ney lexical smoothing
-      // float    colex1,colex2;   // based on raw lexical occurrences
-      float score;
-      PhrasePair();
-      PhrasePair(PhrasePair const& o);
-      bool operator<(PhrasePair const& other) const;
-      bool operator>(PhrasePair const& other) const;
-      bool operator<=(PhrasePair const& other) const;
-      bool operator>=(PhrasePair const& other) const;
-
-      void init(uint64_t const pid1, pstats const& ps,  size_t const numfeats);
-      void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, 
-		size_t const numfeats);
-
-      PhrasePair const& 
-      update(uint64_t const pid2, jstats const& js);
-
-      PhrasePair const& 
-      update(uint64_t const pid2, jstats   const& js1, jstats   const& js2);
-
-      PhrasePair const& 
-      update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
-
-      float eval(vector<float> const& w);
-    };
-
 
     template<typename TKN>
     class Bitext 
diff --git a/moses/TranslationModel/UG/mm/ug_im_ttrack.h b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
index 05066c922..0c6e4afbf 100644
--- a/moses/TranslationModel/UG/mm/ug_im_ttrack.h
+++ b/moses/TranslationModel/UG/mm/ug_im_ttrack.h
@@ -16,6 +16,9 @@
 #include "tpt_tokenindex.h"
 #include "ug_ttrack_base.h"
 #include "tpt_tokenindex.h"
+#include "util/exception.hh"
+#include "moses/Util.h"
+
 // #include "ug_vocab.h"
 
 // define the corpus buffer size (in sentences) and the
@@ -49,6 +52,8 @@ namespace ugdiss
     typename boost::shared_ptr<imTtrack<Token> > 
     append<Token>(typename boost::shared_ptr<imTtrack<Token> > const & crp, vector<Token> const & snt);
 
+    void m_check_token_count(); // debugging function
+
   public:
 
     imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d);
@@ -70,6 +75,22 @@ namespace ugdiss
   };
 
   template<typename Token>
+  void
+  imTtrack<Token>::
+  m_check_token_count()
+  { // sanity check
+    size_t check = 0;
+    BOOST_FOREACH(vector<Token> const& s, *myData)
+      check += s.size();
+    UTIL_THROW_IF2(check != this->numToks, "[" << HERE << "]" 
+		   << " Wrong token count after appending sentence!"
+		   << " Counted " << check << " but expected " 
+		   << this->numToks << " in a total of " << myData->size() 
+		   << " sentences.");
+    
+  }
+
+  template<typename Token>
   Token const* 
   imTtrack<Token>::
   sntStart(size_t sid) const // return pointer to beginning of sentence
@@ -111,9 +132,9 @@ namespace ugdiss
   template<typename Token>
   imTtrack<Token>::
   imTtrack(istream& in, TokenIndex const& V, ostream* log = NULL)
+    : numToks(0)
   {
     myData.reset(new vector<vector<Token> >());
-    numToks = 0;
     string line,w;
     size_t linectr=0;
     boost::unordered_map<string,id_type> H;
@@ -135,6 +156,7 @@ namespace ugdiss
   template<typename Token>
   imTtrack<Token>::
   imTtrack(size_t reserve)
+    : numToks(0)
   {
     myData.reset(new vector<vector<Token> >());
     if (reserve) myData->reserve(reserve);
@@ -143,9 +165,9 @@ namespace ugdiss
   template<typename Token>
   imTtrack<Token>::
   imTtrack(boost::shared_ptr<vector<vector<Token> > > const& d)
+    : numToks(0)
   {
     myData  = d;
-    numToks = 0;
     BOOST_FOREACH(vector<Token> const& v, *d)
       numToks += v.size();
   }
@@ -171,6 +193,9 @@ namespace ugdiss
   shared_ptr<imTtrack<TOKEN> > 
   append(shared_ptr<imTtrack<TOKEN> > const& crp, vector<TOKEN> const & snt)
   {
+#if 1
+    if (crp) crp->m_check_token_count();
+#endif
     shared_ptr<imTtrack<TOKEN> > ret;
     if (crp == NULL)
       {
@@ -185,6 +210,11 @@ namespace ugdiss
       }
     else ret = crp;
     ret->myData->push_back(snt);
+    ret->numToks += snt.size();
+
+#if 1
+    ret->m_check_token_count();
+#endif
     return ret;
   }
 
diff --git a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
index 558b5a7fa..b7e359223 100644
--- a/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
+++ b/moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h
@@ -27,7 +27,6 @@ namespace ugdiss
     typedef mm2dTable<id_type,id_type,uint32_t,uint32_t> table_t;
     table_t COOC;
     void open(string const& fname);
-
     template<typename someint>
     void 
     score(TKN const* snt1, size_t const s1, size_t const e1,
@@ -104,7 +103,19 @@ namespace ugdiss
     if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
     UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
 		   << ": alpha parameter must be >= 0");
-    return float(COOC[s][t]+alpha)/(COOC.m1(s)+alpha);
+    float ret = COOC[s][t]+alpha;
+    ret =  (ret?ret:1.)/(COOC.m1(s)+alpha);
+    UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__ 
+		   << ": result not > 0 and <= 1. alpha = " << alpha << "; "
+		   << COOC[s][t] << "/" << COOC.m1(s));
+
+#if 0
+    cerr << "[" << s << "," << t << "] " 
+	 << COOC.m1(s) << "/" 
+	 << COOC[s][t] << "/" 
+	 << COOC.m2(t) << endl;
+#endif
+    return ret;
   }
   
   template<typename TKN>
@@ -115,7 +126,11 @@ namespace ugdiss
     if (COOC.m1(s) == 0 || COOC.m2(t) == 0) return 1.0;
     UTIL_THROW_IF2(alpha < 0,"At " << __FILE__ << ":" << __LINE__
 		   << ": alpha parameter must be >= 0");
-    return float(COOC[s][t]+alpha)/(COOC.m2(t)+alpha);
+    float ret = float(COOC[s][t]+alpha);
+    ret = (ret?ret:1.)/(COOC.m2(t)+alpha);
+    UTIL_THROW_IF2(ret <= 0 || ret > 1, "At " << __FILE__ << ":" << __LINE__ 
+		   << ": result not > 0 and <= 1.");
+    return ret;
   }
   
   template<typename TKN>
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.cc b/moses/TranslationModel/UG/mm/ug_phrasepair.cc
new file mode 100644
index 000000000..6373f8468
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.cc
@@ -0,0 +1,97 @@
+#include "ug_phrasepair.h"
+namespace Moses {
+  namespace bitext
+  {
+
+#if 0
+    void 
+    PhrasePair::
+    init()
+    {
+      p1 = p2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+    }
+
+    void
+    PhrasePair::
+    init(uint64_t const pid1, 
+	 pstats const& ps1, 
+	 pstats const& ps2, 
+	 size_t const numfeats)
+    {
+      p1      = pid1;
+      raw1    = ps1.raw_cnt    + ps2.raw_cnt;
+      sample1 = ps1.sample_cnt + ps2.sample_cnt;
+      sample2 = 0;
+      good1   = ps1.good       + ps2.good;
+      good2   = 0;
+      joint   = 0;
+      fvals.resize(numfeats);
+    }
+
+    PhrasePair const&
+    PhrasePair::
+    update(uint64_t const pid2, jstats const& js1, jstats const& js2)   
+    {
+      p2    = pid2;
+      raw2  = js1.cnt2() + js2.cnt2();
+      joint = js1.rcnt() + js2.rcnt();
+      assert(js1.aln().size() || js2.aln().size());
+      if (js1.aln().size()) 
+	aln = js1.aln()[0].second;
+      else if (js2.aln().size()) 
+	aln = js2.aln()[0].second;
+      for (int i = po_first; i < po_other; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  dfwd[i] = float(js1.dcnt_fwd(po) + js2.dcnt_fwd(po) + 1)/(sample1+po_other);
+	  dbwd[i] = float(js1.dcnt_bwd(po) + js2.dcnt_bwd(po) + 1)/(sample1+po_other);
+	}
+      return *this;
+    }
+
+    PhrasePair const&
+    PhrasePair::
+    update(uint64_t const pid2, size_t r2)
+    {
+      p2    = pid2;
+      raw2  = r2;
+      joint = 0;
+      return *this;
+    } 
+
+
+    PhrasePair const&
+    PhrasePair::
+    update(uint64_t const pid2, 
+	   size_t   const raw2extra,
+	   jstats   const& js)   
+    {
+      p2    = pid2;
+      raw2  = js.cnt2() + raw2extra;
+      joint = js.rcnt();
+      assert(js.aln().size());
+      if (js.aln().size()) 
+	aln = js.aln()[0].second;
+      for (int i = po_first; i <= po_other; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  dfwd[i] = float(js.dcnt_fwd(po)+1)/(sample1+po_other);
+	  dbwd[i] = float(js.dcnt_bwd(po)+1)/(sample1+po_other);
+	}
+      return *this;
+    }
+
+    float
+    PhrasePair::
+    eval(vector<float> const& w)
+    {
+      assert(w.size() == this->fvals.size());
+      this->score = 0;
+      for (size_t i = 0; i < w.size(); ++i)
+	this->score += w[i] * this->fvals[i];
+      return this->score;
+    }
+#endif
+  } // namespace bitext
+} // namespace Moses
+
diff --git a/moses/TranslationModel/UG/mm/ug_phrasepair.h b/moses/TranslationModel/UG/mm/ug_phrasepair.h
new file mode 100644
index 000000000..8cd43dc18
--- /dev/null
+++ b/moses/TranslationModel/UG/mm/ug_phrasepair.h
@@ -0,0 +1,243 @@
+//-*- c++ -*-
+#pragma once
+#include "ug_bitext.h"
+
+using namespace ugdiss;
+using namespace std;
+
+namespace Moses {
+  namespace bitext
+  {
+
+    template<typename Token>
+    string 
+    toString(TokenIndex const& V, Token const* x, size_t const len)
+    {
+      if (!len) return "";
+      UTIL_THROW_IF2(!x, HERE << ": Unexpected end of phrase!");
+      ostringstream buf; 
+      buf << V[x->id()];
+      size_t i = 1;
+      for (x = x->next(); x && i < len; ++i, x = x->next())
+	buf << " " << V[x->id()];
+      UTIL_THROW_IF2(i != len, HERE << ": Unexpected end of phrase!");
+      return buf.str();
+    }
+
+    template<typename Token>
+    class 
+    PhrasePair
+    {
+    public:
+      Token const* start1;
+      Token const* start2;
+      uint32_t len1;
+      uint32_t len2;
+      // uint64_t p1, p2;
+      uint32_t raw1,raw2,sample1,sample2,good1,good2,joint;
+      vector<float> fvals;
+      float dfwd[po_other+1]; // distortion counts // counts or probs?
+      float dbwd[po_other+1]; // distortion counts
+      vector<uchar> aln;
+      float score;
+      PhrasePair() { };
+      PhrasePair(PhrasePair const& o);
+
+      PhrasePair const& operator+=(PhrasePair const& other);
+
+      bool operator<(PhrasePair const& other) const;
+      bool operator>(PhrasePair const& other) const;
+      bool operator<=(PhrasePair const& other) const; 
+      bool operator>=(PhrasePair const& other) const;
+
+      void init();
+      void init(Token const* x,   uint32_t const len,
+		pstats const* ps = NULL, size_t const numfeats=0);
+      
+      // void init(uint64_t const pid1, pstats const& ps,  size_t const numfeats);
+      // void init(uint64_t const pid1, pstats const& ps1, pstats const& ps2, 
+      // size_t const numfeats);
+
+      // PhrasePair const&
+      // update(uint64_t const pid2, size_t r2 = 0);
+
+      PhrasePair const& 
+      update(Token const* x, uint32_t const len, jstats const& js);
+      
+      // PhrasePair const& 
+      // update(uint64_t const pid2, jstats   const& js1, jstats   const& js2);
+
+      // PhrasePair const& 
+      // update(uint64_t const pid2, size_t const raw2extra, jstats const& js);
+
+      // float 
+      // eval(vector<float> const& w);
+
+      class SortByTargetIdSeq
+      {
+      public:
+	int cmp(PhrasePair const& a, PhrasePair const& b) const;
+	bool operator()(PhrasePair const& a, PhrasePair const& b) const;
+      };
+    };
+
+    template<typename Token>
+    void
+    PhrasePair<Token>::
+    init(Token const* x, uint32_t const len, 
+	 pstats const* ps, size_t const numfeats)
+    {
+      start1 = x; len1 = len;
+      // p1      = pid1;
+      // p2      = 0;
+      if (ps)
+	{
+	  raw1    = ps->raw_cnt;
+	  sample1 = ps->sample_cnt;
+	  good1   = ps->good;
+	}
+      else raw1 = sample1 = good1 = 0;
+      joint   = 0;
+      good2   = 0;
+      sample2 = 0;
+      raw2    = 0;
+      fvals.resize(numfeats);
+    }
+
+    template<typename Token>
+    PhrasePair<Token> const&
+    PhrasePair<Token>::
+    update(Token const* x, uint32_t const len, jstats const& js)   
+    {
+      // p2    = pid2;
+      start2 = x; len2 = len;
+      raw2  = js.cnt2();
+      joint = js.rcnt();
+      assert(js.aln().size());
+      if (js.aln().size()) 
+	aln = js.aln()[0].second;
+      float total_fwd = 0, total_bwd = 0;
+      for (int i = po_first; i <= po_other; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  total_fwd += js.dcnt_fwd(po)+1;
+	  total_bwd += js.dcnt_bwd(po)+1;
+	}
+
+      // should we do that here or leave the raw counts?
+      for (int i = po_first; i <= po_other; i++)
+	{
+	  PhraseOrientation po = static_cast<PhraseOrientation>(i);
+	  dfwd[i] = float(js.dcnt_fwd(po)+1)/total_fwd;
+	  dbwd[i] = float(js.dcnt_bwd(po)+1)/total_bwd;
+	}
+
+      return *this;
+    }
+
+    template<typename Token>
+    bool 
+    PhrasePair<Token>::
+    operator<(PhrasePair const& other) const 
+    { return this->score < other.score; }
+    
+    template<typename Token>
+    bool 
+    PhrasePair<Token>::
+    operator>(PhrasePair const& other) const
+    { return this->score > other.score; }
+
+    template<typename Token>
+    bool 
+    PhrasePair<Token>::
+    operator<=(PhrasePair const& other) const 
+    { return this->score <= other.score; }
+    
+    template<typename Token>
+    bool 
+    PhrasePair<Token>::
+    operator>=(PhrasePair const& other) const
+    { return this->score >= other.score; }
+
+    template<typename Token>
+    PhrasePair<Token> const&
+    PhrasePair<Token>::
+    operator+=(PhrasePair const& o) 
+    { 
+      raw1 += o.raw1;
+      raw2 += o.raw2;
+      sample1 += o.sample1;
+      sample2 += o.sample2;
+      good1 += o.good1;
+      good2 += o.good2;
+      joint += o.joint;
+      return *this;
+    }
+
+    template<typename Token>
+    PhrasePair<Token>::
+    PhrasePair(PhrasePair<Token> const& o) 
+      : start1(o.start1)
+      , start2(o.start2)
+      , len1(o.len1)
+      , len2(o.len2)
+      , raw1(o.raw1) 
+      , raw2(o.raw2) 
+      , sample1(o.sample1)
+      , sample2(o.sample2)
+      ,	good1(o.good1)
+      , good2(o.good2)
+      , joint(o.joint)
+      , fvals(o.fvals)
+      , aln(o.aln)
+      , score(o.score)
+    {
+      for (size_t i = 0; i <= po_other; ++i)
+	{
+	  dfwd[i] = o.dfwd[i];
+	  dbwd[i] = o.dbwd[i];
+	}
+    }
+    
+    template<typename Token>
+    int
+    PhrasePair<Token>::
+    SortByTargetIdSeq::
+    cmp(PhrasePair const& a, PhrasePair const& b) const
+    {
+      size_t i = 0;
+      Token const* x = a.start2;
+      Token const* y = b.start2;
+      while (i < a.len2 && i < b.len2 && x->id() == y->id()) 
+	{
+	  x = x->next();
+	  y = y->next();
+	  ++i;
+	}
+      if (i == a.len2 && i == b.len2) return 0;
+      if (i == a.len2) return -1;
+      if (i == b.len2) return  1;
+      return x->id() < y->id() ? -1 : 1;
+    }
+    
+    template<typename Token>
+    bool
+    PhrasePair<Token>::
+    SortByTargetIdSeq::
+    operator()(PhrasePair const& a, PhrasePair const& b) const
+    {
+      return this->cmp(a,b) < 0;
+    }
+
+    template<typename Token>
+    void 
+    PhrasePair<Token>::
+    init()
+    {
+      len1 = len2 = raw1 = raw2 = sample1 = sample2 = good1 = good2 = joint = 0;
+      start1 = start2 = NULL;
+    }
+
+
+  } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
index 14bf6cdad..ab7f96bf0 100644
--- a/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
+++ b/moses/TranslationModel/UG/mm/ug_tsa_tree_iterator.h
@@ -7,6 +7,8 @@
 #include "ug_typedefs.h"
 #include "tpt_tokenindex.h"
 #include <iostream>
+#include "util/exception.hh"
+#include "moses/Util.h"
 //#include <cassert>
 
 // #include "ug_bv_iter.h"
@@ -60,10 +62,15 @@ namespace ugdiss
 
     // TSA_tree_iterator(TSA_tree_iterator const& other);
     TSA_tree_iterator(TSA<Token> const* s);
+    TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other);
     TSA_tree_iterator(TSA<Token> const* r, id_type const* s, size_t const len);
     // TSA_tree_iterator(TSA<Token> const* s, Token const& t);
     TSA_tree_iterator(TSA<Token> const* s, 
 		      Token const* kstart, 
+		      size_t const len, 
+		      bool full_match_only=true);
+    TSA_tree_iterator(TSA<Token> const* s, 
+		      Token const* kstart, 
 		      Token const* kend, 
 		      bool full_match_only=true);
     // TSA_tree_iterator(TSA<Token> const* s, 
@@ -150,9 +157,12 @@ namespace ugdiss
     double approxOccurrenceCount(int p=-1) const
     {
       assert(root);
+      if (p < 0) p += lower.size();
       double ret = arrayByteSpanSize(p)/root->aveIndexEntrySize();
-      assert(ret < root->corpus->numTokens());
       if (ret < 25) ret = rawCnt(p);
+      UTIL_THROW_IF2(ret > root->corpus->numTokens(), "[" << HERE << "] "
+		     << "Word count mismatch.");
+      assert(ret <= root->corpus->numTokens());
       return ret;
     }
 
@@ -320,6 +330,18 @@ namespace ugdiss
 
   template<typename Token>
   TSA_tree_iterator<Token>::
+  TSA_tree_iterator(TSA<Token> const* s, TSA_tree_iterator<Token> const& other)
+    : root(s) 
+  {
+    Token const* x = other.getToken(0);
+    for (size_t i = 0; i < other.size() && this->extend(x->id()); ++i)
+      x = x->next(); 
+  };
+
+
+
+  template<typename Token>
+  TSA_tree_iterator<Token>::
   TSA_tree_iterator
   (TSA<Token> const* r,
    id_type    const* s, 
@@ -385,6 +407,25 @@ namespace ugdiss
   template<typename Token>
   TSA_tree_iterator<Token>::
   TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, 
+		    size_t const len, bool full_match_only)
+    : root(s) 
+  {
+    if (!root) return;
+    size_t i = 0;
+    for (; i < len && kstart && extend(*kstart); ++i)
+      kstart = kstart->next();
+    if (full_match_only && i != len) 
+      {
+        lower.clear();
+        upper.clear();
+      }
+  };
+
+  // DEPRECATED: DO NOT USE. Use the one that takes the length 
+  // instead of kend.
+  template<typename Token>
+  TSA_tree_iterator<Token>::
+  TSA_tree_iterator(TSA<Token> const* s, Token const* kstart, 
 		    Token const* kend, bool full_match_only)
     : root(s) 
   {
@@ -561,8 +602,7 @@ namespace ugdiss
   TSA_tree_iterator<Token>::
   rawCnt(int p) const
   {
-    if (p < 0)
-      p = lower.size()+p;
+    if (p < 0) p += lower.size();
     assert(p>=0);
     if (lower.size() == 0) return root->getCorpusSize();
     return root->rawCnt(lower[p],upper[p]);
diff --git a/moses/TranslationModel/UG/mmsapt.cpp b/moses/TranslationModel/UG/mmsapt.cpp
index dc9945472..596fec4e6 100644
--- a/moses/TranslationModel/UG/mmsapt.cpp
+++ b/moses/TranslationModel/UG/mmsapt.cpp
@@ -1,13 +1,38 @@
 #include "mmsapt.h"
 #include <boost/foreach.hpp>
+#include <boost/scoped_ptr.hpp>
 #include <boost/tokenizer.hpp>
 #include <algorithm>
+#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
+#include "util/exception.hh"
+#include <set>
 
 namespace Moses
 {
   using namespace bitext;
   using namespace std;
   using namespace boost;
+
+
+  // uint64_t 
+  // pack_phrasekey(uint64_t const shard_id, uint64_t const snt_id, 
+  // 		 uint64_t const offset, uint64_t const len)
+  // {
+  //   uint64_t one = 1;
+  //   //  8 bits - 256 shards
+  //   // 13 bits - max offset
+  //   // 11 bits - max len
+  //   // 32 bits - max sentence id
+  //   UTIL_TRHOW_IF2(shard_id >= 256, "[" << HERE << "] " 
+  // 		   << "Sentence ID exceeds limit.");
+  //   UTIL_THROW_IF2(snt_id >= 4294967296, "[" << HERE << "] " 
+  // 		   << "Sentence ID exceeds limit.");
+  //   UTIL_TRHOW_IF2(offset >= 8192, "[" << HERE << "]" 
+  // 		   << "Phrase offset exceeds limit.");
+  //   UTIL_TRHOW_IF2(offset >= 2048, "[" << HERE << "]" 
+  // 		   << "Phrase length exceeds limit.");
+  //   return ((shard_id<<56)+(snt_id<<24)+(offset<<11)+len);
+  // }
   
   void 
   fillIdSeq(Phrase const& mophrase, size_t const ifactor,
@@ -23,7 +48,7 @@ namespace Moses
     
 
   void 
-  parseLine(string const& line, map<string,string> & params)
+  parseLine(string const& line, map<string,string> & param)
   {
     char_separator<char> sep("; ");
     tokenizer<char_separator<char> > tokens(line,sep);
@@ -32,9 +57,14 @@ namespace Moses
 	size_t i = t.find_first_not_of(" =");
 	size_t j = t.find_first_of(" =",i+1);
 	size_t k = t.find_first_not_of(" =",j+1);
+	UTIL_THROW_IF2(i == string::npos || k == string::npos,
+		       "[" << HERE << "] "
+		       << "Parameter specification error near '"
+		       << t << "' in moses ini line\n"
+		      << line);
 	assert(i != string::npos);
 	assert(k != string::npos);
-	params[t.substr(i,j)] = t.substr(k);
+	param[t.substr(i,j)] = t.substr(k);
       }
   }
 
@@ -57,13 +87,13 @@ namespace Moses
   Mmsapt::
   Mmsapt(string const& line)
     : PhraseDictionary(line)
-    , m_lex_alpha(1.0)
-    , withLogCountFeatures(false)
-    , withCoherence(true)
-    , m_pfwd_features("g")
-    , m_pbwd_features("g")
-    , withPbwd(true)
-    , poolCounts(true)
+      // , m_lex_alpha(1.0)
+      // , withLogCountFeatures(false)
+      // , withCoherence(true)
+      // , m_pfwd_features("g")
+      // , m_pbwd_features("g")
+      // , withPbwd(true)
+      // , poolCounts(true)
     , ofactor(1,0)
     , m_tpc_ctr(0)
   {
@@ -94,81 +124,125 @@ namespace Moses
 
   void
   Mmsapt::
+  register_ff(sptr<pscorer> const& ff, vector<sptr<pscorer> > & registry)
+  {
+    registry.push_back(ff);
+    ff->setIndex(m_feature_names.size());
+    for (int i = 0; i < ff->fcnt(); ++i)
+      {
+	m_feature_names.push_back(ff->fname(i));
+	m_is_logval.push_back(ff->isLogVal(i));
+	m_is_integer.push_back(ff->isIntegerValued(i));
+      }
+  }
+
+  bool 
+  Mmsapt::
+  isLogVal(int i) const { return m_is_logval.at(i); }
+
+  bool 
+  Mmsapt::
+  isInteger(int i) const { return m_is_integer.at(i); }
+
+  void
+  Mmsapt::
   init(string const& line)
   {
     map<string,string>::const_iterator m;
-    map<string,string> param;
-    parseLine(line,param);
+    parseLine(line,this->param);
+
+    this->m_numScoreComponents = atoi(param["num-features"].c_str());
     
     m = param.find("config");
     if (m != param.end())
       read_config_file(m->second,param);
-    
-    bname = param["base"];
+
+    bname = param["base"]; 
     L1    = param["L1"];
     L2    = param["L2"];
-    assert(bname.size());
-    assert(L1.size());
-    assert(L2.size());
-
-    m = param.find("pfwd-denom");
-    m_pfwd_denom = m != param.end() ? m->second[0] : 's';
-    
-    m = param.find("smooth");
-    m_lbop_parameter = m != param.end() ? atof(m->second.c_str()) : .05;
 
-    m = param.find("max-samples");
-    m_default_sample_size = m != param.end() ? atoi(m->second.c_str()) : 1000;
+    UTIL_THROW_IF2(bname.size() == 0, "Missing corpus base name at " << HERE);
+    UTIL_THROW_IF2(L1.size() == 0, "Missing L1 tag at " << HERE);
+    UTIL_THROW_IF2(L2.size() == 0, "Missing L2 tag at " << HERE);
 
-    if ((m = param.find("logcnt-features")) != param.end())
-      withLogCountFeatures = m->second != "0";
-
-    if ((m = param.find("coh")) != param.end())
-      withCoherence = m->second != "0";
-    
-    if ((m = param.find("pfwd")) != param.end())
-      m_pfwd_features = (m->second == "0" ? "" : m->second);
-
-    if (m_pfwd_features == "1") // legacy; deprecated
-      m_pfwd_features[0] = m_pfwd_denom;
+    // set defaults for all parameters if not specified so far
+    pair<string,string> dflt("input-factor","0");
+    input_factor = atoi(param.insert(dflt).first->second.c_str());
+    // shouldn't that be a string?
     
-    if ((m = param.find("pbwd")) != param.end())
-      m_pbwd_features = (m->second == "0" ? "" : m->second);
+    dflt = pair<string,string> ("smooth",".01");
+    m_lbop_conf = atof(param.insert(dflt).first->second.c_str());
 
-    if (m_pbwd_features == "1") 
-      m_pbwd_features = "r"; // lecagy; deprecated
+    dflt = pair<string,string> ("lexalpha","0");
+    m_lex_alpha = atof(param.insert(dflt).first->second.c_str());
 
-    if ((m = param.find("lexalpha")) != param.end())
-      m_lex_alpha = atof(m->second.c_str());
+    dflt = pair<string,string> ("sample","1000");
+    m_default_sample_size = atoi(param.insert(dflt).first->second.c_str());
 
-    m = param.find("workers");
-    m_workers = m != param.end() ? atoi(m->second.c_str()) : 8;
+    dflt = pair<string,string>("workers","8");
+    m_workers = atoi(param.insert(dflt).first->second.c_str());
     m_workers = min(m_workers,24UL);
 
-    if ((m = param.find("limit")) != param.end()) 
-      m_tableLimit = atoi(m->second.c_str());
+    dflt = pair<string,string>("limit","20");
+    m_tableLimit = atoi(param.insert(dflt).first->second.c_str());
 
-    m = param.find("cache-size");
-    m_history.reserve(m != param.end()?max(1000,atoi(m->second.c_str())):10000);
+    dflt = pair<string,string>("cache","10000");
+    size_t hsize = max(1000,atoi(param.insert(dflt).first->second.c_str()));
+    m_history.reserve(hsize);
     // in plain language: cache size is at least 1000, and 10,000 by default
     // this cache keeps track of the most frequently used target phrase collections
     // even when not actively in use
-    
-    this->m_numScoreComponents = atoi(param["num-features"].c_str());
 
-    m = param.find("ifactor");
-    input_factor = m != param.end() ? atoi(m->second.c_str()) : 0;
+    // Feature functions are initialized  in function Load();
+    param.insert(pair<string,string>("pfwd",   "g"));  
+    param.insert(pair<string,string>("pbwd",   "g"));  
+    param.insert(pair<string,string>("logcnt", "0")); 
+    param.insert(pair<string,string>("coh",    "0")); 
+    param.insert(pair<string,string>("rare",   "1")); 
+    param.insert(pair<string,string>("prov",   "1")); 
     
     poolCounts = true;
     
     if ((m = param.find("extra")) != param.end()) 
       extra_data = m->second;
 
+    // check for unknown parameters
+    vector<string> known_parameters; known_parameters.reserve(50);
+    known_parameters.push_back("L1");
+    known_parameters.push_back("L2");
+    known_parameters.push_back("Mmsapt");
+    known_parameters.push_back("base");
+    known_parameters.push_back("cache");
+    known_parameters.push_back("coh");
+    known_parameters.push_back("config");
+    known_parameters.push_back("extra");
+    known_parameters.push_back("input-factor");
+    known_parameters.push_back("lexalpha");
+    known_parameters.push_back("limit");
+    known_parameters.push_back("logcnt");
+    known_parameters.push_back("name");
+    known_parameters.push_back("num-features");
+    known_parameters.push_back("output-factor");
+    known_parameters.push_back("pbwd");
+    known_parameters.push_back("pfwd");
+    known_parameters.push_back("prov");
+    known_parameters.push_back("rare");
+    known_parameters.push_back("sample");
+    known_parameters.push_back("smooth");
+    known_parameters.push_back("unal");
+    known_parameters.push_back("workers");
+    for (map<string,string>::iterator m = param.begin(); m != param.end(); ++m)
+      {
+	UTIL_THROW_IF2(!binary_search(known_parameters.begin(),
+				      known_parameters.end(), m->first),
+		       HERE << ": Unknown parameter specification for Mmsapt: " 
+		       << m->first);
+      }
   }
 
   void
   Mmsapt::
-  load_extra_data(string bname)
+  load_extra_data(string bname, bool locking = true)
   {
     // TO DO: ADD CHECKS FOR ROBUSTNESS
     // - file existence?
@@ -186,122 +260,120 @@ namespace Moses
     while(getline(in2,line)) text2.push_back(line);
     while(getline(ina,line)) symal.push_back(line);
 
-    lock_guard<mutex> guard(this->lock);
+    boost::scoped_ptr<lock_guard<mutex> > guard;
+    if (locking) guard.reset(new lock_guard<mutex>(this->lock));
     btdyn = btdyn->add(text1,text2,symal);
     assert(btdyn);
     // cerr << "Loaded " << btdyn->T1->size() << " sentence pairs" << endl;
   }
 
-  size_t
+  template<typename fftype>
+  void
   Mmsapt::
-  add_corpus_specific_features
-  (vector<sptr<pscorer > >& ffvec, size_t num_feats)
+  check_ff(string const ffname, vector<sptr<pscorer> >* registry)
   {
-    float const lbop = m_lbop_parameter; // just for code readability below
-    // for the time being, we assume that all phrase probability features 
-    // use the same confidence parameter for lower-bound-estimation
-    for (size_t i = 0; i < m_pfwd_features.size(); ++i) 
-      {	
-	UTIL_THROW_IF2(m_pfwd_features[i] != 'g' &&
-		       m_pfwd_features[i] != 'r' &&
-		       m_pfwd_features[i] != 's',
-		       "Can't handle pfwd feature type '" 
-		       << m_pfwd_features[i] << "'.");
-	sptr<PScorePfwd<Token> > ff(new PScorePfwd<Token>());
-	size_t k = num_feats;
-	num_feats = ff->init(num_feats,lbop,m_pfwd_features[i]);
-	for (;k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
-	ffvec.push_back(ff);
+    string const& spec = param[ffname];
+    if (spec == "" || spec == "0") return;
+    if (registry)
+      {
+	sptr<fftype> ff(new fftype(spec));
+	register_ff(ff, *registry);
       }
-    
-    for (size_t i = 0; i < m_pbwd_features.size(); ++i) 
-      {	
-	UTIL_THROW_IF2(m_pbwd_features[i] != 'g' &&
-		       m_pbwd_features[i] != 'r' &&
-		       m_pbwd_features[i] != 's',
-		       "Can't handle pbwd feature type '" 
-		       << m_pbwd_features[i] << "'.");
-	sptr<PScorePbwd<Token> > ff(new PScorePbwd<Token>());
-	size_t k = num_feats;
-	num_feats = ff->init(num_feats,lbop,m_pbwd_features[i]);
-	for (;k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
-	ffvec.push_back(ff);
+    else if (spec[spec.size()-1] == '+') // corpus specific
+      {
+	sptr<fftype> ff(new fftype(spec));
+	register_ff(ff, m_active_ff_fix);
+	ff.reset(new fftype(spec));
+	register_ff(ff, m_active_ff_dyn);
       }
-
-    // if (withPbwd) 
-    //   {
-    // 	sptr<PScorePbwd<Token> > ff(new PScorePbwd<Token>());
-    // 	size_t k = num_feats;
-    // 	num_feats = ff->init(num_feats,lbop);
-    // 	for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
-    // 	ffvec.push_back(ff);
-    //   }
-    
-    if (withLogCountFeatures) 
+    else 
       {
-	sptr<PScoreLogCounts<Token> > ff(new PScoreLogCounts<Token>());
-	size_t k = num_feats;
-	num_feats = ff->init(num_feats);
-	for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
-	ffvec.push_back(ff);
+	sptr<fftype> ff(new fftype(spec));
+	register_ff(ff, m_active_ff_common);
       }
+  }
 
-    return num_feats;
+  template<typename fftype>
+  void
+  Mmsapt::
+  check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry)
+  {
+    string const& spec = param[ffname];
+    if (spec == "" || spec == "0") return;
+    if (registry)
+      {
+	sptr<fftype> ff(new fftype(xtra,spec));
+	register_ff(ff, *registry);
+      }
+    else if (spec[spec.size()-1] == '+') // corpus specific
+      {
+	sptr<fftype> ff(new fftype(xtra,spec));
+	register_ff(ff, m_active_ff_fix);
+	ff.reset(new fftype(xtra,spec));
+	register_ff(ff, m_active_ff_dyn);
+      }
+    else 
+      {
+	sptr<fftype> ff(new fftype(xtra,spec));
+	register_ff(ff, m_active_ff_common);
+      }
   }
 
+  // void
+  // Mmsapt::
+  // add_corpus_specific_features(vector<sptr<pscorer > >& registry)
+  // {
+  //   check_ff<PScorePbwd<Token> >("pbwd",m_lbop_conf,registry);
+  //   check_ff<PScoreLogCnt<Token> >("logcnt",registry);
+  // }
+
   void
   Mmsapt::
   Load()
   {
+    lock_guard<mutex> guard(this->lock);
+
+    // can load only once
+    // UTIL_THROW_IF2(shards.size(),"Mmsapt is already loaded at " << HERE);
+
+    // lexical scores 
+    string lexfile = bname + L1 + "-" + L2 + ".lex";
+    sptr<PScoreLex1<Token> > ff(new PScoreLex1<Token>(param["lex_alpha"],lexfile));
+    register_ff(ff,m_active_ff_common);
+
+    // these are always computed on pooled data
+    check_ff<PScoreRareness<Token> > ("rare", &m_active_ff_common);
+    check_ff<PScoreUnaligned<Token> >("unal", &m_active_ff_common);
+    check_ff<PScoreCoherence<Token> >("coh",  &m_active_ff_common);
+    
+    // for these ones either way is possible (specification ends with '+' 
+    // if corpus-specific 
+    check_ff<PScorePfwd<Token> >("pfwd", m_lbop_conf);
+    check_ff<PScorePbwd<Token> >("pbwd", m_lbop_conf);
+    check_ff<PScoreLogCnt<Token> >("logcnt");
+
+    // These are always corpus-specific
+    check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_fix);
+    check_ff<PScoreProvenance<Token> >("prov", &m_active_ff_dyn);
+
+    UTIL_THROW_IF2(this->m_feature_names.size() != this->m_numScoreComponents,
+		   "At " << HERE << ": number of feature values provided by "
+		   << "Phrase table (" << this->m_feature_names.size()
+		   << ") does not match number specified in Moses config file ("
+		   << this->m_numScoreComponents << ")!\n";);
+
+    // Load corpora. For the time being, we can have one memory-mapped static
+    // corpus and one in-memory dynamic corpus
+    // sptr<mmbitext> btfix(new mmbitext());
     btfix.num_workers = this->m_workers;
     btfix.open(bname, L1, L2);
     btfix.setDefaultSampleSize(m_default_sample_size);
+    // shards.push_back(btfix);
     
-    size_t num_feats = 0;
-    
-    // lexical scores are currently always active 
-    sptr<PScoreLex<Token> > ff(new PScoreLex<Token>(m_lex_alpha));
-    size_t k = num_feats;
-    num_feats = ff->init(num_feats, bname + L1 + "-" + L2 + ".lex");
-    for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
-    m_active_ff_common.push_back(ff);
-    
-    if (withCoherence)
-      {
-	sptr<PScoreCoherence<Token> > ff(new PScoreCoherence<Token>());
-	size_t k = num_feats;
-	num_feats = ff->init(num_feats);
-	for (; k < num_feats; ++k) m_feature_names.push_back(ff->fname(k));
-	m_active_ff_common.push_back(ff);
-      }
-
-    num_feats = add_corpus_specific_features(m_active_ff_fix,num_feats);
-    // cerr << num_feats << "/" << this->m_numScoreComponents 
-    // << " at " << __FILE__ << ":" << __LINE__ << endl;
-    poolCounts = poolCounts && num_feats == this->m_numScoreComponents;
-    if (!poolCounts)
-      num_feats = add_corpus_specific_features(m_active_ff_dyn, num_feats);
-    
-#if 0
-    cerr << "MMSAPT provides " << num_feats << " features at " 
-	 << __FILE__ << ":" << __LINE__ << endl;
-    BOOST_FOREACH(string const& fname, m_feature_names)
-      cerr << fname << endl;
-#endif
-    UTIL_THROW_IF2(num_feats != this->m_numScoreComponents,
-		   "At " << __FILE__ << ":" << __LINE__
-		   << ": number of feature values provided by Phrase table (" 
-		   << num_feats << ") does not match number specified in "
-		   << "Moses config file (" << this->m_numScoreComponents 
-		   << ")!\n";);
-    
-    
-    btdyn.reset(new imBitext<Token>(btfix.V1, btfix.V2,m_default_sample_size));
+    btdyn.reset(new imbitext(btfix.V1, btfix.V2, m_default_sample_size));
     btdyn->num_workers = this->m_workers;
     if (extra_data.size()) 
-      {
-	load_extra_data(extra_data);
-      }
+      load_extra_data(extra_data,false);
     
 #if 0
     // currently not used
@@ -330,258 +402,345 @@ namespace Moses
 
   TargetPhrase* 
   Mmsapt::
-  createTargetPhrase(Phrase        const& src, 
-		     Bitext<Token> const& bt, 
-		     PhrasePair    const& pp) const
+  mkTPhrase(Phrase const& src,
+	    PhrasePair<Token>* fix, 
+	    PhrasePair<Token>* dyn, 
+	    sptr<Bitext<Token> > const& dynbt) const
   {
-    Word w; uint32_t sid,off,len;    
+    UTIL_THROW_IF2(!fix && !dyn, HERE << 
+		   ": Can't create target phrase from nothing.");
+    vector<float> fvals(this->m_numScoreComponents);
+    PhrasePair<Token> pool = fix ? *fix : *dyn;
+    if (fix) 
+      {
+	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+	  (*ff)(btfix, *fix, &fvals);
+      }
+    if (dyn)
+      {
+	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
+	  (*ff)(*dynbt, *dyn, &fvals);
+      }
+    
+    if (fix && dyn) { pool += *dyn; }
+    else if (fix)
+      {
+	PhrasePair<Token> zilch; zilch.init();
+	TSA<Token>::tree_iterator m(dynbt->I2.get(), fix->start2, fix->len2);
+	if (m.size() == fix->len2)
+	  zilch.raw2 = m.approxOccurrenceCount();
+	pool += zilch;
+	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
+	  (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
+      }
+    else if (dyn)
+      {
+	PhrasePair<Token> zilch; zilch.init();
+	TSA<Token>::tree_iterator m(btfix.I2.get(), dyn->start2, dyn->len2);
+	if (m.size() == dyn->len2)
+	  zilch.raw2 = m.approxOccurrenceCount();
+	pool += zilch;
+	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+	  (*ff)(*dynbt, ff->allowPooling() ? pool : zilch, &fvals);
+      }
+    if (fix) 
+      {
+ 	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+	  (*ff)(btfix, pool, &fvals);
+      }
+    else
+      {
+ 	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+	  (*ff)(*dynbt, pool, &fvals);
+      }
     TargetPhrase* tp = new TargetPhrase();
-    parse_pid(pp.p2, sid, off, len);
-    Token const* x = bt.T2->sntStart(sid) + off;
-    for (uint32_t k = 0; k < len; ++k)
+    Token const* x = fix ? fix->start2 : dyn->start2;
+    uint32_t len = fix ? fix->len2 : dyn->len2;
+    for (uint32_t k = 0; k < len; ++k, x = x->next())
       {
-	// cerr << (*bt.V2)[x[k].id()] << " at " << __FILE__ << ":" << __LINE__ << endl;
-	StringPiece wrd = (*bt.V2)[x[k].id()];
-	// if ((off+len) > bt.T2->sntLen(sid))
-	// cerr << off << ";" << len << " " << bt.T2->sntLen(sid) << endl;
-	assert(off+len <= bt.T2->sntLen(sid));
-	w.CreateFromString(Output,ofactor,wrd,false);
+	StringPiece wrd = (*(btfix.V2))[x->id()];
+	Word w; w.CreateFromString(Output,ofactor,wrd,false);
 	tp->AddWord(w);
       }
-    tp->GetScoreBreakdown().Assign(this, pp.fvals);
+    tp->GetScoreBreakdown().Assign(this, fvals);
     tp->Evaluate(src);
     return tp;
   }
 
-  // process phrase stats from a single parallel corpus
-  void
-  Mmsapt::
-  process_pstats
-  (Phrase   const& src,
-   uint64_t const  pid1, 
-   pstats   const& stats, 
-   Bitext<Token> const & bt, 
-   TargetPhraseCollection* tpcoll
-   ) const
-  {
-    PhrasePair pp;   
-    pp.init(pid1, stats, this->m_numScoreComponents);
-    pstats::trg_map_t::const_iterator t;
-    for (t = stats.trg.begin(); t != stats.trg.end(); ++t)
-      {
-   	pp.update(t->first,t->second);
-	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
-	  (*ff)(bt,pp);
-	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
-	  (*ff)(bt,pp);
-	tpcoll->Add(createTargetPhrase(src,bt,pp));
-      }
-  }
+  // TargetPhrase* 
+  // Mmsapt::
+  // mkTPhrase(Phrase        const& src, 
+  // 		     Bitext<Token> const& bt, 
+  // 		     PhrasePair    const& pp) const
+  // {
+  //   Word w; uint32_t sid,off,len;    
+  //   TargetPhrase* tp = new TargetPhrase();
+  //   parse_pid(pp.p2, sid, off, len);
+  //   Token const* x = bt.T2->sntStart(sid) + off;
+  //   for (uint32_t k = 0; k < len; ++k)
+  //     {
+  // 	// cerr << (*bt.V2)[x[k].id()] << " at " << __FILE__ << ":" << __LINE__ << endl;
+  // 	StringPiece wrd = (*bt.V2)[x[k].id()];
+  // 	// if ((off+len) > bt.T2->sntLen(sid))
+  // 	// cerr << off << ";" << len << " " << bt.T2->sntLen(sid) << endl;
+  // 	assert(off+len <= bt.T2->sntLen(sid));
+  // 	w.CreateFromString(Output,ofactor,wrd,false);
+  // 	tp->AddWord(w);
+  //     }
+  //   tp->GetScoreBreakdown().Assign(this, pp.fvals);
+  //   tp->Evaluate(src);
+  //   return tp;
+  // }
+
+  // // process phrase stats from a single parallel corpus
+  // void
+  // Mmsapt::
+  // process_pstats
+  // (Phrase   const& src,
+  //  uint64_t const  pid1, 
+  //  pstats   const& stats, 
+  //  Bitext<Token> const & bt, 
+  //  TargetPhraseCollection* tpcoll
+  //  ) const
+  // {
+  //   PhrasePair pp;   
+  //   pp.init(pid1, stats, this->m_numScoreComponents);
+  //   pstats::trg_map_t::const_iterator t;
+  //   for (t = stats.trg.begin(); t != stats.trg.end(); ++t)
+  //     {
+  //  	pp.update(t->first,t->second);
+  // 	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+  // 	  (*ff)(bt,pp);
+  // 	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+  // 	  (*ff)(bt,pp);
+  // 	tpcoll->Add(mkTPhrase(src,bt,pp));
+  //     }
+  // }
+
+  // void
+  // Mmsapt::
+  // ScorePPfix(PhrasePair& pp) const
+  // {
+  //   BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+  //     (*ff)(btfix,pp);
+  //   BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+  //     (*ff)(btfix,pp);
+  // }
+
+//   // process phrase stats from a single parallel corpus
+//   bool
+//   Mmsapt::
+//   pool_pstats(Phrase   const& src,
+// 	      uint64_t const  pid1a, 
+// 	      pstats        * statsa, 
+// 	      Bitext<Token> const & bta,
+// 	      uint64_t const  pid1b, 
+// 	      pstats   const* statsb, 
+// 	      Bitext<Token> const & btb,
+// 	      TargetPhraseCollection* tpcoll) const
+//   {
+//     PhrasePair pp;
+//     if (statsa && statsb)
+//       pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents);
+//     else if (statsa)
+//       pp.init(pid1a, *statsa, this->m_numScoreComponents);
+//     else if (statsb)
+//       pp.init(pid1b, *statsb, this->m_numScoreComponents);
+//     else return false; // throw "no stats for pooling available!";
+
+//     pstats::trg_map_t::const_iterator b;
+//     pstats::trg_map_t::iterator a;
+//     if (statsb)
+//       {
+// 	for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
+// 	  {
+// 	    uint32_t sid,off,len;    
+// 	    parse_pid(b->first, sid, off, len);
+// 	    Token const* x = btb.T2->sntStart(sid) + off;
+// 	    TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
+// 	    if (m.size() == len) 
+// 	      {
+// 		;
+// 		if (statsa && ((a = statsa->trg.find(m.getPid())) 
+// 			       != statsa->trg.end()))
+// 		  {
+// 		    pp.update(b->first,a->second,b->second);
+// 		    a->second.invalidate();
+// 		  }
+// 		else 
+// 		  pp.update(b->first,m.approxOccurrenceCount(),
+// 			    b->second);
+// 	      }
+// 	    else pp.update(b->first,b->second);
+// 	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+// 	      (*ff)(btb,pp);
+// 	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+// 	      (*ff)(btb,pp);
+// 	    tpcoll->Add(mkTPhrase(src,btb,pp));
+// 	  }
+//       }
+//     if (!statsa) return statsb != NULL;
+//     for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
+//       {
+// 	uint32_t sid,off,len;
+// 	if (!a->second.valid()) continue;
+// 	parse_pid(a->first, sid, off, len);
+// 	if (btb.T2)
+// 	  {
+// 	    Token const* x = bta.T2->sntStart(sid) + off;
+// 	    TSA<Token>::tree_iterator m(btb.I2.get(), x, len);
+// 	    if (m.size() == len) 
+// 	      pp.update(a->first,m.approxOccurrenceCount(),a->second);
+// 	    else 
+// 	      pp.update(a->first,a->second);
+// 	  }
+// 	else pp.update(a->first,a->second);
+// #if 0
+// 	// jstats const& j = a->second;
+// 	cerr << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " 
+// 	     << bta.T2->pid2str(bta.V2.get(),pp.p2) << endl;
+// 	cerr << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " " 
+// 	     << pp.joint << " " << pp.raw2 << endl;
+// #endif
+
+// 	UTIL_THROW_IF2(pp.raw2 == 0, 
+// 		       "OOPS" << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " 
+// 		       << bta.T2->pid2str(bta.V2.get(),pp.p2) << ": "
+// 		       << pp.raw1 << " " << pp.sample1 << " " 
+// 		       << pp.good1 << " " << pp.joint << " " 
+// 		       << pp.raw2);
+// 	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+// 	  (*ff)(bta,pp);
+// 	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+// 	  (*ff)(bta,pp);
+// 	tpcoll->Add(mkTPhrase(src,bta,pp));
+//       }
+//     return true;
+//   }
 
-  void
-  Mmsapt::
-  ScorePPfix(bitext::PhrasePair& pp) const
-  {
-    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
-      (*ff)(btfix,pp);
-    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
-      (*ff)(btfix,pp);
-  }
 
-  // process phrase stats from a single parallel corpus
-  bool
-  Mmsapt::
-  pool_pstats(Phrase   const& src,
-	      uint64_t const  pid1a, 
-	      pstats        * statsa, 
-	      Bitext<Token> const & bta,
-	      uint64_t const  pid1b, 
-	      pstats   const* statsb, 
-	      Bitext<Token> const & btb,
-	      TargetPhraseCollection* tpcoll) const
-  {
-    PhrasePair pp;
-    if (statsa && statsb)
-      pp.init(pid1b, *statsa, *statsb, this->m_numScoreComponents);
-    else if (statsa)
-      pp.init(pid1a, *statsa, this->m_numScoreComponents);
-    else if (statsb)
-      pp.init(pid1b, *statsb, this->m_numScoreComponents);
-    else return false; // throw "no stats for pooling available!";
-
-    pstats::trg_map_t::const_iterator b;
-    pstats::trg_map_t::iterator a;
-    if (statsb)
-      {
-	for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
-	  {
-	    uint32_t sid,off,len;    
-	    parse_pid(b->first, sid, off, len);
-	    Token const* x = bta.T2->sntStart(sid) + off;
-	    TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
-	    if (m.size() == len) 
-	      {
-		;
-		if (statsa && ((a = statsa->trg.find(m.getPid())) 
-			       != statsa->trg.end()))
-		  {
-		    pp.update(b->first,a->second,b->second);
-		    a->second.invalidate();
-		  }
-		else 
-		  pp.update(b->first,m.approxOccurrenceCount(),
-			    b->second);
-	      }
-	    else pp.update(b->first,b->second);
-	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
-	      (*ff)(btb,pp);
-	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
-	      (*ff)(btb,pp);
-	    tpcoll->Add(createTargetPhrase(src,btb,pp));
-	  }
-      }
-    if (!statsa) return statsb != NULL;
-    for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
-      {
-	uint32_t sid,off,len;
-	if (!a->second.valid()) continue;
-	parse_pid(a->first, sid, off, len);
-	if (btb.T2)
-	  {
-	    Token const* x = bta.T2->sntStart(sid) + off;
-	    TSA<Token>::tree_iterator m(btb.I2.get(), x, x+len);
-	    if (m.size() == len) 
-	      pp.update(a->first,m.approxOccurrenceCount(),a->second);
-	    else 
-	      pp.update(a->first,a->second);
-	  }
-	else 
-	  pp.update(a->first,a->second);
-#if 0
-	// jstats const& j = a->second;
-	cerr << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " 
-	     << bta.T2->pid2str(bta.V2.get(),pp.p2) << endl;
-	cerr << pp.raw1 << " " << pp.sample1 << " " << pp.good1 << " " 
-	     << pp.joint << " " << pp.raw2 << endl;
-#endif
 
-	UTIL_THROW_IF2(pp.raw2 == 0, 
-		       "OOPS" << bta.T1->pid2str(bta.V1.get(),pp.p1) << " ::: " 
-		       << bta.T2->pid2str(bta.V2.get(),pp.p2) << ": "
-		       << pp.raw1 << " " << pp.sample1 << " " 
-		       << pp.good1 << " " << pp.joint << " " 
-		       << pp.raw2);
-	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
-	  (*ff)(bta,pp);
-	BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
-	  (*ff)(bta,pp);
-	tpcoll->Add(createTargetPhrase(src,bta,pp));
-      }
-    return true;
-  }
   
-  
-  // process phrase stats from a single parallel corpus
-  bool
-  Mmsapt::
-  combine_pstats
-  (Phrase   const& src,
-   uint64_t const  pid1a, pstats      * statsa, Bitext<Token> const & bta,
-   uint64_t const  pid1b, pstats const* statsb, Bitext<Token> const & btb,
-   TargetPhraseCollection* tpcoll) const
-  {
-    PhrasePair ppfix,ppdyn,pool; 
-    // ppfix: counts from btfix
-    // ppdyn: counts from btdyn
-    // pool: pooled counts from both
-    Word w;
-    if (statsa) ppfix.init(pid1a,*statsa,this->m_numScoreComponents);
-    if (statsb) ppdyn.init(pid1b,*statsb,this->m_numScoreComponents);
-    pstats::trg_map_t::const_iterator b;
-    pstats::trg_map_t::iterator a;
-
-    if (statsb)
-      {
-	pool.init(pid1b,*statsb,0);
-	for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
-	  {
-	    ppdyn.update(b->first,b->second);
-	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
-	      (*ff)(btb,ppdyn);
+  // // process phrase stats from a single parallel corpus
+  // bool
+  // Mmsapt::
+  // combine_pstats
+  // (Phrase   const& src,
+  //  uint64_t const  pid1a, pstats      * statsa, Bitext<Token> const & bta,
+  //  uint64_t const  pid1b, pstats const* statsb, Bitext<Token> const & btb,
+  //  TargetPhraseCollection* tpcoll) const
+  // {
+  //   if (!statsa && !statsb) return false; 
+
+  //   PhrasePair ppfix,ppdyn,pool; Word w;
+  //   // ppfix: counts from btfix
+  //   // ppdyn: counts from btdyn
+  //   // pool: pooled counts from both
+
+  //   pstats::trg_map_t::const_iterator b;
+  //   pstats::trg_map_t::iterator a;
+
+    
+  //   set<uint64_t> check;
+  //   if (statsb)
+  //     {
+  // 	ppdyn.init(pid1b,*statsb,this->m_numScoreComponents);
+  // 	if (statsa)
+  // 	  {
+  // 	    pool.init(pid1b, *statsa, *statsb, 0);
+  // 	    ppfix.init(pid1a,*statsa, 0);
+  // 	  }
+  // 	else 
+  // 	  {
+  // 	    pool.init(pid1b, *statsb,0);
+  // 	    ppfix.init();
+  // 	  }
+	
+  // 	for (b = statsb->trg.begin(); b != statsb->trg.end(); ++b)
+  // 	  {
+  // 	    ppdyn.update(b->first,b->second);
+  // 	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
+  // 	      (*ff)(btb,ppdyn);
 	    
-	    uint32_t sid,off,len;    
-	    parse_pid(b->first, sid, off, len);
-	    Token const* x = bta.T2->sntStart(sid) + off;
-	    TSA<Token>::tree_iterator m(bta.I2.get(),x,x+len);
+  // 	    uint32_t sid,off,len;    
+  // 	    parse_pid(b->first, sid, off, len);
+  // 	    Token const* x = btb.T2->sntStart(sid) + off;
+  // 	    TSA<Token>::tree_iterator m(bta.I2.get(),x,len);
 	    
-	    if (m.size() && statsa && 
-		((a = statsa->trg.find(m.getPid())) != statsa->trg.end()))
-	      {
-		// phrase pair found also in btfix
-		ppfix.update(a->first,a->second);
-		BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
-		  (*ff)(bta,ppfix,&ppdyn.fvals);
-		BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
-		  (*ff)(bta,ppfix,&ppdyn.fvals);
-		a->second.invalidate();
-	      }
-	    else 
-	      {
-		// phrase pair was not found in btfix
-
-		// ... but the source phrase was  
-		if (m.size()) 
-		  pool.update(b->first,m.approxOccurrenceCount(), b->second);
-
-		// ... and not even the source phrase 
-		else 
-		  pool.update(b->first,b->second);
-		
-		BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
-		  (*ff)(btb,pool,&ppdyn.fvals);
-		BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
-		  (*ff)(btb,pool,&ppdyn.fvals);
-		
-	      }
-
-	    tpcoll->Add(createTargetPhrase(src,btb,ppdyn));
-	  }
-      }
-
-    // now deal with all phraise pairs that are ONLY in btfix
-    // (the ones that are in both were dealt with above)
-    if (statsa)
-      {
-	pool.init(pid1a,*statsa,0);
-	for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
-	  {
-	    if (!a->second.valid()) continue; // done above
-	    ppfix.update(a->first,a->second);
-	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
-	      (*ff)(bta,ppfix);
-	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
-	      (*ff)(bta,ppfix);
+  // 	    Token const* y = m.getToken(0);
+  // 	    for (size_t i = 0; i < len; ++i)
+  // 	      cout << x[i].id() << " " << endl;
+  // 	    for (size_t i = 0; i < m.size(); ++i)
+  // 	      cout << y[i].id() << " " << endl;
 	    
-	    if (btb.I2)
-	      {
-		uint32_t sid,off,len;    
-		parse_pid(a->first, sid, off, len);
-		Token const* x = bta.T2->sntStart(sid) + off;
-		TSA<Token>::tree_iterator m(btb.I2.get(),x,x+len);
-		if (m.size())
-		  pool.update(a->first,m.approxOccurrenceCount(),a->second);
-		else
-		  pool.update(a->first,a->second);
-	      }
-	    else pool.update(a->first,a->second);
-	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
-	      (*ff)(btb,pool,&ppfix.fvals);
-	    if (ppfix.p2)
-	      tpcoll->Add(createTargetPhrase(src,bta,ppfix));
-	  }
-      }
-    return (statsa || statsb);
-  }
+  // 	    if (statsa && m.size() &&  
+  // 		((a = statsa->trg.find(m.getPid())) != statsa->trg.end()))
+  // 	      { // i.e., phrase pair found also in btfix
+  // 		ppfix.update(a->first,a->second);
+  // 		pool.update(b->first, b->second, a->second);
+  // 		BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+  // 		  (*ff)(bta, ppfix, &ppdyn.fvals);
+  // 		check.insert(a->first); 
+  // 	      }
+  // 	    else // phrase pair was not found in btfix
+  // 	      {
+  // 		if (m.size()) // ... but the source phrase was  
+  // 		  {
+  // 		    pool.update(b->first, m.approxOccurrenceCount(), b->second);
+  // 		    ppfix.update(b->first,m.approxOccurrenceCount());
+  // 		  }
+  // 		else // ... and not even the source phrase 
+  // 		  {
+  // 		    pool.update(b->first, b->second);
+  // 		    ppfix.update(b->first,0);
+  // 		  }		    
+  // 		BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+  // 		  (*ff)(btb, ff->allowPooling() ? pool : ppfix, &ppdyn.fvals);
+  // 	      }
+  // 	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+  // 	      (*ff)(btb, pool, &ppdyn.fvals);
+  // 	    tpcoll->Add(mkTPhrase(src,btb,ppdyn));
+  // 	  }
+  //     }
+
+  //   // now deal with all phraise pairs that are ONLY in btfix
+  //   // (the ones that are in both were dealt with above)
+  //   if (statsa)
+  //     {
+  // 	ppfix.init(pid1a, *statsa, this->m_numScoreComponents);
+  // 	pool.init(pid1a,  *statsa, 0);
+  // 	ppdyn.init();
+  // 	for (a = statsa->trg.begin(); a != statsa->trg.end(); ++a)
+  // 	  {
+  // 	    if (check.find(a->first) != check.end()) 
+  // 	      continue;
+
+  // 	    ppfix.update(a->first, a->second);
+  // 	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_fix)
+  // 	      (*ff)(bta, ppfix);
+	    
+  // 	    if (btb.I2)
+  // 	      {
+  // 		uint32_t sid,off,len;    
+  // 		parse_pid(a->first, sid, off, len);
+  // 		Token const* x = bta.T2->sntStart(sid) + off;
+  // 		TSA<Token>::tree_iterator m(btb.I2.get(), x, len);
+  // 		if (m.size())
+  // 		  pool.update(a->first, m.approxOccurrenceCount(), a->second);
+  // 		else
+  // 		  pool.update(a->first, a->second);
+  // 	      }
+  // 	    else pool.update(a->first, a->second);
+  // 	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_dyn)
+  // 	      (*ff)(btb, ff->allowPooling() ? pool : ppdyn, &ppfix.fvals);
+  // 	    BOOST_FOREACH(sptr<pscorer> const& ff, m_active_ff_common)
+  // 	      (*ff)(bta, pool, &ppfix.fvals);
+  // 	    if (ppfix.p2)
+  // 	      tpcoll->Add(mkTPhrase(src, bta, ppfix));
+  // 	  }
+  //     }
+  //   return true;
+  // }
   
   Mmsapt::
   TargetPhraseCollectionWrapper::
@@ -595,8 +754,34 @@ namespace Moses
   {
     assert(this->refCount == 0);
   }
-
   
+  template<typename Token>
+  void 
+  expand(typename Bitext<Token>::iter const& m, 
+	 Bitext<Token> const& bt, 
+	 pstats const& ps, vector<PhrasePair<Token> >& dest)
+  {
+    dest.reserve(ps.trg.size());
+    PhrasePair<Token> pp;
+    pp.init(m.getToken(0), m.size(), &ps, 0);
+    // cout << HERE << " " << toString(*(bt.V1),pp.start1,pp.len1) << endl;
+    pstats::trg_map_t::const_iterator a;
+    for (a = ps.trg.begin(); a != ps.trg.end(); ++a)
+      {
+	uint32_t sid,off,len;
+	parse_pid(a->first, sid, off, len);
+	pp.update(bt.T2->sntStart(sid)+off, len, a->second);
+	dest.push_back(pp);
+      }
+    typename PhrasePair<Token>::SortByTargetIdSeq sorter;
+    sort(dest.begin(), dest.end(),sorter);
+#if 0
+    BOOST_FOREACH(PhrasePair<Token> const& p, dest)
+      cout << toString (*bt.V1,p.start1,p.len1) << " ::: " 
+	   << toString (*bt.V2,p.start2,p.len2) << " " 
+	   << p.joint << endl;
+#endif
+  }
 
   // This is not the most efficient way of phrase lookup! 
   TargetPhraseCollection const* 
@@ -605,13 +790,9 @@ namespace Moses
   {
     // map from Moses Phrase to internal id sequence
     vector<id_type> sphrase; 
-    fillIdSeq(src,input_factor,*btfix.V1,sphrase);
+    fillIdSeq(src,input_factor,*(btfix.V1),sphrase);
     if (sphrase.size() == 0) return NULL;
     
-    // lookup in static bitext 
-    TSA<Token>::tree_iterator mfix(btfix.I1.get(),&sphrase[0],sphrase.size());
-
-    // lookup in dynamic bitext
     // Reserve a local copy of the dynamic bitext in its current form. /btdyn/
     // is set to a new copy of the dynamic bitext every time a sentence pair
     // is added. /dyn/ keeps the old bitext around as long as we need it.
@@ -621,12 +802,13 @@ namespace Moses
       dyn = btdyn;
     }
     assert(dyn);
+
+    // lookup phrases in both bitexts
+    TSA<Token>::tree_iterator mfix(btfix.I1.get(), &sphrase[0], sphrase.size());
     TSA<Token>::tree_iterator mdyn(dyn->I1.get());
     if (dyn->I1.get())
-      {
-	for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i)
-	  mdyn.extend(sphrase[i]);
-      }
+      for (size_t i = 0; mdyn.size() == i && i < sphrase.size(); ++i)
+	mdyn.extend(sphrase[i]);
 
 #if 0
     cerr << src << endl;
@@ -634,43 +816,62 @@ namespace Moses
 	 << mdyn.size() << " " << mdyn.getPid() << endl;
 #endif
 
-    // phrase not found in either
-    if (mdyn.size() != sphrase.size() && 
-	mfix.size() != sphrase.size()) 
-      return NULL; // not found
+    if (mdyn.size() != sphrase.size() && mfix.size() != sphrase.size()) 
+      return NULL; // phrase not found in either bitext
 
     // cache lookup:
-
-    uint64_t phrasekey;
-    if (mfix.size() == sphrase.size())
-      phrasekey = (mfix.getPid()<<1);
-    else
-      phrasekey = (mdyn.getPid()<<1)+1;
-
+    uint64_t phrasekey = (mfix.size() == sphrase.size() ? (mfix.getPid()<<1) 
+			  : (mdyn.getPid()<<1)+1);
     size_t revision = dyn->revision();
     {
       boost::lock_guard<boost::mutex> guard(this->lock);
       tpc_cache_t::iterator c = m_cache.find(phrasekey);
+      // TO DO: we should revise the revision mechanism: we take the length
+      // of the dynamic bitext (in sentences) at the time the PT entry
+      // was stored as the time stamp. For each word in the
+      // vocabulary, we also store its most recent occurrence in the
+      // bitext. Only if the timestamp of each word in the phrase is
+      // newer than the timestamp of the phrase itself we must update 
+      // the entry. 
       if (c != m_cache.end() && c->second->revision == revision)
 	return encache(c->second);
     }
     
-    // not found or not up to date
+    // OK: pt entry not found or not up to date
+    // lookup and expansion could be done in parallel threds, 
+    // but ppdyn is probably small anyway
+    // TO DO: have Bitexts return lists of PhrasePairs instead of pstats
+    // no need to expand pstats at every single lookup again, especially 
+    // for btfix.
     sptr<pstats> sfix,sdyn;
-    if (mfix.size() == sphrase.size())
-      sfix = btfix.lookup(mfix);
-    if (mdyn.size() == sphrase.size())
-      sdyn = dyn->lookup(mdyn);
+    if (mfix.size() == sphrase.size()) sfix = btfix.lookup(mfix);
+    if (mdyn.size() == sphrase.size()) sdyn = dyn->lookup(mdyn);
+
+    vector<PhrasePair<Token> > ppfix,ppdyn;
+    if (sfix) expand(mfix, btfix, *sfix, ppfix);
+    if (sdyn) expand(mdyn, *dyn, *sdyn, ppdyn);
     
-    TargetPhraseCollectionWrapper* 
-      ret = new TargetPhraseCollectionWrapper(revision,phrasekey);
-    if ((poolCounts && 
-	 pool_pstats(src, mfix.getPid(),sfix.get(),btfix, 
-		     mdyn.getPid(),sdyn.get(),*dyn,ret))
-	|| combine_pstats(src, mfix.getPid(),sfix.get(),btfix, 
-			  mdyn.getPid(),sdyn.get(),*dyn,ret))
+    // now we have two lists of Phrase Pairs, let's merge them
+    TargetPhraseCollectionWrapper* ret;
+    ret = new TargetPhraseCollectionWrapper(revision,phrasekey);
+    PhrasePair<Token>::SortByTargetIdSeq sorter;
+    size_t i = 0; size_t k = 0;
+    while (i < ppfix.size() && k < ppdyn.size())
+      {
+	int cmp = sorter.cmp(ppfix[i], ppdyn[k]);
+	if      (cmp  < 0) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn));
+	else if (cmp == 0) ret->Add(mkTPhrase(src,&ppfix[i++],&ppdyn[k++],dyn));
+	else               ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn));
+      }
+    while (i < ppfix.size()) ret->Add(mkTPhrase(src,&ppfix[i++],NULL,dyn));
+    while (k < ppdyn.size()) ret->Add(mkTPhrase(src,NULL,&ppdyn[k++],dyn));
+    if (m_tableLimit) ret->Prune(true, m_tableLimit);
+    else ret->Prune(true,ret->GetSize());
+#if 0
+    if (combine_pstats(src, 
+		       mfix.getPid(), sfix.get(), btfix, 
+		       mdyn.getPid(), sdyn.get(),  *dyn, ret))
       {
-	if (m_tableLimit) ret->Prune(true,m_tableLimit);
 #if 0
 	sort(ret->begin(), ret->end(), CompareTargetPhrase());
 	cout << "SOURCE PHRASE: " << src << endl;
@@ -686,6 +887,9 @@ namespace Moses
 	  }
 #endif
       }
+#endif
+
+    // put the result in the cache and return
     boost::lock_guard<boost::mutex> guard(this->lock);
     m_cache[phrasekey] = ret;
     return encache(ret);
@@ -839,6 +1043,7 @@ namespace Moses
     TSA<Token>::tree_iterator mfix(btfix.I1.get(),&myphrase[0],myphrase.size());
     if (mfix.size() == myphrase.size()) 
       {
+	btfix.prep(mfix);
 	// cerr << phrase << " " << mfix.approxOccurrenceCount() << endl;
 	return true;
       }
@@ -854,6 +1059,7 @@ namespace Moses
       {
 	for (size_t i = 0; mdyn.size() == i && i < myphrase.size(); ++i)
 	  mdyn.extend(myphrase[i]);
+	if (mdyn.size() == myphrase.size()) dyn->prep(mdyn);
       }
     return mdyn.size() == myphrase.size();
   }
diff --git a/moses/TranslationModel/UG/mmsapt.h b/moses/TranslationModel/UG/mmsapt.h
index b6be36131..a7ece8fdb 100644
--- a/moses/TranslationModel/UG/mmsapt.h
+++ b/moses/TranslationModel/UG/mmsapt.h
@@ -19,6 +19,7 @@
 #include "moses/TranslationModel/UG/mm/ug_typedefs.h"
 #include "moses/TranslationModel/UG/mm/tpt_pickler.h"
 #include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
 #include "moses/TranslationModel/UG/mm/ug_lexical_phrase_scorer2.h"
 
 #include "moses/InputFileStream.h"
@@ -29,7 +30,8 @@
 #include <map>
 
 #include "moses/TranslationModel/PhraseDictionary.h"
-#include "mmsapt_phrase_scorers.h"
+#include "mmsapt_phrase_scorers.h" // deprecated
+#include "sapt_phrase_scorers.h"
 
 // TO DO:
 // - make lexical phrase scorer take addition to the "dynamic overlay" into account
@@ -47,47 +49,68 @@ namespace Moses
 #endif
   {
     friend class Alignment;
+    map<string,string> param;
   public:    
     typedef L2R_Token<SimpleWordId> Token;
     typedef mmBitext<Token> mmbitext;
     typedef imBitext<Token> imbitext;
+    typedef Bitext<Token>     bitext;
     typedef TSA<Token>           tsa;
     typedef PhraseScorer<Token> pscorer;
+
   private:
+    // vector<sptr<bitext> > shards;
     mmbitext btfix; 
-    sptr<imbitext> btdyn;
+    sptr<imbitext> btdyn; 
     string bname,extra_data;
     string L1;
     string L2;
-    float  m_lbop_parameter;
-    float  m_lex_alpha; 
+    float  m_lbop_conf; // confidence level for lbop smoothing
+    float  m_lex_alpha; // alpha paramter (j+a)/(m+a) for lexical smoothing
     // alpha parameter for lexical smoothing (joint+alpha)/(marg + alpha)
     // must be > 0 if dynamic 
     size_t m_default_sample_size;
     size_t m_workers;  // number of worker threads for sampling the bitexts
 
-    // deprecated!
-    char m_pfwd_denom; // denominator for computation of fwd phrase score:
-    // 'r' - divide by raw count
-    // 's' - divide by sample count
-    // 'g' - devide by number of "good" (i.e. coherent) samples 
-    // size_t num_features;
+    // // deprecated!
+    // char m_pfwd_denom; // denominator for computation of fwd phrase score:
+    // // 'r' - divide by raw count
+    // // 's' - divide by sample count
+    // // 'g' - devide by number of "good" (i.e. coherent) samples 
+    // // size_t num_features;
 
     size_t input_factor;
     size_t output_factor; // we can actually return entire Tokens!
 
-    bool withLogCountFeatures; // add logs of counts as features?
-    bool withCoherence; 
-    string m_pfwd_features; // which pfwd functions to use
-    string m_pbwd_features; // which pbwd functions to use
+    // bool withLogCountFeatures; // add logs of counts as features?
+    // bool withCoherence; 
+    // string m_pfwd_features; // which pfwd functions to use
+    // string m_pbwd_features; // which pbwd functions to use
+
+    // for display for human inspection (ttable dumps):
     vector<string> m_feature_names; // names of features activated
+    vector<bool> m_is_logval;  // keeps track of which features are log valued 
+    vector<bool> m_is_integer; // keeps track of which features are integer valued 
+
     vector<sptr<pscorer > > m_active_ff_fix; // activated feature functions (fix)
     vector<sptr<pscorer > > m_active_ff_dyn; // activated feature functions (dyn)
     vector<sptr<pscorer > > m_active_ff_common; // activated feature functions (dyn)
 
-    size_t
-    add_corpus_specific_features
-    (vector<sptr<pscorer > >& ffvec, size_t num_feats);
+    void
+    register_ff(sptr<pscorer> const& ff, vector<sptr<pscorer> > & registry);
+
+    template<typename fftype>
+    void 
+    check_ff(string const ffname,vector<sptr<pscorer> >* registry = NULL);
+    // add feature function if specified 
+    
+    template<typename fftype>
+    void 
+    check_ff(string const ffname, float const xtra, vector<sptr<pscorer> >* registry = NULL);
+    // add feature function if specified
+
+    void
+    add_corpus_specific_features(vector<sptr<pscorer > >& ffvec);
     
     // built-in feature functions
     // PScorePfwd<Token> calc_pfwd_fix, calc_pfwd_dyn;
@@ -140,12 +163,24 @@ namespace Moses
     mm2dtable_t COOCraw;
 
     TargetPhrase* 
-    createTargetPhrase
+    mkTPhrase(Phrase const& src, 
+	      Moses::bitext::PhrasePair<Token>* fix, 
+	      Moses::bitext::PhrasePair<Token>* dyn, 
+	      sptr<Bitext<Token> > const& dynbt) const;
+
+    // template<typename Token>
+    // void 
+    // expand(typename Bitext<Token>::iter const& m, Bitext<Token> const& bt, 
+    // 	   pstats const& pstats, vector<PhrasePair<Token> >& dest);
+    
+#if 0
+    TargetPhrase* 
+    mkTPhrase
     (Phrase        const& src, 
      Bitext<Token> const& bt, 
-     bitext::PhrasePair    const& pp
+     Moses::bitext::PhrasePair const& pp
      ) const;
-
+#endif
     void
     process_pstats
     (Phrase   const& src,
@@ -180,7 +215,7 @@ namespace Moses
      ) const;
 
     void
-    load_extra_data(string bname);
+    load_extra_data(string bname, bool locking);
 
     mutable size_t m_tpc_ctr;
   public:
@@ -231,8 +266,14 @@ namespace Moses
     vector<string> const&
     GetFeatureNames() const;
     
-    void
-    ScorePPfix(bitext::PhrasePair& pp) const;
+    // void
+    // ScorePPfix(bitext::PhrasePair& pp) const;
+
+    bool
+    isLogVal(int i) const;
+    
+    bool
+    isInteger(int i) const;
 
   private:
   };
diff --git a/moses/TranslationModel/UG/mmsapt_align.cc b/moses/TranslationModel/UG/mmsapt_align.cc
index 407df648d..8b6bf1eb2 100644
--- a/moses/TranslationModel/UG/mmsapt_align.cc
+++ b/moses/TranslationModel/UG/mmsapt_align.cc
@@ -1,335 +1,336 @@
 #include "mmsapt.h"
+// currently broken
 
-namespace Moses
-{
-  using namespace bitext;
-  using namespace std;
-  using namespace boost;
+// namespace Moses
+// {
+//   using namespace bitext;
+//   using namespace std;
+//   using namespace boost;
   
-  struct PPgreater
-  {
-    bool operator()(PhrasePair const& a, PhrasePair const& b)
-    {
-      return a.score > b.score;
-    }
-  };
+//   struct PPgreater
+//   {
+//     bool operator()(PhrasePair const& a, PhrasePair const& b)
+//     {
+//       return a.score > b.score;
+//     }
+//   };
 
-  void
-  Mmsapt::
-  setWeights(vector<float> const & w)
-  {
-    assert(w.size() == this->m_numScoreComponents);
-    this->feature_weights = w;
-  }
+//   void
+//   Mmsapt::
+//   setWeights(vector<float> const & w)
+//   {
+//     assert(w.size() == this->m_numScoreComponents);
+//     this->feature_weights = w;
+//   }
 
-  struct PhraseAlnHyp
-  {
-    PhrasePair pp;
-    ushort   s1,e1,s2,e2; // start and end positions
-    int             prev; // preceding alignment hypothesis
-    float          score; 
-    bitvector       scov; // source coverage
-    PhraseAlnHyp(PhrasePair const& ppx, int slen,
-		 pair<uint32_t,uint32_t> const& sspan,
-		 pair<uint32_t,uint32_t> const& tspan)
-      : pp(ppx), prev(-1), score(ppx.score), scov(slen)
-    {
-      s1 = sspan.first; e1 = sspan.second;
-      s2 = tspan.first; e2 = tspan.second;
-      for (size_t i = s1; i < e1; ++i) 
-	scov.set(i);
-    }
+//   struct PhraseAlnHyp
+//   {
+//     PhrasePair pp;
+//     ushort   s1,e1,s2,e2; // start and end positions
+//     int             prev; // preceding alignment hypothesis
+//     float          score; 
+//     bitvector       scov; // source coverage
+//     PhraseAlnHyp(PhrasePair const& ppx, int slen,
+// 		 pair<uint32_t,uint32_t> const& sspan,
+// 		 pair<uint32_t,uint32_t> const& tspan)
+//       : pp(ppx), prev(-1), score(ppx.score), scov(slen)
+//     {
+//       s1 = sspan.first; e1 = sspan.second;
+//       s2 = tspan.first; e2 = tspan.second;
+//       for (size_t i = s1; i < e1; ++i) 
+// 	scov.set(i);
+//     }
 
-    bool operator<(PhraseAlnHyp const& other) const
-    {
-      return this->score < other.score;
-    }
+//     bool operator<(PhraseAlnHyp const& other) const
+//     {
+//       return this->score < other.score;
+//     }
 
-    bool operator>(PhraseAlnHyp const& other) const
-    {
-      return this->score > other.score;
-    }
+//     bool operator>(PhraseAlnHyp const& other) const
+//     {
+//       return this->score > other.score;
+//     }
 
-    PhraseOrientation
-    po_bwd(PhraseAlnHyp const* prev) const
-    {
-      if (s2 == 0) return po_first;
-      assert(prev);
-      assert(prev->e2 <= s2);
-      if (prev->e2 < s2)  return po_other;
-      if (prev->e1 == s1) return po_mono;
-      if (prev->e1 < s1)  return po_jfwd;
-      if (prev->s1 == e1) return po_swap;
-      if (prev->s1 > e1)  return po_jbwd;
-      return po_other;
-    }
+//     PhraseOrientation
+//     po_bwd(PhraseAlnHyp const* prev) const
+//     {
+//       if (s2 == 0) return po_first;
+//       assert(prev);
+//       assert(prev->e2 <= s2);
+//       if (prev->e2 < s2)  return po_other;
+//       if (prev->e1 == s1) return po_mono;
+//       if (prev->e1 < s1)  return po_jfwd;
+//       if (prev->s1 == e1) return po_swap;
+//       if (prev->s1 > e1)  return po_jbwd;
+//       return po_other;
+//     }
 
-    PhraseOrientation
-    po_fwd(PhraseAlnHyp const* next) const
-    {
-      if (!next) return po_last;
-      assert(next->s2 >= e2);
-      if (next->s2 < e2)  return po_other;
-      if (next->e1 == s1) return po_swap;
-      if (next->e1 < s1)  return po_jbwd;
-      if (next->s1 == e1) return po_mono;
-      if (next->s1 > e1)  return po_jfwd;
-      return po_other;
-    }
+//     PhraseOrientation
+//     po_fwd(PhraseAlnHyp const* next) const
+//     {
+//       if (!next) return po_last;
+//       assert(next->s2 >= e2);
+//       if (next->s2 < e2)  return po_other;
+//       if (next->e1 == s1) return po_swap;
+//       if (next->e1 < s1)  return po_jbwd;
+//       if (next->s1 == e1) return po_mono;
+//       if (next->s1 > e1)  return po_jfwd;
+//       return po_other;
+//     }
 
-    float 
-    dprob_fwd(PhraseAlnHyp const& next)
-    {
-      return pp.dfwd[po_fwd(&next)];
-    }
+//     float 
+//     dprob_fwd(PhraseAlnHyp const& next)
+//     {
+//       return pp.dfwd[po_fwd(&next)];
+//     }
 
-    float 
-    dprob_bwd(PhraseAlnHyp const& prev)
-    {
-      return pp.dbwd[po_bwd(&prev)];
-    }
+//     float 
+//     dprob_bwd(PhraseAlnHyp const& prev)
+//     {
+//       return pp.dbwd[po_bwd(&prev)];
+//     }
 
-  };
+//   };
 
-  class Alignment
-  {
-    typedef L2R_Token<SimpleWordId> Token;
-    typedef TSA<Token>           tsa;
-    typedef pair<uint32_t, uint32_t>  span;
-    typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
-    typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
-    typedef pstats::trg_map_t jStatsTable;
+//   class Alignment
+//   {
+//     typedef L2R_Token<SimpleWordId> Token;
+//     typedef TSA<Token>           tsa;
+//     typedef pair<uint32_t, uint32_t>  span;
+//     typedef vector<vector<uint64_t> > pidmap_t; // span -> phrase ID
+//     typedef boost::unordered_map<uint64_t,vector<span> > pid2span_t;
+//     typedef pstats::trg_map_t jStatsTable;
 
-    Mmsapt const& PT;
-    vector<id_type> s,t; 
-    pidmap_t   sspan2pid, tspan2pid; // span -> phrase ID
-    pid2span_t spid2span,tpid2span;
-    vector<vector<sptr<pstats> > > spstats;
+//     Mmsapt const& PT;
+//     vector<id_type> s,t; 
+//     pidmap_t   sspan2pid, tspan2pid; // span -> phrase ID
+//     pid2span_t spid2span,tpid2span;
+//     vector<vector<sptr<pstats> > > spstats;
 
-    vector<PhrasePair> PP; 
-    // position-independent phrase pair info
-  public:
-    vector<PhraseAlnHyp> PAH;  
-    vector<vector<int> > tpos2ahyp;
-    // maps from target start positions to PhraseAlnHyps starting at
-    // that position
+//     vector<PhrasePair> PP; 
+//     // position-independent phrase pair info
+//   public:
+//     vector<PhraseAlnHyp> PAH;  
+//     vector<vector<int> > tpos2ahyp;
+//     // maps from target start positions to PhraseAlnHyps starting at
+//     // that position
 
-    sptr<pstats> getPstats(span const& sspan);
-    void fill_tspan_maps();
-    void fill_sspan_maps();
-  public:
-    Alignment(Mmsapt const& pt, string const& src, string const& trg);
-    void show(ostream& out); 
-    void show(ostream& out, PhraseAlnHyp const& ah); 
-  };
+//     sptr<pstats> getPstats(span const& sspan);
+//     void fill_tspan_maps();
+//     void fill_sspan_maps();
+//   public:
+//     Alignment(Mmsapt const& pt, string const& src, string const& trg);
+//     void show(ostream& out); 
+//     void show(ostream& out, PhraseAlnHyp const& ah); 
+//   };
 
-  void
-  Alignment::
-  show(ostream& out, PhraseAlnHyp const& ah)
-  {
-#if 0
-    LexicalPhraseScorer2<Token>::table_t const& 
-      COOCjnt = PT.calc_lex.scorer.COOC;
+//   void
+//   Alignment::
+//   show(ostream& out, PhraseAlnHyp const& ah)
+//   {
+// #if 0
+//     LexicalPhraseScorer2<Token>::table_t const& 
+//       COOCjnt = PT.calc_lex.scorer.COOC;
 
-    out << setw(10) << exp(ah.score) << " "
-	<< PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2) 
-	<< " <=> "
-	<< PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1);
-    vector<uchar> const& a = ah.pp.aln;
-    // BOOST_FOREACH(int x,a) cout << "[" << x << "] ";
-    for (size_t u = 0; u+1 < a.size(); u += 2)
-      out << " " << int(a[u+1]) << "-" << int(a[u]);
+//     out << setw(10) << exp(ah.score) << " "
+// 	<< PT.btfix.T2->pid2str(PT.btfix.V2.get(), ah.pp.p2) 
+// 	<< " <=> "
+// 	<< PT.btfix.T1->pid2str(PT.btfix.V1.get(), ah.pp.p1);
+//     vector<uchar> const& a = ah.pp.aln;
+//     // BOOST_FOREACH(int x,a) cout << "[" << x << "] ";
+//     for (size_t u = 0; u+1 < a.size(); u += 2)
+//       out << " " << int(a[u+1]) << "-" << int(a[u]);
 
-    if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1)
-      out << " " << COOCjnt[s[ah.s1]][t[ah.s2]]
-	  << "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]]
-	  << "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]];
-    out << endl;
-    // float const* ofwdj = ah.pp.dfwd;
-    // float const* obwdj = ah.pp.dbwd;
-    // uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
-    // uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
-    // out << "   [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
-    // 	 <<     " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
-    // 	 <<     " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
-    // 	 <<     " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
-    // 	 <<     " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
-    // 	 <<     " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
-    // 	 <<     " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
-    // 	 << "]" << endl
-    // 	 << "   [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
-    // 	 <<     " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
-    // 	 <<     " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
-    // 	 <<     " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
-    // 	 <<     " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
-    // 	 <<     " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
-    // 	 <<     " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
-    // 	 << "]" << endl;
-#endif
-  }
+//     if (ah.e2-ah.s2 == 1 and ah.e1-ah.s1 == 1)
+//       out << " " << COOCjnt[s[ah.s1]][t[ah.s2]]
+// 	  << "/" << PT.COOCraw[s[ah.s1]][t[ah.s2]]
+// 	  << "=" << float(COOCjnt[s[ah.s1]][t[ah.s2]])/PT.COOCraw[s[ah.s1]][t[ah.s2]];
+//     out << endl;
+//     // float const* ofwdj = ah.pp.dfwd;
+//     // float const* obwdj = ah.pp.dbwd;
+//     // uint32_t const* ofwdm = spstats[ah.s1][ah.e1-ah.s1-1]->ofwd;
+//     // uint32_t const* obwdm = spstats[ah.s1][ah.e1-ah.s1-1]->obwd;
+//     // out << "   [first: " << ofwdj[po_first]<<"/"<<ofwdm[po_first]
+//     // 	 <<     " last: " << ofwdj[po_last]<<"/"<<ofwdm[po_last]
+//     // 	 <<     " mono: " << ofwdj[po_mono]<<"/"<<ofwdm[po_mono]
+//     // 	 <<     " jfwd: " << ofwdj[po_jfwd]<<"/"<<ofwdm[po_jfwd]
+//     // 	 <<     " swap: " << ofwdj[po_swap]<<"/"<<ofwdm[po_swap]
+//     // 	 <<     " jbwd: " << ofwdj[po_jbwd]<<"/"<<ofwdm[po_jbwd]
+//     // 	 <<     " other: " << ofwdj[po_other]<<"/"<<ofwdm[po_other]
+//     // 	 << "]" << endl
+//     // 	 << "   [first: " << obwdj[po_first]<<"/"<<obwdm[po_first]
+//     // 	 <<     " last: " << obwdj[po_last]<<"/"<<obwdm[po_last]
+//     // 	 <<     " mono: " << obwdj[po_mono]<<"/"<<obwdm[po_mono]
+//     // 	 <<     " jfwd: " << obwdj[po_jfwd]<<"/"<<obwdm[po_jfwd]
+//     // 	 <<     " swap: " << obwdj[po_swap]<<"/"<<obwdm[po_swap]
+//     // 	 <<     " jbwd: " << obwdj[po_jbwd]<<"/"<<obwdm[po_jbwd]
+//     // 	 <<     " other: " << obwdj[po_other]<<"/"<<obwdm[po_other]
+//     // 	 << "]" << endl;
+// #endif
+//   }
   
-  void
-  Alignment::
-  show(ostream& out)
-  {
-    // show what we have so far ...
-    for (size_t s2 = 0; s2 < t.size(); ++s2)
-      {
-	VectorIndexSorter<PhraseAlnHyp> foo(PAH);
-	sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
-	for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
-	  show(out,PAH[tpos2ahyp[s2][h]]);
-      }
-  }
+//   void
+//   Alignment::
+//   show(ostream& out)
+//   {
+//     // show what we have so far ...
+//     for (size_t s2 = 0; s2 < t.size(); ++s2)
+//       {
+// 	VectorIndexSorter<PhraseAlnHyp> foo(PAH);
+// 	sort(tpos2ahyp[s2].begin(), tpos2ahyp[s2].end(), foo);
+// 	for (size_t h = 0; h < tpos2ahyp[s2].size(); ++h)
+// 	  show(out,PAH[tpos2ahyp[s2][h]]);
+//       }
+//   }
 
-  sptr<pstats>
-  Alignment::
-  getPstats(span const& sspan)
-  {
-    size_t k = sspan.second - sspan.first - 1;
-    if (k < spstats[sspan.first].size())
-      return spstats[sspan.first][k];
-    else return sptr<pstats>();
-  }
+//   sptr<pstats>
+//   Alignment::
+//   getPstats(span const& sspan)
+//   {
+//     size_t k = sspan.second - sspan.first - 1;
+//     if (k < spstats[sspan.first].size())
+//       return spstats[sspan.first][k];
+//     else return sptr<pstats>();
+//   }
   
-  void
-  Alignment::
-  fill_tspan_maps()
-  {
-    tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0));
-    for (size_t i = 0; i < t.size(); ++i)
-      {
-	tsa::tree_iterator m(PT.btfix.I2.get());
-	for (size_t k = i; k < t.size() && m.extend(t[k]); ++k)
-	  {
-	    uint64_t pid = m.getPid();
-	    tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
-	    tspan2pid[i][k] = pid;
-	  }
-      } 
-  }
+//   void
+//   Alignment::
+//   fill_tspan_maps()
+//   {
+//     tspan2pid.assign(t.size(),vector<uint64_t>(t.size(),0));
+//     for (size_t i = 0; i < t.size(); ++i)
+//       {
+// 	tsa::tree_iterator m(PT.btfix.I2.get());
+// 	for (size_t k = i; k < t.size() && m.extend(t[k]); ++k)
+// 	  {
+// 	    uint64_t pid = m.getPid();
+// 	    tpid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
+// 	    tspan2pid[i][k] = pid;
+// 	  }
+//       } 
+//   }
 
-  void
-  Alignment::
-  fill_sspan_maps()
-  {
-    sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0));
-    spstats.resize(s.size());
-    for (size_t i = 0; i < s.size(); ++i)
-      {
-	tsa::tree_iterator m(PT.btfix.I1.get());
-	for (size_t k = i; k < s.size() && m.extend(s[k]); ++k)
-	  {
-	    uint64_t pid = m.getPid();
-	    sspan2pid[i][k] = pid;
-	    pid2span_t::iterator p = spid2span.find(pid);
-	    if (p != spid2span.end())
-	      {
-		int x = p->second[0].first;
-		int y = p->second[0].second-1;
-		spstats[i].push_back(spstats[x][y-x]);
-	      }
-	    else 
-	      {
-		spstats[i].push_back(PT.btfix.lookup(m));
-		cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " "
-		     << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt 
-		     << endl;
-	      }
-	    spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
-	  }
-      }
-  }
+//   void
+//   Alignment::
+//   fill_sspan_maps()
+//   {
+//     sspan2pid.assign(s.size(),vector<uint64_t>(s.size(),0));
+//     spstats.resize(s.size());
+//     for (size_t i = 0; i < s.size(); ++i)
+//       {
+// 	tsa::tree_iterator m(PT.btfix.I1.get());
+// 	for (size_t k = i; k < s.size() && m.extend(s[k]); ++k)
+// 	  {
+// 	    uint64_t pid = m.getPid();
+// 	    sspan2pid[i][k] = pid;
+// 	    pid2span_t::iterator p = spid2span.find(pid);
+// 	    if (p != spid2span.end())
+// 	      {
+// 		int x = p->second[0].first;
+// 		int y = p->second[0].second-1;
+// 		spstats[i].push_back(spstats[x][y-x]);
+// 	      }
+// 	    else 
+// 	      {
+// 		spstats[i].push_back(PT.btfix.lookup(m));
+// 		cout << PT.btfix.T1->pid2str(PT.btfix.V1.get(),pid) << " "
+// 		     << spstats[i].back()->good << "/" << spstats[i].back()->sample_cnt 
+// 		     << endl;
+// 	      }
+// 	    spid2span[pid].push_back(pair<uint32_t,uint32_t>(i,k+1));
+// 	  }
+//       }
+//   }
 
-  Alignment::
-  Alignment(Mmsapt const& pt, string const& src, string const& trg)
-    : PT(pt)
-  {
-    PT.btfix.V1->fillIdSeq(src,s);
-    PT.btfix.V2->fillIdSeq(trg,t);
+//   Alignment::
+//   Alignment(Mmsapt const& pt, string const& src, string const& trg)
+//     : PT(pt)
+//   {
+//     PT.btfix.V1->fillIdSeq(src,s);
+//     PT.btfix.V2->fillIdSeq(trg,t);
 
-    // LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC;
-    // BOOST_FOREACH(id_type i, t)
-    //   {
-    // 	cout << (*PT.btfix.V2)[i];
-    // 	if (i < PT.wlex21.size())
-    // 	  {
-    // 	    BOOST_FOREACH(id_type k, PT.wlex21[i])
-    // 	      {
-    // 		size_t  j = COOC[k][i];
-    // 		size_t m1 = COOC.m1(k);
-    // 		size_t m2 = COOC.m2(i);
-    // 		if (j*1000 > m1 && j*1000 > m2)
-    // 		  cout << " " << (*PT.btfix.V1)[k];
-    // 	      }	 
-    // 	  }
-    // 	cout << endl;
-    //   }
+//     // LexicalPhraseScorer2<Token>::table_t const& COOC = PT.calc_lex.scorer.COOC;
+//     // BOOST_FOREACH(id_type i, t)
+//     //   {
+//     // 	cout << (*PT.btfix.V2)[i];
+//     // 	if (i < PT.wlex21.size())
+//     // 	  {
+//     // 	    BOOST_FOREACH(id_type k, PT.wlex21[i])
+//     // 	      {
+//     // 		size_t  j = COOC[k][i];
+//     // 		size_t m1 = COOC.m1(k);
+//     // 		size_t m2 = COOC.m2(i);
+//     // 		if (j*1000 > m1 && j*1000 > m2)
+//     // 		  cout << " " << (*PT.btfix.V1)[k];
+//     // 	      }	 
+//     // 	  }
+//     // 	cout << endl;
+//     //   }
     
-    fill_tspan_maps();
-    fill_sspan_maps();
-    tpos2ahyp.resize(t.size()); 
-    // now fill the association score table
-    PAH.reserve(1000000);
-    typedef pid2span_t::iterator psiter;
-    for (psiter L = spid2span.begin(); L != spid2span.end(); ++L)
-      {
-	if (!L->second.size()) continue; // should never happen anyway
-	int i = L->second[0].first;
-	int k = L->second[0].second - i -1;
-	sptr<pstats> ps = spstats[i][k];
-	PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents);
-	jStatsTable & J = ps->trg;
-	for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
-	  {
-	    psiter R = tpid2span.find(y->first);
-	    if (R == tpid2span.end()) continue;
-	    pp.update(y->first, y->second);
-	    PT.ScorePPfix(pp);
-	    pp.eval(PT.feature_weights);
-	    PP.push_back(pp);
-	    BOOST_FOREACH(span const& sspan, L->second)
-	      {
-		BOOST_FOREACH(span const& tspan, R->second)
-		  {
-		    tpos2ahyp[tspan.first].push_back(PAH.size());
-		    PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan));
-		  }
-	      }
-	  }
-      }
-  }
+//     fill_tspan_maps();
+//     fill_sspan_maps();
+//     tpos2ahyp.resize(t.size()); 
+//     // now fill the association score table
+//     PAH.reserve(1000000);
+//     typedef pid2span_t::iterator psiter;
+//     for (psiter L = spid2span.begin(); L != spid2span.end(); ++L)
+//       {
+// 	if (!L->second.size()) continue; // should never happen anyway
+// 	int i = L->second[0].first;
+// 	int k = L->second[0].second - i -1;
+// 	sptr<pstats> ps = spstats[i][k];
+// 	PhrasePair pp; pp.init(L->first,*ps, PT.m_numScoreComponents);
+// 	jStatsTable & J = ps->trg;
+// 	for (jStatsTable::iterator y = J.begin(); y != J.end(); ++y)
+// 	  {
+// 	    psiter R = tpid2span.find(y->first);
+// 	    if (R == tpid2span.end()) continue;
+// 	    pp.update(y->first, y->second);
+// 	    PT.ScorePPfix(pp);
+// 	    pp.eval(PT.feature_weights);
+// 	    PP.push_back(pp);
+// 	    BOOST_FOREACH(span const& sspan, L->second)
+// 	      {
+// 		BOOST_FOREACH(span const& tspan, R->second)
+// 		  {
+// 		    tpos2ahyp[tspan.first].push_back(PAH.size());
+// 		    PAH.push_back(PhraseAlnHyp(PP.back(),s.size(),sspan,tspan));
+// 		  }
+// 	      }
+// 	  }
+//       }
+//   }
 
     
 
-  int
-  extend(vector<PhraseAlnHyp> & PAH, int edge, int next)
-  {
-    if ((PAH[edge].scov & PAH[next].scov).count()) 
-      return -1;
-    int ret = PAH.size();
-    PAH.push_back(PAH[next]);
-    PhraseAlnHyp & h = PAH.back();
-    h.prev  = edge;
-    h.scov |= PAH[edge].scov;
-    h.score += log(PAH[edge].dprob_fwd(PAH[next]));
-    h.score += log(PAH[next].dprob_bwd(PAH[edge]));
-    return ret;
-  }
+//   int
+//   extend(vector<PhraseAlnHyp> & PAH, int edge, int next)
+//   {
+//     if ((PAH[edge].scov & PAH[next].scov).count()) 
+//       return -1;
+//     int ret = PAH.size();
+//     PAH.push_back(PAH[next]);
+//     PhraseAlnHyp & h = PAH.back();
+//     h.prev  = edge;
+//     h.scov |= PAH[edge].scov;
+//     h.score += log(PAH[edge].dprob_fwd(PAH[next]));
+//     h.score += log(PAH[next].dprob_bwd(PAH[edge]));
+//     return ret;
+//   }
 
-  sptr<vector<int> >
-  Mmsapt::
-  align(string const& src, string const& trg) const
-  {
-    // For the time being, we consult only the fixed bitext.
-    // We might also consider the dynamic bitext. => TO DO.
-    Alignment A(*this,src,trg);
-    VectorIndexSorter<PhraseAlnHyp> foo(A.PAH);
-    vector<size_t> o; foo.GetOrder(o);
-    BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]);
-    sptr<vector<int> > aln;
-    return aln;
-}
-}
+//   sptr<vector<int> >
+//   Mmsapt::
+//   align(string const& src, string const& trg) const
+//   {
+//     // For the time being, we consult only the fixed bitext.
+//     // We might also consider the dynamic bitext. => TO DO.
+//     Alignment A(*this,src,trg);
+//     VectorIndexSorter<PhraseAlnHyp> foo(A.PAH);
+//     vector<size_t> o; foo.GetOrder(o);
+//     BOOST_FOREACH(int i, o) A.show(cout,A.PAH[i]);
+//     sptr<vector<int> > aln;
+//     return aln;
+// }
+// }
 
 
diff --git a/moses/TranslationModel/UG/mmsapt_phrase_scorers.h b/moses/TranslationModel/UG/mmsapt_phrase_scorers.h
index 6e852b44b..083afb3a3 100644
--- a/moses/TranslationModel/UG/mmsapt_phrase_scorers.h
+++ b/moses/TranslationModel/UG/mmsapt_phrase_scorers.h
@@ -1,268 +1,17 @@
 // -*- c++ -*-
+// written by Ulrich Germann 
 #pragma once
 #include "moses/TranslationModel/UG/mm/ug_bitext.h"
 #include "util/exception.hh"
+#include "boost/format.hpp"
+#include "sapt_pscore_base.h"
+
+// DEPRECATED CODE: Word and phrase penalties are now 
+// added by the decoder.
 
 namespace Moses {
   namespace bitext
   {
-
-    template<typename Token>
-    class
-    PhraseScorer
-    {
-    protected:
-      int m_index;
-      int m_num_feats;
-      vector<string> m_feature_names;
-    public:
- 
-      virtual 
-      void 
-      operator()(Bitext<Token> const& pt, PhrasePair& pp, vector<float> * dest=NULL) 
-	const = 0;
-    
-      int 
-      fcnt() const 
-      { return m_num_feats; }
-    
-      vector<string> const &
-      fnames() const
-      { return m_feature_names; }
-
-      string const &
-      fname(int i) const
-      { 
-	UTIL_THROW_IF2((i < m_index || i >= m_index + m_num_feats),
-		       "Feature name index out of range at " 
-		       << __FILE__ << ":" << __LINE__);
-	return m_feature_names.at(i - m_index); 
-      }
-    
-      int 
-      getIndex() const 
-      { return m_index; }
-    };
-  
-    ////////////////////////////////////////////////////////////////////////////////
-  
-    template<typename Token>
-    class
-    PScorePfwd : public PhraseScorer<Token>
-    {
-      float conf;
-      char denom;
-    public:
-      PScorePfwd() 
-      {
-	this->m_num_feats = 1;
-      }
-
-      int 
-      init(int const i, float const c, char d) 
-      { 
-	conf  = c; 
-	denom = d;
-	this->m_index = i;
-	ostringstream buf;
-	buf << format("pfwd-%c%.3f") % denom % c;
-	this->m_feature_names.push_back(buf.str());
-	return i + this->m_num_feats;
-      }
-
-      void 
-      operator()(Bitext<Token> const& bt, PhrasePair & pp, 
-		 vector<float> * dest = NULL) const
-      {
-	if (!dest) dest = &pp.fvals;
-	if (pp.joint > pp.good1) 
-	  {
-	    cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl;
-	    cerr<<pp.joint<<"/"<<pp.good1<<"/"<<pp.raw2<<endl;
-	  }
-	switch (denom)
-	  {
-	  case 'g': 
-	    (*dest)[this->m_index] = log(lbop(pp.good1, pp.joint, conf)); 
-	    break;
-	  case 's': 
-	    (*dest)[this->m_index] = log(lbop(pp.sample1, pp.joint, conf)); 
-	    break;
-	  case 'r':
-	    (*dest)[this->m_index] = log(lbop(pp.raw1, pp.joint, conf)); 
-	  }
-      }
-    };
-  
-    ////////////////////////////////////////////////////////////////////////////////
-
-    template<typename Token>
-    class
-    PScorePbwd : public PhraseScorer<Token>
-    {
-      float conf;
-      char denom;
-    public:
-      PScorePbwd() 
-      {
-	this->m_num_feats = 1;
-      }
-
-      int 
-      init(int const i, float const c, char d) 
-      { 
-	conf = c; 
-	denom = d;
-	this->m_index = i;
-	ostringstream buf;
-	buf << format("pbwd-%c%.3f") % denom % c;
-	this->m_feature_names.push_back(buf.str());
-	return i + this->m_num_feats;
-      }
-
-      void 
-      operator()(Bitext<Token> const& bt, PhrasePair& pp, 
-		 vector<float> * dest = NULL) const
-      {
-	if (!dest) dest = &pp.fvals;
-	// we use the denominator specification to scale the raw counts on the 
-	// target side; the clean way would be to counter-sample
-	uint32_t r2 = pp.raw2;
-	if      (denom == 'g') r2 = round(r2 * float(pp.good1)   / pp.raw1);
-	else if (denom == 's') r2 = round(r2 * float(pp.sample1) / pp.raw1);
-	(*dest)[this->m_index] = log(lbop(max(r2, pp.joint),pp.joint,conf));
-      }
-    };
-  
-    ////////////////////////////////////////////////////////////////////////////////
-
-    template<typename Token>
-    class
-    PScoreCoherence : public PhraseScorer<Token>
-    {
-    public:
-      PScoreCoherence() 
-      {
-	this->m_num_feats = 1;
-      }
-    
-      int 
-      init(int const i) 
-      { 
-	this->m_index = i;
-	this->m_feature_names.push_back(string("coherence"));
-	return i + this->m_num_feats;
-      }
-
-      void 
-      operator()(Bitext<Token> const& bt, PhrasePair& pp, 
-		 vector<float> * dest = NULL) const
-      {
-	if (!dest) dest = &pp.fvals;
-	(*dest)[this->m_index] = log(pp.good1) - log(pp.sample1);
-      }
-    };
-  
-    ////////////////////////////////////////////////////////////////////////////////
-
-    template<typename Token>
-    class
-    PScoreLogCounts : public PhraseScorer<Token>
-    {
-      float conf;
-    public:
-      PScoreLogCounts() 
-      {
-	this->m_num_feats = 5;
-      }
-    
-      int 
-      init(int const i) 
-      { 
-	this->m_index = i;
-	this->m_feature_names.push_back("log-r1");
-	this->m_feature_names.push_back("log-s1");
-	this->m_feature_names.push_back("log-g1");
-	this->m_feature_names.push_back("log-j");
-	this->m_feature_names.push_back("log-r2");
-	return i + this->m_num_feats;
-      }
-    
-      void 
-      operator()(Bitext<Token> const& bt, PhrasePair& pp, 
-		 vector<float> * dest = NULL) const
-      {
-	if (!dest) dest = &pp.fvals;
-	size_t i = this->m_index;
-	assert(pp.raw1);
-	assert(pp.sample1);
-	assert(pp.good1);
-	assert(pp.joint);
-	assert(pp.raw2);
-	(*dest)[i]   = -log(pp.raw1);
-	(*dest)[++i] = -log(pp.sample1);
-	(*dest)[++i] = -log(pp.good1);
-	(*dest)[++i] = +log(pp.joint);
-	(*dest)[++i] = -log(pp.raw2);
-      }
-    };
-  
-    template<typename Token>
-    class
-    PScoreLex : public PhraseScorer<Token>
-    {
-      float const m_alpha;
-    public:
-      LexicalPhraseScorer2<Token> scorer;
-    
-      PScoreLex(float const a) 
-	: m_alpha(a) 
-      { this->m_num_feats = 2; }
-    
-      int 
-      init(int const i, string const& fname) 
-      { 
-	scorer.open(fname); 
-	this->m_index = i;
-	this->m_feature_names.push_back("lexfwd");
-	this->m_feature_names.push_back("lexbwd");
-	return i + this->m_num_feats;
-      }
-    
-      void 
-      operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
-      {
-	if (!dest) dest = &pp.fvals;
-	uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
-	parse_pid(pp.p1, sid1, off1, len1);
-	parse_pid(pp.p2, sid2, off2, len2);
-	
-#if 0
-	cout << len1 << " " << len2 << endl;
-	Token const* t1 = bt.T1->sntStart(sid1);
-	for (size_t i = off1; i < off1 + len1; ++i)
-	  cout << (*bt.V1)[t1[i].id()] << " "; 
-	cout << __FILE__ << ":" << __LINE__ << endl;
-	
-	Token const* t2 = bt.T2->sntStart(sid2);
-	for (size_t i = off2; i < off2 + len2; ++i)
-	  cout << (*bt.V2)[t2[i].id()] << " "; 
-	cout << __FILE__ << ":" << __LINE__ << endl;
-	
-	BOOST_FOREACH (int a, pp.aln)
-	  cout << a << " " ;
-	cout << __FILE__ << ":" << __LINE__ << "\n" << endl;
-	
-#endif
-	scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
-		     bt.T2->sntStart(sid2)+off2,0,len2,
-		     pp.aln, m_alpha,
-		     (*dest)[this->m_index],
-		     (*dest)[this->m_index+1]);
-      }
-      
-    };
-  
     /// Word penalty
     template<typename Token>
     class
@@ -280,7 +29,8 @@ namespace Moses {
       }
     
       void 
-      operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
+      operator()(Bitext<Token> const& bt, PhrasePair<Token>& pp, 
+		 vector<float> * dest = NULL) const
       {
 	if (!dest) dest = &pp.fvals;
 	uint32_t sid2=0,off2=0,len2=0;
@@ -307,7 +57,8 @@ namespace Moses {
       }
     
       void 
-      operator()(Bitext<Token> const& bt, PhrasePair& pp, vector<float> * dest = NULL) const
+      operator()(Bitext<Token> const& bt, PhrasePair<Token>& pp, 
+		 vector<float> * dest = NULL) const
       {
 	if (!dest) dest = &pp.fvals;
 	(*dest)[this->m_index] = 1;
diff --git a/moses/TranslationModel/UG/ptable-lookup.cc b/moses/TranslationModel/UG/ptable-lookup.cc
index 106505f05..2cbf89b16 100644
--- a/moses/TranslationModel/UG/ptable-lookup.cc
+++ b/moses/TranslationModel/UG/ptable-lookup.cc
@@ -106,15 +106,11 @@ int main(int argc, char* argv[])
       	  cout << "   ";
       	  for (size_t k = idx.first; k < idx.second; ++k)
       	    {
-      	      if (mmsapt && fname[k-idx.first].substr(0,3) == "log")
-      		{
-      		  if(scores[k] < 0)
-      		    cout << " " << format("%10d") % round(exp(-scores[k]));
-      		  else
-      		    cout << " " << format("%10d") % round(exp(scores[k]));
-      		}
-      	      else
-      		cout << " " << format("%10.8f") % exp(scores[k]);
+	      size_t j = k-idx.first;
+	      float f = (mmsapt ? mmsapt->isLogVal(j) ? exp(scores[k]) : scores[k]
+			 : scores[k] < 0 ? exp(scores[k]) : scores[k]);
+	      string fmt = (mmsapt && mmsapt->isInteger(j)) ? "%10d" : "%10.8f";
+	      cout << " " << format(fmt) % f;
       	    }
       	  cout << endl;
       	}
diff --git a/moses/TranslationModel/UG/sapt_phrase_key.h b/moses/TranslationModel/UG/sapt_phrase_key.h
new file mode 100644
index 000000000..e1ecf1573
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_phrase_key.h
@@ -0,0 +1,13 @@
+//-*- c++ -*-
+#pragma once
+#include <stdint.h>
+
+using namespace std;
+namespace sapt
+{
+  using namespace Moses;
+  using namespace std;
+
+    
+
+}
diff --git a/moses/TranslationModel/UG/sapt_phrase_scorers.h b/moses/TranslationModel/UG/sapt_phrase_scorers.h
new file mode 100644
index 000000000..37cfd26fd
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_phrase_scorers.h
@@ -0,0 +1,12 @@
+// -*- c++ -*-
+// Phrase scoring functions for suffix array-based phrase tables
+// written by Ulrich Germann 
+#pragma once
+#include "sapt_pscore_unaligned.h"   // count # of unaligned words
+#include "sapt_pscore_provenance.h"  // reward for joint phrase occ. per corpus
+#include "sapt_pscore_rareness.h"    // penalty for rare occurrences (global?)
+#include "sapt_pscore_logcnt.h"      // logs of observed counts
+#include "sapt_pscore_lex1.h"        // plain vanilla Moses lexical scores
+#include "sapt_pscore_pfwd.h"        // fwd phrase prob
+#include "sapt_pscore_pbwd.h"        // bwd phrase prob
+#include "sapt_pscore_coherence.h"   // coherence feature: good/sample-size
diff --git a/moses/TranslationModel/UG/sapt_pscore_base.h b/moses/TranslationModel/UG/sapt_pscore_base.h
new file mode 100644
index 000000000..68a491145
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_base.h
@@ -0,0 +1,103 @@
+// -*- c++ -*-
+// Base classes for suffix array-based phrase scorers
+// written by Ulrich Germann 
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "moses/TranslationModel/UG/mm/ug_phrasepair.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+
+namespace Moses {
+  namespace bitext
+  {
+
+    // abstract base class that defines the common API for phrase scorers
+    template<typename Token>
+    class
+    PhraseScorer
+    {
+    protected:
+      int m_index;
+      int m_num_feats;
+      string m_tag;
+      vector<string> m_feature_names;
+    public:
+ 
+      virtual 
+      void 
+      operator()(Bitext<Token> const& pt, 
+		 PhrasePair<Token>& pp, 
+		 vector<float> * dest=NULL) 
+	const = 0;
+
+      void
+      setIndex(int const i) { m_index = i; }
+    
+      int
+      getIndex() const { return m_index; }
+
+      int 
+      fcnt() const { return m_num_feats; }
+    
+      vector<string> const &
+      fnames() const { return m_feature_names; }
+
+      string const &
+      fname(int i) const
+      { 
+	if (i < 0) i += m_num_feats;
+	UTIL_THROW_IF2(i < 0 || i >= m_num_feats,
+		       "Feature name index out of range at " << HERE);
+	return m_feature_names.at(i); 
+      }
+
+      virtual
+      bool
+      isLogVal(int i) const  { return true; }; 
+      // is this feature log valued? 
+    
+      virtual
+      bool
+      isIntegerValued(int i) const  { return false; }; 
+      // is this feature integer valued (e.g., count features)? 
+
+      virtual
+      bool
+      allowPooling() const { return true; }
+      // does this feature function allow pooling of counts if 
+      // there are no occurrences in the respective corpus?
+      
+    };
+
+    // base class for 'families' of phrase scorers that have a single 
+    template<typename Token>
+    class
+    SingleRealValuedParameterPhraseScorerFamily 
+      : public PhraseScorer<Token>
+    {
+    protected:
+      vector<float> m_x;
+
+      virtual 
+      void 
+      init(string const specs) 
+      { 
+	using namespace boost;
+	UTIL_THROW_IF2(this->m_tag.size() == 0, 
+		       "m_tag must be initialized in constructor");
+	UTIL_THROW_IF2(specs.size() == 0,"empty specification string!");
+	UTIL_THROW_IF2(this->m_feature_names.size(),
+		       "PhraseScorer can only be initialized once!");
+	this->m_index = -1;
+	float x; char c;
+	for (istringstream buf(specs); buf>>x; buf>>c)
+	  {
+	    this->m_x.push_back(x);
+	    string fname = (format("%s-%.2f") % this->m_tag % x).str();
+	    this->m_feature_names.push_back(fname);
+	  }
+	this->m_num_feats = this->m_x.size();
+      }
+    };
+  } // namespace bitext
+} // namespace moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_coherence.h b/moses/TranslationModel/UG/sapt_pscore_coherence.h
new file mode 100644
index 000000000..a3211df54
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_coherence.h
@@ -0,0 +1,33 @@
+// -*- c++ -*-
+// written by Ulrich Germann 
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+
+namespace Moses {
+  namespace bitext
+  {
+    template<typename Token>
+    class
+    PScoreCoherence : public PhraseScorer<Token>
+    {
+    public:
+      PScoreCoherence(string const dummy) 
+      { 
+	this->m_index = -1;
+	this->m_num_feats = 1;
+	this->m_feature_names.push_back(string("coherence"));
+      }
+      
+      void 
+      operator()(Bitext<Token> const& bt, 
+		 PhrasePair<Token>& pp, 
+		 vector<float> * dest = NULL) const
+      {
+	if (!dest) dest = &pp.fvals;
+	(*dest)[this->m_index] = log(pp.good1) - log(pp.sample1);
+      }
+    };
+  }
+}
diff --git a/moses/TranslationModel/UG/sapt_pscore_lex1.h b/moses/TranslationModel/UG/sapt_pscore_lex1.h
new file mode 100644
index 000000000..be994b0d3
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_lex1.h
@@ -0,0 +1,70 @@
+// -*- c++ -*-
+// Phrase scorer that counts the number of unaligend words in the phrase
+// written by Ulrich Germann 
+
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+namespace Moses {
+  namespace bitext
+  {
+    template<typename Token>
+    class
+    PScoreLex1 : public PhraseScorer<Token>
+    {
+      float m_alpha;
+    public:
+      LexicalPhraseScorer2<Token> scorer;
+    
+      PScoreLex1(string const& alpaspec, string const& lexfile) 
+      { 
+	this->m_index = -1;
+	this->m_num_feats = 2; 
+	this->m_feature_names.reserve(2);
+	this->m_feature_names.push_back("lexfwd");
+	this->m_feature_names.push_back("lexbwd");
+	m_alpha = atof(alpaspec.c_str());
+	scorer.open(lexfile); 
+      }
+    
+      void 
+      operator()(Bitext<Token> const& bt, 
+		 PhrasePair<Token>& pp, 
+		 vector<float> * dest = NULL) const
+      {
+	if (!dest) dest = &pp.fvals;
+	// uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
+	// parse_pid(pp.p1, sid1, off1, len1);
+	// parse_pid(pp.p2, sid2, off2, len2);
+#if 0
+	cout << len1 << " " << len2 << endl;
+	Token const* t1 = bt.T1->sntStart(sid1);
+	for (size_t i = off1; i < off1 + len1; ++i)
+	  cout << (*bt.V1)[t1[i].id()] << " "; 
+	cout << __FILE__ << ":" << __LINE__ << endl;
+	
+	Token const* t2 = bt.T2->sntStart(sid2);
+	for (size_t i = off2; i < off2 + len2; ++i)
+	  cout << (*bt.V2)[t2[i].id()] << " "; 
+	cout << __FILE__ << ":" << __LINE__ << endl;
+	
+	BOOST_FOREACH (int a, pp.aln)
+	  cout << a << " " ;
+	cout << __FILE__ << ":" << __LINE__ << "\n" << endl;
+	
+	scorer.score(bt.T1->sntStart(sid1)+off1,0,len1,
+		     bt.T2->sntStart(sid2)+off2,0,len2,
+		     pp.aln, m_alpha,
+		     (*dest)[this->m_index],
+		     (*dest)[this->m_index+1]);
+#endif
+	scorer.score(pp.start1,0, pp.len1, 
+		     pp.start2,0, pp.len2, pp.aln, m_alpha, 
+		     (*dest)[this->m_index], 
+		     (*dest)[this->m_index+1]);
+      }
+    };
+  } //namespace bitext
+} // namespace Moses
+
diff --git a/moses/TranslationModel/UG/sapt_pscore_logcnt.h b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
new file mode 100644
index 000000000..2790323ed
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_logcnt.h
@@ -0,0 +1,65 @@
+// -*- c++ -*-
+// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
+// with the asymptotic function x/(j+x) where x > 0 is a function
+// parameter that determines the steepness of the rewards curve
+// written by Ulrich Germann 
+
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+using namespace std;
+namespace Moses {
+  namespace bitext  {
+    
+    template<typename Token>
+    class
+    PScoreLogCnt : public PhraseScorer<Token>
+    {
+      string m_specs;
+    public:
+      PScoreLogCnt(string const specs) 
+      { 
+	this->m_index = -1;
+	this->m_specs = specs;
+	if (specs.find("r1") != string::npos) // raw source phrase counts
+	  this->m_feature_names.push_back("log-r1");
+	if (specs.find("s1") != string::npos)
+	  this->m_feature_names.push_back("log-s1"); // L1 sample size
+	if (specs.find("g1") != string::npos) // coherent phrases
+	  this->m_feature_names.push_back("log-g1");
+	if (specs.find("j") != string::npos) // joint counts
+	  this->m_feature_names.push_back("log-j");
+	if (specs.find("r2") != string::npos) // raw target phrase counts
+	  this->m_feature_names.push_back("log-r2");
+	this->m_num_feats = this->m_feature_names.size();
+      }
+
+      bool
+      isIntegerValued(int i) const { return true; } 
+
+      void 
+      operator()(Bitext<Token> const& bt, 
+		 PhrasePair<Token>& pp, 
+		 vector<float> * dest = NULL) const
+      {
+	if (!dest) dest = &pp.fvals;
+	assert(pp.raw1);
+	assert(pp.sample1);
+	assert(pp.good1);
+	assert(pp.joint);
+	assert(pp.raw2);
+	size_t i = this->m_index;
+	if (m_specs.find("r1") != string::npos) 
+	  (*dest)[i++] = log(pp.raw1);
+	if (m_specs.find("s1") != string::npos) 
+	  (*dest)[i++] = log(pp.sample1);
+	if (m_specs.find("g1") != string::npos) 
+	  (*dest)[i++] = log(pp.good1);
+	if (m_specs.find("j") != string::npos) 
+	  (*dest)[i++] = log(pp.joint);
+	if (m_specs.find("r2") != string::npos) 
+	  (*dest)[++i] = log(pp.raw2);
+      }
+    };
+  } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_pbwd.h b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
new file mode 100644
index 000000000..f7b4686d7
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_pbwd.h
@@ -0,0 +1,58 @@
+//-*- c++ -*-
+// written by Ulrich Germann 
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+#include "boost/foreach.hpp"
+
+namespace Moses {
+  namespace bitext
+  {
+    template<typename Token>
+    class
+    PScorePbwd : public PhraseScorer<Token>
+    {
+      float   conf;
+      string denom;
+      
+    public:
+      PScorePbwd(float const c, string d) 
+      { 
+	this->m_index = -1;
+	conf  = c; 
+	denom = d;
+	size_t checksum = d.size();
+	BOOST_FOREACH(char const& x, denom)
+	  {
+	    if (x == '+') { --checksum; continue; }
+	    if (x != 'g' && x != 's' && x != 'r') continue;
+	    string s = (format("pbwd-%c%.3f") % x % c).str();
+	    this->m_feature_names.push_back(s);
+	  }
+	this->m_num_feats = this->m_feature_names.size();
+	UTIL_THROW_IF2(this->m_feature_names.size() != checksum,
+		       "Unknown parameter in specification '"
+		       << d << "' for Pbwd phrase scorer at " << HERE);
+      }
+
+      void 
+      operator()(Bitext<Token> const& bt, 
+		 PhrasePair<Token>& pp, 
+		 vector<float> * dest = NULL) const
+      {
+	if (!dest) dest = &pp.fvals;
+	// we use the denominator specification to scale the raw counts on the 
+	// target side; the clean way would be to counter-sample
+	size_t i = this->m_index;
+	BOOST_FOREACH(char const& x, denom)
+	  {
+	    uint32_t m2 = pp.raw2;
+	    if      (x == 'g') m2 = round(m2 * float(pp.good1)   / pp.raw1);
+	    else if (x == 's') m2 = round(m2 * float(pp.sample1) / pp.raw1);
+	    (*dest)[i++] = log(lbop(max(m2, pp.joint),pp.joint,conf));
+	  }
+      }
+    };
+  } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_pfwd.h b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
new file mode 100644
index 000000000..ed48a93d2
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_pfwd.h
@@ -0,0 +1,70 @@
+// -*- c++ -*-
+// written by Ulrich Germann 
+#pragma once
+#include "moses/TranslationModel/UG/mm/ug_bitext.h"
+#include "util/exception.hh"
+#include "boost/format.hpp"
+#include "boost/foreach.hpp"
+
+namespace Moses {
+  namespace bitext
+  {
+    template<typename Token>
+    class
+    PScorePfwd : public PhraseScorer<Token>
+    {
+      float   conf;
+      string denom;
+
+    public:
+
+      PScorePfwd(float const c, string d) 
+      { 
+	this->m_index = -1;
+	conf  = c; 
+	denom = d;
+	size_t checksum = d.size();
+	BOOST_FOREACH(char const& x, denom)
+	  {
+	    if (x == '+') { --checksum; continue; }
+	    if (x != 'g' && x != 's' && x != 'r') continue;
+	    string s = (format("pfwd-%c%.3f") % x % c).str();
+	    this->m_feature_names.push_back(s);
+	  }
+	this->m_num_feats = this->m_feature_names.size();
+	UTIL_THROW_IF2(this->m_feature_names.size() != checksum, 
+		       "Unknown parameter in specification '" 
+		       << d << "' for Pfwd phrase scorer at " << HERE);
+      }
+      
+      void 
+      operator()(Bitext<Token> const& bt, PhrasePair<Token> & pp, 
+		 vector<float> * dest = NULL) const
+      {
+	if (!dest) dest = &pp.fvals;
+	if (pp.joint > pp.good1) 
+	  {
+	    pp.joint = pp.good1;
+	    // cerr<<bt.toString(pp.p1,0)<<" ::: "<<bt.toString(pp.p2,1)<<endl;
+	    // cerr<<pp.joint<<"/"<<pp.good1<<"/"<<pp.raw2<<endl;
+	  }
+	size_t i = this->m_index;
+	BOOST_FOREACH(char const& c, this->denom)
+	  {
+	    switch (c)
+	      {
+	      case 'g': 
+		(*dest)[i++] = log(lbop(pp.good1, pp.joint, conf)); 
+		break;
+	      case 's': 
+		(*dest)[i++] = log(lbop(pp.sample1, pp.joint, conf)); 
+		break;
+	      case 'r':
+		(*dest)[i++] = log(lbop(pp.raw1, pp.joint, conf)); 
+	      }
+	  }
+      }
+    };
+  }
+}
+  
diff --git a/moses/TranslationModel/UG/sapt_pscore_provenance.h b/moses/TranslationModel/UG/sapt_pscore_provenance.h
new file mode 100644
index 000000000..c33b98fe7
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_provenance.h
@@ -0,0 +1,47 @@
+// -*- c++ -*-
+// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
+// with the asymptotic function j/(j+x) where x > 0 is a function
+// parameter that determines the steepness of the rewards curve
+// written by Ulrich Germann 
+
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+using namespace std;
+namespace Moses {
+  namespace bitext {
+    
+    // asymptotic provenance feature n/(n+x)
+    template<typename Token>
+    class
+    PScoreProvenance : public SingleRealValuedParameterPhraseScorerFamily<Token>
+    {
+    public:
+
+      PScoreProvenance(string const& spec) 
+      {
+	this->m_tag = "prov";
+	this->init(spec);
+      }
+    
+      bool
+      isLogVal(int i) const { return false; } 
+
+      void 
+      operator()(Bitext<Token> const& bt, 
+		 PhrasePair<Token>& pp, 
+		 vector<float> * dest = NULL) const
+      {
+	if (!dest) dest = &pp.fvals;
+	size_t i = this->m_index;
+	BOOST_FOREACH(float const x, this->m_x)
+	  (*dest).at(i++) = pp.joint/(x + pp.joint);
+      }
+
+      bool
+      allowPooling() const 
+      { return false; }
+
+    };
+  } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_rareness.h b/moses/TranslationModel/UG/sapt_pscore_rareness.h
new file mode 100644
index 000000000..58f204c88
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_rareness.h
@@ -0,0 +1,41 @@
+// -*- c++ -*-
+// Phrase scorer that rewards the number of phrase pair occurrences in a bitext
+// with the asymptotic function x/(j+x) where x > 0 is a function
+// parameter that determines the steepness of the rewards curve
+// written by Ulrich Germann 
+
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+using namespace std;
+namespace Moses {
+  namespace bitext  {
+    
+    // rareness penalty: x/(n+x)
+    template<typename Token>
+    class
+    PScoreRareness : public SingleRealValuedParameterPhraseScorerFamily<Token>
+    {
+    public:
+      PScoreRareness(string const spec) 
+      {
+	this->m_tag = "rare";
+	this->init(spec);
+      }
+
+      bool
+      isLogVal(int i) const { return false; } 
+
+      void 
+      operator()(Bitext<Token> const& bt, 
+		 PhrasePair<Token>& pp, 
+		 vector<float> * dest = NULL) const
+      {
+	if (!dest) dest = &pp.fvals;
+	size_t i = this->m_index;
+	BOOST_FOREACH(float const x, this->m_x)
+	  (*dest).at(i++) = x/(x + pp.joint);
+      }
+    };
+  } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sapt_pscore_unaligned.h b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
new file mode 100644
index 000000000..bdd2919b4
--- /dev/null
+++ b/moses/TranslationModel/UG/sapt_pscore_unaligned.h
@@ -0,0 +1,67 @@
+// -*- c++ -*-
+// Phrase scorer that counts the number of unaligend words in the phrase
+// written by Ulrich Germann 
+
+#include "sapt_pscore_base.h"
+#include <boost/dynamic_bitset.hpp>
+
+namespace Moses {
+  namespace bitext
+  {
+    template<typename Token>
+    class
+    PScoreUnaligned : public PhraseScorer<Token>
+    {
+      typedef boost::dynamic_bitset<uint64_t> bitvector;
+    public:
+      PScoreUnaligned(string const spec) 
+      {
+	this->m_index = -1;
+	int f = this->m_num_feats = atoi(spec.c_str());
+	UTIL_THROW_IF2(f != 1 && f != 2,"unal parameter must be 1 or 2 at "<<HERE);
+	this->m_feature_names.resize(f);
+	if (f == 1)
+	  this->m_feature_names[0] = "unal";
+	else
+	  {
+	    this->m_feature_names[0] = "unal-s";
+	    this->m_feature_names[1] = "unal-t";
+	  }
+      }
+    
+      bool
+      isLogVal(int i) const { return false; } 
+      
+      bool
+      isIntegerValued(int i) const { return true; } 
+
+      void 
+      operator()(Bitext<Token> const& bt, 
+		 PhrasePair<Token>& pp, 
+		 vector<float> * dest = NULL) const
+      {
+	if (!dest) dest = &pp.fvals;
+	// uint32_t sid1=0,sid2=0,off1=0,off2=0,len1=0,len2=0;
+	// parse_pid(pp.p1, sid1, off1, len1);
+	// parse_pid(pp.p2, sid2, off2, len2);
+	bitvector check1(pp.len1),check2(pp.len2);
+	for (size_t i = 0; i < pp.aln.size(); )
+	  { 
+	    check1.set(pp.aln[i++]); 
+	    check2.set(pp.aln.at(i++)); 
+	  }
+
+	if (this->m_num_feats == 1)
+	  {
+	    (*dest)[this->m_index]  = pp.len1 - check1.count();
+	    (*dest)[this->m_index] += pp.len2 - check2.count();
+	  }
+	else
+	  {
+	    (*dest)[this->m_index]   = pp.len1 - check1.count();
+	    (*dest)[this->m_index+1] = pp.len2 - check2.count();
+	  }
+      }
+    };
+  } // namespace bitext
+} // namespace Moses
diff --git a/moses/TranslationModel/UG/sim-pe.cc b/moses/TranslationModel/UG/sim-pe.cc
new file mode 100644
index 000000000..58a70cab4
--- /dev/null
+++ b/moses/TranslationModel/UG/sim-pe.cc
@@ -0,0 +1,83 @@
+#include "mmsapt.h"
+#include "moses/Manager.h"
+#include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
+#include <boost/foreach.hpp>
+#include <boost/format.hpp>
+#include <boost/tokenizer.hpp>
+#include <boost/shared_ptr.hpp>
+#include <algorithm>
+#include <iostream>
+
+using namespace Moses;
+using namespace bitext;
+using namespace std;
+using namespace boost;
+
+vector<FactorType> fo(1,FactorType(0));
+
+ostream& 
+operator<<(ostream& out, Hypothesis const* x)
+{
+  vector<const Hypothesis*> H;
+  for (const Hypothesis* h = x; h; h = h->GetPrevHypo())
+    H.push_back(h);
+  for (; H.size(); H.pop_back())
+    {
+      Phrase const& p = H.back()->GetCurrTargetPhrase();
+      for (size_t pos = 0 ; pos < p.GetSize() ; pos++) 
+	out << *p.GetFactor(pos, 0) << (H.size() ? " " : "");
+    }
+  return out;
+}
+
+vector<FactorType> ifo;
+size_t lineNumber;
+
+string 
+translate(string const& source)
+{
+  StaticData const& global = StaticData::Instance();
+
+  Sentence sentence; 
+  istringstream ibuf(source+"\n"); 
+  sentence.Read(ibuf,ifo);
+
+  Manager manager(lineNumber, sentence, global.GetSearchAlgorithm());
+  manager.ProcessSentence();
+  
+  ostringstream obuf;
+  const Hypothesis* h = manager.GetBestHypothesis();
+  obuf << h;
+  return obuf.str();
+
+}
+
+int main(int argc, char* argv[])
+{
+  Parameter params;
+  if (!params.LoadParam(argc,argv) || !StaticData::LoadDataStatic(&params, argv[0]))
+    exit(1);
+  
+  StaticData const& global = StaticData::Instance();
+  global.SetVerboseLevel(0);
+  ifo = global.GetInputFactorOrder();
+
+  lineNumber = 0; // TODO: Include sentence request number here?
+  string source, target, alignment;
+  while (getline(cin,source))
+    {
+      getline(cin,target);
+      getline(cin,alignment);
+      cout << "[S] " << source << endl;
+      cout << "[H] " << translate(source) << endl;
+      cout << "[T] " << target << endl;
+      Mmsapt* pdsa = reinterpret_cast<Mmsapt*>(PhraseDictionary::GetColl()[0]);
+      pdsa->add(source,target,alignment);
+      cout << "[X] " << translate(source) << endl;
+      cout << endl;
+    }
+  exit(0);
+}
+  
+  
+
diff --git a/moses/TranslationModel/UG/try-align.cc b/moses/TranslationModel/UG/try-align.cc
index 30c87ccab..483ad2c34 100644
--- a/moses/TranslationModel/UG/try-align.cc
+++ b/moses/TranslationModel/UG/try-align.cc
@@ -2,32 +2,33 @@
 using namespace std;
 using namespace Moses;
 
+// currently broken
 
 Mmsapt* PT;
 int main(int argc, char* argv[])
 {
-  string base = argv[1];
-  string L1   = argv[2];
-  string L2   = argv[3];
-  ostringstream buf;
-  buf << "Mmsapt name=PT0 output-factor=0 num-features=5 base="
-      << base << " L1=" << L1 << " L2=" << L2;
-  string configline = buf.str();
-  PT = new Mmsapt(configline);
-  PT->Load();
-  float w[] = { 0.0582634, 0.0518865, 0.0229819, 0.00640856,  0.647506 };
-  vector<float> weights(w,w+5);
-  PT->setWeights(weights);
-  // these values are taken from a moses.ini file;
-  // is there a convenient way of accessing them from within mmsapt ???
-  string eline,fline;
-  // TokenIndex V; V.open("crp/trn/mm/de.tdx");
-  while (getline(cin,eline) && getline(cin,fline))
-    {
-      cout << eline << endl;
-      cout << fline << endl;
-      PT->align(eline,fline);
-    }
-  delete PT;
+  // string base = argv[1];
+  // string L1   = argv[2];
+  // string L2   = argv[3];
+  // ostringstream buf;
+  // buf << "Mmsapt name=PT0 output-factor=0 num-features=5 base="
+  //     << base << " L1=" << L1 << " L2=" << L2;
+  // string configline = buf.str();
+  // PT = new Mmsapt(configline);
+  // PT->Load();
+  // float w[] = { 0.0582634, 0.0518865, 0.0229819, 0.00640856,  0.647506 };
+  // vector<float> weights(w,w+5);
+  // PT->setWeights(weights);
+  // // these values are taken from a moses.ini file;
+  // // is there a convenient way of accessing them from within mmsapt ???
+  // string eline,fline;
+  // // TokenIndex V; V.open("crp/trn/mm/de.tdx");
+  // while (getline(cin,eline) && getline(cin,fline))
+  //   {
+  //     cout << eline << endl;
+  //     cout << fline << endl;
+  //     PT->align(eline,fline);
+  //   }
+  // delete PT;
 }
 
diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
index 8766743b3..a91c58343 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@@ -345,10 +345,10 @@ string FuzzyMatchWrapper::ExtractTM(WordIndex &wordIndex, long translationId, co
     // find the best matches according to letter sed
     string best_path = "";
     int best_match = -1;
-    int best_letter_cost;
+    unsigned int best_letter_cost;
     if (lsed_flag) {
       best_letter_cost = compute_length( input[sentenceInd] ) * min_match / 100 + 1;
-      for(int si=0; si<best_tm.size(); si++) {
+      for(size_t si=0; si<best_tm.size(); si++) {
         int s = best_tm[si];
         string path;
         unsigned int letter_cost = sed( input[sentenceInd], source[s], path, true );
diff --git a/moses/TypeDef.h b/moses/TypeDef.h
index fb9fd56cb..7852d130d 100644
--- a/moses/TypeDef.h
+++ b/moses/TypeDef.h
@@ -59,7 +59,11 @@ const size_t DEFAULT_MAX_HYPOSTACK_SIZE = 200;
 const size_t DEFAULT_MAX_TRANS_OPT_CACHE_SIZE = 10000;
 const size_t DEFAULT_MAX_TRANS_OPT_SIZE	= 5000;
 const size_t DEFAULT_MAX_PART_TRANS_OPT_SIZE = 10000;
-const size_t DEFAULT_MAX_PHRASE_LENGTH = 20;
+#ifdef PT_UG
+  const size_t DEFAULT_MAX_PHRASE_LENGTH = -1;
+#else
+ const size_t DEFAULT_MAX_PHRASE_LENGTH = 20;
+#endif
 const size_t DEFAULT_MAX_CHART_SPAN			= 10;
 const size_t ARRAY_SIZE_INCR					= 10; //amount by which a phrase gets resized when necessary
 const float LOWEST_SCORE							= -100.0f;
diff --git a/moses/Util.h b/moses/Util.h
index 3bba71332..24a4e2c28 100644
--- a/moses/Util.h
+++ b/moses/Util.h
@@ -56,8 +56,12 @@ namespace Moses
 
 /** verbose macros
  * */
+
 #define VERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR(str); } }
 #define IFVERBOSE(level) if (StaticData::Instance().GetVerboseLevel() >= level)
+#define XVERBOSE(level,str) { if (StaticData::Instance().GetVerboseLevel() >= level) { TRACE_ERR("[" << __FILE__ << ":" << __LINE__ << "] ");TRACE_ERR(str); } }
+#define HERE __FILE__ << ":" << __LINE__
+
 
 #if __GNUC__ == 4 && __GNUC_MINOR__ == 8 && (__GNUC_PATCHLEVEL__ == 1 || __GNUC_PATCHLEVEL__ == 2)
 // gcc nth_element() bug
diff --git a/scripts/server/moses.py b/scripts/server/moses.py
index 155458b9b..a176c473a 100644
--- a/scripts/server/moses.py
+++ b/scripts/server/moses.py
@@ -152,7 +152,7 @@ def find_free_port(p):
 
 class MosesServer(ProcessWrapper):
 
-  def __init__(self,args=["-fd", "\n"]):
+  def __init__(self,args=[]):
     self.process = None
     mserver_cmd  = moses_root+"/bin/mosesserver"
     self.cmd = [mserver_cmd] + args 
@@ -175,7 +175,10 @@ class MosesServer(ProcessWrapper):
     self.cmd.extend(["--server-port", "%d"%self.port])
     if debug:
       print >>sys.stderr,self.cmd
-      self.process = Popen(self.cmd,stderr = sys.stderr)
+      # self.stderr = open("mserver.%d.stderr"%self.port,'w')
+      # self.stdout = open("mserver.%d.stdout"%self.port,'w')
+      # self.process = Popen(self.cmd,stderr = self.stderr,stdout = self.stdout)
+      self.process = Popen(self.cmd)
     else:
       devnull = open(os.devnull,"w")
       self.process = Popen(self.cmd, stderr=devnull, stdout=devnull)
@@ -216,10 +219,13 @@ class MosesServer(ProcessWrapper):
 
         elif type(input) is list:
           return [self.translate(x) for x in input]
+
         elif type(input) is dict:
           return self.proxy.translate(input)
+
         else:
           raise Exception("Can't handle input of this type!")
+
       except:
         attempts += 1
         print >>sys.stderr, "WAITING", attempts
diff --git a/scripts/server/sim-pe.py b/scripts/server/sim-pe.py
index 340695a56..52d1e314a 100755
--- a/scripts/server/sim-pe.py
+++ b/scripts/server/sim-pe.py
@@ -127,13 +127,40 @@ def translate(proxy, args, line):
         param['nbest-distinct'] = True
         pass
     attempts = 0
-    while attempts < 120:
+    while attempts < 20:
+        t1 = time.time()
         try:
-            return proxy.translate(param)
-        except:
-            print >>sys.stderr, "Waiting", proxy
-            attempts += 1
+            return proxy.translate(param) 
+
+        # except xmlrpclib.Fault as e:
+        # except xmlrpclib.ProtocolError as e:
+        # except xmlrpclib.ResponseError as e:
+        except xmlrpclib.Error as e:
+            time.sleep(2) # give all the stderr stuff a chance to be flushed
+            print >>sys.stderr," XMLRPC error:",e
+            print >>sys.stderr, "Input was"
+            print >>sys.stderr, param
+            sys.exit(1)
+
+        except IOError as e:
+            print >>sys.stderr,"I/O error({0}): {1}".format(e.errno, e.strerror)
             time.sleep(5)
+
+        except:
+            serverstatus = mserver.process.poll()
+            if serverstatus == None:
+                print >>sys.stderr, "Connection failed after %f seconds"%(time.time()-t1)
+                attempts += 1
+                if attempts > 10:
+                    time.sleep(10)
+                else:
+                    time.sleep(5)
+                    pass
+            else:
+                
+                print >>sys.stderr, "Oopsidaisy, server exited with code %d (signal %d)"\
+                    %(serverstatus/256,serverstatus%256)
+                pass
             pass
         pass
     raise Exception("Exception: could not reach translation server.")
@@ -210,17 +237,25 @@ if __name__ == "__main__":
             pass
         pass
 
-    if args.url:
-        mserver.connect(args.url)
-    else:
-        mserver.start(args=mo_args,port=args.port,debug=args.debug)
-        pass
-
     ref = None
     aln = None
     if args.ref: ref = read_data(args.ref)
     if args.aln: aln = read_data(args.aln)
 
+    if ref and aln:
+        try:
+            mo_args.index("--serial")
+        except:
+            mo_args.append("--serial")
+            pass
+        pass
+
+    if args.url:
+        mserver.connect(args.url)
+    else:
+        mserver.start(args=mo_args, port=args.port, debug=args.debug)
+        pass
+
     if (args.input == "-"):
         line = sys.stdin.readline()
         idx = 0