Merge branch 'master' into nadir_osm

author: Hieu Hoang <hieu@hoang.co.uk> 2013-06-28 17:19:30 +0400
committer: Hieu Hoang <hieu@hoang.co.uk> 2013-06-28 17:19:30 +0400
commit: fa4b92fc0af49079827fb6d71eae86fc97332121 (patch)
tree: 3f14600ca2dc8fc8de14ed7552ca4864d5c7baf8
parent: 8c19c2ba8ae59e4cdfefaa442e9825ba9b8bf39f (diff)
parent: c963338476a40d5b1071c0c253332ed6ebd0b427 (diff)
25 files changed, 1140 insertions, 435 deletions
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project
index 086623c0e..593955b4d 100644
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@@ -1572,6 +1572,16 @@
 			<locationURI>virtual:/virtual</locationURI>
 		</link>
 		<link>
+			<name>TranslationModel/WordCoocTable.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.cpp</locationURI>
+		</link>
+		<link>
+			<name>TranslationModel/WordCoocTable.h</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/WordCoocTable.h</locationURI>
+		</link>
+		<link>
 			<name>TranslationModel/fuzzy-match</name>
 			<type>2</type>
 			<locationURI>virtual:/virtual</locationURI>
diff --git a/moses/FF/DistortionScoreProducer.cpp b/moses/FF/DistortionScoreProducer.cpp
index 67a1185cb..36618b579 100644
--- a/moses/FF/DistortionScoreProducer.cpp
+++ b/moses/FF/DistortionScoreProducer.cpp
@@ -22,7 +22,8 @@ struct DistortionState_traditional : public FFState {
 };
 
 DistortionScoreProducer::DistortionScoreProducer(const std::string &line)
-  : StatefulFeatureFunction("Distortion", 1, line) {
+  : StatefulFeatureFunction("Distortion", 1, line)
+{
   ReadParameters();
 }
 
diff --git a/moses/FF/FeatureFunction.cpp b/moses/FF/FeatureFunction.cpp
index b35b98648..7c1c98652 100644
--- a/moses/FF/FeatureFunction.cpp
+++ b/moses/FF/FeatureFunction.cpp
@@ -102,8 +102,8 @@ void FeatureFunction::SetParameter(const std::string& key, const std::string& va
 void FeatureFunction::ReadParameters()
 {
   while (!m_args.empty()) {
-	const vector<string> &args = m_args[0];
-	SetParameter(args[0], args[1]);
+    const vector<string> &args = m_args[0];
+    SetParameter(args[0], args[1]);
 
     m_args.erase(m_args.begin());
   }
diff --git a/moses/FF/PhrasePenalty.cpp b/moses/FF/PhrasePenalty.cpp
index ac1aa47b8..8b54939a7 100644
--- a/moses/FF/PhrasePenalty.cpp
+++ b/moses/FF/PhrasePenalty.cpp
@@ -5,14 +5,15 @@
 namespace Moses
 {
 PhrasePenalty::PhrasePenalty(const std::string &line)
-: StatelessFeatureFunction("PhrasePenalty",1, line) {
+  : StatelessFeatureFunction("PhrasePenalty",1, line)
+{
   ReadParameters();
 }
 
 void PhrasePenalty::Evaluate(const Phrase &source
-						, const TargetPhrase &targetPhrase
-						, ScoreComponentCollection &scoreBreakdown
-						, ScoreComponentCollection &estimatedFutureScore) const
+                             , const TargetPhrase &targetPhrase
+                             , ScoreComponentCollection &scoreBreakdown
+                             , ScoreComponentCollection &estimatedFutureScore) const
 {
   scoreBreakdown.Assign(this, 1.0f);
 }
diff --git a/moses/FF/PhrasePenalty.h b/moses/FF/PhrasePenalty.h
index 5263f15ef..7ebc20659 100644
--- a/moses/FF/PhrasePenalty.h
+++ b/moses/FF/PhrasePenalty.h
@@ -11,13 +11,13 @@ public:
   PhrasePenalty(const std::string &line);
 
   bool IsUseable(const FactorMask &mask) const {
-	return true;
+    return true;
   }
 
   virtual void Evaluate(const Phrase &source
-						, const TargetPhrase &targetPhrase
-						, ScoreComponentCollection &scoreBreakdown
-						, ScoreComponentCollection &estimatedFutureScore) const;
+                        , const TargetPhrase &targetPhrase
+                        , ScoreComponentCollection &scoreBreakdown
+                        , ScoreComponentCollection &estimatedFutureScore) const;
 };
 
 } //namespace
diff --git a/moses/FF/UnknownWordPenaltyProducer.cpp b/moses/FF/UnknownWordPenaltyProducer.cpp
index adc4c4d6b..4ba033e58 100644
--- a/moses/FF/UnknownWordPenaltyProducer.cpp
+++ b/moses/FF/UnknownWordPenaltyProducer.cpp
@@ -7,7 +7,8 @@ using namespace std;
 namespace Moses
 {
 UnknownWordPenaltyProducer::UnknownWordPenaltyProducer(const std::string &line)
-  : StatelessFeatureFunction("UnknownWordPenalty",1, line) {
+  : StatelessFeatureFunction("UnknownWordPenalty",1, line)
+{
   m_tuneable = false;
   ReadParameters();
 }
diff --git a/moses/FF/WordPenaltyProducer.cpp b/moses/FF/WordPenaltyProducer.cpp
index 054852349..8d77d68fd 100644
--- a/moses/FF/WordPenaltyProducer.cpp
+++ b/moses/FF/WordPenaltyProducer.cpp
@@ -7,7 +7,8 @@ using namespace std;
 namespace Moses
 {
 WordPenaltyProducer::WordPenaltyProducer(const std::string &line)
-  : StatelessFeatureFunction("WordPenalty",1, line) {
+  : StatelessFeatureFunction("WordPenalty",1, line)
+{
   ReadParameters();
 }
 
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index e22a89c6d..bcd7c1519 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -275,13 +275,15 @@ bool Parameter::LoadParam(int argc, char* argv[])
   }
 
   // overwrite parameters with values from switches
-  for(PARAM_STRING::const_iterator iterParam = m_description.begin(); iterParam != m_description.end(); iterParam++) {
+  for(PARAM_STRING::const_iterator iterParam = m_description.begin();
+      iterParam != m_description.end(); iterParam++) {
     const string paramName = iterParam->first;
     OverwriteParam("-" + paramName, paramName, argc, argv);
   }
 
   // ... also shortcuts
-  for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin(); iterParam != m_abbreviation.end(); iterParam++) {
+  for(PARAM_STRING::const_iterator iterParam = m_abbreviation.begin();
+      iterParam != m_abbreviation.end(); iterParam++) {
     const string paramName = iterParam->first;
     const string paramShortName = iterParam->second;
     OverwriteParam("-" + paramShortName, paramName, argc, argv);
@@ -294,7 +296,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
     verbose = Scan<int>(m_setting["verbose"][0]);
   if (verbose >= 1) { // only if verbose
     TRACE_ERR( "Defined parameters (per moses.ini or switch):" << endl);
-    for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ; iterParam != m_setting.end(); iterParam++) {
+    for(PARAM_MAP::const_iterator iterParam = m_setting.begin() ;
+        iterParam != m_setting.end(); iterParam++) {
       TRACE_ERR( "\t" << iterParam->first << ": ");
       for ( size_t i = 0; i < iterParam->second.size(); i++ )
         TRACE_ERR( iterParam->second[i] << " ");
@@ -303,7 +306,8 @@ bool Parameter::LoadParam(int argc, char* argv[])
   }
 
   // convert old weights args to new format
-  if (!isParamSpecified("feature"))
+  // WHAT IS GOING ON HERE??? - UG
+  if (!isParamSpecified("feature")) // UG
     ConvertWeightArgs();
   CreateWeightsMap();
   WeightOverwrite();
@@ -331,11 +335,11 @@ std::vector<float> &Parameter::GetWeights(const std::string &name)
 {
   std::vector<float> &ret = m_weights[name];
 
-  cerr << "WEIGHT " << name << "=";
-  for (size_t i = 0; i < ret.size(); ++i) {
-    cerr << ret[i] << ",";
-  }
-  cerr << endl;
+  // cerr << "WEIGHT " << name << "=";
+  // for (size_t i = 0; i < ret.size(); ++i) {
+  //   cerr << ret[i] << ",";
+  // }
+  // cerr << endl;
   return ret;
 }
 
@@ -357,7 +361,10 @@ void Parameter::SetWeight(const std::string &name, size_t ind, const vector<floa
   newWeights.push_back(line);
 }
 
-void Parameter::AddWeight(const std::string &name, size_t ind, const std::vector<float> &weights)
+void
+Parameter::
+AddWeight(const std::string &name, size_t ind,
+          const std::vector<float> &weights)
 {
   PARAM_VEC &newWeights = m_setting["weight"];
 
@@ -478,6 +485,12 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
       case Compact:
         ptType = "PhraseDictionaryCompact";
         break;
+      case SuffixArray:
+        ptType = "PhraseDictionarySuffixArray";
+        break;
+      case DSuffixArray:
+        ptType = "PhraseDictionaryDynSuffixArray";
+        break;
       default:
         break;
       }
@@ -502,6 +515,9 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
 
         ++currOldInd;
       }
+
+      // cerr << weights.size() << " PHRASE TABLE WEIGHTS "
+      // << __FILE__ << ":" << __LINE__ << endl;
       AddWeight(ptType, ptInd, weights);
 
       // actual pt
@@ -527,7 +543,7 @@ void Parameter::ConvertWeightArgsPhraseModel(const string &oldWeightName)
       ptLine << "num-features=" << numScoreComponent << " ";
       ptLine << "table-limit=" << maxTargetPhrase[currDict] << " ";
 
-      if (implementation == SuffixArray) {
+      if (implementation == SuffixArray || implementation == DSuffixArray) {
         ptLine << "target-path=" << token[5] << " ";
         ptLine << "alignment-path=" << token[6] << " ";
       }
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index bc4231b70..a9eb14ba3 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -700,8 +700,8 @@ bool StaticData::LoadData(Parameter *parameter)
 	  SetWeights(model, weights);
     } else if (feature == "PhrasePenalty") {
       PhrasePenalty* model = new PhrasePenalty(line);
-  	  vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
-	  SetWeights(model, weights);
+      vector<float> weights = m_parameter->GetWeights(model->GetScoreProducerDescription());
+      SetWeights(model, weights);
     }
 
 #ifdef HAVE_SYNLM
@@ -1177,7 +1177,6 @@ void StaticData::LoadFeatureFunctions()
     }
   }
 
-  // load phrase table
   for (size_t i = 0; i < m_phraseDictionary.size(); ++i) {
     PhraseDictionary *pt = m_phraseDictionary[i];
     pt->Load();
diff --git a/moses/TargetPhraseCollection.cpp b/moses/TargetPhraseCollection.cpp
index 73f6d0543..9a28dbd5d 100644
--- a/moses/TargetPhraseCollection.cpp
+++ b/moses/TargetPhraseCollection.cpp
@@ -35,11 +35,11 @@ struct CompareTargetPhrase {
 
 void TargetPhraseCollection::NthElement(size_t tableLimit)
 {
-  vector<TargetPhrase*>::iterator
-  iterMiddle = (tableLimit == 0 || m_collection.size() < tableLimit) ?m_collection.end() : m_collection.begin() + tableLimit;
-
-  //std::sort(m_collection.begin(), m_collection.end(), CompareTargetPhrase());
-  std::nth_element(m_collection.begin(), iterMiddle, m_collection.end(), CompareTargetPhrase());
+  vector<TargetPhrase*>::iterator nth;
+  nth = (tableLimit && tableLimit <= m_collection.size()
+         ? m_collection.begin() + tableLimit
+         : m_collection.end());
+  std::nth_element(m_collection.begin(), nth, m_collection.end(), CompareTargetPhrase());
 }
 
 void TargetPhraseCollection::Prune(bool adhereTableLimit, size_t tableLimit)
diff --git a/moses/TranslationModel/BilingualDynSuffixArray.cpp b/moses/TranslationModel/BilingualDynSuffixArray.cpp
index 35b5b37e6..aea2491e9 100644
--- a/moses/TranslationModel/BilingualDynSuffixArray.cpp
+++ b/moses/TranslationModel/BilingualDynSuffixArray.cpp
@@ -3,6 +3,11 @@
 #include "moses/FactorCollection.h"
 #include "moses/StaticData.h"
 #include "moses/TargetPhrase.h"
+
+#include "moses/generic/sorting/NBestList.h"
+#include "moses/generic/sampling/Sampling.h"
+
+#include <boost/foreach.hpp>
 #include <iomanip>
 
 using namespace std;
@@ -10,43 +15,48 @@ using namespace std;
 namespace Moses
 {
 
-BilingualDynSuffixArray::BilingualDynSuffixArray():
+BilingualDynSuffixArray::
+BilingualDynSuffixArray():
   m_maxPhraseLength(StaticData::Instance().GetMaxPhraseLength()),
-  m_maxSampleSize(20)
+  m_maxSampleSize(20), m_maxPTEntries(20)
 {
   m_srcSA = 0;
   m_trgSA = 0;
-  m_srcCorpus = new std::vector<wordID_t>();
-  m_trgCorpus = new std::vector<wordID_t>();
-  m_srcVocab = new Vocab(false);
-  m_trgVocab = new Vocab(false);
+  m_srcCorpus = new vector<wordID_t>();
+  m_trgCorpus = new vector<wordID_t>();
+  m_srcVocab  = new Vocab(false);
+  m_trgVocab  = new Vocab(false);
   m_scoreCmp = 0;
 }
 
-BilingualDynSuffixArray::~BilingualDynSuffixArray()
+BilingualDynSuffixArray::
+~BilingualDynSuffixArray()
 {
-  if(m_srcSA) delete m_srcSA;
-  if(m_trgSA) delete m_trgSA;
-  if(m_srcVocab) delete m_srcVocab;
-  if(m_trgVocab) delete m_trgVocab;
+  if(m_srcSA)     delete m_srcSA;
+  if(m_trgSA)     delete m_trgSA;
+  if(m_srcVocab)  delete m_srcVocab;
+  if(m_trgVocab)  delete m_trgVocab;
   if(m_srcCorpus) delete m_srcCorpus;
   if(m_trgCorpus) delete m_trgCorpus;
-  if(m_scoreCmp) delete m_scoreCmp;
+  if(m_scoreCmp)  delete m_scoreCmp;
 }
 
-bool BilingualDynSuffixArray::Load(
-  const std::vector<FactorType>& inputFactors,
-  const std::vector<FactorType>& outputFactors,
-  std::string source, std::string target, std::string alignments,
-  const std::vector<float> &weight)
+bool
+BilingualDynSuffixArray::
+Load(
+  const vector<FactorType>& inputFactors,
+  const vector<FactorType>& outputFactors,
+  string source, string target, string alignments,
+  const vector<float> &weight)
 {
   m_inputFactors = inputFactors;
   m_outputFactors = outputFactors;
 
-  m_scoreCmp = new ScoresComp(weight);
+  // m_scoreCmp = new ScoresComp(weight);
   InputFileStream sourceStrme(source);
   InputFileStream targetStrme(target);
   cerr << "Loading source corpus...\n";
+  // Input and Output are 'Factor directions' (whatever that is) defined in Typedef.h
   LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
   cerr << "Loading target corpus...\n";
   LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
@@ -57,88 +67,69 @@ bool BilingualDynSuffixArray::Load(
   m_srcSA = new DynSuffixArray(m_srcCorpus);
   if(!m_srcSA) return false;
   cerr << "Building Target Suffix Array...\n";
-  //m_trgSA = new DynSuffixArray(m_trgCorpus);
-  //if(!m_trgSA) return false;
-  cerr << "\t(Skipped. Not used)\n";
+  m_trgSA = new DynSuffixArray(m_trgCorpus);
+  if(!m_trgSA) return false;
 
   InputFileStream alignStrme(alignments);
   cerr << "Loading Alignment File...\n";
   LoadRawAlignments(alignStrme);
+  cerr << m_srcSntBreaks.size() << " "
+       << m_trgSntBreaks.size() << " "
+       << m_rawAlignments.size() << endl;
   //LoadAlignments(alignStrme);
   cerr << "Building frequent word cache...\n";
   CacheFreqWords();
-  return true;
-}
-
-bool BilingualDynSuffixArray::LoadTM(
-  const std::vector<FactorType>& inputFactors,
-  const std::vector<FactorType>& outputFactors,
-  std::string source, std::string target, std::string alignments,
-  const std::vector<float> &weight)
-{
-  m_inputFactors = inputFactors;
-  m_outputFactors = outputFactors;
-
-  m_scoreCmp = new ScoresComp(weight);
-  InputFileStream sourceStrme(source);
-  InputFileStream targetStrme(target);
-
-  cerr << "Loading target corpus...\n";
-  LoadCorpus(Output, targetStrme, m_outputFactors,*m_trgCorpus, m_trgSntBreaks, m_trgVocab);
-
-  cerr << "Loading source corpus...\n";
-  LoadCorpus(Input, sourceStrme, m_inputFactors, *m_srcCorpus, m_srcSntBreaks, m_srcVocab);
 
-  CHECK(m_srcSntBreaks.size() == m_trgSntBreaks.size());
-
-  // build suffix arrays and auxilliary arrays
-  cerr << "Building Source Suffix Array...\n";
-  m_srcSA = new DynSuffixArray(m_srcCorpus);
-  if(!m_srcSA) return false;
-  cerr << "Building Target Suffix Array...\n";
-  //m_trgSA = new DynSuffixArray(m_trgCorpus);
-  //if(!m_trgSA) return false;
-  cerr << "\t(Skipped. Not used)\n";
-
-  InputFileStream alignStrme(alignments);
-  cerr << "Loading Alignment File...\n";
-  LoadRawAlignments(alignStrme);
-  //LoadAlignments(alignStrme);
-  cerr << "Building frequent word cache...\n";
-  CacheFreqWords();
+  wordID_t const* s = &(*m_srcCorpus)[0];
+  wordID_t const* t = &(*m_trgCorpus)[0];
+  for (size_t sid = 0; sid < m_srcSntBreaks.size(); ++sid) {
+    wordID_t const* se = s + GetSourceSentenceSize(sid);
+    wordID_t const* te = t + GetTargetSentenceSize(sid);
+    vector<short> const& a = m_rawAlignments[sid];
+    m_wrd_cooc.Count(vector<wordID_t>(s,se),
+                     vector<wordID_t>(t,te), a,
+                     m_srcVocab->GetkOOVWordID(),
+                     m_trgVocab->GetkOOVWordID());
+    s = se;
+    t = te;
+  }
+  if (m_srcSntBreaks.size()  != m_trgSntBreaks.size() ||
+      m_rawAlignments.size() != m_trgSntBreaks.size()) {
+    cerr << "FATAL ERROR: Line counts don't match!\n"
+         << "Source side text corpus: " << m_srcSntBreaks.size() << "\n"
+         << "Target side text corpus: " << m_trgSntBreaks.size() << "\n"
+         << "Word alignments:         " << m_rawAlignments.size() << endl;
+    exit(1);
+  }
   return true;
-
 }
 
-int BilingualDynSuffixArray::LoadRawAlignments(InputFileStream& align)
+int
+BilingualDynSuffixArray::
+LoadRawAlignments(InputFileStream& align)
 {
   // stores the alignments in the raw file format
-  std::string line;
-  std::vector<int> vtmp;
-  int lineNum = 1;
+  string line;
+  // vector<int> vtmp;
+  // int lineNum = 0;
   while(getline(align, line)) {
-    if (lineNum % 10000 == 0)
-      cerr << lineNum;
-    Utils::splitToInt(line, vtmp, "- ");
-    CHECK(vtmp.size() % 2 == 0);
-    std::vector<short> vAlgn;	// store as short ints for memory
-    for (std::vector<int>::const_iterator itr = vtmp.begin();
-         itr != vtmp.end(); ++itr) {
-      vAlgn.push_back(short(*itr));
-    }
-    m_rawAlignments.push_back(vAlgn);
-    ++lineNum;
+    // if (++lineNum % 10000 == 0) cerr << lineNum << endl;
+    LoadRawAlignments(line);
   }
   return m_rawAlignments.size();
 }
-int BilingualDynSuffixArray::LoadRawAlignments(string& align)
+
+
+int
+BilingualDynSuffixArray::
+LoadRawAlignments(string& align)
 {
   // stores the alignments in the raw file format
   vector<int> vtmp;
   Utils::splitToInt(align, vtmp, "- ");
   CHECK(vtmp.size() % 2 == 0);
   vector<short> vAlgn;  // store as short ints for memory
-  for (std::vector<int>::const_iterator itr = vtmp.begin();
+  for (vector<int>::const_iterator itr = vtmp.begin();
        itr != vtmp.end(); ++itr) {
     vAlgn.push_back(short(*itr));
   }
@@ -146,48 +137,56 @@ int BilingualDynSuffixArray::LoadRawAlignments(string& align)
   return m_rawAlignments.size();
 }
 
-int BilingualDynSuffixArray::LoadAlignments(InputFileStream& align)
-{
-  std::string line;
-  std::vector<int> vtmp;
-  int sntIndex(0);
-
-  while(getline(align, line)) {
-    Utils::splitToInt(line, vtmp, "- ");
-    CHECK(vtmp.size() % 2 == 0);
-
-    int sourceSize = GetSourceSentenceSize(sntIndex);
-    int targetSize = GetTargetSentenceSize(sntIndex);
-
-    SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
-    for(int i=0; i < (int)vtmp.size(); i+=2) {
-      int sourcePos = vtmp[i];
-      int targetPos = vtmp[i+1];
-      CHECK(sourcePos < sourceSize);
-      CHECK(targetPos < targetSize);
-
-      curSnt.alignedList[sourcePos].push_back(targetPos);	// list of target nodes for each source word
-      curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
-    }
-    curSnt.srcSnt = m_srcCorpus + sntIndex;	// point source and target sentence
-    curSnt.trgSnt = m_trgCorpus + sntIndex;
-    m_alignments.push_back(curSnt);
-
-    sntIndex++;
-  }
-  return m_alignments.size();
-}
-
-SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntIndex, bool trg2Src) const
+// int
+// BilingualDynSuffixArray::
+// LoadAlignments(InputFileStream& align)
+// {
+//   string line;
+//   vector<int> vtmp;
+//   int sntIndex = 0;
+
+//   while(getline(align, line))
+//     {
+// 	Utils::splitToInt(line, vtmp, "- ");
+// 	CHECK(vtmp.size() % 2 == 0);
+
+// 	int sourceSize = GetSourceSentenceSize(sntIndex);
+// 	int targetSize = GetTargetSentenceSize(sntIndex);
+
+// 	SentenceAlignment curSnt(sntIndex, sourceSize, targetSize); // initialize empty sentence
+// 	for(int i=0; i < (int)vtmp.size(); i+=2)
+// 	  {
+// 	    int sourcePos = vtmp[i];
+// 	    int targetPos = vtmp[i+1];
+// 	    CHECK(sourcePos < sourceSize);
+// 	    CHECK(targetPos < targetSize);
+
+// 	    curSnt.alignedList[sourcePos].push_back(targetPos);	// list of target nodes for each source word
+// 	    curSnt.numberAligned[targetPos]++; // cnt of how many source words connect to this target word
+// 	  }
+// 	curSnt.srcSnt = m_srcCorpus + sntIndex;	// point source and target sentence
+// 	curSnt.trgSnt = m_trgCorpus + sntIndex;
+// 	m_alignments.push_back(curSnt);
+
+// 	sntIndex++;
+//     }
+//   return m_alignments.size();
+// }
+
+SentenceAlignment
+BilingualDynSuffixArray::
+GetSentenceAlignment(const int sntIndex, bool trg2Src) const
 {
   // retrieves the alignments in the format used by SentenceAlignment.Extract()
-  int sntGiven = trg2Src ? GetTargetSentenceSize(sntIndex) : GetSourceSentenceSize(sntIndex);
-  int sntExtract = trg2Src ? GetSourceSentenceSize(sntIndex) : GetTargetSentenceSize(sntIndex);
-  std::vector<short> alignment = m_rawAlignments.at(sntIndex);
+  int t = GetTargetSentenceSize(sntIndex);
+  int s = GetSourceSentenceSize(sntIndex);
+  int sntGiven   = trg2Src ? t : s;
+  int sntExtract = trg2Src ? s : t;
   SentenceAlignment curSnt(sntIndex, sntGiven, sntExtract); // initialize empty sentence
-  for(size_t i=0; i < alignment.size(); i+=2) {
-    int sourcePos = alignment[i];
-    int targetPos = alignment[i+1];
+  vector<short> const& a = m_rawAlignments.at(sntIndex);
+  for(size_t i=0; i < a.size(); i+=2) {
+    int sourcePos = a[i];
+    int targetPos = a[i+1];
     if(trg2Src) {
       curSnt.alignedList[targetPos].push_back(sourcePos);	// list of target nodes for each source word
       curSnt.numberAligned[sourcePos]++; // cnt of how many source words connect to this target word
@@ -202,32 +201,50 @@ SentenceAlignment BilingualDynSuffixArray::GetSentenceAlignment(const int sntInd
   return curSnt;
 }
 
-bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,
-    const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const
+bool
+BilingualDynSuffixArray::
+ExtractPhrases(const int& sntIndex,
+               const int& wordIndex,
+               const int& sourceSize,
+               vector<PhrasePair*>& phrasePairs,
+               bool trg2Src) const
 {
   /* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src
    * parameter */
   SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
   // get span of phrase in source sentence
   int beginSentence = m_srcSntBreaks[sntIndex];
-  int rightIdx = wordIndex - beginSentence
-                 ,leftIdx = rightIdx - sourceSize + 1;
+  int rightIdx = wordIndex - beginSentence;
+  int leftIdx  = rightIdx  - sourceSize + 1;
   return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
 }
 
-int BilingualDynSuffixArray::LoadCorpus(FactorDirection direction, InputFileStream& corpus, const FactorList& factors,
-                                        std::vector<wordID_t>& cArray, std::vector<wordID_t>& sntArray,
-                                        Vocab* vocab)
+void
+BilingualDynSuffixArray::
+CleanUp(const InputType& source)
 {
-  std::string line, word;
+  //m_wordPairCache.clear();
+}
+
+int
+BilingualDynSuffixArray::
+LoadCorpus(FactorDirection direction,
+           InputFileStream  & corpus,
+           const FactorList & factors,
+           vector<wordID_t> & cArray,
+           vector<wordID_t> & sntArray,
+           Vocab* vocab)
+{
+  string line, word;
   int sntIdx(0);
-//	corpus.seekg(0); Seems needless -> commented out to allow loading of gzipped corpora (gzfilebuf doesn't support seeking).
-  const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
+  // corpus.seekg(0); Seems needless -> commented out to allow
+  // loading of gzipped corpora (gzfilebuf doesn't support seeking).
+  const string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
   while(getline(corpus, line)) {
     sntArray.push_back(sntIdx);
     Phrase phrase(ARRAY_SIZE_INCR);
     // parse phrase
-    phrase.CreateFromString(direction, factors, line, factorDelimiter, NULL);
+    phrase.CreateFromString( direction, factors, line, factorDelimiter, NULL);
     // store words in vocabulary and corpus
     for( size_t i = 0; i < phrase.GetSize(); ++i) {
       cArray.push_back( vocab->GetWordID(phrase.GetWord(i)) );
@@ -239,7 +256,9 @@ int BilingualDynSuffixArray::LoadCorpus(FactorDirection direction, InputFileStre
   return cArray.size();
 }
 
-bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
+bool
+BilingualDynSuffixArray::
+GetLocalVocabIDs(const Phrase& src, SAPhrase &output) const
 {
   // looks up the SA vocab ids for the current src phrase
   size_t phraseSize = src.GetSize();
@@ -251,87 +270,131 @@ bool BilingualDynSuffixArray::GetLocalVocabIDs(const Phrase& src, SAPhrase &outp
       return false;
     } else {
       output.SetId(pos, arrayId);
-      //cerr << arrayId << " ";
     }
   }
   return true;
 }
 
-pair<float, float> BilingualDynSuffixArray::GetLexicalWeight(const PhrasePair& phrasepair) const
+pair<float, float>
+BilingualDynSuffixArray::
+GetLexicalWeight(const PhrasePair& pp) const
 {
-  //return pair<float, float>(1, 1);
-  float srcLexWeight(1.0), trgLexWeight(1.0);
-  std::map<pair<wordID_t, wordID_t>, float> targetProbs; // collect sum of target probs given source words
-  //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
-  const SentenceAlignment& alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
-  std::map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
-  // for each source word
-  for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
-    float srcSumPairProbs(0);
-    wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]);	// localIDs
-    const std::vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
-    // for each target word aligned to this source word in this alignment
-    if(srcWordAlignments.size() == 0) { // get p(NULL|src)
-      pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
-      itrCache = m_wordPairCache.find(wordpair);
-      if(itrCache == m_wordPairCache.end()) { // if not in cache
-        CacheWordProbs(srcWord);
-        itrCache = m_wordPairCache.find(wordpair); // search cache again
-      }
-      CHECK(itrCache != m_wordPairCache.end());
-      srcSumPairProbs += itrCache->second.first;
-      targetProbs[wordpair] = itrCache->second.second;
-    } else { // extract p(trg|src)
-      for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
-        int trgIdx = srcWordAlignments[i];
-        wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
-        // get probability of this source->target word pair
-        pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
-        itrCache = m_wordPairCache.find(wordpair);
-        if(itrCache == m_wordPairCache.end()) { // if not in cache
-          CacheWordProbs(srcWord);
-          itrCache = m_wordPairCache.find(wordpair); // search cache again
-        }
-        CHECK(itrCache != m_wordPairCache.end());
-        srcSumPairProbs += itrCache->second.first;
-        targetProbs[wordpair] = itrCache->second.second;
-      }
-    }
-    float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
-    srcLexWeight *= (srcNormalizer * srcSumPairProbs);
-  }	// end for each source word
-  for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
-    float trgSumPairProbs(0);
-    wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
-    for (std::map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
-         = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
-      if(trgItr->first.second == trgWord)
-        trgSumPairProbs += trgItr->second;
-    }
-    if(trgSumPairProbs == 0) continue;	// currently don't store target-side SA
-    int noAligned = alignment.numberAligned.at(trgIdx);
-    float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
-    trgLexWeight *= (trgNormalizer * trgSumPairProbs);
+  // sp,tp: sum of link probabilities
+  // sc,tc: count of links
+  int src_size = pp.GetSourceSize();
+  int trg_size = pp.GetTargetSize();
+  vector<float> sp(src_size, 0), tp(trg_size, 0);
+  vector<int>   sc(src_size,0),  tc(trg_size,0);
+  wordID_t const* sw = &(m_srcCorpus->at(m_srcSntBreaks.at(pp.m_sntIndex)));
+  wordID_t const* tw = &(m_trgCorpus->at(m_trgSntBreaks.at(pp.m_sntIndex)));
+  vector<short> const & a = m_rawAlignments.at(pp.m_sntIndex);
+  for (size_t i = 0; i < a.size(); i += 2) {
+    int s = a[i], t = a.at(i+1), sx, tx;
+    // sx, tx: local positions within phrase pair
+
+    if (s < pp.m_startSource || t < pp.m_startTarget) continue;
+    if ((sx = s - pp.m_startSource) >= src_size) continue;
+    if ((tx = t - pp.m_startTarget) >= trg_size) continue;
+    sp[sx] += m_wrd_cooc.pbwd(sw[s],tw[t]);
+    tp[tx] += m_wrd_cooc.pfwd(sw[s],tw[t]);
+    ++sc[sx];
+    ++tc[tx];
+  }
+  pair<float,float> ret(1,1);
+  wordID_t null_trg = m_trgVocab->GetkOOVWordID();
+  wordID_t null_src = m_srcVocab->GetkOOVWordID();
+  for (size_t i = 0, k = pp.m_startSource; i < sp.size(); ++i, ++k) {
+    if (sc[i]) ret.first *= sp[i]/sc[i];
+    else       ret.first *= m_wrd_cooc.pbwd(sw[k], null_trg);
+  }
+  for (size_t i = 0, k = pp.m_startTarget; i < tp.size(); ++i, ++k) {
+    if (tc[i]) ret.second *= tp[i]/tc[i];
+    else       ret.second *= m_wrd_cooc.pfwd(null_src,tw[k]);
   }
-  // TODO::Need to get p(NULL|trg)
-  return pair<float, float>(srcLexWeight, trgLexWeight);
+  return ret;
+
+  // //return pair<float, float>(1, 1);
+  // float srcLexWeight(1.0), trgLexWeight(1.0);
+  // map<pair<wordID_t, wordID_t>, float> targetProbs;
+  // // collects sum of target probs given source words
+
+  // //const SentenceAlignment& alignment = m_alignments[phrasepair.m_sntIndex];
+  // const SentenceAlignment alignment = GetSentenceAlignment(phrasepair.m_sntIndex);
+  // map<pair<wordID_t, wordID_t>, pair<float, float> >::const_iterator itrCache;
+  // // for each source word
+  // for(int srcIdx = phrasepair.m_startSource; srcIdx <= phrasepair.m_endSource; ++srcIdx) {
+  //   float srcSumPairProbs(0);
+  //   wordID_t srcWord = m_srcCorpus->at(srcIdx + m_srcSntBreaks[phrasepair.m_sntIndex]);	// localIDs
+  //   const vector<int>& srcWordAlignments = alignment.alignedList.at(srcIdx);
+  //   // for each target word aligned to this source word in this alignment
+  //   if(srcWordAlignments.size() == 0) { // get p(NULL|src)
+  // 	pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, m_srcVocab->GetkOOVWordID());
+  // 	itrCache = m_wordPairCache.find(wordpair);
+  // 	if(itrCache == m_wordPairCache.end()) { // if not in cache
+  // 	  CacheWordProbs(srcWord);
+  // 	  itrCache = m_wordPairCache.find(wordpair); // search cache again
+  // 	}
+  // 	CHECK(itrCache != m_wordPairCache.end());
+  // 	srcSumPairProbs += itrCache->second.first;
+  // 	targetProbs[wordpair] = itrCache->second.second;
+  //   }
+  //   else { // extract p(trg|src)
+  // 	for(size_t i = 0; i < srcWordAlignments.size(); ++i) { // for each aligned word
+  // 	  int trgIdx = srcWordAlignments[i];
+  // 	  wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
+  // 	  // get probability of this source->target word pair
+  // 	  pair<wordID_t, wordID_t> wordpair = make_pair(srcWord, trgWord);
+  // 	  itrCache = m_wordPairCache.find(wordpair);
+  // 	  if(itrCache == m_wordPairCache.end()) { // if not in cache
+  // 	    CacheWordProbs(srcWord);
+  // 	    itrCache = m_wordPairCache.find(wordpair); // search cache again
+  // 	  }
+  // 	  CHECK(itrCache != m_wordPairCache.end());
+  // 	  srcSumPairProbs += itrCache->second.first;
+  // 	  targetProbs[wordpair] = itrCache->second.second;
+  // 	}
+  //   }
+  //   float srcNormalizer = srcWordAlignments.size() < 2 ? 1.0 : 1.0 / float(srcWordAlignments.size());
+  //   srcLexWeight *= (srcNormalizer * srcSumPairProbs);
+  // }	// end for each source word
+
+  // for(int trgIdx = phrasepair.m_startTarget; trgIdx <= phrasepair.m_endTarget; ++trgIdx) {
+  //   float trgSumPairProbs(0);
+  //   wordID_t trgWord = m_trgCorpus->at(trgIdx + m_trgSntBreaks[phrasepair.m_sntIndex]);
+  //   for (map<pair<wordID_t, wordID_t>, float>::const_iterator trgItr
+  // 	     = targetProbs.begin(); trgItr != targetProbs.end(); ++trgItr) {
+  // 	if(trgItr->first.second == trgWord)
+  // 	  trgSumPairProbs += trgItr->second;
+  //   }
+  //   if(trgSumPairProbs == 0) continue;	// currently don't store target-side SA
+  //   int noAligned = alignment.numberAligned.at(trgIdx);
+  //   float trgNormalizer = noAligned < 2 ? 1.0 : 1.0 / float(noAligned);
+  //   trgLexWeight *= (trgNormalizer * trgSumPairProbs);
+  // }
+
+  // // TODO::Need to get p(NULL|trg)
+
+  // return pair<float, float>(srcLexWeight, trgLexWeight);
 }
-void BilingualDynSuffixArray::CacheFreqWords() const
+
+void
+BilingualDynSuffixArray::
+CacheFreqWords() const
 {
-  std::multimap<int, wordID_t> wordCnts;
+  multimap<int, wordID_t> wordCnts;
   // for each source word in vocab
   Vocab::Word2Id::const_iterator it;
   for(it = m_srcVocab->VocabStart(); it != m_srcVocab->VocabEnd(); ++it) {
     // get its frequency
     wordID_t srcWord = it->second;
-    std::vector<wordID_t> sword(1, srcWord), wrdIndices;
+    vector<wordID_t> sword(1, srcWord), wrdIndices;
     m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
     if(wrdIndices.size() >= 1000) { // min count
       wordCnts.insert(make_pair(wrdIndices.size(), srcWord));
     }
   }
   int numSoFar(0);
-  std::multimap<int, wordID_t>::reverse_iterator ritr;
+  multimap<int, wordID_t>::reverse_iterator ritr;
   for(ritr = wordCnts.rbegin(); ritr != wordCnts.rend(); ++ritr) {
     m_freqWordsCached.insert(ritr->second);
     CacheWordProbs(ritr->second);
@@ -339,20 +402,23 @@ void BilingualDynSuffixArray::CacheFreqWords() const
   }
   cerr << "\tCached " << m_freqWordsCached.size() << " source words\n";
 }
-void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
+
+void
+BilingualDynSuffixArray::
+CacheWordProbs(wordID_t srcWord) const
 {
-  std::map<wordID_t, int> counts;
-  std::vector<wordID_t> sword(1, srcWord), wrdIndices;
+  map<wordID_t, int> counts;
+  vector<wordID_t> sword(1, srcWord), wrdIndices;
   bool ret = m_srcSA->GetCorpusIndex(&sword, &wrdIndices);
   CHECK(ret);
-  std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
+  vector<int> sntIndexes = GetSntIndexes(wrdIndices, 1, m_srcSntBreaks);
   float denom(0);
   // for each occurrence of this word
   for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
     int sntIdx = sntIndexes.at(snt); // get corpus index for sentence
     CHECK(sntIdx != -1);
     int srcWrdSntIdx = wrdIndices.at(snt) - m_srcSntBreaks.at(sntIdx); // get word index in sentence
-    const std::vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
+    const vector<int> srcAlg = GetSentenceAlignment(sntIdx).alignedList.at(srcWrdSntIdx); // list of target words for this source word
     if(srcAlg.size() == 0) {
       ++counts[m_srcVocab->GetkOOVWordID()]; // if not alligned then align to NULL word
       ++denom;
@@ -366,7 +432,7 @@ void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
   }
   // now we've gotten counts of all target words aligned to this source word
   // get probs and cache all pairs
-  for(std::map<wordID_t, int>::const_iterator itrCnt = counts.begin();
+  for(map<wordID_t, int>::const_iterator itrCnt = counts.begin();
       itrCnt != counts.end(); ++itrCnt) {
     pair<wordID_t, wordID_t> wordPair = make_pair(srcWord, itrCnt->first);
     float srcTrgPrb = float(itrCnt->second) / float(denom);	// gives p(src->trg)
@@ -375,7 +441,9 @@ void BilingualDynSuffixArray::CacheWordProbs(wordID_t srcWord) const
   }
 }
 
-SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
+SAPhrase
+BilingualDynSuffixArray::
+TrgPhraseFromSntIdx(const PhrasePair& phrasepair) const
 {
   // takes sentence indexes and looks up vocab IDs
   SAPhrase phraseIds(phrasepair.GetTargetSize());
@@ -388,7 +456,9 @@ SAPhrase BilingualDynSuffixArray::TrgPhraseFromSntIdx(const PhrasePair& phrasepa
   return phraseIds;
 }
 
-TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const
+TargetPhrase*
+BilingualDynSuffixArray::
+GetMosesFactorIDs(const SAPhrase& phrase, const Phrase& sourcePhrase) const
 {
   TargetPhrase* targetPhrase = new TargetPhrase();
   for(size_t i=0; i < phrase.words.size(); ++i) { // look up trg words
@@ -401,106 +471,211 @@ TargetPhrase* BilingualDynSuffixArray::GetMosesFactorIDs(const SAPhrase& phrase,
   return targetPhrase;
 }
 
-void BilingualDynSuffixArray::GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> > & target) const
+/// Gather translation candidates for source phrase /src/ and store raw
+//  phrase pair statistics in /pstats/. Return the sample rate
+//  (number of samples considered / total number of hits) and total number of
+//  phrase pairs
+pair<float,float>
+BilingualDynSuffixArray::
+GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const
 {
-  //cerr << "phrase is \"" << src << endl;
-  size_t sourceSize = src.GetSize();
-  SAPhrase localIDs(sourceSize);
-  if(!GetLocalVocabIDs(src, localIDs)) return;
-  float totalTrgPhrases(0);
-  std::map<SAPhrase, int> phraseCounts;
-  //std::map<SAPhrase, PhrasePair> phraseColl; // (one of) the word indexes this phrase was taken from
-  std::map<SAPhrase, pair<float, float> > lexicalWeights;
-  std::map<SAPhrase, pair<float, float> >::iterator itrLexW;
-  std::vector<unsigned> wrdIndices;
-  // extract sentence IDs from SA and return rightmost index of phrases
-  if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices)) return;
-  SampleSelection(wrdIndices);
-  std::vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
-  // for each sentence with this phrase
-  for(size_t snt = 0; snt < sntIndexes.size(); ++snt) {
-    std::vector<PhrasePair*> phrasePairs; // to store all phrases possible from current sentence
-    int sntIndex = sntIndexes.at(snt); // get corpus index for sentence
-    if(sntIndex == -1) continue;	// bad flag set by GetSntIndexes()
-    ExtractPhrases(sntIndex, wrdIndices[snt], sourceSize, phrasePairs);
-    //cerr << "extracted " << phrasePairs.size() << endl;
-    totalTrgPhrases += phrasePairs.size(); // keep track of count of each extracted phrase pair
-    std::vector<PhrasePair*>::iterator iterPhrasePair;
-    for (iterPhrasePair = phrasePairs.begin(); iterPhrasePair != phrasePairs.end(); ++iterPhrasePair) {
-      SAPhrase phrase = TrgPhraseFromSntIdx(**iterPhrasePair);
-      phraseCounts[phrase]++;	// count each unique phrase
-      // NOTE::Correct but slow to extract lexical weight here. could do
-      // it later for only the top phrases chosen by phrase prob p(e|f)
-      pair<float, float> lexWeight = GetLexicalWeight(**iterPhrasePair);	// get lexical weighting for this phrase pair
-      itrLexW = lexicalWeights.find(phrase); // check if phrase already has lexical weight attached
-      if((itrLexW != lexicalWeights.end()) && (itrLexW->second.first < lexWeight.first))
-        itrLexW->second = lexWeight;	// if this lex weight is greater save it
-      else lexicalWeights[phrase] = lexWeight; // else save
+  typedef map<SAPhrase, vector<float> >::iterator   pstat_iter;
+  typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
+  pair<float,float> ret(0,0);
+  float& sampleRate   = ret.first;
+  float& totalPhrases = ret.second;
+  size_t srcSize = src.GetSize();
+  SAPhrase localIDs(srcSize);
+  vector<unsigned> wrdIndices;
+  if(!GetLocalVocabIDs(src, localIDs) ||
+      !m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices))
+    return ret; // source phrase contains OOVs
+
+  // select a sample of the occurrences for phrase extraction
+  size_t m1 = wrdIndices.size();
+  SampleSelection(wrdIndices); // careful! SampleSelection alters wrdIndices!
+  sampleRate = float(wrdIndices.size())/m1;
+
+  // determine the sentences in which these phrases occur
+  vector<int> sntIndices = GetSntIndexes(wrdIndices, srcSize, m_srcSntBreaks);
+  for(size_t s = 0; s < sntIndices.size(); ++s) {
+    int sntStart = sntIndices.at(s);
+    if(sntStart == -1) continue; // marked as bad by GetSntIndexes()
+    vector<PhrasePair*> phrasePairs;
+    ExtractPhrases(sntStart, wrdIndices[s], srcSize, phrasePairs);
+    totalPhrases += phrasePairs.size();
+    vector<PhrasePair*>::iterator p;
+    for (p = phrasePairs.begin(); p != phrasePairs.end(); ++p) {
+      assert(*p);
+      pair<float, float> lex = GetLexicalWeight(**p);
+      pstat_entry entry(TrgPhraseFromSntIdx(**p), Scores(5));
+      pair<pstat_iter, bool> foo = pstats.insert(entry);
+      Scores& feats = foo.first->second;
+      if (foo.second) {
+        feats[0]  = 1; // count
+        feats[1]  = lex.first;
+        feats[3]  = lex.second;
+      } else {
+        feats[0] += 1;
+        feats[1]  = max(feats[1],lex.first);
+        feats[3]  = max(feats[3],lex.second);
+      }
+      delete *p;
     }
-    // done with sentence. delete SA phrase pairs
-    RemoveAllInColl(phrasePairs);
   } // done with all sentences
-  // convert to moses phrase pairs
-  std::map<SAPhrase, int>::const_iterator iterPhrases;
-  std::multimap<Scores, const SAPhrase*, ScoresComp> phraseScores (*m_scoreCmp);
-  // get scores of all phrases
-  for(iterPhrases = phraseCounts.begin(); iterPhrases != phraseCounts.end(); ++iterPhrases) {
-    float trg2SrcMLE = float(iterPhrases->second) / totalTrgPhrases;
-    itrLexW = lexicalWeights.find(iterPhrases->first);
-    CHECK(itrLexW != lexicalWeights.end());
-    Scores scoreVector(3);
-    scoreVector[0] = std::log(trg2SrcMLE);
-    scoreVector[1] = std::log(itrLexW->second.first);
-    scoreVector[2] = 1; // exp(1);
-    phraseScores.insert(make_pair(scoreVector, &iterPhrases->first));
-  }
-  // return top scoring phrases
-  std::multimap<Scores, const SAPhrase*, ScoresComp>::reverse_iterator ritr;
-  for(ritr = phraseScores.rbegin(); ritr != phraseScores.rend(); ++ritr) {
-    Scores scoreVector = ritr->first;
-    TargetPhrase *targetPhrase = GetMosesFactorIDs(*ritr->second, src);
-    target.push_back(make_pair( scoreVector, targetPhrase));
-    if(target.size() == m_maxSampleSize) break;
+  BOOST_FOREACH(pstat_entry & e, pstats) {
+    Scores& feats = e.second;
+    // 0: bwd phrase prob
+    // 1: lex 1
+    // 2: fwd phrase prob
+    // 3: lex 2
+    // 4: phrase penalty
+    float x  = m_trgSA->GetCount(e.first.words)-feats[0] * sampleRate;
+    feats[4] = 1;
+    feats[3] = log(feats[3]);
+    feats[2] = log(feats[0]) - log(totalPhrases);
+    feats[1] = log(feats[1]);
+    feats[0] = log(feats[0]) - log(feats[0] + x);
   }
+  return ret;
 }
 
-std::vector<int> BilingualDynSuffixArray::GetSntIndexes(std::vector<unsigned>& wrdIndices,
-    const int sourceSize, const std::vector<unsigned>& sntBreaks) const
+// void
+// BilingualDynSuffixArray::
+// GetTargetPhrasesByLexicalWeight
+// (const Phrase& src, vector< pair<Scores, TargetPhrase*> > & target)
+//   const
+// {
+//   size_t sourceSize = src.GetSize();
+//   SAPhrase localIDs(sourceSize);
+//   if(!GetLocalVocabIDs(src, localIDs))
+//     return; // source phrase contains OOVs
+
+//   float totalTrgPhrases(0);
+//   map<SAPhrase, int> phraseCounts;
+//   map<SAPhrase, pair<float, float> > lexicalWeights;
+//   map<SAPhrase, pair<float, float> >::iterator itrLexW;
+
+//   // find all occurrences of the phrase in the corpus;
+//   // wrdIndices stores the rightmost position
+//   vector<unsigned> wrdIndices;
+//   if(!m_srcSA->GetCorpusIndex(&(localIDs.words), &wrdIndices))
+//     return; // none found
+
+//   // select a sample of the occurrences for phrase extraction
+//   size_t m1 = wrdIndices.size();
+//   SampleSelection(wrdIndices);
+//   float  sampleRate = float(wrdIndices.size())/m1;
+
+//   // determine the sentences in which these phrases occur
+//   vector<int> sntIndexes = GetSntIndexes(wrdIndices, sourceSize, m_srcSntBreaks);
+
+//   // for each sentence with this phrase
+//   for(size_t s = 0; s < sntIndexes.size(); ++s)
+//     {
+// 	vector<PhrasePair*> phrasePairs;
+// 	int sntIndex = sntIndexes.at(s);
+// 	if(sntIndex == -1) continue;	// bad flag set by GetSntIndexes()
+// 	ExtractPhrases(sntIndex, wrdIndices[s], sourceSize, phrasePairs);
+// 	totalTrgPhrases += phrasePairs.size();
+// 	vector<PhrasePair*>::iterator p;
+// 	for (p = phrasePairs.begin(); p != phrasePairs.end(); ++p)
+// 	  {
+// 	    PhrasePair* px = *p;
+// 	    assert(px);
+// 	    SAPhrase phrase = TrgPhraseFromSntIdx(*px);
+// 	    phraseCounts[phrase]++;	// count each unique phrase
+// 	    // NOTE::Correct but slow to extract lexical weight here. could do
+// 	    // it later for only the top phrases chosen by phrase prob p(e|f)
+// 	    pair<float, float> lexWeight = GetLexicalWeight(**p);
+// 	    itrLexW = lexicalWeights.find(phrase);
+// 	    if((itrLexW != lexicalWeights.end()) &&
+// 	       (itrLexW->second.first < lexWeight.first))
+// 	      itrLexW->second = lexWeight; // if this lex weight is greater save it
+// 	    else lexicalWeights[phrase] = lexWeight; // else save
+// 	  }
+// 	// done with sentence. delete SA phrase pairs
+// 	BOOST_FOREACH(PhrasePair* p, phrasePairs) delete p;
+//     } // done with all sentences
+
+//   cerr << "Done extracting ... "  << endl;
+
+//   // convert to moses phrase pairs
+//   map<SAPhrase, int>::iterator pcnt;
+//   BetterPhrase better(*m_scoreCmp);
+//   NBestList<pair<Scores,SAPhrase const*>,BetterPhrase> nbest(m_maxPTEntries,better);
+//   for(pcnt = phraseCounts.begin(); pcnt != phraseCounts.end(); ++pcnt)
+//     {
+// 	float tmarginal = (m_trgSA->GetCount(pcnt->first.words) * sampleRate);
+// 	float pfwd = pcnt->second / totalTrgPhrases;
+// 	float pbwd = pcnt->second / tmarginal;
+// 	pair<float, float> lexWeight = lexicalWeights[pcnt->first];
+// 	pair<Scores, SAPhrase const*> entry;
+// 	entry.first.resize(5);
+// 	entry.first[0] = pbwd;
+// 	entry.first[1] = itrLexW->second.first;
+// 	entry.first[2] = pfwd;
+// 	entry.first[3] = itrLexW->second.second;
+// 	entry.first[4] = 2.718; // exp(1);
+// 	entry.second = &pcnt->first;
+// 	nbest.add(entry);
+//     }
+
+//   // return top scoring phrases
+//   for (size_t n = 0; n < nbest.size(); ++n)
+//     {
+// 	pair<Scores, SAPhrase const*> e = nbest[n];
+// 	target.push_back(make_pair(e.first,GetMosesFactorIDs(*e.second, src)));
+//     }
+// }
+
+vector<int>
+BilingualDynSuffixArray::
+GetSntIndexes(vector<unsigned>& wrdIndices,
+              const int sourceSize,
+              const vector<unsigned>& sntBreaks) const
 {
-  std::vector<unsigned>::const_iterator vit;
-  std::vector<int> sntIndexes;
+  vector<unsigned>::const_iterator vit;
+  vector<int> sntIndices;
   for(size_t i=0; i < wrdIndices.size(); ++i) {
-    vit = std::upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
+    vit = upper_bound(sntBreaks.begin(), sntBreaks.end(), wrdIndices[i]);
     int index = int(vit - sntBreaks.begin()) - 1;
     // check for phrases that cross sentence boundaries
     if(wrdIndices[i] - sourceSize + 1 < sntBreaks.at(index))
-      sntIndexes.push_back(-1);	// set bad flag
+      sntIndices.push_back(-1);	// set bad flag
     else
-      sntIndexes.push_back(index);	// store the index of the sentence in the corpus
+      sntIndices.push_back(index);	// store the index of the sentence in the corpus
   }
-  return sntIndexes;
+  return sntIndices;
 }
 
-int BilingualDynSuffixArray::SampleSelection(std::vector<unsigned>& sample,
-    int sampleSize) const
+int
+BilingualDynSuffixArray::
+SampleSelection(vector<unsigned>& sample, int sampleSize) const
 {
   // only use top 'sampleSize' number of samples
-  if(sample.size() > (size_t)sampleSize)
-    sample.erase(sample.begin()+sampleSize, sample.end());
+  vector<unsigned> s;
+  randomSample<unsigned>(s,sampleSize,sample.size());
+  for (size_t i = 0; i < s.size(); ++i)
+    s[i] = sample[s[i]];
+  sample.swap(s);
   return sample.size();
 }
 
-void BilingualDynSuffixArray::addSntPair(string& source, string& target, string& alignment)
+void
+BilingualDynSuffixArray::
+addSntPair(string& source, string& target, string& alignment)
 {
   vuint_t srcFactor, trgFactor;
-  cerr << "source, target, alignment = " << source << ", " << target << ", " << alignment << endl;
-  const std::string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
+  cerr << "source, target, alignment = " << source << ", "
+       << target << ", " << alignment << endl;
+  const string& factorDelimiter = StaticData::Instance().GetFactorDelimiter();
   const unsigned oldSrcCrpSize = m_srcCorpus->size(), oldTrgCrpSize = m_trgCorpus->size();
   cerr << "old source corpus size = " << oldSrcCrpSize << "\told target size = " << oldTrgCrpSize << endl;
   Phrase sphrase(ARRAY_SIZE_INCR);
   sphrase.CreateFromString(Input, m_inputFactors, source, factorDelimiter, NULL);
   m_srcVocab->MakeOpen();
-  std::vector<wordID_t> sIDs(sphrase.GetSize());
+  vector<wordID_t> sIDs(sphrase.GetSize());
   // store words in vocabulary and corpus
   for(int i = sphrase.GetSize()-1; i >= 0; --i) {
     sIDs[i] = m_srcVocab->GetWordID(sphrase.GetWord(i));  // get vocab id backwards
@@ -515,7 +690,7 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
   Phrase tphrase(ARRAY_SIZE_INCR);
   tphrase.CreateFromString(Output, m_outputFactors, target, factorDelimiter, NULL);
   m_trgVocab->MakeOpen();
-  std::vector<wordID_t> tIDs(tphrase.GetSize());
+  vector<wordID_t> tIDs(tphrase.GetSize());
   for(int i = tphrase.GetSize()-1; i >= 0; --i) {
     tIDs[i] = m_trgVocab->GetWordID(tphrase.GetWord(i));  // get vocab id
   }
@@ -529,18 +704,26 @@ void BilingualDynSuffixArray::addSntPair(string& source, string& target, string&
   cerr << "gets to 2\n";
   m_srcSA->Insert(&srcFactor, oldSrcCrpSize);
   cerr << "gets to 3\n";
-  //m_trgSA->Insert(&trgFactor, oldTrgCrpSize);
+  m_trgSA->Insert(&trgFactor, oldTrgCrpSize);
   LoadRawAlignments(alignment);
   m_trgVocab->MakeClosed();
+
+  m_wrd_cooc.Count(sIDs,tIDs, m_rawAlignments.back(),
+                   m_srcVocab->GetkOOVWordID(),
+                   m_trgVocab->GetkOOVWordID());
+
   //for(size_t i=0; i < sphrase.GetSize(); ++i)
   //ClearWordInCache(sIDs[i]);
 
 }
-void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord)
+
+void
+BilingualDynSuffixArray::
+ClearWordInCache(wordID_t srcWord)
 {
   if(m_freqWordsCached.find(srcWord) != m_freqWordsCached.end())
     return;
-  std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> >::iterator it,
+  map<pair<wordID_t, wordID_t>, pair<float, float> >::iterator it,
       first, last;
   for(it = m_wordPairCache.begin(); it != m_wordPairCache.end(); ++it) {
     if(it->first.first == srcWord) {  // all source words grouped
@@ -553,18 +736,23 @@ void BilingualDynSuffixArray::ClearWordInCache(wordID_t srcWord)
     m_wordPairCache.erase(first, last);
   }
 }
-SentenceAlignment::SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
-  :m_sntIndex(sntIndex)
-  ,numberAligned(targetSize, 0)
-  ,alignedList(sourceSize)
+
+SentenceAlignment::
+SentenceAlignment(int sntIndex, int sourceSize, int targetSize)
+  : m_sntIndex(sntIndex)
+  , numberAligned(targetSize, 0)
+  , alignedList(sourceSize)
 {
-  for(int i=0; i < sourceSize; ++i) {
-    std::vector<int> trgWrd;
-    alignedList[i] = trgWrd;
-  }
+  // What is the code below supposed to accomplish??? UG.
+  // for(int i=0; i < sourceSize; ++i) {
+  //   vector<int> trgWrd;
+  //   alignedList[i] = trgWrd;
+  // }
 }
 
-bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const
+bool
+SentenceAlignment::
+Extract(int maxPhraseLength, vector<PhrasePair*> &ret, int startSource, int endSource) const
 {
   // foreign = target, F=T
   // english = source, E=S
@@ -572,7 +760,7 @@ bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &r
 
   int minTarget = 9999;
   int maxTarget = -1;
-  std::vector< int > usedTarget = numberAligned;
+  vector< int > usedTarget = numberAligned;
   for(int sourcePos = startSource; sourcePos <= endSource; sourcePos++) {
     for(int ind=0; ind < (int)alignedList[sourcePos].size(); ind++) {
       int targetPos = alignedList[sourcePos][ind];
@@ -626,4 +814,42 @@ bool SentenceAlignment::Extract(int maxPhraseLength, std::vector<PhrasePair*> &r
 
 }
 
+int
+BilingualDynSuffixArray::
+GetSourceSentenceSize(size_t sentenceId) const
+{
+  return (sentenceId==m_srcSntBreaks.size()-1) ?
+         m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
+         m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
+}
+
+int
+BilingualDynSuffixArray::
+GetTargetSentenceSize(size_t sentenceId) const
+{
+  return (sentenceId==m_trgSntBreaks.size()-1) ?
+         m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
+         m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
+}
+
+BetterPhrase::
+BetterPhrase(ScoresComp const& sc)
+  : cmp(sc) {}
+
+// bool
+// BetterPhrase::
+// operator()(pair<Scores, TargetPhrase const*> const& a,
+// 	     pair<Scores, TargetPhrase const*> const& b) const
+// {
+//   return cmp(b.first,a.first);
+// }
+
+bool
+BetterPhrase::
+operator()(pair<Scores, SAPhrase const*> const& a,
+           pair<Scores, SAPhrase const*> const& b) const
+{
+  return cmp(b.first,a.first);
+}
+
 }// end namepsace
diff --git a/moses/TranslationModel/BilingualDynSuffixArray.h b/moses/TranslationModel/BilingualDynSuffixArray.h
index 7a4d47eea..71f032507 100644
--- a/moses/TranslationModel/BilingualDynSuffixArray.h
+++ b/moses/TranslationModel/BilingualDynSuffixArray.h
@@ -5,23 +5,29 @@
 #include "moses/TranslationModel/DynSAInclude/vocab.h"
 #include "moses/TranslationModel/DynSAInclude/types.h"
 #include "moses/TranslationModel/DynSAInclude/utils.h"
+#include "moses/TranslationModel/WordCoocTable.h"
 #include "moses/InputFileStream.h"
 #include "moses/FactorTypeSet.h"
 #include "moses/TargetPhrase.h"
+#include <boost/dynamic_bitset.hpp>
+#include "moses/TargetPhraseCollection.h"
+#include <map>
 
+using namespace std;
 namespace Moses
 {
+class PhraseDictionaryDynSuffixArray;
 
 /** @todo ask Abbey Levenberg
  */
 class SAPhrase
 {
 public:
-  std::vector<wordID_t> words;
+  vector<wordID_t> words;
 
   SAPhrase(size_t phraseSize)
-    :words(phraseSize) {
-  }
+    :words(phraseSize)
+  {}
 
   void SetId(size_t pos, wordID_t id) {
     CHECK(pos < words.size());
@@ -43,12 +49,16 @@ public:
     , m_endTarget(endTarget)
     , m_startSource(startSource)
     , m_endSource(endSource)
-    , m_sntIndex(sntIndex) {
-  }
+    , m_sntIndex(sntIndex)
+  {}
 
   size_t GetTargetSize() const {
     return m_endTarget - m_startTarget + 1;
   }
+
+  size_t GetSourceSize() const {
+    return m_endSource - m_startSource + 1;
+  }
 };
 
 /** @todo ask Abbey Levenberg
@@ -58,32 +68,43 @@ class SentenceAlignment
 public:
   SentenceAlignment(int sntIndex, int sourceSize, int targetSize);
   int m_sntIndex;
-  std::vector<wordID_t>* trgSnt;
-  std::vector<wordID_t>* srcSnt;
-  std::vector<int> numberAligned;
-  std::vector< std::vector<int> > alignedList;
-  bool Extract(int maxPhraseLength, std::vector<PhrasePair*> &ret, int startSource, int endSource) const;
+  vector<wordID_t>* trgSnt;
+  vector<wordID_t>* srcSnt;
+  vector<int> numberAligned;
+  vector< vector<int> > alignedList;
+  bool Extract(int maxPhraseLength, vector<PhrasePair*> &ret,
+               int startSource, int endSource) const;
 };
+
 class ScoresComp
 {
 public:
-  ScoresComp(const std::vector<float>& weights): m_weights(weights) {}
+  ScoresComp(const vector<float>& weights): m_weights(weights) {}
   bool operator()(const Scores& s1, const Scores& s2) const {
     return s1[0] < s2[0]; // just p(e|f) as approximation
-    /*float score1(0), score2(0);
-    int idx1(0), idx2(0);
-    for (Scores::const_iterator itr = s1.begin();
-            itr != s1.end(); ++itr) {
-        score1 += log(*itr * m_weights.at(idx1++));
-    }
-    for (Scores::const_iterator itr = s2.begin();
-        itr != s2.end(); ++itr) {
-        score2 += log(*itr * m_weights.at(idx2++));
-    }
-    return score1 < score2;*/
+    // float score1(0), score2(0);
+    // int idx1(0), idx2(0);
+    // for (Scores::const_iterator itr = s1.begin();
+    // 	 itr != s1.end(); ++itr) {
+    //   score1 += log(*itr * m_weights.at(idx1++));
+    // }
+    // for (Scores::const_iterator itr = s2.begin();
+    // 	 itr != s2.end(); ++itr) {
+    //   score2 += log(*itr * m_weights.at(idx2++));
+    // }
+    // return score1 < score2;
   }
 private:
-  const std::vector<float>& m_weights;
+  const vector<float>& m_weights;
+};
+
+struct BetterPhrase {
+  ScoresComp const& cmp;
+  BetterPhrase(ScoresComp const& sc);
+  // bool operator()(pair<Scores, TargetPhrase const*> const& a,
+  // pair<Scores, TargetPhrase const*> const& b) const;
+  bool operator()(pair<Scores, SAPhrase const*> const& a,
+                  pair<Scores, SAPhrase const*> const& b) const;
 };
 
 /** @todo ask Abbey Levenberg
@@ -93,66 +114,70 @@ class BilingualDynSuffixArray
 public:
   BilingualDynSuffixArray();
   ~BilingualDynSuffixArray();
-  bool Load( const std::vector<FactorType>& inputFactors,
-             const std::vector<FactorType>& outputTactors,
-             std::string source, std::string target, std::string alignments,
-             const std::vector<float> &weight);
-  bool LoadTM( const std::vector<FactorType>& inputFactors,
-               const std::vector<FactorType>& outputTactors,
-               std::string source, std::string target, std::string alignments,
-               const std::vector<float> &weight);
-  void GetTargetPhrasesByLexicalWeight(const Phrase& src, std::vector< std::pair<Scores, TargetPhrase*> >& target) const;
+  bool Load( const vector<FactorType>& inputFactors,
+             const vector<FactorType>& outputTactors,
+             string source, string target, string alignments,
+             const vector<float> &weight);
+  // bool LoadTM( const vector<FactorType>& inputFactors,
+  // 	     const vector<FactorType>& outputTactors,
+  // 	     string source, string target, string alignments,
+  // 	     const vector<float> &weight);
+  void GetTargetPhrasesByLexicalWeight(const Phrase& src, vector< pair<Scores, TargetPhrase*> >& target) const;
+
+  void CleanUp(const InputType& source);
   void addSntPair(string& source, string& target, string& alignment);
+  pair<float,float>
+  GatherCands(Phrase const& src, map<SAPhrase, vector<float> >& pstats) const;
+
+  TargetPhrase*
+  GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
+
 private:
-  DynSuffixArray* m_srcSA;
-  DynSuffixArray* m_trgSA;
-  std::vector<wordID_t>* m_srcCorpus;
-  std::vector<wordID_t>* m_trgCorpus;
-  std::vector<FactorType> m_inputFactors;
-  std::vector<FactorType> m_outputFactors;
 
-  std::vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
+
+  mutable WordCoocTable m_wrd_cooc;
+  DynSuffixArray * m_srcSA;
+  DynSuffixArray * m_trgSA;
+  vector<wordID_t>* m_srcCorpus;
+  vector<wordID_t>* m_trgCorpus;
+  vector<FactorType> m_inputFactors;
+  vector<FactorType> m_outputFactors;
+
+  vector<unsigned> m_srcSntBreaks, m_trgSntBreaks;
 
   Vocab* m_srcVocab, *m_trgVocab;
   ScoresComp* m_scoreCmp;
 
-  std::vector<SentenceAlignment> m_alignments;
-  std::vector<std::vector<short> > m_rawAlignments;
+  vector<SentenceAlignment> m_alignments;
+  vector<vector<short> > m_rawAlignments;
 
-  mutable std::map<std::pair<wordID_t, wordID_t>, std::pair<float, float> > m_wordPairCache;
-  mutable std::set<wordID_t> m_freqWordsCached;
+  mutable map<pair<wordID_t, wordID_t>, pair<float, float> > m_wordPairCache;
+  mutable set<wordID_t> m_freqWordsCached;
   const size_t m_maxPhraseLength, m_maxSampleSize;
-
-  int LoadCorpus(FactorDirection direction, InputFileStream&, const std::vector<FactorType>& factors,
-                 std::vector<wordID_t>&, std::vector<wordID_t>&,
+  const size_t m_maxPTEntries;
+  int LoadCorpus(FactorDirection direction,
+                 InputFileStream&, const vector<FactorType>& factors,
+                 vector<wordID_t>&, vector<wordID_t>&,
                  Vocab*);
   int LoadAlignments(InputFileStream& aligs);
   int LoadRawAlignments(InputFileStream& aligs);
   int LoadRawAlignments(string& aligs);
 
-  bool ExtractPhrases(const int&, const int&, const int&, std::vector<PhrasePair*>&, bool=false) const;
+  bool ExtractPhrases(const int&, const int&, const int&, vector<PhrasePair*>&, bool=false) const;
   SentenceAlignment GetSentenceAlignment(const int, bool=false) const;
-  int SampleSelection(std::vector<unsigned>&, int = 300) const;
+  int SampleSelection(vector<unsigned>&, int = 300) const;
 
-  std::vector<int> GetSntIndexes(std::vector<unsigned>&, int, const std::vector<unsigned>&) const;
-  TargetPhrase* GetMosesFactorIDs(const SAPhrase&, const Phrase& sourcePhrase) const;
+  vector<int> GetSntIndexes(vector<unsigned>&, int, const vector<unsigned>&) const;
   SAPhrase TrgPhraseFromSntIdx(const PhrasePair&) const;
   bool GetLocalVocabIDs(const Phrase&, SAPhrase &) const;
   void CacheWordProbs(wordID_t) const;
   void CacheFreqWords() const;
   void ClearWordInCache(wordID_t);
-  std::pair<float, float> GetLexicalWeight(const PhrasePair&) const;
+  pair<float, float> GetLexicalWeight(const PhrasePair&) const;
+
+  int GetSourceSentenceSize(size_t sentenceId) const;
+  int GetTargetSentenceSize(size_t sentenceId) const;
 
-  int GetSourceSentenceSize(size_t sentenceId) const {
-    return (sentenceId==m_srcSntBreaks.size()-1) ?
-           m_srcCorpus->size() - m_srcSntBreaks.at(sentenceId) :
-           m_srcSntBreaks.at(sentenceId+1) - m_srcSntBreaks.at(sentenceId);
-  }
-  int GetTargetSentenceSize(size_t sentenceId) const {
-    return (sentenceId==m_trgSntBreaks.size()-1) ?
-           m_trgCorpus->size() - m_trgSntBreaks.at(sentenceId) :
-           m_trgSntBreaks.at(sentenceId+1) - m_trgSntBreaks.at(sentenceId);
-  }
 };
 } // end namespace
 #endif
diff --git a/moses/TranslationModel/DynSuffixArray.cpp b/moses/TranslationModel/DynSuffixArray.cpp
index c5fddf3f0..a5fc32dcc 100644
--- a/moses/TranslationModel/DynSuffixArray.cpp
+++ b/moses/TranslationModel/DynSuffixArray.cpp
@@ -1,5 +1,6 @@
 #include "DynSuffixArray.h"
 #include <iostream>
+#include <boost/foreach.hpp>
 
 using namespace std;
 
@@ -215,8 +216,37 @@ void DynSuffixArray::Substitute(vuint_t* /* newSents */, unsigned /* newIndex */
   return;
 }
 
+ComparePosition::
+ComparePosition(vuint_t const& crp, vuint_t const& sfa)
+  : m_crp(crp), m_sfa(sfa) { }
+
+bool
+ComparePosition::
+operator()(unsigned const& i, vector<wordID_t> const& phrase) const
+{
+  unsigned const* x = &m_crp.at(i);
+  unsigned const* e = &m_crp.back();
+  size_t k = 0;
+  for (; k < phrase.size() && x < e; ++k, ++x)
+    if (*x != phrase[k]) return *x < phrase[k];
+  return (x == e && k < phrase.size());
+}
+
+bool
+ComparePosition::
+operator()(vector<wordID_t> const& phrase, unsigned const& i) const
+{
+  unsigned const* x = &m_crp.at(i);
+  unsigned const* e = &m_crp.back();
+  size_t k = 0;
+  for (; k < phrase.size() && x < e; ++k, ++x)
+    if (*x != phrase[k]) return phrase[k] < *x;
+  return false; // (k == phrase.size() && x < e);
+}
+
 bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
 {
+  // DOES THIS EVEN WORK WHEN A DynSuffixArray has been saved and reloaded????
   pair<vuint_t::iterator,vuint_t::iterator> bounds;
   indices->clear();
   size_t phrasesize = phrase->size();
@@ -251,6 +281,16 @@ bool DynSuffixArray::GetCorpusIndex(const vuint_t* phrase, vuint_t* indices)
   return (indices->size() > 0);
 }
 
+size_t
+DynSuffixArray::
+GetCount(vuint_t const& phrase) const
+{
+  ComparePosition cmp(*m_corpus, *m_SA);
+  vuint_t::const_iterator lb = lower_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
+  vuint_t::const_iterator ub = upper_bound(m_SA->begin(), m_SA->end(), phrase, cmp);
+  return ub-lb;
+}
+
 void DynSuffixArray::Save(FILE* fout)
 {
   fWriteVector(fout, *m_SA);
diff --git a/moses/TranslationModel/DynSuffixArray.h b/moses/TranslationModel/DynSuffixArray.h
index 024d6daf1..62f719d57 100644
--- a/moses/TranslationModel/DynSuffixArray.h
+++ b/moses/TranslationModel/DynSuffixArray.h
@@ -11,9 +11,25 @@
 
 namespace Moses
 {
-
+using namespace std;
 typedef std::vector<unsigned> vuint_t;
 
+
+/// compare position /i/ in the suffix array /m_sfa/ into corpus /m_crp/
+/// against reference phrase /phrase/
+// added by Ulrich Germann
+class ComparePosition
+{
+  vuint_t const& m_crp;
+  vuint_t const& m_sfa;
+
+public:
+  ComparePosition(vuint_t const& crp, vuint_t const& sfa);
+  bool operator()(unsigned const& i, vector<wordID_t> const& phrase) const;
+  bool operator()(vector<wordID_t> const& phrase, unsigned const& i) const;
+};
+
+
 /** @todo ask Abbey Levenberg
  */
 class DynSuffixArray
@@ -30,6 +46,8 @@ public:
   void Delete(unsigned, unsigned);
   void Substitute(vuint_t*, unsigned);
 
+  size_t GetCount(vuint_t const& phrase) const;
+
 private:
   vuint_t* m_SA;
   vuint_t* m_ISA;
@@ -46,10 +64,10 @@ private:
   void PrintAuxArrays() {
     std::cerr << "SA\tISA\tF\tL\n";
     for(size_t i=0; i < m_SA->size(); ++i)
-      std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t" << m_F->at(i) << "\t" << m_L->at(i) << std::endl;
+      std::cerr << m_SA->at(i) << "\t" << m_ISA->at(i) << "\t"
+                << m_F->at(i) << "\t" << m_L->at(i) << std::endl;
   }
 };
-
 } //end namespace
 
 #endif
diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README
new file mode 100644
index 000000000..19a7f8779
--- /dev/null
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.README
@@ -0,0 +1,4 @@
+Specifying Dynamic Suffix Array-based Phrase Tables in moses.ini
+
+[ttable-file]
+14 0 0 5 <source language text file> <target language text file> <file with alignment info in symal format>
diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
index 61305b958..160362f4c 100644
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.cpp
@@ -3,83 +3,103 @@
 #include "moses/StaticData.h"
 #include "moses/TargetPhrase.h"
 #include <iomanip>
-
+#include <boost/foreach.hpp>
 using namespace std;
 
 namespace Moses
 {
-PhraseDictionaryDynSuffixArray::PhraseDictionaryDynSuffixArray(const std::string &line)
-  :PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
+PhraseDictionaryDynSuffixArray::
+PhraseDictionaryDynSuffixArray(const std::string &line)
+  : PhraseDictionary("PhraseDictionaryDynSuffixArray", line)
   ,m_biSA(new BilingualDynSuffixArray())
 {
   ReadParameters();
 }
 
-PhraseDictionaryDynSuffixArray::~PhraseDictionaryDynSuffixArray()
-{
-  delete m_biSA;
-}
 
 void PhraseDictionaryDynSuffixArray::Load()
 {
   SetFeaturesToApply();
 
-  const StaticData &staticData = StaticData::Instance();
-  vector<float> weight = staticData.GetWeights(this);
-
-  m_biSA->Load( m_input, m_output, m_source, m_target, m_alignments, weight);
+  vector<float> weight = StaticData::Instance().GetWeights(this);
+  m_biSA->Load(m_input, m_output, m_source, m_target, m_alignments, weight);
 }
 
-const TargetPhraseCollection *PhraseDictionaryDynSuffixArray::GetTargetPhraseCollection(const Phrase& src) const
+PhraseDictionaryDynSuffixArray::
+~PhraseDictionaryDynSuffixArray()
 {
-  TargetPhraseCollection *ret = new TargetPhraseCollection();
-  std::vector< std::pair< Scores, TargetPhrase*> > trg;
-  // extract target phrases and their scores from suffix array
-  m_biSA->GetTargetPhrasesByLexicalWeight( src, trg);
+  delete m_biSA;
+}
 
-  std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
-  for(itr = trg.begin(); itr != trg.end(); ++itr) {
-    Scores scoreVector = itr->first;
-    TargetPhrase *targetPhrase = itr->second;
-    //std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),NegateScore);
-    std::transform(scoreVector.begin(),scoreVector.end(),scoreVector.begin(),FloorScore);
+void PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const std::string& value)
+{
+  if (key == "source") {
+    m_source = value;
+  } else if (key == "target") {
+    m_target = value;
+  } else if (key == "alignment") {
+    m_alignments = value;
+  } else {
+    PhraseDictionary::SetParameter(key, value);
+  }
+}
 
-    targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
-    targetPhrase->Evaluate(src);
+const TargetPhraseCollection*
+PhraseDictionaryDynSuffixArray::
+GetTargetPhraseCollection(const Phrase& src) const
+{
+  typedef map<SAPhrase, vector<float> >::value_type pstat_entry;
+  map<SAPhrase, vector<float> > pstats; // phrase (pair) statistics
+  m_biSA->GatherCands(src,pstats);
 
-    //cout << *targetPhrase << "\t" << std::setprecision(8) << scoreVector[2] << endl;
-    ret->Add(targetPhrase);
+  TargetPhraseCollection *ret = new TargetPhraseCollection();
+  BOOST_FOREACH(pstat_entry & e, pstats) {
+    TargetPhrase* tp = m_biSA->GetMosesFactorIDs(e.first, src);
+    tp->GetScoreBreakdown().Assign(this,e.second);
+    ret->Add(tp);
   }
-  ret->NthElement(m_tableLimit); // sort the phrases for the dcoder
+  // return ret;
+  // TargetPhraseCollection *ret = new TargetPhraseCollection();
+  // std::vector< std::pair< Scores, TargetPhrase*> > trg;
+  //
+  // // extract target phrases and their scores from suffix array
+  // m_biSA->GetTargetPhrasesByLexicalWeight(src, trg);
+  //
+  // std::vector< std::pair< Scores, TargetPhrase*> >::iterator itr;
+  // for(itr = trg.begin(); itr != trg.end(); ++itr) {
+  //   Scores scoreVector = itr->first;
+  //   TargetPhrase *targetPhrase = itr->second;
+  //   std::transform(scoreVector.begin(),scoreVector.end(),
+  // 		   scoreVector.begin(),FloorScore);
+  //   targetPhrase->GetScoreBreakdown().Assign(this, scoreVector);
+  //   targetPhrase->Evaluate();
+  //   ret->Add(targetPhrase);
+  // }
+  ret->NthElement(m_tableLimit); // sort the phrases for the decoder
   return ret;
 }
 
-void PhraseDictionaryDynSuffixArray::insertSnt(string& source, string& target, string& alignment)
+void
+PhraseDictionaryDynSuffixArray::
+insertSnt(string& source, string& target, string& alignment)
 {
   m_biSA->addSntPair(source, target, alignment); // insert sentence pair into suffix arrays
   //StaticData::Instance().ClearTransOptionCache(); // clear translation option cache
 }
-void PhraseDictionaryDynSuffixArray::deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
-{
-  // need to implement --
-}
 
-ChartRuleLookupManager *PhraseDictionaryDynSuffixArray::CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
+void
+PhraseDictionaryDynSuffixArray::
+deleteSnt(unsigned /* idx */, unsigned /* num2Del */)
 {
-  throw "Chart decoding not supported by PhraseDictionaryDynSuffixArray";
+  // need to implement --
 }
 
-void PhraseDictionaryDynSuffixArray::SetParameter(const std::string& key, const std::string& value)
+ChartRuleLookupManager*
+PhraseDictionaryDynSuffixArray::
+CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&)
 {
-  if (key == "source") {
-    m_source = value;
-  } else if (key == "target") {
-    m_target = value;
-  } else if (key == "alignment") {
-    m_alignments = value;
-  } else {
-    PhraseDictionary::SetParameter(key, value);
-  }
+  CHECK(false);
+  return 0;
 }
 
 }// end namepsace
diff --git a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h
index 2c9a9ceca..8cd761350 100644
--- a/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h
+++ b/moses/TranslationModel/PhraseDictionaryDynSuffixArray.h
@@ -17,21 +17,19 @@ class PhraseDictionaryDynSuffixArray: public PhraseDictionary
 public:
   PhraseDictionaryDynSuffixArray(const std::string &line);
   ~PhraseDictionaryDynSuffixArray();
-
+  bool InitDictionary();
   void Load();
-
   // functions below required by base class
   const TargetPhraseCollection* GetTargetPhraseCollection(const Phrase& src) const;
   void insertSnt(string&, string&, string&);
   void deleteSnt(unsigned, unsigned);
   ChartRuleLookupManager *CreateRuleLookupManager(const InputType&, const ChartCellCollectionBase&);
-
   void SetParameter(const std::string& key, const std::string& value);
-
 private:
   BilingualDynSuffixArray *m_biSA;
   std::string m_source, m_target, m_alignments;
 
+  std::vector<float> m_weight;
 };
 
 } // end namespace
diff --git a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
index 2f05e92b6..06e931fab 100644
--- a/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
+++ b/moses/TranslationModel/RuleTable/PhraseDictionaryOnDisk.cpp
@@ -31,7 +31,8 @@ using namespace std;
 namespace Moses
 {
 PhraseDictionaryOnDisk::PhraseDictionaryOnDisk(const std::string &line)
-  : MyBase("PhraseDictionaryOnDisk", line) {
+  : MyBase("PhraseDictionaryOnDisk", line)
+{
   ReadParameters();
 }
 
diff --git a/moses/TranslationModel/RuleTable/Trie.h b/moses/TranslationModel/RuleTable/Trie.h
index 265747260..0d363f747 100644
--- a/moses/TranslationModel/RuleTable/Trie.h
+++ b/moses/TranslationModel/RuleTable/Trie.h
@@ -48,12 +48,6 @@ public:
 
   void Load();
 
-  // Required by PhraseDictionary.
-  virtual const TargetPhraseCollection *GetTargetPhraseCollection(const Phrase &) const {
-    CHECK(false);
-    return NULL;
-  }
-
 private:
   friend class RuleTableLoader;
 
diff --git a/moses/TranslationModel/WordCoocTable.cpp b/moses/TranslationModel/WordCoocTable.cpp
new file mode 100644
index 000000000..c3223942e
--- /dev/null
+++ b/moses/TranslationModel/WordCoocTable.cpp
@@ -0,0 +1,72 @@
+#include "moses/TranslationModel/WordCoocTable.h"
+using namespace std;
+namespace Moses
+{
+
+WordCoocTable::
+WordCoocTable()
+{
+  m_cooc.reserve(1000000);
+  m_marg1.reserve(1000000);
+  m_marg2.reserve(1000000);
+}
+
+WordCoocTable::
+WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2)
+  : m_cooc(VocabSize1), m_marg1(VocabSize1,0), m_marg2(VocabSize2, 0)
+{}
+
+void
+WordCoocTable::
+Count(size_t const a, size_t const b)
+{
+  while (a >= m_marg1.size()) {
+    m_cooc.push_back(my_map_t());
+    m_marg1.push_back(0);
+  }
+  while (b >= m_marg2.size())
+    m_marg2.push_back(0);
+  ++m_marg1[a];
+  ++m_marg2[b];
+  ++m_cooc[a][b];
+}
+
+uint32_t
+WordCoocTable::
+GetJoint(size_t const a, size_t const b) const
+{
+  if (a >= m_marg1.size() || b >= m_marg2.size()) return 0;
+  my_map_t::const_iterator m = m_cooc.at(a).find(b);
+  if (m == m_cooc[a].end()) return 0;
+  return m->second;
+}
+
+uint32_t
+WordCoocTable::
+GetMarg1(size_t const x) const
+{
+  return x >= m_marg1.size() ? 0 : m_marg1[x];
+}
+
+uint32_t
+WordCoocTable::
+GetMarg2(size_t const x) const
+{
+  return x >= m_marg2.size() ? 0 : m_marg2[x];
+}
+
+float
+WordCoocTable::
+pfwd(size_t const a, size_t const b) const
+{
+  return float(GetJoint(a,b))/GetMarg1(a);
+}
+
+float
+WordCoocTable::
+pbwd(size_t const a, size_t const b) const
+{
+  // cerr << "at " << __FILE__ << ":" << __LINE__ << endl;
+  return float(GetJoint(a,b))/GetMarg2(b);
+}
+}
diff --git a/moses/TranslationModel/WordCoocTable.h b/moses/TranslationModel/WordCoocTable.h
new file mode 100644
index 000000000..60d788bd3
--- /dev/null
+++ b/moses/TranslationModel/WordCoocTable.h
@@ -0,0 +1,72 @@
+#ifndef moses_WordCoocTable_h
+#define moses_WordCoocTable_h
+
+#include "moses/TranslationModel/DynSAInclude/vocab.h"
+#include "moses/TranslationModel/DynSAInclude/types.h"
+#include "moses/TranslationModel/DynSAInclude/utils.h"
+#include "moses/InputFileStream.h"
+#include "moses/FactorTypeSet.h"
+#include "moses/TargetPhrase.h"
+#include <boost/dynamic_bitset.hpp>
+#include <map>
+
+namespace Moses
+{
+
+using namespace std;
+
+#ifndef bitvector
+typedef boost::dynamic_bitset<uint64_t> bitvector;
+#endif
+
+
+/**
+ *  Stores word cooccurrence counts
+ *  @todo ask Uli Germann
+ */
+class WordCoocTable
+{
+  typedef map<wordID_t,uint32_t> my_map_t;
+  vector<my_map_t> m_cooc;
+  vector<uint32_t> m_marg1;
+  vector<uint32_t> m_marg2;
+public:
+  WordCoocTable();
+  WordCoocTable(wordID_t const VocabSize1, wordID_t const VocabSize2);
+  uint32_t GetJoint(size_t const a, size_t const b) const;
+  uint32_t GetMarg1(size_t const x) const;
+  uint32_t GetMarg2(size_t const x) const;
+  float pfwd(size_t const a, size_t const b) const;
+  float pbwd(size_t const a, size_t const b) const;
+  void
+  Count(size_t const a, size_t const b);
+
+  template<typename idvec, typename alnvec>
+  void
+  Count(idvec const& s1, idvec const& s2, alnvec const& aln,
+        wordID_t const NULL1, wordID_t const NULL2);
+
+};
+
+template<typename idvec, typename alnvec>
+void
+WordCoocTable::
+Count(idvec const& s1, idvec const& s2, alnvec const& aln,
+      wordID_t const NULL1, wordID_t const NULL2)
+{
+  boost::dynamic_bitset<uint64_t> check1(s1.size()), check2(s2.size());
+  check1.set();
+  check2.set();
+  for (size_t i = 0; i < aln.size(); i += 2) {
+    Count(s1[aln[i]], s2[aln[i+1]]);
+    check1.reset(aln[i]);
+    check2.reset(aln[i+1]);
+  }
+  for (size_t i = check1.find_first(); i < check1.size(); i = check1.find_next(i))
+    Count(s1[i], NULL2);
+  for (size_t i = check2.find_first(); i < check2.size(); i = check2.find_next(i))
+    Count(NULL1, s2[i]);
+}
+
+}
+#endif
diff --git a/moses/TypeDef.h b/moses/TypeDef.h
index 2b98b5bc3..af3a47b23 100644
--- a/moses/TypeDef.h
+++ b/moses/TypeDef.h
@@ -121,6 +121,7 @@ enum PhraseTableImplementation {
   ,FuzzyMatch    = 11
   ,Compact      = 12
   ,Interpolated = 13
+  ,DSuffixArray = 14
 };
 
 enum InputTypeEnum {
diff --git a/moses/generic/sampling/Sampling.h b/moses/generic/sampling/Sampling.h
new file mode 100644
index 000000000..69796c96e
--- /dev/null
+++ b/moses/generic/sampling/Sampling.h
@@ -0,0 +1,51 @@
+#ifndef __sampling_h
+#define __sampling_h
+
+// Utility functions for proper sub-sampling.
+// (c) 2007-2012 Ulrich Germann
+
+
+namespace Moses
+{
+
+inline
+size_t
+randInt(size_t N)
+{
+  return N*(rand()/(RAND_MAX+1.));
+}
+
+// select a random sample of size /s/ without restitution from the range of
+// integers [0,N);
+template<typename idx_t>
+void
+randomSample(vector<idx_t>& v, size_t s, size_t N)
+{
+  // see also Knuth: Art of Computer Programming Vol. 2, p. 142
+
+  s = min(s,N);
+  v.resize(s);
+
+  // the first option tries to be a bit more efficient than O(N) in picking
+  // the samples. The threshold is an ad-hoc, off-the-cuff guess. I still
+  // need to figure out the optimal break-even point between a linear sweep
+  // and repeatedly picking random numbers with the risk of hitting the same
+  // number many times.
+  if (s*10<N) {
+    boost::dynamic_bitset<uint64_t> check(N,0);
+    for (size_t i = 0; i < v.size(); i++) {
+      size_t x = randInt(N);
+      while (check[x]) x = randInt(N);
+      check[x]=true;
+      v[i] = x;
+    }
+  } else {
+    size_t m=0;
+    for (size_t t = 0; m <= s && t < N; t++)
+      if (s==N || randInt(N-t) < s-m) v[m++] = t;
+  }
+}
+
+};
+
+#endif
diff --git a/moses/generic/sorting/NBestList.h b/moses/generic/sorting/NBestList.h
new file mode 100644
index 000000000..9c58baa88
--- /dev/null
+++ b/moses/generic/sorting/NBestList.h
@@ -0,0 +1,85 @@
+#ifndef __n_best_list_h
+#define __n_best_list_h
+#include <algorithm>
+#include "moses/generic/sorting/VectorIndexSorter.h"
+
+// NBest List; (c) 2007-2012 Ulrich Germann
+//
+// The 'trick' used in this implementation is to maintain a heap of size <= N
+// such that the lowest-scoring item is on top of the heap. For each incoming
+// item we can then determine easily if it is in the top N.
+
+namespace Moses
+{
+using namespace std;
+
+template<typename THINGY, typename CMP>
+class
+  NBestList
+{
+  vector<uint32_t> m_heap;
+  vector<THINGY>   m_list;
+  VectorIndexSorter<THINGY, CMP, uint32_t> m_better;
+  mutable vector<uint32_t> m_order;
+  mutable bool m_changed;
+public:
+  NBestList(size_t const max_size, CMP const& cmp);
+  NBestList(size_t const max_size);
+  bool add(THINGY const& item);
+  THINGY const& operator[](int i) const;
+  size_t size() const {
+    return m_heap.size();
+  }
+};
+
+template<typename THINGY, typename CMP>
+NBestList<THINGY,CMP>::
+NBestList(size_t const max_size, CMP const& cmp)
+  : m_better(m_list, cmp), m_changed(false)
+{
+  m_heap.reserve(max_size);
+}
+
+template<typename THINGY, typename CMP>
+NBestList<THINGY,CMP>::
+NBestList(size_t const max_size)
+  : m_better(m_heap), m_changed(false)
+{
+  m_heap.reserve(max_size);
+}
+
+template<typename THINGY, typename CMP>
+bool
+NBestList<THINGY,CMP>::
+add(THINGY const& item)
+{
+  if (m_heap.size() == m_heap.capacity()) {
+    if (m_better.Compare(item, m_list[m_heap.at(0)])) {
+      pop_heap(m_heap.begin(),m_heap.end(),m_better);
+      m_list[m_heap.back()] = item;
+    } else return false;
+  } else {
+    m_list.push_back(item);
+    m_heap.push_back(m_heap.size());
+  }
+  push_heap(m_heap.begin(),m_heap.end(),m_better);
+  return m_changed = true;
+}
+
+template<typename THINGY, typename CMP>
+THINGY const&
+NBestList<THINGY,CMP>::
+operator[](int i) const
+{
+  if (m_changed) {
+    m_order.assign(m_heap.begin(),m_heap.end());
+    for (size_t k = m_heap.size(); k != 0; --k)
+      pop_heap(m_order.begin(), m_order.begin()+k);
+    m_changed = false;
+  }
+  if (i < 0) i += m_order.size();
+  return m_list[m_order.at(i)];
+}
+
+}
+#endif
diff --git a/moses/generic/sorting/VectorIndexSorter.h b/moses/generic/sorting/VectorIndexSorter.h
new file mode 100644
index 000000000..a689c417b
--- /dev/null
+++ b/moses/generic/sorting/VectorIndexSorter.h
@@ -0,0 +1,69 @@
+#ifndef __vector_index_sorter_h
+#define __vector_index_sorter_h
+
+// VectorIndexSorter; (c) 2007-2012 Ulrich Germann
+
+// A VectorIndexSorter is a function object for sorting indices into a vector
+// of objects (instead of sorting the vector itself).
+//
+// typcial use:
+// vector<thingy> my_vector;
+// VectorIndexSorter<thingy,less<thingy>,int> sorter(my_vector);
+// vector<int> order;
+// sorter.get_order(order);
+
+namespace Moses
+{
+template<typename VAL, typename COMP = greater<VAL>,  typename IDX_T=size_t>
+class
+  VectorIndexSorter : public binary_function<IDX_T const&, IDX_T const&, bool>
+{
+  vector<VAL> const&    m_vecref;
+  boost::shared_ptr<COMP> m_comp;
+public:
+
+  COMP const& Compare;
+  VectorIndexSorter(vector<VAL> const& v, COMP const& comp)
+    : m_vecref(v), Compare(comp)
+  { }
+
+  VectorIndexSorter(vector<VAL> const& v)
+    : m_vecref(v), m_comp(new COMP()), Compare(*m_comp)
+  { }
+
+  bool operator()(IDX_T const & a, IDX_T const & b) const {
+    bool fwd = Compare(m_vecref.at(a) ,m_vecref.at(b));
+    bool bwd = Compare(m_vecref[b],    m_vecref[a]);
+    return (fwd == bwd ? a < b : fwd);
+  }
+
+  boost::shared_ptr<vector<IDX_T> >
+  GetOrder() const;
+
+  void
+  GetOrder(vector<IDX_T> & order) const;
+
+};
+
+template<typename VAL, typename COMP, typename IDX_T>
+boost::shared_ptr<vector<IDX_T> >
+VectorIndexSorter<VAL,COMP,IDX_T>::
+GetOrder() const
+{
+  boost::shared_ptr<vector<IDX_T> > ret(new vector<IDX_T>(m_vecref.size()));
+  get_order(*ret);
+  return ret;
+}
+
+template<typename VAL, typename COMP, typename IDX_T>
+void
+VectorIndexSorter<VAL,COMP,IDX_T>::
+GetOrder(vector<IDX_T> & order) const
+{
+  order.resize(m_vecref.size());
+  for (IDX_T i = 0; i < IDX_T(m_vecref.size()); ++i) order[i] = i;
+  sort(order.begin(), order.end(), *this);
+}
+
+}
+#endif
author	Hieu Hoang <hieu@hoang.co.uk>	2013-06-28 17:19:30 +0400
committer	Hieu Hoang <hieu@hoang.co.uk>	2013-06-28 17:19:30 +0400
commit	fa4b92fc0af49079827fb6d71eae86fc97332121 (patch)
tree	3f14600ca2dc8fc8de14ed7552ca4864d5c7baf8
parent	8c19c2ba8ae59e4cdfefaa442e9825ba9b8bf39f (diff)
parent	c963338476a40d5b1071c0c253332ed6ebd0b427 (diff)