From 3a72b4958a3fc468b6bd6102e67e24007c9b2d9b Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Mon, 3 Oct 2016 19:02:06 +0100
Subject: update Moses::ProbingPT with Moses2::ProbingPT. Does not compile

---
 contrib/other-builds/moses/.project                |  32 +-
 misc/CreateProbingPT.cpp                           | 108 ++++-
 misc/Jamfile                                       |   4 +-
 misc/QueryProbingPT.cpp                            |   2 +-
 moses/TranslationModel/ProbingPT/ProbingPT.cpp     |  93 +++--
 moses/TranslationModel/ProbingPT/ProbingPT.h       |  19 +-
 moses/TranslationModel/ProbingPT/hash.cpp          |  36 +-
 moses/TranslationModel/ProbingPT/hash.hh           |   7 +-
 moses/TranslationModel/ProbingPT/huffmanish.cpp    | 451 ---------------------
 moses/TranslationModel/ProbingPT/huffmanish.hh     | 112 -----
 moses/TranslationModel/ProbingPT/line_splitter.cpp |  59 ++-
 moses/TranslationModel/ProbingPT/line_splitter.hh  |  36 +-
 .../ProbingPT/probing_hash_utils.cpp               |  28 +-
 .../ProbingPT/probing_hash_utils.hh                |  38 +-
 moses/TranslationModel/ProbingPT/quering.cpp       | 221 ++++------
 moses/TranslationModel/ProbingPT/quering.hh        |  62 ++-
 moses/TranslationModel/ProbingPT/storing.cpp       | 322 +++++++++++----
 moses/TranslationModel/ProbingPT/storing.hh        |  91 ++++-
 moses/TranslationModel/ProbingPT/vocabid.cpp       |  53 ++-
 moses/TranslationModel/ProbingPT/vocabid.hh        |  15 +-
 20 files changed, 837 insertions(+), 952 deletions(-)
 delete mode 100644 moses/TranslationModel/ProbingPT/huffmanish.cpp
 delete mode 100644 moses/TranslationModel/ProbingPT/huffmanish.hh
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project
index b59f28e08..c25eb5225 100644
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@@ -1319,7 +1319,7 @@
 			<name>FF/PhraseBoundaryFeature.h</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/moses/FF/PhraseBoundaryFeature.h</locationURI>
-        </link>
+		</link>
 		<link>
 			<name>FF/PhraseDistanceFeature.cpp</name>
 			<type>1</type>
@@ -3341,24 +3341,34 @@
 			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/ProbingPT.h</locationURI>
 		</link>
 		<link>
-			<name>TranslationModel/ProbingPT/hash.cpp</name>
+			<name>TranslationModel/ProbingPT/StoreTarget.cpp</name>
 			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.cpp</locationURI>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.cpp</locationURI>
 		</link>
 		<link>
-			<name>TranslationModel/ProbingPT/hash.hh</name>
+			<name>TranslationModel/ProbingPT/StoreTarget.h</name>
 			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh</locationURI>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreTarget.h</locationURI>
 		</link>
 		<link>
-			<name>TranslationModel/ProbingPT/huffmanish.cpp</name>
+			<name>TranslationModel/ProbingPT/StoreVocab.cpp</name>
 			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.cpp</locationURI>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.cpp</locationURI>
 		</link>
 		<link>
-			<name>TranslationModel/ProbingPT/huffmanish.hh</name>
+			<name>TranslationModel/ProbingPT/StoreVocab.h</name>
 			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/huffmanish.hh</locationURI>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/StoreVocab.h</locationURI>
+		</link>
+		<link>
+			<name>TranslationModel/ProbingPT/hash.cpp</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.cpp</locationURI>
+		</link>
+		<link>
+			<name>TranslationModel/ProbingPT/hash.hh</name>
+			<type>1</type>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/hash.hh</locationURI>
 		</link>
 		<link>
 			<name>TranslationModel/ProbingPT/line_splitter.cpp</name>
@@ -3664,7 +3674,7 @@
 			<name>TranslationModel/UG/sapt_pscore_coherence.h</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_coherence.h</locationURI>
-        </link>
+		</link>
 		<link>
 			<name>TranslationModel/UG/sapt_pscore_lex1.h</name>
 			<type>1</type>
@@ -3709,7 +3719,7 @@
 			<name>TranslationModel/UG/sapt_pscore_wordcount.h</name>
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/UG/sapt_pscore_wordcount.h</locationURI>
-        </link>
+		</link>
 		<link>
 			<name>TranslationModel/UG/sim-pe.cc</name>
 			<type>1</type>
diff --git a/misc/CreateProbingPT.cpp b/misc/CreateProbingPT.cpp
index b23427f30..dff916660 100644
--- a/misc/CreateProbingPT.cpp
+++ b/misc/CreateProbingPT.cpp
@@ -1,29 +1,113 @@
+#include <string>
+#include <boost/program_options.hpp>
 #include "util/usage.hh"
 #include "moses/TranslationModel/ProbingPT/storing.hh"
+#include "moses/InputFileStream.h"
+#include "moses/OutputFileStream.h"
+#include "moses/Util.h"
 
+using namespace std;
 
+std::string ReformatSCFGFile(const std::string &path);
 
 int main(int argc, char* argv[])
 {
+	string inPath, outPath; 
+	int num_scores = 4;
+	int num_lex_scores = 0;
+	bool log_prob = false;
+	bool scfg = false;
+	int max_cache_size = 50000;
 
-  const char * is_reordering = "false";
+  namespace po = boost::program_options;
+  po::options_description desc("Options");
+  desc.add_options()
+  ("help", "Print help messages")
+  ("input-pt", po::value<string>()->required(), "Text pt")
+  ("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
+  ("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
+  ("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
+  ("log-prob", "log (and floor) probabilities before storing")
+  ("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit")
+  ("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS")
 
-  if (!(argc == 5 || argc == 4)) {
-    // Tell the user how to run the program
-    std::cerr << "Provided " << argc << " arguments, needed 4 or 5." << std::endl;
-    std::cerr << "Usage: " << argv[0] << " path_to_phrasetable output_dir num_scores is_reordering" << std::endl;
-    std::cerr << "is_reordering should be either true or false, but it is currently a stub feature." << std::endl;
-    //std::cerr << "Usage: " << argv[0] << " path_to_phrasetable number_of_uniq_lines output_bin_file output_hash_table output_vocab_id" << std::endl;
-    return 1;
+	;
+
+  po::variables_map vm;
+  try {
+    po::store(po::parse_command_line(argc, argv, desc),
+              vm); // can throw
+
+    /** --help option
+     */
+    if ( vm.count("help")) {
+      std::cout << desc << std::endl;
+      return EXIT_SUCCESS;
+    }
+
+    po::notify(vm); // throws on error, so do after help in case
+    // there are any problems
+  } catch(po::error& e) {
+    std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
+    std::cerr << desc << std::endl;
+    return EXIT_FAILURE;
   }
 
-  if (argc == 5) {
-    is_reordering = argv[4];
+  if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>();
+  if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
+  if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
+  if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
+  if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>();
+  if (vm.count("log-prob")) log_prob = true;
+  if (vm.count("scfg")) scfg = true;
+
+
+  if (scfg) {
+    inPath = ReformatSCFGFile(inPath);
   }
 
-  createProbingPT(argv[1], argv[2], argv[3], is_reordering);
+  Moses::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg);
 
-  util::PrintUsage(std::cout);
+  //util::PrintUsage(std::cout);
   return 0;
 }
 
+std::string ReformatSCFGFile(const std::string &path)
+{
+  Moses::InputFileStream inFile(path);
+  string reformattedPath = path + ".reformat.gz";
+  Moses::OutputFileStream outFile(reformattedPath);
+
+  string line;
+  while (getline(inFile, line)) {
+    vector<string> toks = Moses::TokenizeMultiCharSeparator(line, "|||");
+    assert(toks.size() >= 3);
+
+    // source
+    vector<string> sourceToks = Moses::Tokenize(toks[0], " ");
+    for (size_t i = 0; i < sourceToks.size() - 1; ++i) {
+      outFile << sourceToks[i] << " ";
+    }
+
+    // other columns
+    for (size_t i = 1; i < toks.size(); ++i) {
+      outFile << "|||" << toks[i];
+    }
+    outFile << endl;
+  }
+
+  inFile.Close();
+  outFile.Close();
+
+  string sortedPath = path + ".reformat.sorted.gz";
+  string tmpPath = path + ".tmp ";
+  string cmd = "mkdir " + tmpPath
+      + " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath;
+  system(cmd.c_str());
+
+  cmd = "rm -rf " + tmpPath + " " + reformattedPath;
+  system(cmd.c_str());
+
+  return sortedPath;
+}
+
diff --git a/misc/Jamfile b/misc/Jamfile
index f1599aca8..135490a46 100644
--- a/misc/Jamfile
+++ b/misc/Jamfile
@@ -31,9 +31,9 @@ else {
 }
 
 exe CreateProbingPT : CreateProbingPT.cpp ..//boost_filesystem ../moses//moses ;
-exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
+#exe QueryProbingPT : QueryProbingPT.cpp ..//boost_filesystem ../moses//moses ;
 
-alias programsProbing : CreateProbingPT QueryProbingPT ;
+alias programsProbing : CreateProbingPT ; #QueryProbingPT
 
 exe merge-sorted : 
 merge-sorted.cc 
diff --git a/misc/QueryProbingPT.cpp b/misc/QueryProbingPT.cpp
index 72fd0be11..5047d4d47 100644
--- a/misc/QueryProbingPT.cpp
+++ b/misc/QueryProbingPT.cpp
@@ -34,7 +34,7 @@ int main(int argc, char* argv[])
     return 1;
   }
 
-  QueryEngine queries(argv[1]);
+  Moses::QueryEngine queries(argv[1]);
 
   //Interactive search
   std::cout << "Please enter a string to be searched, or exit to exit." << std::endl;
diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
index cbfd2c1a4..bb3f26e22 100644
--- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp
+++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
@@ -3,6 +3,7 @@
 #include "moses/StaticData.h"
 #include "moses/FactorCollection.h"
 #include "moses/TargetPhraseCollection.h"
+#include "moses/InputFileStream.h"
 #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
 #include "quering.hh"
 
@@ -34,44 +35,94 @@ void ProbingPT::Load(AllOptions::ptr const& opts)
 
   m_unkId = 456456546456;
 
+  FactorCollection &vocab = FactorCollection::Instance();
+
   // source vocab
-  const std::map<uint64_t, std::string> &sourceVocab = m_engine->getSourceVocab();
+  const std::map<uint64_t, std::string> &sourceVocab =
+      m_engine->getSourceVocab();
   std::map<uint64_t, std::string>::const_iterator iterSource;
-  for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end(); ++iterSource) {
-    const string &wordStr = iterSource->second;
-    const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
+  for (iterSource = sourceVocab.begin(); iterSource != sourceVocab.end();
+      ++iterSource) {
+    string wordStr = iterSource->second;
+    //cerr << "wordStr=" << wordStr << endl;
 
-    uint64_t probingId = iterSource->first;
+    const Factor *factor = vocab.AddFactor(wordStr);
 
-    SourceVocabMap::value_type entry(factor, probingId);
-    m_sourceVocabMap.insert(entry);
+    uint64_t probingId = iterSource->first;
+    size_t factorId = factor->GetId();
 
+    if (factorId >= m_sourceVocab.size()) {
+      m_sourceVocab.resize(factorId + 1, m_unkId);
+    }
+    m_sourceVocab[factorId] = probingId;
   }
 
   // target vocab
-  const std::map<unsigned int, std::string> &probingVocab = m_engine->getVocab();
-  std::map<unsigned int, std::string>::const_iterator iter;
-  for (iter = probingVocab.begin(); iter != probingVocab.end(); ++iter) {
-    const string &wordStr = iter->second;
-    const Factor *factor = FactorCollection::Instance().AddFactor(wordStr);
+  InputFileStream targetVocabStrme(m_filePath + "/TargetVocab.dat");
+  string line;
+  while (getline(targetVocabStrme, line)) {
+    vector<string> toks = Tokenize(line, "\t");
+    UTIL_THROW_IF2(toks.size() != 2, string("Incorrect format:") + line + "\n");
+
+    //cerr << "wordStr=" << toks[0] << endl;
+
+    const Factor *factor = vocab.AddFactor(toks[0]);
+    uint32_t probingId = Scan<uint32_t>(toks[1]);
+
+    if (probingId >= m_targetVocab.size()) {
+      m_targetVocab.resize(probingId + 1);
+    }
+
+    m_targetVocab[probingId] = factor;
+  }
+
+  // alignments
+  CreateAlignmentMap(m_filePath + "/Alignments.dat");
 
-    unsigned int probingId = iter->first;
+  // memory mapped file to tps
+  string filePath = m_filePath + "/TargetColl.dat";
+  file.open(filePath.c_str());
+  if (!file.is_open()) {
+    throw "Couldn't open file ";
+  }
+
+  data = file.data();
+  //size_t size = file.size();
+
+  // cache
+  //CreateCache(system);
 
-    TargetVocabMap::value_type entry(factor, probingId);
-    m_vocabMap.insert(entry);
+}
 
+void ProbingPT::CreateAlignmentMap(const std::string path)
+{
+  const std::vector< std::vector<unsigned char> > &probingAlignColl = m_engine->getAlignments();
+  m_aligns.resize(probingAlignColl.size(), NULL);
+
+  for (size_t i = 0; i < probingAlignColl.size(); ++i) {
+    AlignmentInfo::CollType aligns;
+
+    const std::vector<unsigned char> &probingAligns = probingAlignColl[i];
+    for (size_t j = 0; j < probingAligns.size(); j += 2) {
+      size_t startPos = probingAligns[j];
+      size_t endPos = probingAligns[j+1];
+      //cerr << "startPos=" << startPos << " " << endPos << endl;
+      aligns.insert(std::pair<size_t,size_t>(startPos, endPos));
+    }
+
+    const AlignmentInfo *align = AlignmentInfoCollection::Instance().Add(aligns);
+    m_aligns[i] = align;
+    //cerr << "align=" << align->Debug(system) << endl;
   }
 }
 
 void ProbingPT::InitializeForInput(ttasksptr const& ttask)
 {
-  ReduceCache();
+
 }
 
 void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQueue) const
 {
-  CacheColl &cache = GetCache();
-
   InputPathList::const_iterator iter;
   for (iter = inputPathQueue.begin(); iter != inputPathQueue.end(); ++iter) {
     InputPath &inputPath = **iter;
@@ -82,12 +133,6 @@ void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQue
     }
 
     TargetPhraseCollection::shared_ptr tpColl = CreateTargetPhrase(sourcePhrase);
-
-    // add target phrase to phrase-table cache
-    size_t hash = hash_value(sourcePhrase);
-    std::pair<TargetPhraseCollection::shared_ptr , clock_t> value(tpColl, clock());
-    cache[hash] = value;
-
     inputPath.SetTargetPhrases(*this, tpColl, NULL);
   }
 }
diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.h b/moses/TranslationModel/ProbingPT/ProbingPT.h
index 4e7ab02c6..3b5dfc895 100644
--- a/moses/TranslationModel/ProbingPT/ProbingPT.h
+++ b/moses/TranslationModel/ProbingPT/ProbingPT.h
@@ -1,17 +1,17 @@
 
 #pragma once
-
+#include <boost/iostreams/device/mapped_file.hpp>
 #include <boost/bimap.hpp>
 #include "../PhraseDictionary.h"
 
-class QueryEngine;
-class target_text;
 
 namespace Moses
 {
 class ChartParser;
 class ChartCellCollectionBase;
 class ChartRuleLookupManager;
+class QueryEngine;
+class target_text;
 
 class ProbingPT : public PhraseDictionary
 {
@@ -39,12 +39,16 @@ public:
 
 protected:
   QueryEngine *m_engine;
+  uint64_t m_unkId;
 
-  typedef boost::bimap<const Factor *, uint64_t> SourceVocabMap;
-  mutable SourceVocabMap m_sourceVocabMap;
+  std::vector<uint64_t> m_sourceVocab; // factor id -> pt id
+  std::vector<const Factor*> m_targetVocab; // pt id -> factor*
+  std::vector<const AlignmentInfo*> m_aligns;
 
-  typedef boost::bimap<const Factor *, unsigned int> TargetVocabMap;
-  mutable TargetVocabMap m_vocabMap;
+  boost::iostreams::mapped_file_source file;
+  const char *data;
+
+  void CreateAlignmentMap(const std::string path);
 
   TargetPhraseCollection::shared_ptr CreateTargetPhrase(const Phrase &sourcePhrase) const;
   TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const;
@@ -53,7 +57,6 @@ protected:
 
   std::vector<uint64_t> ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const;
 
-  uint64_t m_unkId;
 };
 
 }  // namespace Moses
diff --git a/moses/TranslationModel/ProbingPT/hash.cpp b/moses/TranslationModel/ProbingPT/hash.cpp
index 8945649ef..27a64b129 100644
--- a/moses/TranslationModel/ProbingPT/hash.cpp
+++ b/moses/TranslationModel/ProbingPT/hash.cpp
@@ -1,5 +1,11 @@
+#include <iostream>
 #include "hash.hh"
 
+using namespace std;
+
+namespace Moses
+{
+
 uint64_t getHash(StringPiece text)
 {
   std::size_t len = text.size();
@@ -7,24 +13,32 @@ uint64_t getHash(StringPiece text)
   return key;
 }
 
-std::vector<uint64_t> getVocabIDs(StringPiece textin)
+std::vector<uint64_t> getVocabIDs(const StringPiece &textin)
 {
   //Tokenize
   std::vector<uint64_t> output;
 
-  util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' '));
+  util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
+
+  while (itWord) {
+	StringPiece word = *itWord;
+	uint64_t id = 0;
+
+	util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
+    while (itFactor) {
+    	StringPiece factor = *itFactor;
+    	//cerr << "factor=" << factor << endl;
 
-  while(it) {
-    output.push_back(getHash(*it));
-    it++;
+    	id += getHash(factor);
+        itFactor++;
+    }
+
+    output.push_back(id);
+    itWord++;
   }
 
   return output;
 }
 
-uint64_t getVocabID(std::string candidate)
-{
-  std::size_t len = candidate.length();
-  uint64_t key = util::MurmurHashNative(candidate.c_str(), len);
-  return key;
-}
\ No newline at end of file
+}
+
diff --git a/moses/TranslationModel/ProbingPT/hash.hh b/moses/TranslationModel/ProbingPT/hash.hh
index 607238ae1..f218ad9da 100644
--- a/moses/TranslationModel/ProbingPT/hash.hh
+++ b/moses/TranslationModel/ProbingPT/hash.hh
@@ -6,9 +6,12 @@
 #include "util/tokenize_piece.hh"
 #include <vector>
 
+namespace Moses
+{
+
 //Gets the MurmurmurHash for give string
 uint64_t getHash(StringPiece text);
 
-std::vector<uint64_t> getVocabIDs(StringPiece textin);
+std::vector<uint64_t> getVocabIDs(const StringPiece &textin);
 
-uint64_t getVocabID(std::string candidate);
\ No newline at end of file
+}
diff --git a/moses/TranslationModel/ProbingPT/huffmanish.cpp b/moses/TranslationModel/ProbingPT/huffmanish.cpp
deleted file mode 100644
index 534fd04d1..000000000
--- a/moses/TranslationModel/ProbingPT/huffmanish.cpp
+++ /dev/null
@@ -1,451 +0,0 @@
-#include "huffmanish.hh"
-
-Huffman::Huffman (const char * filepath)
-{
-  //Read the file
-  util::FilePiece filein(filepath);
-
-  //Init uniq_lines to zero;
-  uniq_lines = 0;
-
-  line_text prev_line; //Check for unique lines.
-  int num_lines = 0 ;
-
-  while (true) {
-    line_text new_line;
-
-    num_lines++;
-
-    try {
-      //Process line read
-      new_line = splitLine(filein.ReadLine());
-      count_elements(new_line); //Counts the number of elements, adds new and increments counters.
-
-    } catch (util::EndOfFileException e) {
-      std::cerr << "Unique entries counted: ";
-      break;
-    }
-
-    if (new_line.source_phrase == prev_line.source_phrase) {
-      continue;
-    } else {
-      uniq_lines++;
-      prev_line = new_line;
-    }
-  }
-
-  std::cerr << uniq_lines << std::endl;
-}
-
-void Huffman::count_elements(line_text linein)
-{
-  //For target phrase:
-  util::TokenIter<util::SingleCharacter> it(linein.target_phrase, util::SingleCharacter(' '));
-  while (it) {
-    //Check if we have that entry
-    std::map<std::string, unsigned int>::iterator mapiter;
-    mapiter = target_phrase_words.find(it->as_string());
-
-    if (mapiter != target_phrase_words.end()) {
-      //If the element is found, increment the count.
-      mapiter->second++;
-    } else {
-      //Else create a new entry;
-      target_phrase_words.insert(std::pair<std::string, unsigned int>(it->as_string(), 1));
-    }
-    it++;
-  }
-
-  //For word allignment 1
-  std::map<std::vector<unsigned char>, unsigned int>::iterator mapiter3;
-  std::vector<unsigned char> numbers = splitWordAll1(linein.word_align);
-  mapiter3 = word_all1.find(numbers);
-
-  if (mapiter3 != word_all1.end()) {
-    //If the element is found, increment the count.
-    mapiter3->second++;
-  } else {
-    //Else create a new entry;
-    word_all1.insert(std::pair<std::vector<unsigned char>, unsigned int>(numbers, 1));
-  }
-
-}
-
-//Assigns huffman values for each unique element
-void Huffman::assign_values()
-{
-  //First create vectors for all maps so that we could sort them later.
-
-  //Create a vector for target phrases
-  for(std::map<std::string, unsigned int>::iterator it = target_phrase_words.begin(); it != target_phrase_words.end(); it++ ) {
-    target_phrase_words_counts.push_back(*it);
-  }
-  //Sort it
-  std::sort(target_phrase_words_counts.begin(), target_phrase_words_counts.end(), sort_pair());
-
-  //Create a vector for word allignments 1
-  for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1.begin(); it != word_all1.end(); it++ ) {
-    word_all1_counts.push_back(*it);
-  }
-  //Sort it
-  std::sort(word_all1_counts.begin(), word_all1_counts.end(), sort_pair_vec());
-
-
-  //Afterwards we assign a value for each phrase, starting from 1, as zero is reserved for delimiter
-  unsigned int i = 1; //huffman code
-  for(std::vector<std::pair<std::string, unsigned int> >::iterator it = target_phrase_words_counts.begin();
-      it != target_phrase_words_counts.end(); it++) {
-    target_phrase_huffman.insert(std::pair<std::string, unsigned int>(it->first, i));
-    i++; //Go to the next huffman code
-  }
-
-  i = 1; //Reset i for the next map
-  for(std::vector<std::pair<std::vector<unsigned char>, unsigned int> >::iterator it = word_all1_counts.begin();
-      it != word_all1_counts.end(); it++) {
-    word_all1_huffman.insert(std::pair<std::vector<unsigned char>, unsigned int>(it->first, i));
-    i++; //Go to the next huffman code
-  }
-
-  //After lookups are produced, clear some memory usage of objects not needed anymore.
-  target_phrase_words.clear();
-  word_all1.clear();
-
-  target_phrase_words_counts.clear();
-  word_all1_counts.clear();
-
-  std::cerr << "Finished generating huffman codes." << std::endl;
-
-}
-
-void Huffman::serialize_maps(const char * dirname)
-{
-  //Note that directory name should exist.
-  std::string basedir(dirname);
-  std::string target_phrase_path(basedir + "/target_phrases");
-  std::string probabilities_path(basedir + "/probs");
-  std::string word_all1_path(basedir + "/Wall1");
-
-  //Target phrase
-  std::ofstream os (target_phrase_path.c_str(), std::ios::binary);
-  boost::archive::text_oarchive oarch(os);
-  oarch << lookup_target_phrase;
-  os.close();
-
-  //Word all1
-  std::ofstream os2 (word_all1_path.c_str(), std::ios::binary);
-  boost::archive::text_oarchive oarch2(os2);
-  oarch2 << lookup_word_all1;
-  os2.close();
-}
-
-std::vector<unsigned char> Huffman::full_encode_line(line_text line)
-{
-  return vbyte_encode_line((encode_line(line)));
-}
-
-std::vector<unsigned int> Huffman::encode_line(line_text line)
-{
-  std::vector<unsigned int> retvector;
-
-  //Get target_phrase first.
-  util::TokenIter<util::SingleCharacter> it(line.target_phrase, util::SingleCharacter(' '));
-  while (it) {
-    retvector.push_back(target_phrase_huffman.find(it->as_string())->second);
-    it++;
-  }
-  //Add a zero;
-  retvector.push_back(0);
-
-  //Get probabilities. Reinterpreting the float bytes as unsgined int.
-  util::TokenIter<util::SingleCharacter> probit(line.prob, util::SingleCharacter(' '));
-  while (probit) {
-    //Sometimes we have too big floats to handle, so first convert to double
-    double tempnum = atof(probit->data());
-    float num = (float)tempnum;
-    retvector.push_back(reinterpret_float(&num));
-    probit++;
-  }
-  //Add a zero;
-  retvector.push_back(0);
-
-
-  //Get Word allignments
-  retvector.push_back(word_all1_huffman.find(splitWordAll1(line.word_align))->second);
-  retvector.push_back(0);
-
-  return retvector;
-}
-
-void Huffman::produce_lookups()
-{
-  //basically invert every map that we have
-  for(std::map<std::string, unsigned int>::iterator it = target_phrase_huffman.begin(); it != target_phrase_huffman.end(); it++ ) {
-    lookup_target_phrase.insert(std::pair<unsigned int, std::string>(it->second, it->first));
-  }
-
-  for(std::map<std::vector<unsigned char>, unsigned int>::iterator it = word_all1_huffman.begin(); it != word_all1_huffman.end(); it++ ) {
-    lookup_word_all1.insert(std::pair<unsigned int, std::vector<unsigned char> >(it->second, it->first));
-  }
-
-}
-
-HuffmanDecoder::HuffmanDecoder (const char * dirname)
-{
-  //Read the maps from disk
-
-  //Note that directory name should exist.
-  std::string basedir(dirname);
-  std::string target_phrase_path(basedir + "/target_phrases");
-  std::string word_all1_path(basedir + "/Wall1");
-
-  //Target phrases
-  std::ifstream is (target_phrase_path.c_str(), std::ios::binary);
-  boost::archive::text_iarchive iarch(is);
-  iarch >> lookup_target_phrase;
-  is.close();
-
-  //Word allignment 1
-  std::ifstream is2 (word_all1_path.c_str(), std::ios::binary);
-  boost::archive::text_iarchive iarch2(is2);
-  iarch2 >> lookup_word_all1;
-  is2.close();
-
-}
-
-HuffmanDecoder::HuffmanDecoder (std::map<unsigned int, std::string> * lookup_target,
-                                std::map<unsigned int, std::vector<unsigned char> > * lookup_word1)
-{
-  lookup_target_phrase = *lookup_target;
-  lookup_word_all1 = *lookup_word1;
-}
-
-std::vector<target_text> HuffmanDecoder::full_decode_line (std::vector<unsigned char> lines, int num_scores)
-{
-  std::vector<target_text> retvector; //All target phrases
-  std::vector<unsigned int> decoded_lines = vbyte_decode_line(lines); //All decoded lines
-  std::vector<unsigned int>::iterator it = decoded_lines.begin(); //Iterator for them
-  std::vector<unsigned int> current_target_phrase; //Current target phrase decoded
-
-  short zero_count = 0; //Count home many zeroes we have met. so far. Every 3 zeroes mean a new target phrase.
-  while(it != decoded_lines.end()) {
-    if (zero_count == 1) {
-      //We are extracting scores. we know how many scores there are so we can push them
-      //to the vector. This is done in case any of the scores is 0, because it would mess
-      //up the state machine.
-      for (int i = 0; i < num_scores; i++) {
-        current_target_phrase.push_back(*it);
-        it++;
-      }
-    }
-
-    if (zero_count == 3) {
-      //We have finished with this entry, decode it, and add it to the retvector.
-      retvector.push_back(decode_line(current_target_phrase, num_scores));
-      current_target_phrase.clear(); //Clear the current target phrase and the zero_count
-      zero_count = 0; //So that we can reuse them for the next target phrase
-    }
-    //Add to the next target_phrase, number by number.
-    current_target_phrase.push_back(*it);
-    if (*it == 0) {
-      zero_count++;
-    }
-    it++; //Go to the next word/symbol
-  }
-  //Don't forget the last remaining line!
-  if (zero_count == 3) {
-    //We have finished with this entry, decode it, and add it to the retvector.
-    retvector.push_back(decode_line(current_target_phrase, num_scores));
-    current_target_phrase.clear(); //Clear the current target phrase and the zero_count
-    zero_count = 0; //So that we can reuse them for the next target phrase
-  }
-
-  return retvector;
-
-}
-
-target_text HuffmanDecoder::decode_line (std::vector<unsigned int> input, int num_scores)
-{
-  //demo decoder
-  target_text ret;
-  //Split everything
-  std::vector<unsigned int> target_phrase;
-  std::vector<unsigned int> probs;
-  unsigned int wAll;
-
-  //Split the line into the proper arrays
-  short num_zeroes = 0;
-  int counter = 0;
-  while (num_zeroes < 3) {
-    unsigned int num = input[counter];
-    if (num == 0) {
-      num_zeroes++;
-    } else if (num_zeroes == 0) {
-      target_phrase.push_back(num);
-    } else if (num_zeroes == 1) {
-      //Push exactly num_scores scores
-      for (int i = 0; i < num_scores; i++) {
-        probs.push_back(num);
-        counter++;
-        num = input[counter];
-      }
-      continue;
-    } else if (num_zeroes == 2) {
-      wAll = num;
-    }
-    counter++;
-  }
-
-  ret.target_phrase = target_phrase;
-  ret.word_all1 = lookup_word_all1.find(wAll)->second;
-
-  //Decode probabilities
-  for (std::vector<unsigned int>::iterator it = probs.begin(); it != probs.end(); it++) {
-    ret.prob.push_back(reinterpret_uint(&(*it)));
-  }
-
-  return ret;
-
-}
-
-inline std::string HuffmanDecoder::getTargetWordFromID(unsigned int id)
-{
-  return lookup_target_phrase.find(id)->second;
-}
-
-std::string HuffmanDecoder::getTargetWordsFromIDs(std::vector<unsigned int> ids)
-{
-  std::string returnstring;
-  for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
-    returnstring.append(getTargetWordFromID(*it) + " ");
-  }
-
-  return returnstring;
-}
-
-inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase)
-{
-  return lookup_target_phrase->find(id)->second;
-}
-
-std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase)
-{
-  std::string returnstring;
-  for (std::vector<unsigned int>::iterator it = ids.begin(); it != ids.end(); it++) {
-    returnstring.append(getTargetWordFromID(*it, lookup_target_phrase) + " ");
-  }
-
-  return returnstring;
-}
-
-/*Those functions are used to more easily store the floats in the binary phrase table
- We convert the float unsinged int so that it is the same as our other values and we can
- apply variable byte encoding on top of it.*/
-
-inline unsigned int reinterpret_float(float * num)
-{
-  unsigned int * converted_num;
-  converted_num = reinterpret_cast<unsigned int *>(num);
-  return *converted_num;
-}
-
-inline float reinterpret_uint(unsigned int * num)
-{
-  float * converted_num;
-  converted_num = reinterpret_cast<float *>(num);
-  return *converted_num;
-}
-
-/*Mostly taken from stackoverflow, http://stackoverflow.com/questions/5858646/optimizing-variable-length-encoding
-and modified in order to return a vector of chars. Implements ULEB128 or variable byte encoding.
-This is highly optimized version with unrolled loop */
-inline std::vector<unsigned char> vbyte_encode(unsigned int num)
-{
-  //Determine how many bytes we are going to take.
-  short size;
-  std::vector<unsigned char> byte_vector;
-
-  if (num < 0x00000080U) {
-    size = 1;
-    byte_vector.reserve(size);
-    goto b1;
-  }
-  if (num < 0x00004000U) {
-    size = 2;
-    byte_vector.reserve(size);
-    goto b2;
-  }
-  if (num < 0x00200000U) {
-    size = 3;
-    byte_vector.reserve(size);
-    goto b3;
-  }
-  if (num < 0x10000000U) {
-    size = 4;
-    byte_vector.reserve(size);
-    goto b4;
-  }
-  size = 5;
-  byte_vector.reserve(size);
-
-
-  //Now proceed with the encoding.
-  byte_vector.push_back((num & 0x7f) | 0x80);
-  num >>= 7;
-b4:
-  byte_vector.push_back((num & 0x7f) | 0x80);
-  num >>= 7;
-b3:
-  byte_vector.push_back((num & 0x7f) | 0x80);
-  num >>= 7;
-b2:
-  byte_vector.push_back((num & 0x7f) | 0x80);
-  num >>= 7;
-b1:
-  byte_vector.push_back(num);
-
-  return byte_vector;
-}
-
-std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line)
-{
-  std::vector<unsigned int> huffman_line;
-  std::vector<unsigned char> current_num;
-
-  for (std::vector<unsigned char>::iterator it = line.begin(); it != line.end(); it++) {
-    current_num.push_back(*it);
-    if ((*it >> 7) != 1) {
-      //We don't have continuation in the next bit
-      huffman_line.push_back(bytes_to_int(current_num));
-      current_num.clear();
-    }
-  }
-  return huffman_line;
-}
-
-inline unsigned int bytes_to_int(std::vector<unsigned char> number)
-{
-  unsigned int retvalue = 0;
-  std::vector<unsigned char>::iterator it = number.begin();
-  unsigned char shift = 0; //By how many bits to shift
-
-  while (it != number.end()) {
-    retvalue |= (*it & 0x7f) << shift;
-    shift += 7;
-    it++;
-  }
-
-  return retvalue;
-}
-
-std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line)
-{
-  std::vector<unsigned char> retvec;
-
-  //For each unsigned int in the line, vbyte encode it and add it to a vector of unsigned chars.
-  for (std::vector<unsigned int>::iterator it = line.begin(); it != line.end(); it++) {
-    std::vector<unsigned char> vbyte_encoded = vbyte_encode(*it);
-    retvec.insert(retvec.end(), vbyte_encoded.begin(), vbyte_encoded.end());
-  }
-
-  return retvec;
-}
diff --git a/moses/TranslationModel/ProbingPT/huffmanish.hh b/moses/TranslationModel/ProbingPT/huffmanish.hh
deleted file mode 100644
index 0970a9e68..000000000
--- a/moses/TranslationModel/ProbingPT/huffmanish.hh
+++ /dev/null
@@ -1,112 +0,0 @@
-#pragma once
-
-//Huffman encodes a line and also produces the vocabulary ids
-#include "hash.hh"
-#include "line_splitter.hh"
-#include <cstdio>
-#include <fstream>
-#include <iostream>
-#include <sstream>
-#include <boost/serialization/serialization.hpp>
-#include <boost/serialization/vector.hpp>
-#include <boost/serialization/map.hpp>
-#include <boost/archive/text_iarchive.hpp>
-#include <boost/archive/text_oarchive.hpp>
-
-//Sorting for the second
-struct sort_pair {
-  bool operator()(const std::pair<std::string, unsigned int> &left, const std::pair<std::string, unsigned int> &right) {
-    return left.second > right.second; //This puts biggest numbers first.
-  }
-};
-
-struct sort_pair_vec {
-  bool operator()(const std::pair<std::vector<unsigned char>, unsigned int> &left, const std::pair<std::vector<unsigned char>, unsigned int> &right) {
-    return left.second > right.second; //This puts biggest numbers first.
-  }
-};
-
-class Huffman
-{
-  unsigned long uniq_lines; //Unique lines in the file.
-
-  //Containers used when counting the occurence of a given phrase
-  std::map<std::string, unsigned int> target_phrase_words;
-  std::map<std::vector<unsigned char>, unsigned int> word_all1;
-
-  //Same containers as vectors, for sorting
-  std::vector<std::pair<std::string, unsigned int> > target_phrase_words_counts;
-  std::vector<std::pair<std::vector<unsigned char>, unsigned int> > word_all1_counts;
-
-  //Huffman maps
-  std::map<std::string, unsigned int> target_phrase_huffman;
-  std::map<std::vector<unsigned char>, unsigned int> word_all1_huffman;
-
-  //inverted maps
-  std::map<unsigned int, std::string> lookup_target_phrase;
-  std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
-
-public:
-  Huffman (const char *);
-  void count_elements (line_text line);
-  void assign_values();
-  void serialize_maps(const char * dirname);
-  void produce_lookups();
-
-  std::vector<unsigned int> encode_line(line_text line);
-
-  //encode line + variable byte ontop
-  std::vector<unsigned char> full_encode_line(line_text line);
-
-  //Getters
-  const std::map<unsigned int, std::string> get_target_lookup_map() const {
-    return lookup_target_phrase;
-  }
-  const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
-    return lookup_word_all1;
-  }
-
-  unsigned long getUniqLines() {
-    return uniq_lines;
-  }
-};
-
-class HuffmanDecoder
-{
-  std::map<unsigned int, std::string> lookup_target_phrase;
-  std::map<unsigned int, std::vector<unsigned char> > lookup_word_all1;
-
-public:
-  HuffmanDecoder (const char *);
-  HuffmanDecoder (std::map<unsigned int, std::string> *, std::map<unsigned int, std::vector<unsigned char> > *);
-
-  //Getters
-  const std::map<unsigned int, std::string> get_target_lookup_map() const {
-    return lookup_target_phrase;
-  }
-  const std::map<unsigned int, std::vector<unsigned char> > get_word_all1_lookup_map() const {
-    return lookup_word_all1;
-  }
-
-  inline std::string getTargetWordFromID(unsigned int id);
-
-  std::string getTargetWordsFromIDs(std::vector<unsigned int> ids);
-
-  target_text decode_line (std::vector<unsigned int> input, int num_scores);
-
-  //Variable byte decodes a all target phrases contained here and then passes them to decode_line
-  std::vector<target_text> full_decode_line (std::vector<unsigned char> lines, int num_scores);
-};
-
-std::string getTargetWordsFromIDs(std::vector<unsigned int> ids, std::map<unsigned int, std::string> * lookup_target_phrase);
-
-inline std::string getTargetWordFromID(unsigned int id, std::map<unsigned int, std::string> * lookup_target_phrase);
-
-inline unsigned int reinterpret_float(float * num);
-
-inline float reinterpret_uint(unsigned int * num);
-
-std::vector<unsigned char> vbyte_encode_line(std::vector<unsigned int> line);
-inline std::vector<unsigned char> vbyte_encode(unsigned int num);
-std::vector<unsigned int> vbyte_decode_line(std::vector<unsigned char> line);
-inline unsigned int bytes_to_int(std::vector<unsigned char> number);
diff --git a/moses/TranslationModel/ProbingPT/line_splitter.cpp b/moses/TranslationModel/ProbingPT/line_splitter.cpp
index 1eeeb1899..cb9e47fec 100644
--- a/moses/TranslationModel/ProbingPT/line_splitter.cpp
+++ b/moses/TranslationModel/ProbingPT/line_splitter.cpp
@@ -1,66 +1,92 @@
 #include "line_splitter.hh"
 
-line_text splitLine(StringPiece textin)
+namespace Moses
 {
-  const char delim[] = " ||| ";
+
+line_text splitLine(const StringPiece &textin, bool scfg)
+{
+  const char delim[] = "|||";
   line_text output;
 
   //Tokenize
   util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
   //Get source phrase
-  output.source_phrase = *it;
+  output.source_phrase = Trim(*it);
+  //std::cerr << "output.source_phrase=" << output.source_phrase << "AAAA" << std::endl;
 
   //Get target_phrase
   it++;
-  output.target_phrase = *it;
+  output.target_phrase = Trim(*it);
+  //std::cerr << "output.target_phrase=" << output.target_phrase << "AAAA" << std::endl;
+
+  if (scfg) {
+    /*
+    std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
+    std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
+    reformatSCFG(output);
+    std::cerr << "output.source_phrase=" << output.source_phrase << std::endl;
+    std::cerr << "output.target_phrase=" << output.target_phrase << std::endl;
+    */
+  }
 
   //Get probabilities
   it++;
-  output.prob = *it;
+  output.prob = Trim(*it);
+  //std::cerr << "output.prob=" << output.prob << "AAAA" << std::endl;
 
   //Get WordAllignment
   it++;
   if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
-  output.word_align = *it;
+  output.word_align = Trim(*it);
+  //std::cerr << "output.word_align=" << output.word_align << "AAAA" << std::endl;
 
   //Get count
   it++;
   if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
-  output.counts = *it;
+  output.counts = Trim(*it);
+  //std::cerr << "output.counts=" << output.counts << "AAAA" << std::endl;
 
   //Get sparse_score
   it++;
   if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
-  output.sparse_score = *it;
+  output.sparse_score = Trim(*it);
+  //std::cerr << "output.sparse_score=" << output.sparse_score << "AAAA" << std::endl;
 
   //Get property
   it++;
   if (it == util::TokenIter<util::MultiCharacter>::end()) return output;
-  output.property = *it;
+  output.property = Trim(*it);
+  //std::cerr << "output.property=" << output.property << "AAAA" << std::endl;
 
   return output;
 }
 
-std::vector<unsigned char> splitWordAll1(StringPiece textin)
+std::vector<unsigned char> splitWordAll1(const StringPiece &textin)
 {
   const char delim[] = " ";
   const char delim2[] = "-";
   std::vector<unsigned char> output;
 
+  //Case with no word alignments.
+  if (textin.size() == 0) {
+    return output;
+  }
+
   //Split on space
   util::TokenIter<util::MultiCharacter> it(textin, util::MultiCharacter(delim));
 
   //For each int
   while (it) {
     //Split on dash (-)
-    util::TokenIter<util::MultiCharacter> itInner(*it, util::MultiCharacter(delim2));
+    util::TokenIter<util::MultiCharacter> itInner(*it,
+        util::MultiCharacter(delim2));
 
     //Insert the two entries in the vector. User will read entry 0 and 1 to get the first,
     //2 and 3 for second etc. Use unsigned char instead of int to save space, as
     //word allignments are all very small numbers that fit in a single byte
-    output.push_back((unsigned char)(atoi(itInner->data())));
+    output.push_back((unsigned char) (atoi(itInner->data())));
     itInner++;
-    output.push_back((unsigned char)(atoi(itInner->data())));
+    output.push_back((unsigned char) (atoi(itInner->data())));
     it++;
   }
 
@@ -68,3 +94,10 @@ std::vector<unsigned char> splitWordAll1(StringPiece textin)
 
 }
 
+void reformatSCFG(line_text &output)
+{
+
+}
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/line_splitter.hh b/moses/TranslationModel/ProbingPT/line_splitter.hh
index 2cb9a3c8c..cec0a5f45 100644
--- a/moses/TranslationModel/ProbingPT/line_splitter.hh
+++ b/moses/TranslationModel/ProbingPT/line_splitter.hh
@@ -9,8 +9,12 @@
 #include "util/tokenize_piece.hh"
 #include <vector>
 
+namespace Moses
+{
+
 //Struct for holding processed line
-struct line_text {
+struct line_text
+{
   StringPiece source_phrase;
   StringPiece target_phrase;
   StringPiece prob;
@@ -18,16 +22,38 @@ struct line_text {
   StringPiece counts;
   StringPiece sparse_score;
   StringPiece property;
+  std::string property_to_be_binarized;
 };
 
 //Struct for holding processed line
-struct target_text {
+struct target_text
+{
   std::vector<unsigned int> target_phrase;
   std::vector<float> prob;
-  std::vector<unsigned char> word_all1;
+  std::vector<size_t> word_align_term;
+  std::vector<size_t> word_align_non_term;
+  std::vector<char> counts;
+  std::vector<char> sparse_score;
+  std::vector<char> property;
+
+  /*
+  void Reset()
+  {
+    target_phrase.clear();
+    prob.clear();
+    word_all1.clear();
+    counts.clear();
+    sparse_score.clear();
+    property.clear();
+  }
+  */
 };
 
 //Ask if it's better to have it receive a pointer to a line_text struct
-line_text splitLine(StringPiece textin);
+line_text splitLine(const StringPiece &textin, bool scfg);
+void reformatSCFG(line_text &output);
+
+std::vector<unsigned char> splitWordAll1(const StringPiece &textin);
+
+}
 
-std::vector<unsigned char> splitWordAll1(StringPiece textin);
diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp
index ca3e8f69f..f23f57d66 100644
--- a/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp
+++ b/moses/TranslationModel/ProbingPT/probing_hash_utils.cpp
@@ -1,5 +1,8 @@
 #include "probing_hash_utils.hh"
 
+namespace Moses
+{
+
 //Read table from disk, return memory map location
 char * readTable(const char * filename, size_t size)
 {
@@ -13,7 +16,7 @@ char * readTable(const char * filename, size_t size)
     exit(EXIT_FAILURE);
   }
 
-  map = (char *)mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
+  map = (char *) mmap(0, size, PROT_READ, MAP_SHARED, fd, 0);
 
   if (map == MAP_FAILED) {
     close(fd);
@@ -24,11 +27,24 @@ char * readTable(const char * filename, size_t size)
   return map;
 }
 
-
-void serialize_table(char *mem, size_t size, const char * filename)
+void serialize_table(char *mem, size_t size, const std::string &filename)
 {
-  std::ofstream os (filename, std::ios::binary);
-  os.write((const char*)&mem[0], size);
+  std::ofstream os(filename.c_str(), std::ios::binary);
+  os.write((const char*) &mem[0], size);
   os.close();
 
-}
\ No newline at end of file
+}
+
+uint64_t getKey(const uint64_t source_phrase[], size_t size)
+{
+  //TOO SLOW
+  //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
+  uint64_t key = 0;
+  for (size_t i = 0; i < size; i++) {
+    key += (source_phrase[i] << i);
+  }
+  return key;
+}
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh
index de96e87a0..dcf0dbe25 100644
--- a/moses/TranslationModel/ProbingPT/probing_hash_utils.hh
+++ b/moses/TranslationModel/ProbingPT/probing_hash_utils.hh
@@ -7,31 +7,49 @@
 #include <fcntl.h>
 #include <fstream>
 
+namespace Moses
+{
+
+#define API_VERSION 15
 
 //Hash table entry
-struct Entry {
-  uint64_t key;
+struct Entry
+{
   typedef uint64_t Key;
-  unsigned int bytes_toread;
+  Key key;
 
-  uint64_t GetKey() const {
+  Key GetKey() const
+  {
     return key;
   }
 
-  void SetKey(uint64_t to) {
+  void SetKey(Key to)
+  {
     key = to;
   }
 
-  uint64_t GetValue() const {
-    return value;
-  }
-
   uint64_t value;
 };
 
+#define NONE       std::numeric_limits<uint64_t>::max()
+
 //Define table
 typedef util::ProbingHashTable<Entry, boost::hash<uint64_t> > Table;
 
-void serialize_table(char *mem, size_t size, const char * filename);
+void serialize_table(char *mem, size_t size, const std::string &filename);
 
 char * readTable(const char * filename, size_t size);
+
+uint64_t getKey(const uint64_t source_phrase[], size_t size);
+
+struct TargetPhraseInfo
+{
+  uint32_t alignTerm;
+  uint32_t alignNonTerm;
+  uint16_t numWords;
+  uint16_t propLength;
+  uint16_t filler;
+};
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/quering.cpp b/moses/TranslationModel/ProbingPT/quering.cpp
index bd1d61a1e..ef980ef06 100644
--- a/moses/TranslationModel/ProbingPT/quering.cpp
+++ b/moses/TranslationModel/ProbingPT/quering.cpp
@@ -1,73 +1,80 @@
 #include "quering.hh"
+#include "util/exception.hh"
 
-unsigned char * read_binary_file(const char * filename, size_t filesize)
-{
-  //Get filesize
-  int fd;
-  unsigned char * map;
-
-  fd = open(filename, O_RDONLY);
-
-  if (fd == -1) {
-    perror("Error opening file for reading");
-    exit(EXIT_FAILURE);
-  }
-
-  map = (unsigned char *)mmap(0, filesize, PROT_READ, MAP_SHARED, fd, 0);
-  if (map == MAP_FAILED) {
-    close(fd);
-    perror("Error mmapping the file");
-    exit(EXIT_FAILURE);
-  }
+using namespace std;
 
-  return map;
-}
+namespace Moses
+{
 
-QueryEngine::QueryEngine(const char * filepath) : decoder(filepath)
+QueryEngine::QueryEngine(const char * filepath)
 {
 
   //Create filepaths
   std::string basepath(filepath);
   std::string path_to_hashtable = basepath + "/probing_hash.dat";
-  std::string path_to_data_bin = basepath + "/binfile.dat";
   std::string path_to_source_vocabid = basepath + "/source_vocabids";
+  std::string alignPath = basepath + "/Alignments.dat";
 
   ///Source phrase vocabids
-  read_map(&source_vocabids, path_to_source_vocabid.c_str());
+  read_map(source_vocabids, path_to_source_vocabid.c_str());
 
-  //Target phrase vocabIDs
-  vocabids = decoder.get_target_lookup_map();
+  // alignments
+  read_alignments(alignPath);
 
   //Read config file
+  boost::unordered_map<std::string, std::string> keyValue;
+
+  std::ifstream config((basepath + "/config").c_str());
   std::string line;
-  std::ifstream config ((basepath + "/config").c_str());
+  while (getline(config, line)) {
+    std::vector<std::string> toks = Tokenize(line, "\t");
+    UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
+    keyValue[ toks[0] ] = toks[1];
+  }
+
+  bool found;
   //Check API version:
-  getline(config, line);
-  if (atoi(line.c_str()) != API_VERSION) {
-    std::cerr << "The ProbingPT API has changed, please rebinarize your phrase tables." << std::endl;
+  int version;
+  found = Get(keyValue, "API_VERSION", version);
+  if (!found) {
+    std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
+  }
+  else if (version != API_VERSION) {
+    std::cerr << "The ProbingPT API has changed. " << version << "!="
+        << API_VERSION << " Please rebinarize your phrase tables." << std::endl;
     exit(EXIT_FAILURE);
   }
+
   //Get tablesize.
-  getline(config, line);
-  int tablesize = atoi(line.c_str());
+  int tablesize;
+  found = Get(keyValue, "uniq_entries", tablesize);
+  if (!found) {
+    std::cerr << "uniq_entries not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
   //Number of scores
-  getline(config, line);
-  num_scores = atoi(line.c_str());
-  //do we have a reordering table
-  getline(config, line);
-  std::transform(line.begin(), line.end(), line.begin(), ::tolower); //Get the boolean in lowercase
-  is_reordering = false;
-  if (line == "true") {
-    is_reordering = true;
-    std::cerr << "WARNING. REORDERING TABLES NOT SUPPORTED YET." << std::endl;
+  found = Get(keyValue, "num_scores", num_scores);
+  if (!found) {
+    std::cerr << "num_scores not found" << std::endl;
+    exit(EXIT_FAILURE);
   }
-  config.close();
 
-  //Mmap binary table
-  struct stat filestatus;
-  stat(path_to_data_bin.c_str(), &filestatus);
-  binary_filesize = filestatus.st_size;
-  binary_mmaped = read_binary_file(path_to_data_bin.c_str(), binary_filesize);
+  //How may scores from lex reordering models
+  found = Get(keyValue, "num_lex_scores", num_lex_scores);
+  if (!found) {
+    std::cerr << "num_lex_scores not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  // have the scores been log() and FloorScore()?
+  found = Get(keyValue, "log_prob", logProb);
+  if (!found) {
+    std::cerr << "logProb not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  config.close();
 
   //Read hashtable
   table_filesize = Table::Size(tablesize, 1.2);
@@ -81,118 +88,50 @@ QueryEngine::QueryEngine(const char * filepath) : decoder(filepath)
 QueryEngine::~QueryEngine()
 {
   //Clear mmap content from memory.
-  munmap(binary_mmaped, binary_filesize);
   munmap(mem, table_filesize);
 
 }
 
-std::pair<bool, std::vector<target_text> > QueryEngine::query(std::vector<uint64_t> source_phrase)
+uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
 {
-  bool found;
-  std::vector<target_text> translation_entries;
-  const Entry * entry;
   //TOO SLOW
   //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
-  uint64_t key = 0;
-  for (int i = 0; i < source_phrase.size(); i++) {
-    key += (source_phrase[i] << i);
-  }
-
-
-  found = table.Find(key, entry);
-
-  if (found) {
-    //The phrase that was searched for was found! We need to get the translation entries.
-    //We will read the largest entry in bytes and then filter the unnecesarry with functions
-    //from line_splitter
-    uint64_t initial_index = entry -> GetValue();
-    unsigned int bytes_toread = entry -> bytes_toread;
-
-    //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
-    std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
-    encoded_text.reserve(bytes_toread);
-    for (int i = 0; i < bytes_toread; i++) {
-      encoded_text.push_back(binary_mmaped[i+initial_index]);
-    }
-
-    //Get only the translation entries necessary
-    translation_entries = decoder.full_decode_line(encoded_text, num_scores);
-
-  }
-
-  std::pair<bool, std::vector<target_text> > output (found, translation_entries);
-
-  return output;
-
+  return getKey(source_phrase, size);
 }
 
-std::pair<bool, std::vector<target_text> > QueryEngine::query(StringPiece source_phrase)
+std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
 {
-  bool found;
-  std::vector<target_text> translation_entries;
-  const Entry * entry;
-  //Convert source frase to VID
-  std::vector<uint64_t> source_phrase_vid = getVocabIDs(source_phrase);
-  //TOO SLOW
-  //uint64_t key = util::MurmurHashNative(&source_phrase_vid[0], source_phrase_vid.size());
-  uint64_t key = 0;
-  for (int i = 0; i < source_phrase_vid.size(); i++) {
-    key += (source_phrase_vid[i] << i);
-  }
-
-  found = table.Find(key, entry);
-
-
-  if (found) {
-    //The phrase that was searched for was found! We need to get the translation entries.
-    //We will read the largest entry in bytes and then filter the unnecesarry with functions
-    //from line_splitter
-    uint64_t initial_index = entry -> GetValue();
-    unsigned int bytes_toread = entry -> bytes_toread;
-    //At the end of the file we can't readd + largest_entry cause we get a segfault.
-    std::cerr << "Entry size is bytes is: " << bytes_toread << std::endl;
-
-    //ASK HIEU FOR MORE EFFICIENT WAY TO DO THIS!
-    std::vector<unsigned char> encoded_text; //Assign to the vector the relevant portion of the array.
-    encoded_text.reserve(bytes_toread);
-    for (int i = 0; i < bytes_toread; i++) {
-      encoded_text.push_back(binary_mmaped[i+initial_index]);
-    }
-
-    //Get only the translation entries necessary
-    translation_entries = decoder.full_decode_line(encoded_text, num_scores);
+  std::pair<bool, uint64_t> ret;
 
+  const Entry * entry;
+  ret.first = table.Find(key, entry);
+  if (ret.first) {
+    ret.second = entry->value;
   }
-
-  std::pair<bool, std::vector<target_text> > output (found, translation_entries);
-
-  return output;
-
+  return ret;
 }
 
-void QueryEngine::printTargetInfo(std::vector<target_text> target_phrases)
+void QueryEngine::read_alignments(const std::string &alignPath)
 {
-  int entries = target_phrases.size();
+  std::ifstream strm(alignPath.c_str());
 
-  for (int i = 0; i<entries; i++) {
-    std::cout << "Entry " << i+1 << " of " << entries << ":" << std::endl;
-    //Print text
-    std::cout << getTargetWordsFromIDs(target_phrases[i].target_phrase, &vocabids) << "\t";
+  string line;
+  while (getline(strm, line)) {
+    vector<string> toks = Tokenize(line, "\t ");
+    UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
 
-    //Print probabilities:
-    for (int j = 0; j<target_phrases[i].prob.size(); j++) {
-      std::cout << target_phrases[i].prob[j] << " ";
+    uint32_t alignInd = Scan<uint32_t>(toks[0]);
+    if (alignInd >= alignColl.size()) {
+      alignColl.resize(alignInd + 1);
     }
-    std::cout << "\t";
-
-    //Print word_all1
-    for (int j = 0; j<target_phrases[i].word_all1.size(); j++) {
-      if (j%2 == 0) {
-        std::cout << (short)target_phrases[i].word_all1[j] << "-";
-      } else {
-        std::cout << (short)target_phrases[i].word_all1[j] << " ";
-      }
+
+    Alignments &aligns = alignColl[alignInd];
+    for (size_t i = 1; i < toks.size(); ++i) {
+      size_t pos = Scan<size_t>(toks[i]);
+      aligns.push_back(pos);
     }
-    std::cout << std::endl;
   }
 }
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/quering.hh b/moses/TranslationModel/ProbingPT/quering.hh
index e574d1f8f..c43c7f3b9 100644
--- a/moses/TranslationModel/ProbingPT/quering.hh
+++ b/moses/TranslationModel/ProbingPT/quering.hh
@@ -1,45 +1,65 @@
 #pragma once
 
-#include "probing_hash_utils.hh"
-#include "huffmanish.hh"
-#include "hash.hh" //Includes line splitter
+#include <boost/unordered_map.hpp>
 #include <sys/stat.h> //For finding size of file
 #include "vocabid.hh"
 #include <algorithm> //toLower
-#define API_VERSION 3
-
+#include <deque>
+#include "probing_hash_utils.hh"
+#include "hash.hh" //Includes line splitter
+#include "line_splitter.hh"
+#include "moses//Util.h"
 
-char * read_binary_file(char * filename);
+namespace Moses
+{
 
 class QueryEngine
 {
-  unsigned char * binary_mmaped; //The binari phrase table file
-  std::map<unsigned int, std::string> vocabids;
   std::map<uint64_t, std::string> source_vocabids;
 
+  typedef std::vector<unsigned char> Alignments;
+  std::vector<Alignments> alignColl;
+
   Table table;
   char *mem; //Memory for the table, necessary so that we can correctly destroy the object
 
-  HuffmanDecoder decoder;
-
-  size_t binary_filesize;
   size_t table_filesize;
-  int num_scores;
   bool is_reordering;
+
+  void read_alignments(const std::string &alignPath);
+
 public:
-  QueryEngine (const char *);
+  int num_scores;
+  int num_lex_scores;
+  bool logProb;
+
+  QueryEngine(const char *);
   ~QueryEngine();
-  std::pair<bool, std::vector<target_text> > query(StringPiece source_phrase);
-  std::pair<bool, std::vector<target_text> > query(std::vector<uint64_t> source_phrase);
-  void printTargetInfo(std::vector<target_text> target_phrases);
-  const std::map<unsigned int, std::string> getVocab() const {
-    return decoder.get_target_lookup_map();
-  }
 
-  const std::map<uint64_t, std::string> getSourceVocab() const {
-    return source_vocabids;
+  std::pair<bool, uint64_t> query(uint64_t key);
+
+  const std::map<uint64_t, std::string> &getSourceVocab() const
+  {  return source_vocabids; }
+
+  const std::vector<Alignments> &getAlignments() const
+  {  return alignColl; }
+
+  uint64_t getKey(uint64_t source_phrase[], size_t size) const;
+
+  template<typename T>
+  inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const
+  {
+    boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
+    if (iter == keyValue.end()) {
+      return false;
+    }
+
+    const std::string &foundStr = iter->second;
+    found = Scan<T>(foundStr);
+    return true;
   }
 
 };
 
+}
 
diff --git a/moses/TranslationModel/ProbingPT/storing.cpp b/moses/TranslationModel/ProbingPT/storing.cpp
index 01128c1e4..98dcfd5dc 100644
--- a/moses/TranslationModel/ProbingPT/storing.cpp
+++ b/moses/TranslationModel/ProbingPT/storing.cpp
@@ -1,161 +1,303 @@
+#include <sys/stat.h>
+#include <boost/foreach.hpp>
+#include "line_splitter.hh"
 #include "storing.hh"
+#include "StoreTarget.h"
+#include "StoreVocab.h"
+#include "moses/Util.h"
+#include "moses/InputFileStream.h"
 
-BinaryFileWriter::BinaryFileWriter (std::string basepath) : os ((basepath + "/binfile.dat").c_str(), std::ios::binary)
+using namespace std;
+
+namespace Moses
 {
-  binfile.reserve(10000); //Reserve part of the vector to avoid realocation
-  it = binfile.begin();
-  dist_from_start = 0; //Initialize variables
-  extra_counter = 0;
-}
 
-void BinaryFileWriter::write (std::vector<unsigned char> * bytes)
+///////////////////////////////////////////////////////////////////////
+void Node::Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos)
 {
-  binfile.insert(it, bytes->begin(), bytes->end()); //Insert the bytes
-  //Keep track of the offsets
-  it += bytes->size();
-  dist_from_start = distance(binfile.begin(),it);
-  //Flush the vector to disk every once in a while so that we don't consume too much ram
-  if (dist_from_start > 9000) {
-    flush();
+  if (pos < sourcePhrase.size()) {
+    uint64_t vocabId = sourcePhrase[pos];
+
+    Node *child;
+    Children::iterator iter = m_children.find(vocabId);
+    if (iter == m_children.end()) {
+      // New node. Write other children then discard them
+      BOOST_FOREACH(Children::value_type &valPair, m_children) {
+        Node &otherChild = valPair.second;
+        otherChild.Write(table);
+      }
+      m_children.clear();
+
+      // create new node
+      child = &m_children[vocabId];
+      assert(!child->done);
+      child->key = key + (vocabId << pos);
+    }
+    else {
+      child = &iter->second;
+    }
+
+    child->Add(table, sourcePhrase, pos + 1);
+  }
+  else {
+    // this node was written previously 'cos it has rules
+    done = true;
   }
 }
 
-void BinaryFileWriter::flush ()
+void Node::Write(Table &table)
 {
-  //Cast unsigned char to char before writing...
-  os.write((char *)&binfile[0], dist_from_start);
-  //Clear the vector:
-  binfile.clear();
-  binfile.reserve(10000);
-  extra_counter += dist_from_start; //Keep track of the total number of bytes.
-  it = binfile.begin(); //Reset iterator
-  dist_from_start = distance(binfile.begin(),it); //Reset dist from start
-}
+  //cerr << "START write " << done << " " << key << endl;
+  BOOST_FOREACH(Children::value_type &valPair, m_children) {
+    Node &child = valPair.second;
+    child.Write(table);
+  }
 
-BinaryFileWriter::~BinaryFileWriter ()
-{
-  os.close();
-  binfile.clear();
+  if (!done) {
+    // save
+    Entry sourceEntry;
+    sourceEntry.value = NONE;
+    sourceEntry.key = key;
+
+    //Put into table
+    table.Insert(sourceEntry);
+  }
 }
 
-void createProbingPT(const char * phrasetable_path, const char * target_path,
-                     const char * num_scores, const char * is_reordering)
+///////////////////////////////////////////////////////////////////////
+void createProbingPT(const std::string &phrasetable_path,
+    const std::string &basepath, int num_scores, int num_lex_scores,
+    bool log_prob, int max_cache_size, bool scfg)
 {
+  std::cerr << "Starting..." << std::endl;
+
   //Get basepath and create directory if missing
-  std::string basepath(target_path);
   mkdir(basepath.c_str(), S_IRWXU | S_IRWXG | S_IROTH | S_IXOTH);
 
-  //Set up huffman and serialize decoder maps.
-  Huffman huffmanEncoder(phrasetable_path); //initialize
-  huffmanEncoder.assign_values();
-  huffmanEncoder.produce_lookups();
-  huffmanEncoder.serialize_maps(target_path);
+  StoreTarget storeTarget(basepath);
 
   //Get uniq lines:
-  unsigned long uniq_entries = huffmanEncoder.getUniqLines();
+  unsigned long uniq_entries = countUniqueSource(phrasetable_path);
 
   //Source phrase vocabids
-  std::map<uint64_t, std::string> source_vocabids;
+  StoreVocab<uint64_t> sourceVocab(basepath + "/source_vocabids");
 
   //Read the file
-  util::FilePiece filein(phrasetable_path);
+  util::FilePiece filein(phrasetable_path.c_str());
 
   //Init the probing hash table
   size_t size = Table::Size(uniq_entries, 1.2);
   char * mem = new char[size];
   memset(mem, 0, size);
-  Table table(mem, size);
+  Table sourceEntries(mem, size);
 
-  BinaryFileWriter binfile(basepath); //Init the binary file writer.
-
-  line_text prev_line; //Check if the source phrase of the previous line is the same
+  std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> cache;
+  float totalSourceCount = 0;
 
   //Keep track of the size of each group of target phrases
-  uint64_t entrystartidx = 0;
-  //uint64_t line_num = 0;
-
+  size_t line_num = 0;
 
   //Read everything and processs
-  while(true) {
+  std::string prevSource;
+
+  Node sourcePhrases;
+  sourcePhrases.done = true;
+  sourcePhrases.key = 0;
+
+  while (true) {
     try {
       //Process line read
       line_text line;
-      line = splitLine(filein.ReadLine());
-      //Add source phrases to vocabularyIDs
-      add_to_map(&source_vocabids, line.source_phrase);
+      line = splitLine(filein.ReadLine(), scfg);
+      //cerr << "line=" << line.source_phrase << endl;
 
-      if ((binfile.dist_from_start + binfile.extra_counter) == 0) {
-        prev_line = line; //For the first iteration assume the previous line is
-      } //The same as this one.
+      ++line_num;
+      if (line_num % 1000000 == 0) {
+        std::cerr << line_num << " " << std::flush;
+      }
 
-      if (line.source_phrase != prev_line.source_phrase) {
+      //Add source phrases to vocabularyIDs
+      add_to_map(sourceVocab, line.source_phrase);
+
+      if (prevSource.empty()) {
+        // 1st line
+        prevSource = line.source_phrase.as_string();
+        storeTarget.Append(line, log_prob, scfg);
+      }
+      else if (prevSource == line.source_phrase) {
+        //If we still have the same line, just append to it:
+        storeTarget.Append(line, log_prob, scfg);
+      }
+      else {
+        assert(prevSource != line.source_phrase);
 
         //Create a new entry even
 
+        // save
+        uint64_t targetInd = storeTarget.Save();
+
+        // next line
+        storeTarget.Append(line, log_prob, scfg);
+
         //Create an entry for the previous source phrase:
-        Entry pesho;
-        pesho.value = entrystartidx;
+        Entry sourceEntry;
+        sourceEntry.value = targetInd;
         //The key is the sum of hashes of individual words bitshifted by their position in the phrase.
         //Probably not entirerly correct, but fast and seems to work fine in practise.
-        pesho.key = 0;
-        std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
-        for (int i = 0; i < vocabid_source.size(); i++) {
-          pesho.key += (vocabid_source[i] << i);
+        std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
+        if (scfg) {
+          // storing prefixes?
+       	  sourcePhrases.Add(sourceEntries, vocabid_source);
         }
-        pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
+        sourceEntry.key = getKey(vocabid_source);
 
+        /*
+        cerr << "prevSource=" << prevSource << flush
+            << " vocabids=" << Debug(vocabid_source) << flush
+            << " key=" << sourceEntry.key << endl;
+		*/
         //Put into table
-        table.Insert(pesho);
+        sourceEntries.Insert(sourceEntry);
 
-        entrystartidx = binfile.dist_from_start + binfile.extra_counter; //Designate start idx for new entry
+        // update cache - CURRENT source phrase, not prev
+        if (max_cache_size) {
+          std::string countStr = line.counts.as_string();
+          countStr = Trim(countStr);
+          if (!countStr.empty()) {
+            std::vector<float> toks = Tokenize<float>(countStr);
+            //cerr << "CACHE:" << line.source_phrase << " " << countStr << " " << toks[1] << endl;
 
-        //Encode a line and write it to disk.
-        std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
-        binfile.write(&encoded_line);
+            if (toks.size() >= 2) {
+              totalSourceCount += toks[1];
 
-        //Set prevLine
-        prev_line = line;
+              // compute key for CURRENT source
+              std::vector<uint64_t> currVocabidSource = getVocabIDs(line.source_phrase.as_string());
+              uint64_t currKey = getKey(currVocabidSource);
 
-      } else {
-        //If we still have the same line, just append to it:
-        std::vector<unsigned char> encoded_line = huffmanEncoder.full_encode_line(line);
-        binfile.write(&encoded_line);
+              CacheItem *item = new CacheItem(
+                  Trim(line.source_phrase.as_string()),
+				  currKey,
+				  toks[1]);
+              cache.push(item);
+
+              if (max_cache_size > 0 && cache.size() > max_cache_size) {
+                cache.pop();
+              }
+            }
+          }
+        }
+
+        //Set prevLine
+        prevSource = line.source_phrase.as_string();
       }
 
-    } catch (util::EndOfFileException e) {
-      std::cerr << "Reading phrase table finished, writing remaining files to disk." << std::endl;
-      binfile.flush();
+    }
+    catch (util::EndOfFileException e) {
+      std::cerr
+          << "Reading phrase table finished, writing remaining files to disk."
+          << std::endl;
 
       //After the final entry is constructed we need to add it to the phrase_table
       //Create an entry for the previous source phrase:
-      Entry pesho;
-      pesho.value = entrystartidx;
+      uint64_t targetInd = storeTarget.Save();
+
+      Entry sourceEntry;
+      sourceEntry.value = targetInd;
+
       //The key is the sum of hashes of individual words. Probably not entirerly correct, but fast
-      pesho.key = 0;
-      std::vector<uint64_t> vocabid_source = getVocabIDs(prev_line.source_phrase);
-      for (int i = 0; i < vocabid_source.size(); i++) {
-        pesho.key += (vocabid_source[i] << i);
-      }
-      pesho.bytes_toread = binfile.dist_from_start + binfile.extra_counter - entrystartidx;
+      std::vector<uint64_t> vocabid_source = getVocabIDs(prevSource);
+      sourceEntry.key = getKey(vocabid_source);
+
       //Put into table
-      table.Insert(pesho);
+      sourceEntries.Insert(sourceEntry);
 
       break;
     }
   }
 
-  serialize_table(mem, size, (basepath + "/probing_hash.dat").c_str());
+  sourcePhrases.Write(sourceEntries);
+
+  storeTarget.SaveAlignment();
 
-  serialize_map(&source_vocabids, (basepath + "/source_vocabids").c_str());
+  serialize_table(mem, size, (basepath + "/probing_hash.dat"));
+
+  sourceVocab.Save();
+
+  serialize_cache(cache, (basepath + "/cache"), totalSourceCount);
 
   delete[] mem;
 
   //Write configfile
   std::ofstream configfile;
   configfile.open((basepath + "/config").c_str());
-  configfile << API_VERSION << '\n';
-  configfile << uniq_entries << '\n';
-  configfile << num_scores << '\n';
-  configfile << is_reordering << '\n';
+  configfile << "API_VERSION\t" << API_VERSION << '\n';
+  configfile << "uniq_entries\t" << uniq_entries << '\n';
+  configfile << "num_scores\t" << num_scores << '\n';
+  configfile << "num_lex_scores\t" << num_lex_scores << '\n';
+  configfile << "log_prob\t" << log_prob << '\n';
   configfile.close();
 }
+
+size_t countUniqueSource(const std::string &path)
+{
+  size_t ret = 0;
+  InputFileStream strme(path);
+
+  std::string line, prevSource;
+  while (std::getline(strme, line)) {
+    std::vector<std::string> toks = TokenizeMultiCharSeparator(line, "|||");
+    assert(toks.size() != 0);
+
+    if (prevSource != toks[0]) {
+      prevSource = toks[0];
+      ++ret;
+    }
+  }
+
+  return ret;
+}
+
+void serialize_cache(
+    std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
+    const std::string &path, float totalSourceCount)
+{
+  std::vector<const CacheItem*> vec(cache.size());
+
+  size_t ind = cache.size() - 1;
+  while (!cache.empty()) {
+    const CacheItem *item = cache.top();
+    vec[ind] = item;
+    cache.pop();
+    --ind;
+  }
+
+  std::ofstream os(path.c_str());
+
+  os << totalSourceCount << std::endl;
+  for (size_t i = 0; i < vec.size(); ++i) {
+    const CacheItem *item = vec[i];
+    os << item->count << "\t" << item->sourceKey << "\t" << item->source << std::endl;
+    delete item;
+  }
+
+  os.close();
+}
+
+uint64_t getKey(const std::vector<uint64_t> &vocabid_source)
+{
+  return getKey(vocabid_source.data(), vocabid_source.size());
+}
+
+std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos)
+{
+  assert(endPos < vocabid_source.size());
+
+  std::vector<uint64_t> ret(endPos + 1);
+  for (size_t i = 0; i <= endPos; ++i) {
+    ret[i] = vocabid_source[i];
+  }
+  return ret;
+}
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/storing.hh b/moses/TranslationModel/ProbingPT/storing.hh
index 8554d614f..957c73491 100644
--- a/moses/TranslationModel/ProbingPT/storing.hh
+++ b/moses/TranslationModel/ProbingPT/storing.hh
@@ -1,36 +1,95 @@
 #pragma once
 
+#include <boost/unordered_set.hpp>
+#include <boost/unordered_map.hpp>
 #include <cstdio>
+#include <sstream>
 #include <fstream>
 #include <iostream>
+#include <string>
+#include <queue>
+#include <sys/stat.h> //mkdir
 
 #include "hash.hh" //Includes line_splitter
 #include "probing_hash_utils.hh"
-#include "huffmanish.hh"
-#include <sys/stat.h> //mkdir
 
 #include "util/file_piece.hh"
 #include "util/file.hh"
 #include "vocabid.hh"
-#define API_VERSION 3
 
-void createProbingPT(const char * phrasetable_path, const char * target_path,
-                     const char * num_scores, const char * is_reordering);
+namespace Moses
+{
+typedef std::vector<uint64_t> SourcePhrase;
+
+
+class Node
+{
+  typedef boost::unordered_map<uint64_t, Node> Children;
+  Children m_children;
+
+public:
+  uint64_t key;
+  bool done;
+
+  Node()
+  :done(false)
+  {}
+
+  void Add(Table &table, const SourcePhrase &sourcePhrase, size_t pos = 0);
+  void Write(Table &table);
+};
+
+
+void createProbingPT(const std::string &phrasetable_path,
+    const std::string &basepath, int num_scores, int num_lex_scores,
+    bool log_prob, int max_cache_size, bool scfg);
+uint64_t getKey(const std::vector<uint64_t> &source_phrase);
+
+std::vector<uint64_t> CreatePrefix(const std::vector<uint64_t> &vocabid_source, size_t endPos);
 
-class BinaryFileWriter
+template<typename T>
+std::string Debug(const std::vector<T> &vec)
 {
-  std::vector<unsigned char> binfile;
-  std::vector<unsigned char>::iterator it;
-  //Output binary
-  std::ofstream os;
+  std::stringstream strm;
+  for (size_t i = 0; i < vec.size(); ++i) {
+    strm << vec[i] << " ";
+  }
+  return strm.str();
+}
 
+size_t countUniqueSource(const std::string &path);
+
+class CacheItem
+{
 public:
-  unsigned int dist_from_start; //Distance from the start of the vector.
-  uint64_t extra_counter; //After we reset the counter, we still want to keep track of the correct offset, so
+  std::string source;
+  uint64_t sourceKey;
+  float count;
+  CacheItem(const std::string &vSource, uint64_t vSourceKey, float vCount)
+  :source(vSource)
+  ,sourceKey(vSourceKey)
+  ,count(vCount)
+  {
+  }
 
-  BinaryFileWriter (std::string);
-  ~BinaryFileWriter ();
-  void write (std::vector<unsigned char> * bytes);
-  void flush (); //Flush to disk
+  bool operator<(const CacheItem &other) const
+  {
+    return count > other.count;
+  }
+};
 
+class CacheItemOrderer
+{
+public:
+  bool operator()(const CacheItem* a, const CacheItem* b) const
+  {
+    return (*a) < (*b);
+  }
 };
+
+void serialize_cache(
+    std::priority_queue<CacheItem*, std::vector<CacheItem*>, CacheItemOrderer> &cache,
+    const std::string &path, float totalSourceCount);
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/vocabid.cpp b/moses/TranslationModel/ProbingPT/vocabid.cpp
index 1452f299d..3d6efe841 100644
--- a/moses/TranslationModel/ProbingPT/vocabid.cpp
+++ b/moses/TranslationModel/ProbingPT/vocabid.cpp
@@ -1,32 +1,59 @@
+#include <boost/foreach.hpp>
 #include "vocabid.hh"
+#include "StoreVocab.h"
+#include "moses/Util.h"
 
-void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin)
+namespace Moses
+{
+
+void add_to_map(StoreVocab<uint64_t> &sourceVocab,
+    const StringPiece &textin)
 {
   //Tokenize
-  util::TokenIter<util::SingleCharacter> it(textin, util::SingleCharacter(' '));
+  util::TokenIter<util::SingleCharacter> itWord(textin, util::SingleCharacter(' '));
+
+  while (itWord) {
+	StringPiece word = *itWord;
 
-  while(it) {
-    karta->insert(std::pair<uint64_t, std::string>(getHash(*it), it->as_string()));
-    it++;
+	util::TokenIter<util::SingleCharacter> itFactor(word, util::SingleCharacter('|'));
+    while (itFactor) {
+    	StringPiece factor = *itFactor;
+
+        sourceVocab.Insert(getHash(factor), factor.as_string());
+        itFactor++;
+    }
+    itWord++;
   }
 }
 
-void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename)
+void serialize_map(const std::map<uint64_t, std::string> &karta,
+    const std::string &filename)
 {
-  std::ofstream os (filename, std::ios::binary);
-  boost::archive::text_oarchive oarch(os);
+  std::ofstream os(filename.c_str());
+
+  std::map<uint64_t, std::string>::const_iterator iter;
+  for (iter = karta.begin(); iter != karta.end(); ++iter) {
+    os << iter->first << '\t' << iter->second << std::endl;
+  }
 
-  oarch << *karta;  //Serialise map
   os.close();
 }
 
-void read_map(std::map<uint64_t, std::string> *karta, const char* filename)
+void read_map(std::map<uint64_t, std::string> &karta, const char* filename)
 {
-  std::ifstream is (filename, std::ios::binary);
-  boost::archive::text_iarchive iarch(is);
+  std::ifstream is(filename);
 
-  iarch >> *karta;
+  std::string line;
+  while (getline(is, line)) {
+    std::vector<std::string> toks = Tokenize(line, "\t");
+    assert(toks.size() == 2);
+    uint64_t ind = Scan<uint64_t>(toks[1]);
+    karta[ind] = toks[0];
+  }
 
   //Close the stream after we are done.
   is.close();
 }
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/vocabid.hh b/moses/TranslationModel/ProbingPT/vocabid.hh
index 491c53439..f9c9b2dff 100644
--- a/moses/TranslationModel/ProbingPT/vocabid.hh
+++ b/moses/TranslationModel/ProbingPT/vocabid.hh
@@ -13,8 +13,17 @@
 #include "util/string_piece.hh"  //Tokenization and work with StringPiece
 #include "util/tokenize_piece.hh"
 
-void add_to_map(std::map<uint64_t, std::string> *karta, StringPiece textin);
+namespace Moses
+{
+template<typename VOCABID>
+class StoreVocab;
 
-void serialize_map(std::map<uint64_t, std::string> *karta, const char* filename);
+void add_to_map(StoreVocab<uint64_t> &sourceVocab,
+    const StringPiece &textin);
 
-void read_map(std::map<uint64_t, std::string> *karta, const char* filename);
+void serialize_map(const std::map<uint64_t, std::string> &karta,
+    const std::string &filename);
+
+void read_map(std::map<uint64_t, std::string> &karta, const char* filename);
+
+}
-- 
cgit v1.2.3


From 2eea4dd5e0e369a43300298190c4b860c17d19ad Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Tue, 4 Oct 2016 16:48:52 +0100
Subject: compiles

---
 moses/TranslationModel/ProbingPT/ProbingPT.cpp | 131 ++++++++++++-------------
 moses/TranslationModel/ProbingPT/ProbingPT.h   |  15 ++-
 2 files changed, 76 insertions(+), 70 deletions(-)

diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
index bb3f26e22..1298f8149 100644
--- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp
+++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
@@ -161,99 +161,94 @@ TargetPhraseCollection::shared_ptr ProbingPT::CreateTargetPhrase(const Phrase &s
   // create a target phrase from the 1st word of the source, prefix with 'ProbingPT:'
   assert(sourcePhrase.GetSize());
 
-  TargetPhraseCollection::shared_ptr tpColl;
-  bool ok;
-  vector<uint64_t> probingSource = ConvertToProbingSourcePhrase(sourcePhrase, ok);
-  if (!ok) {
-    // source phrase contains a word unknown in the pt.
-    // We know immediately there's no translation for it
-    return tpColl;
+  std::pair<bool, uint64_t> keyStruct = GetKey(sourcePhrase);
+  if (!keyStruct.first) {
+    return TargetPhraseCollection::shared_ptr();
   }
 
-  std::pair<bool, std::vector<target_text> > query_result;
-
-  //Actual lookup
-  query_result = m_engine->query(probingSource);
+  // check in cache
+  CachePb::const_iterator iter = m_cachePb.find(keyStruct.second);
+  if (iter != m_cachePb.end()) {
+    //cerr << "FOUND IN CACHE " << keyStruct.second << " " << sourcePhrase.Debug(mgr.system) << endl;
+    TargetPhraseCollection *tps = iter->second;
+    return TargetPhraseCollection::shared_ptr(tps);
+  }
 
-  if (query_result.first) {
-    //m_engine->printTargetInfo(query_result.second);
-    tpColl.reset(new TargetPhraseCollection());
+  // query pt
+  TargetPhraseCollection *tps = CreateTargetPhrases(sourcePhrase,
+      keyStruct.second);
+  return TargetPhraseCollection::shared_ptr(tps);
+}
 
-    const std::vector<target_text> &probingTargetPhrases = query_result.second;
-    for (size_t i = 0; i < probingTargetPhrases.size(); ++i) {
-      const target_text &probingTargetPhrase = probingTargetPhrases[i];
-      TargetPhrase *tp = CreateTargetPhrase(sourcePhrase, probingTargetPhrase);
+std::pair<bool, uint64_t> ProbingPT::GetKey(const Phrase &sourcePhrase) const
+{
+  std::pair<bool, uint64_t> ret;
 
-      tpColl->Add(tp);
-    }
+  // create a target phrase from the 1st word of the source, prefix with 'ProbingPT:'
+  size_t sourceSize = sourcePhrase.GetSize();
+  assert(sourceSize);
 
-    tpColl->Prune(true, m_tableLimit);
+  uint64_t probingSource[sourceSize];
+  GetSourceProbingIds(sourcePhrase, ret.first, probingSource);
+  if (!ret.first) {
+    // source phrase contains a word unknown in the pt.
+    // We know immediately there's no translation for it
+  }
+  else {
+    ret.second = m_engine->getKey(probingSource, sourceSize);
   }
 
-  return tpColl;
+  return ret;
+
 }
 
-TargetPhrase *ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const
+void ProbingPT::GetSourceProbingIds(const Phrase &sourcePhrase,
+    bool &ok, uint64_t probingSource[]) const
 {
-  const std::vector<unsigned int> &probingPhrase = probingTargetPhrase.target_phrase;
-  size_t size = probingPhrase.size();
-
-  TargetPhrase *tp = new TargetPhrase(this);
 
-  // words
+  size_t size = sourcePhrase.GetSize();
   for (size_t i = 0; i < size; ++i) {
-    uint64_t probingId = probingPhrase[i];
-    const Factor *factor = GetTargetFactor(probingId);
-    assert(factor);
-
-    Word &word = tp->AddWord();
-    word.SetFactor(m_output[0], factor);
+    const Word &word = sourcePhrase.GetWord(i);
+    uint64_t probingId = GetSourceProbingId(word);
+    if (probingId == m_unkId) {
+      ok = false;
+      return;
+    }
+    else {
+      probingSource[i] = probingId;
+    }
   }
 
-  // score for this phrase table
-  vector<float> scores = probingTargetPhrase.prob;
-  std::transform(scores.begin(), scores.end(), scores.begin(),TransformScore);
-  tp->GetScoreBreakdown().PlusEquals(this, scores);
+  ok = true;
+}
 
-  // alignment
-  /*
-  const std::vector<unsigned char> &alignments = probingTargetPhrase.word_all1;
+uint64_t ProbingPT::GetSourceProbingId(const Word &word) const
+{
+  uint64_t ret = 0;
 
-  AlignmentInfo &aligns = tp->GetAlignTerm();
-  for (size_t i = 0; i < alignS.size(); i += 2 ) {
-    aligns.Add((size_t) alignments[i], (size_t) alignments[i+1]);
+  for (size_t i = 0; i < m_input.size(); ++i) {
+    FactorType factorType = m_input[i];
+    const Factor *factor = word[factorType];
+
+    size_t factorId = factor->GetId();
+    if (factorId >= m_sourceVocab.size()) {
+    return m_unkId;
+    }
+    ret += m_sourceVocab[factorId];
   }
-  */
 
-  // score of all other ff when this rule is being loaded
-  tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
-  return tp;
+  return ret;
 }
 
-const Factor *ProbingPT::GetTargetFactor(uint64_t probingId) const
+TargetPhraseCollection *ProbingPT::CreateTargetPhrases(
+    const Phrase &sourcePhrase, uint64_t key) const
 {
-  TargetVocabMap::right_map::const_iterator iter;
-  iter = m_vocabMap.right.find(probingId);
-  if (iter != m_vocabMap.right.end()) {
-    return iter->second;
-  } else {
-    // not in mapping. Must be UNK
-    return NULL;
-  }
-}
 
-uint64_t ProbingPT::GetSourceProbingId(const Factor *factor) const
-{
-  SourceVocabMap::left_map::const_iterator iter;
-  iter = m_sourceVocabMap.left.find(factor);
-  if (iter != m_sourceVocabMap.left.end()) {
-    return iter->second;
-  } else {
-    // not in mapping. Must be UNK
-    return m_unkId;
-  }
 }
 
+//////////////////////////////////////////////////////////////////
+
+
 ChartRuleLookupManager *ProbingPT::CreateRuleLookupManager(
   const ChartParser &,
   const ChartCellCollectionBase &,
diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.h b/moses/TranslationModel/ProbingPT/ProbingPT.h
index 3b5dfc895..98d052e07 100644
--- a/moses/TranslationModel/ProbingPT/ProbingPT.h
+++ b/moses/TranslationModel/ProbingPT/ProbingPT.h
@@ -2,6 +2,7 @@
 #pragma once
 #include <boost/iostreams/device/mapped_file.hpp>
 #include <boost/bimap.hpp>
+#include <boost/unordered_map.hpp>
 #include "../PhraseDictionary.h"
 
 
@@ -48,15 +49,25 @@ protected:
   boost::iostreams::mapped_file_source file;
   const char *data;
 
+  // caching
+  typedef boost::unordered_map<uint64_t, TargetPhraseCollection*> CachePb;
+  CachePb m_cachePb;
+
   void CreateAlignmentMap(const std::string path);
 
   TargetPhraseCollection::shared_ptr CreateTargetPhrase(const Phrase &sourcePhrase) const;
-  TargetPhrase *CreateTargetPhrase(const Phrase &sourcePhrase, const target_text &probingTargetPhrase) const;
-  const Factor *GetTargetFactor(uint64_t probingId) const;
   uint64_t GetSourceProbingId(const Factor *factor) const;
 
   std::vector<uint64_t> ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const;
 
+  std::pair<bool, uint64_t> GetKey(const Phrase &sourcePhrase) const;
+  void GetSourceProbingIds(const Phrase &sourcePhrase, bool &ok,
+      uint64_t probingSource[]) const;
+  uint64_t GetSourceProbingId(const Word &word) const;
+
+  TargetPhraseCollection *CreateTargetPhrases(
+      const Phrase &sourcePhrase, uint64_t key) const;
+
 };
 
 }  // namespace Moses
-- 
cgit v1.2.3


From 041b13eb19f364b79809a7efa08c4552d41d4e75 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 5 Oct 2016 15:15:47 +0100
Subject: compiles but segfault

---
 moses/ScoreComponentCollection.h               |   9 ++
 moses/TranslationModel/ProbingPT/ProbingPT.cpp | 122 +++++++++++++++++++++----
 moses/TranslationModel/ProbingPT/ProbingPT.h   |  14 ++-
 3 files changed, 123 insertions(+), 22 deletions(-)

diff --git a/moses/ScoreComponentCollection.h b/moses/ScoreComponentCollection.h
index 1305e9c16..0ab57a73a 100644
--- a/moses/ScoreComponentCollection.h
+++ b/moses/ScoreComponentCollection.h
@@ -247,6 +247,15 @@ public:
     }
   }
 
+  void PlusEquals(const FeatureFunction* sp, float scores[])
+  {
+    size_t numScores = sp->GetNumScoreComponents();
+    size_t offset = sp->GetIndex();
+    for (size_t i = 0; i < numScores; ++i) {
+      m_scores[i + offset] += scores[i];
+    }
+  }
+
   //! Special version PlusEquals(ScoreProducer, vector<float>)
   //! to add the score from a single ScoreProducer that produces
   //! a single value
diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
index 1298f8149..1fd982f0e 100644
--- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp
+++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
@@ -137,25 +137,6 @@ void ProbingPT::GetTargetPhraseCollectionBatch(const InputPathList &inputPathQue
   }
 }
 
-std::vector<uint64_t> ProbingPT::ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const
-{
-  size_t size = sourcePhrase.GetSize();
-  std::vector<uint64_t> ret(size);
-  for (size_t i = 0; i < size; ++i) {
-    const Factor *factor = sourcePhrase.GetFactor(i, m_input[0]);
-    uint64_t probingId = GetSourceProbingId(factor);
-    if (probingId == m_unkId) {
-      ok = false;
-      return ret;
-    } else {
-      ret[i] = probingId;
-    }
-  }
-
-  ok = true;
-  return ret;
-}
-
 TargetPhraseCollection::shared_ptr ProbingPT::CreateTargetPhrase(const Phrase &sourcePhrase) const
 {
   // create a target phrase from the 1st word of the source, prefix with 'ProbingPT:'
@@ -243,7 +224,110 @@ uint64_t ProbingPT::GetSourceProbingId(const Word &word) const
 TargetPhraseCollection *ProbingPT::CreateTargetPhrases(
     const Phrase &sourcePhrase, uint64_t key) const
 {
+  TargetPhraseCollection *tps = NULL;
+
+  //Actual lookup
+  std::pair<bool, uint64_t> query_result; // 1st=found, 2nd=target file offset
+  query_result = m_engine->query(key);
+  //cerr << "key2=" << query_result.second << endl;
+
+  if (query_result.first) {
+    const char *offset = data + query_result.second;
+    uint64_t *numTP = (uint64_t*) offset;
+
+    tps = new TargetPhraseCollection();
+
+    offset += sizeof(uint64_t);
+    for (size_t i = 0; i < *numTP; ++i) {
+      TargetPhrase *tp = CreateTargetPhrase(offset);
+      assert(tp);
+      tp->EvaluateInIsolation(sourcePhrase, GetFeaturesToApply());
+
+      tps->Add(tp);
+
+    }
+
+    tps->Prune(true, m_tableLimit);
+    //cerr << *tps << endl;
+  }
+
+  return tps;
+
+}
+
+TargetPhrase *ProbingPT::CreateTargetPhrase(
+    const char *&offset) const
+{
+  TargetPhraseInfo *tpInfo = (TargetPhraseInfo*) offset;
+  size_t numRealWords = tpInfo->numWords / m_output.size();
+
+  TargetPhrase *tp = new TargetPhrase(this);
+
+  offset += sizeof(TargetPhraseInfo);
+
+  // scores
+  float *scores = (float*) offset;
+
+  size_t totalNumScores = m_engine->num_scores + m_engine->num_lex_scores;
+
+  if (m_engine->logProb) {
+    // set pt score for rule
+    tp->GetScoreBreakdown().PlusEquals(this, scores);
+
+    // save scores for other FF, eg. lex RO. Just give the offset
+    /*
+    if (m_engine->num_lex_scores) {
+      tp->scoreProperties = scores + m_engine->num_scores;
+    }
+    */
+  }
+  else {
+    // log score 1st
+    float logScores[totalNumScores];
+    for (size_t i = 0; i < totalNumScores; ++i) {
+      logScores[i] = FloorScore(TransformScore(scores[i]));
+    }
+
+    // set pt score for rule
+    tp->GetScoreBreakdown().PlusEquals(this, logScores);
+
+    // save scores for other FF, eg. lex RO.
+    /*
+    tp->scoreProperties = pool.Allocate<SCORE>(m_engine->num_lex_scores);
+    for (size_t i = 0; i < m_engine->num_lex_scores; ++i) {
+      tp->scoreProperties[i] = logScores[i + m_engine->num_scores];
+    }
+    */
+  }
+
+  offset += sizeof(float) * totalNumScores;
+
+  // words
+  for (size_t targetPos = 0; targetPos < numRealWords; ++targetPos) {
+    for (size_t i = 0; i < m_output.size(); ++i) {
+      FactorType factorType = m_output[i];
+
+      uint32_t *probingId = (uint32_t*) offset;
+
+      const Factor *factor = GetTargetFactor(*probingId);
+      assert(factor);
+
+      Word &word = tp->GetWord(targetPos);
+      word[factorType] = factor;
+
+      offset += sizeof(uint32_t);
+    }
+  }
+
+  // align
+  uint32_t alignTerm = tpInfo->alignTerm;
+  //cerr << "alignTerm=" << alignTerm << endl;
+  UTIL_THROW_IF2(alignTerm >= m_aligns.size(), "Unknown alignInd");
+  tp->SetAlignTerm(m_aligns[alignTerm]);
+
+  // properties TODO
 
+  return tp;
 }
 
 //////////////////////////////////////////////////////////////////
diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.h b/moses/TranslationModel/ProbingPT/ProbingPT.h
index 98d052e07..21c01df28 100644
--- a/moses/TranslationModel/ProbingPT/ProbingPT.h
+++ b/moses/TranslationModel/ProbingPT/ProbingPT.h
@@ -56,17 +56,25 @@ protected:
   void CreateAlignmentMap(const std::string path);
 
   TargetPhraseCollection::shared_ptr CreateTargetPhrase(const Phrase &sourcePhrase) const;
-  uint64_t GetSourceProbingId(const Factor *factor) const;
-
-  std::vector<uint64_t> ConvertToProbingSourcePhrase(const Phrase &sourcePhrase, bool &ok) const;
 
   std::pair<bool, uint64_t> GetKey(const Phrase &sourcePhrase) const;
   void GetSourceProbingIds(const Phrase &sourcePhrase, bool &ok,
       uint64_t probingSource[]) const;
   uint64_t GetSourceProbingId(const Word &word) const;
+  uint64_t GetSourceProbingId(const Factor *factor) const;
 
   TargetPhraseCollection *CreateTargetPhrases(
       const Phrase &sourcePhrase, uint64_t key) const;
+  TargetPhrase *CreateTargetPhrase(
+      const char *&offset) const;
+
+  inline const Factor *GetTargetFactor(uint32_t probingId) const
+  {
+    if (probingId >= m_targetVocab.size()) {
+      return NULL;
+    }
+    return m_targetVocab[probingId];
+  }
 
 };
 
-- 
cgit v1.2.3


From 0e4e64b26dd3b82a0dfbfe2445f89e1dcbbdf61a Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 5 Oct 2016 16:26:33 +0100
Subject: getKey() -> Moses::getKey()

---
 moses/TranslationModel/ProbingPT/quering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moses/TranslationModel/ProbingPT/quering.cpp b/moses/TranslationModel/ProbingPT/quering.cpp
index ef980ef06..d616e1f25 100644
--- a/moses/TranslationModel/ProbingPT/quering.cpp
+++ b/moses/TranslationModel/ProbingPT/quering.cpp
@@ -96,7 +96,7 @@ uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
 {
   //TOO SLOW
   //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
-  return getKey(source_phrase, size);
+  return Moses::getKey(source_phrase, size);
 }
 
 std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
-- 
cgit v1.2.3


From 7d7ae1b72ca6487cd50dba6d20d0ba4a4b08b782 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Wed, 5 Oct 2016 16:43:04 +0100
Subject: add StoreVocab

---
 moses/TranslationModel/ProbingPT/StoreTarget.cpp | 266 +++++++++++++++++++++++
 moses/TranslationModel/ProbingPT/StoreTarget.h   |  51 +++++
 moses/TranslationModel/ProbingPT/StoreVocab.cpp  |  13 ++
 moses/TranslationModel/ProbingPT/StoreVocab.h    |  64 ++++++
 4 files changed, 394 insertions(+)
 create mode 100644 moses/TranslationModel/ProbingPT/StoreTarget.cpp
 create mode 100644 moses/TranslationModel/ProbingPT/StoreTarget.h
 create mode 100644 moses/TranslationModel/ProbingPT/StoreVocab.cpp
 create mode 100644 moses/TranslationModel/ProbingPT/StoreVocab.h

diff --git a/moses/TranslationModel/ProbingPT/StoreTarget.cpp b/moses/TranslationModel/ProbingPT/StoreTarget.cpp
new file mode 100644
index 000000000..8072f408b
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/StoreTarget.cpp
@@ -0,0 +1,266 @@
+/*
+ * StoreTarget.cpp
+ *
+ *  Created on: 19 Jan 2016
+ *      Author: hieu
+ */
+#include <boost/foreach.hpp>
+#include "StoreTarget.h"
+#include "line_splitter.hh"
+#include "probing_hash_utils.hh"
+#include "moses/OutputFileStream.h"
+#include "moses/Util.h"
+
+using namespace std;
+
+namespace Moses
+{
+
+StoreTarget::StoreTarget(const std::string &basepath)
+:m_basePath(basepath)
+,m_vocab(basepath + "/TargetVocab.dat")
+{
+  std::string path = basepath + "/TargetColl.dat";
+  m_fileTargetColl.open(path.c_str(),
+      std::ios::out | std::ios::binary | std::ios::ate | std::ios::trunc);
+  if (!m_fileTargetColl.is_open()) {
+    throw "can't create file ";
+  }
+
+}
+
+StoreTarget::~StoreTarget()
+{
+  assert(m_coll.empty());
+  m_fileTargetColl.close();
+
+  // vocab
+  m_vocab.Save();
+}
+
+uint64_t StoreTarget::Save()
+{
+  uint64_t ret = m_fileTargetColl.tellp();
+
+  // save to disk
+  uint64_t numTP = m_coll.size();
+  m_fileTargetColl.write((char*) &numTP, sizeof(uint64_t));
+
+  for (size_t i = 0; i < m_coll.size(); ++i) {
+    Save(*m_coll[i]);
+  }
+
+  // clear coll
+  RemoveAllInColl(m_coll);
+  m_coll.clear();
+
+  // starting position of coll
+  return ret;
+}
+
+void StoreTarget::Save(const target_text &rule)
+{
+  // metadata for each tp
+  TargetPhraseInfo tpInfo;
+  tpInfo.alignTerm = GetAlignId(rule.word_align_term);
+  tpInfo.alignNonTerm = GetAlignId(rule.word_align_non_term);
+  tpInfo.numWords = rule.target_phrase.size();
+  tpInfo.propLength = rule.property.size();
+
+  //cerr << "TPInfo=" << sizeof(TPInfo);
+  m_fileTargetColl.write((char*) &tpInfo, sizeof(TargetPhraseInfo));
+
+  // scores
+  for (size_t i = 0; i < rule.prob.size(); ++i) {
+    float prob = rule.prob[i];
+    m_fileTargetColl.write((char*) &prob, sizeof(prob));
+  }
+
+  // tp
+  for (size_t i = 0; i < rule.target_phrase.size(); ++i) {
+    uint32_t vocabId = rule.target_phrase[i];
+    m_fileTargetColl.write((char*) &vocabId, sizeof(vocabId));
+  }
+
+  // prop TODO
+
+}
+
+void StoreTarget::SaveAlignment()
+{
+  std::string path = m_basePath + "/Alignments.dat";
+  OutputFileStream file(path);
+
+  BOOST_FOREACH(Alignments::value_type &valPair, m_aligns) {
+    file << valPair.second << "\t";
+
+    const std::vector<size_t> &aligns = valPair.first;
+    BOOST_FOREACH(size_t align, aligns) {
+      file << align << " ";
+    }
+    file << endl;
+  }
+
+}
+
+void StoreTarget::Append(const line_text &line, bool log_prob, bool scfg)
+{
+  target_text *rule = new target_text;
+  //cerr << "line.target_phrase=" << line.target_phrase << endl;
+
+  // target_phrase
+  vector<bool> nonTerms;
+  util::TokenIter<util::SingleCharacter> it;
+  it = util::TokenIter<util::SingleCharacter>(line.target_phrase,
+      util::SingleCharacter(' '));
+  while (it) {
+	StringPiece word = *it;
+	//cerr << "word=" << word << endl;
+
+    bool nonTerm = false;
+    if (scfg) {
+      // not really sure how to handle factored SCFG and NT
+      if (scfg && word[0] == '[' && word[word.size() - 1] == ']') {
+        //cerr << "NON-TERM=" << tok << " " << nonTerms.size() << endl;
+        nonTerm = true;
+      }
+      nonTerms.push_back(nonTerm);
+    }
+
+    util::TokenIter<util::SingleCharacter> itFactor;
+    itFactor = util::TokenIter<util::SingleCharacter>(word,
+        util::SingleCharacter('|'));
+    while (itFactor) {
+    	StringPiece factor = *itFactor;
+
+    	string factorStr = factor.as_string();
+    	uint32_t vocabId = m_vocab.GetVocabId(factorStr);
+
+    	rule->target_phrase.push_back(vocabId);
+
+    	itFactor++;
+    }
+
+    it++;
+  }
+
+  // probs
+  it = util::TokenIter<util::SingleCharacter>(line.prob,
+      util::SingleCharacter(' '));
+  while (it) {
+    string tok = it->as_string();
+    float prob = Scan<float>(tok);
+
+    if (log_prob) {
+      prob = FloorScore(log(prob));
+      if (prob == 0.0f) prob = 0.0000000001;
+    }
+
+    rule->prob.push_back(prob);
+    it++;
+  }
+
+  /*
+  cerr << "nonTerms=";
+  for (size_t i = 0; i < nonTerms.size(); ++i) {
+    cerr << nonTerms[i] << " ";
+  }
+  cerr << endl;
+  */
+
+  // alignment
+  it = util::TokenIter<util::SingleCharacter>(line.word_align,
+      util::SingleCharacter(' '));
+  while (it) {
+    string tokPair = Trim(it->as_string());
+    if (tokPair.empty()) {
+      break;
+    }
+
+    vector<size_t> alignPair = Tokenize<size_t>(tokPair, "-");
+    assert(alignPair.size() == 2);
+
+    bool nonTerm = false;
+    size_t sourcePos = alignPair[0];
+    size_t targetPos = alignPair[1];
+    if (scfg) {
+      nonTerm = nonTerms[targetPos];
+    }
+
+    //cerr << targetPos << "=" << nonTerm << endl;
+
+    if (nonTerm) {
+      rule->word_align_non_term.push_back(sourcePos);
+      rule->word_align_non_term.push_back(targetPos);
+      //cerr << (int) rule->word_all1.back() << " ";
+    }
+    else {
+      rule->word_align_term.push_back(sourcePos);
+      rule->word_align_term.push_back(targetPos);
+    }
+
+    it++;
+  }
+
+  // extra scores
+  string prop = line.property.as_string();
+  AppendLexRO(prop, rule->prob, log_prob);
+
+  //cerr << "line.property=" << line.property << endl;
+  //cerr << "prop=" << prop << endl;
+
+  // properties
+  /*
+   for (size_t i = 0; i < prop.size(); ++i) {
+   rule->property.push_back(prop[i]);
+   }
+   */
+  m_coll.push_back(rule);
+}
+
+uint32_t StoreTarget::GetAlignId(const std::vector<size_t> &align)
+{
+  boost::unordered_map<std::vector<size_t>, uint32_t>::iterator iter =
+      m_aligns.find(align);
+  if (iter == m_aligns.end()) {
+    uint32_t ind = m_aligns.size();
+    m_aligns[align] = ind;
+    return ind;
+  }
+  else {
+    return iter->second;
+  }
+}
+
+void StoreTarget::AppendLexRO(std::string &prop, std::vector<float> &retvector,
+    bool log_prob) const
+{
+  size_t startPos = prop.find("{{LexRO ");
+
+  if (startPos != string::npos) {
+    size_t endPos = prop.find("}}", startPos + 8);
+    string lexProb = prop.substr(startPos + 8, endPos - startPos - 8);
+    //cerr << "lexProb=" << lexProb << endl;
+
+    // append lex probs to pt probs
+    vector<float> scores = Tokenize<float>(lexProb);
+
+    if (log_prob) {
+      for (size_t i = 0; i < scores.size(); ++i) {
+        scores[i] = FloorScore(log(scores[i]));
+        if (scores[i] == 0.0f) scores[i] = 0.0000000001;
+      }
+    }
+
+    for (size_t i = 0; i < scores.size(); ++i) {
+      retvector.push_back(scores[i]);
+    }
+
+    // exclude LexRO property from property column
+    prop = prop.substr(0, startPos)
+        + prop.substr(endPos + 2, prop.size() - endPos - 2);
+    //cerr << "line.property_to_be_binarized=" << line.property_to_be_binarized << "AAAA" << endl;
+  }
+}
+
+} /* namespace Moses2 */
diff --git a/moses/TranslationModel/ProbingPT/StoreTarget.h b/moses/TranslationModel/ProbingPT/StoreTarget.h
new file mode 100644
index 000000000..5c7d9e1b7
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/StoreTarget.h
@@ -0,0 +1,51 @@
+/*
+ * StoreTarget.h
+ *
+ *  Created on: 19 Jan 2016
+ *      Author: hieu
+ */
+#pragma once
+#include <string>
+#include <fstream>
+#include <vector>
+#include <inttypes.h>
+#include <boost/unordered_map.hpp>
+#include <boost/unordered_set.hpp>
+#include "StoreVocab.h"
+
+namespace Moses
+{
+
+class line_text;
+class target_text;
+
+class StoreTarget
+{
+public:
+  StoreTarget(const std::string &basepath);
+  virtual ~StoreTarget();
+
+  uint64_t Save();
+  void SaveAlignment();
+
+  void Append(const line_text &line, bool log_prob, bool scfg);
+protected:
+  std::string m_basePath;
+  std::fstream m_fileTargetColl;
+  StoreVocab<uint32_t> m_vocab;
+
+  typedef boost::unordered_map<std::vector<size_t>, uint32_t> Alignments;
+  Alignments m_aligns;
+
+  std::vector<target_text*> m_coll;
+
+  uint32_t GetAlignId(const std::vector<size_t> &align);
+  void Save(const target_text &rule);
+
+  void AppendLexRO(std::string &prop, std::vector<float> &retvector,
+      bool log_prob) const;
+
+};
+
+} /* namespace Moses2 */
+
diff --git a/moses/TranslationModel/ProbingPT/StoreVocab.cpp b/moses/TranslationModel/ProbingPT/StoreVocab.cpp
new file mode 100644
index 000000000..6515bac63
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/StoreVocab.cpp
@@ -0,0 +1,13 @@
+/*
+ * StoreVocab.cpp
+ *
+ *  Created on: 15 Jun 2016
+ *      Author: hieu
+ */
+#include <fstream>
+#include "StoreVocab.h"
+
+namespace Moses
+{
+
+} /* namespace Moses2 */
diff --git a/moses/TranslationModel/ProbingPT/StoreVocab.h b/moses/TranslationModel/ProbingPT/StoreVocab.h
new file mode 100644
index 000000000..05d279f4c
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/StoreVocab.h
@@ -0,0 +1,64 @@
+/*
+ * StoreVocab.h
+ *
+ *  Created on: 15 Jun 2016
+ *      Author: hieu
+ */
+#pragma once
+#include <string>
+#include <boost/unordered_map.hpp>
+#include "moses/OutputFileStream.h"
+#include "moses/Util.h"
+
+namespace Moses
+{
+
+template<typename VOCABID>
+class StoreVocab
+{
+protected:
+  std::string m_path;
+
+  typedef boost::unordered_map<std::string, VOCABID> Coll;
+  Coll m_vocab;
+
+public:
+  StoreVocab(const std::string &path)
+  :m_path(path)
+  {}
+
+  virtual ~StoreVocab() {}
+
+  VOCABID GetVocabId(const std::string &word)
+  {
+    typename Coll::iterator iter = m_vocab.find(word);
+    if (iter == m_vocab.end()) {
+      VOCABID ind = m_vocab.size() + 1;
+      m_vocab[word] = ind;
+      return ind;
+    }
+    else {
+      return iter->second;
+    }
+  }
+
+  void Insert(VOCABID id, const std::string &word)
+  {
+    m_vocab[word] = id;
+  }
+
+  void Save()
+  {
+    OutputFileStream strme(m_path);
+
+    typename Coll::const_iterator iter;
+    for (iter = m_vocab.begin(); iter != m_vocab.end(); ++iter) {
+      strme << iter->first << "\t" << iter->second << std::endl;
+    }
+
+    strme.Close();
+  }
+};
+
+} /* namespace Moses2 */
+
-- 
cgit v1.2.3


From 43ece9b1fff4ef94117e946f24269d2dbf17b20e Mon Sep 17 00:00:00 2001
From: Hieu Hoang <s0565741@odin.inf.ed.ac.uk>
Date: Wed, 5 Oct 2016 16:53:16 +0100
Subject: compile error with gcc 4.6.3

---
 contrib/moses2/MemPool.h | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/contrib/moses2/MemPool.h b/contrib/moses2/MemPool.h
index 5820ce2be..8160aa5a3 100644
--- a/contrib/moses2/MemPool.h
+++ b/contrib/moses2/MemPool.h
@@ -230,6 +230,14 @@ public:
     //std::cerr << "destroy " << p << " " << n << std::endl;
   }
 
+  // return address of values
+  pointer address (reference value) const {
+    return &value;
+  }
+  const_pointer address (const_reference value) const {
+    return &value;
+  }
+
   MemPool &m_pool;
 protected:
 };
-- 
cgit v1.2.3


From cb348f159adae8208ded7042f51c5d9dd739ccb0 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <s0565741@odin.inf.ed.ac.uk>
Date: Wed, 5 Oct 2016 16:57:12 +0100
Subject: add --scfg arg

---
 scripts/generic/binarize4moses2.perl | 41 +++++++++++++++++++++++++++---------
 1 file changed, 31 insertions(+), 10 deletions(-)

diff --git a/scripts/generic/binarize4moses2.perl b/scripts/generic/binarize4moses2.perl
index 5b9f08e50..5765c3705 100755
--- a/scripts/generic/binarize4moses2.perl
+++ b/scripts/generic/binarize4moses2.perl
@@ -12,22 +12,23 @@ my $mosesDir = "$RealBin/../..";
 my $ptPath;
 my $lexRoPath;
 my $outPath;
-my $numScores = 4;
 my $numLexScores;
 my $pruneNum = 0;
+my $scfg = 0;
 
 GetOptions("phrase-table=s"  => \$ptPath,
            "lex-ro=s"   => \$lexRoPath,
            "output-dir=s" => \$outPath,
-           "num-scores=s" => \$numScores,
            "num-lex-scores=i" => \$numLexScores,
-           "prune=i" => \$pruneNum
+           "prune=i" => \$pruneNum,
+           "scfg" => \$scfg
 	   ) or exit 1;
 
+#print STDERR "scfg=$scfg \n";
 die("ERROR: please set --phrase-table") unless defined($ptPath);
-die("ERROR: please set --lex-ro") unless defined($lexRoPath);
+#die("ERROR: please set --lex-ro") unless defined($lexRoPath);
 die("ERROR: please set --output-dir") unless defined($outPath);
-die("ERROR: please set --num-lex-scores") unless defined($numLexScores);
+#die("ERROR: please set --num-lex-scores") unless defined($numLexScores);
 
 my $cmd;
 
@@ -37,13 +38,33 @@ my $tempPath = dirname($outPath)  ."/tmp.$$";
 $cmd = "gzip -dc $ptPath |  $mosesDir/contrib/sigtest-filter/filter-pt -n $pruneNum | gzip -c > $tempPath/pt.gz";
 systemCheck($cmd);
 
-$cmd = "$mosesDir/bin/processLexicalTableMin  -in $lexRoPath -out $tempPath/lex-ro -T . -threads all";
-systemCheck($cmd);
+if (defined($lexRoPath)) {
+  die("ERROR: please set --num-lex-scores") unless defined($numLexScores);                                            
 
-$cmd = "$mosesDir/bin/addLexROtoPT $tempPath/pt.gz $tempPath/lex-ro.minlexr  | gzip -c > $tempPath/pt.withLexRO.gz";
-systemCheck($cmd);
+  $cmd = "$mosesDir/bin/processLexicalTableMin  -in $lexRoPath -out $tempPath/lex-ro -T . -threads all";
+  systemCheck($cmd);
+
+  $cmd = "$mosesDir/bin/addLexROtoPT $tempPath/pt.gz $tempPath/lex-ro.minlexr  | gzip -c > $tempPath/pt.withLexRO.gz";
+  systemCheck($cmd);
+
+  $cmd = "ln -s pt.withLexRO.gz $tempPath/pt.txt.gz";
+  systemCheck($cmd);
+}
+else {
+    $cmd = "ln -s pt.gz $tempPath/pt.txt.gz";
+    systemCheck($cmd);
+}
+
+$cmd = "$mosesDir/bin/CreateProbingPT2 --log-prob --input-pt $tempPath/pt.txt.gz --output-dir $outPath";
+
+if (defined($lexRoPath)) {
+    $cmd .= " --num-lex-scores $numLexScores";
+}
+
+if ($scfg) {
+    $cmd .= " --scfg";
+}
 
-$cmd = "$mosesDir/bin/CreateProbingPT2 --num-scores $numScores --num-lex-scores $numLexScores --log-prob --input-pt $tempPath/pt.withLexRO.gz --output-dir $outPath";
 systemCheck($cmd);
 
 exit(0);
-- 
cgit v1.2.3


From fa888166c00d266c09de6f22d123901aae15d73a Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Thu, 6 Oct 2016 13:10:55 +0100
Subject: no segfault. yay

---
 moses/TranslationModel/ProbingPT/ProbingPT.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
index 1fd982f0e..06b1360cd 100644
--- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp
+++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
@@ -304,6 +304,7 @@ TargetPhrase *ProbingPT::CreateTargetPhrase(
 
   // words
   for (size_t targetPos = 0; targetPos < numRealWords; ++targetPos) {
+	Word &word = tp->AddWord();
     for (size_t i = 0; i < m_output.size(); ++i) {
       FactorType factorType = m_output[i];
 
@@ -312,7 +313,6 @@ TargetPhrase *ProbingPT::CreateTargetPhrase(
       const Factor *factor = GetTargetFactor(*probingId);
       assert(factor);
 
-      Word &word = tp->GetWord(targetPos);
       word[factorType] = factor;
 
       offset += sizeof(uint32_t);
-- 
cgit v1.2.3


From d03991acec06b10bdd6fe213aac64012978ae90b Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Thu, 6 Oct 2016 13:31:18 +0100
Subject: delete CreateProbingPT2

---
 Jamroot                             |   2 +
 contrib/moses2/CreateProbingPT2.cpp | 113 ------------------------------------
 contrib/moses2/Jamfile              |   3 +-
 3 files changed, 3 insertions(+), 115 deletions(-)
 delete mode 100644 contrib/moses2/CreateProbingPT2.cpp

diff --git a/Jamroot b/Jamroot
index efafa0122..7a7be5c93 100644
--- a/Jamroot
+++ b/Jamroot
@@ -341,3 +341,5 @@ if [ path.exists $(TOP)/dist ] && $(prefix) != dist {
 local temp = [ _shell "mkdir -p $(TOP)/bin" ] ;
 local temp = [ _shell "rm -f $(TOP)/bin/moses_chart" ] ;
 local temp = [ _shell "cd $(TOP)/bin && ln -s moses moses_chart" ] ;
+local temp = [ _shell "cd $(TOP)/bin && ln -s CreateProbingPT CreateProbingPT2" ] ;
+
diff --git a/contrib/moses2/CreateProbingPT2.cpp b/contrib/moses2/CreateProbingPT2.cpp
deleted file mode 100644
index 24b0e2fd1..000000000
--- a/contrib/moses2/CreateProbingPT2.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-#include <string>
-#include <boost/program_options.hpp>
-#include "util/usage.hh"
-#include "TranslationModel/ProbingPT/storing.hh"
-#include "legacy/InputFileStream.h"
-#include "legacy/OutputFileStream.h"
-#include "legacy/Util2.h"
-
-using namespace std;
-
-std::string ReformatSCFGFile(const std::string &path);
-
-int main(int argc, char* argv[])
-{
-	string inPath, outPath; 
-	int num_scores = 4;
-	int num_lex_scores = 0;
-	bool log_prob = false;
-	bool scfg = false;
-	int max_cache_size = 50000;
-
-  namespace po = boost::program_options;
-  po::options_description desc("Options");
-  desc.add_options()
-  ("help", "Print help messages")
-  ("input-pt", po::value<string>()->required(), "Text pt")
-  ("output-dir", po::value<string>()->required(), "Directory when binary files will be written")
-  ("num-scores", po::value<int>()->default_value(num_scores), "Number of pt scores")
-  ("num-lex-scores", po::value<int>()->default_value(num_lex_scores), "Number of lexicalized reordering scores")
-  ("log-prob", "log (and floor) probabilities before storing")
-  ("max-cache-size", po::value<int>()->default_value(max_cache_size), "Maximum number of high-count source lines to write to cache file. 0=no cache, negative=no limit")
-  ("scfg", "Rules are SCFG in Moses format (ie. with non-terms and LHS")
-
-	;
-
-  po::variables_map vm;
-  try {
-    po::store(po::parse_command_line(argc, argv, desc),
-              vm); // can throw
-
-    /** --help option
-     */
-    if ( vm.count("help")) {
-      std::cout << desc << std::endl;
-      return EXIT_SUCCESS;
-    }
-
-    po::notify(vm); // throws on error, so do after help in case
-    // there are any problems
-  } catch(po::error& e) {
-    std::cerr << "ERROR: " << e.what() << std::endl << std::endl;
-    std::cerr << desc << std::endl;
-    return EXIT_FAILURE;
-  }
-
-  if (vm.count("input-pt")) inPath = vm["input-pt"].as<string>();
-  if (vm.count("output-dir")) outPath = vm["output-dir"].as<string>();
-  if (vm.count("num-scores")) num_scores = vm["num-scores"].as<int>();
-  if (vm.count("num-lex-scores")) num_lex_scores = vm["num-lex-scores"].as<int>();
-  if (vm.count("max-cache-size")) max_cache_size = vm["max-cache-size"].as<int>();
-  if (vm.count("log-prob")) log_prob = true;
-  if (vm.count("scfg")) scfg = true;
-
-
-  if (scfg) {
-    inPath = ReformatSCFGFile(inPath);
-  }
-
-  Moses2::createProbingPT(inPath, outPath, num_scores, num_lex_scores, log_prob, max_cache_size, scfg);
-
-  //util::PrintUsage(std::cout);
-  return 0;
-}
-
-std::string ReformatSCFGFile(const std::string &path)
-{
-  Moses2::InputFileStream inFile(path);
-  string reformattedPath = path + ".reformat.gz";
-  Moses2::OutputFileStream outFile(reformattedPath);
-
-  string line;
-  while (getline(inFile, line)) {
-    vector<string> toks = Moses2::TokenizeMultiCharSeparator(line, "|||");
-    assert(toks.size() >= 3);
-
-    // source
-    vector<string> sourceToks = Moses2::Tokenize(toks[0], " ");
-    for (size_t i = 0; i < sourceToks.size() - 1; ++i) {
-      outFile << sourceToks[i] << " ";
-    }
-
-    // other columns
-    for (size_t i = 1; i < toks.size(); ++i) {
-      outFile << "|||" << toks[i];
-    }
-    outFile << endl;
-  }
-
-  inFile.Close();
-  outFile.Close();
-
-  string sortedPath = path + ".reformat.sorted.gz";
-  string tmpPath = path + ".tmp ";
-  string cmd = "mkdir " + tmpPath
-      + " && gzip -dc " + reformattedPath + " | LC_ALL=C sort -T " + tmpPath + " | gzip -c > " + sortedPath;
-  system(cmd.c_str());
-
-  cmd = "rm -rf " + tmpPath + " " + reformattedPath;
-  system(cmd.c_str());
-
-  return sortedPath;
-}
-
diff --git a/contrib/moses2/Jamfile b/contrib/moses2/Jamfile
index 193ac8db5..8791e3cf9 100644
--- a/contrib/moses2/Jamfile
+++ b/contrib/moses2/Jamfile
@@ -173,11 +173,10 @@ alias deps :  ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose
     deps ;
 
 exe moses2 : Main.cpp moses2_lib ;
-exe CreateProbingPT2 : CreateProbingPT2.cpp moses2_lib ;
 
 if [ xmlrpc ] {
   echo "Building Moses2" ;
-  alias programs : moses2 CreateProbingPT2 ;
+  alias programs : moses2 ;
 }
 else {
   echo "Not building Moses2" ;
-- 
cgit v1.2.3


From babc5acf70730bd9fd28d0d4deede8fe0fb23c29 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Thu, 6 Oct 2016 13:48:06 +0100
Subject: check that pt exists

---
 contrib/moses2/TranslationModel/ProbingPT/quering.cpp | 7 ++++++-
 moses/TranslationModel/ProbingPT/quering.cpp          | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/contrib/moses2/TranslationModel/ProbingPT/quering.cpp b/contrib/moses2/TranslationModel/ProbingPT/quering.cpp
index f26439442..36e384e73 100644
--- a/contrib/moses2/TranslationModel/ProbingPT/quering.cpp
+++ b/contrib/moses2/TranslationModel/ProbingPT/quering.cpp
@@ -12,10 +12,15 @@ QueryEngine::QueryEngine(const char * filepath)
 
   //Create filepaths
   std::string basepath(filepath);
+  std::string path_to_config = basepath + "/config";
   std::string path_to_hashtable = basepath + "/probing_hash.dat";
   std::string path_to_source_vocabid = basepath + "/source_vocabids";
   std::string alignPath = basepath + "/Alignments.dat";
 
+  if (!FileExists(path_to_config)) {
+    UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config);
+  }
+
   ///Source phrase vocabids
   read_map(source_vocabids, path_to_source_vocabid.c_str());
 
@@ -25,7 +30,7 @@ QueryEngine::QueryEngine(const char * filepath)
   //Read config file
   boost::unordered_map<std::string, std::string> keyValue;
 
-  std::ifstream config((basepath + "/config").c_str());
+  std::ifstream config(path_to_config.c_str());
   std::string line;
   while (getline(config, line)) {
     std::vector<std::string> toks = Moses2::Tokenize(line, "\t");
diff --git a/moses/TranslationModel/ProbingPT/quering.cpp b/moses/TranslationModel/ProbingPT/quering.cpp
index d616e1f25..52cd7f516 100644
--- a/moses/TranslationModel/ProbingPT/quering.cpp
+++ b/moses/TranslationModel/ProbingPT/quering.cpp
@@ -11,10 +11,15 @@ QueryEngine::QueryEngine(const char * filepath)
 
   //Create filepaths
   std::string basepath(filepath);
+  std::string path_to_config = basepath + "/config";
   std::string path_to_hashtable = basepath + "/probing_hash.dat";
   std::string path_to_source_vocabid = basepath + "/source_vocabids";
   std::string alignPath = basepath + "/Alignments.dat";
 
+  if (!FileExists(path_to_config)) {
+    UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config);
+  }
+
   ///Source phrase vocabids
   read_map(source_vocabids, path_to_source_vocabid.c_str());
 
@@ -24,7 +29,7 @@ QueryEngine::QueryEngine(const char * filepath)
   //Read config file
   boost::unordered_map<std::string, std::string> keyValue;
 
-  std::ifstream config((basepath + "/config").c_str());
+  std::ifstream config(path_to_config.c_str());
   std::string line;
   while (getline(config, line)) {
     std::vector<std::string> toks = Tokenize(line, "\t");
-- 
cgit v1.2.3


From a2fd8d5b2c43f0008a050aa850ed387b2289c9c9 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Thu, 6 Oct 2016 13:57:33 +0100
Subject: quering -> querying

---
 contrib/moses2/Jamfile                             |   2 +-
 .../TranslationModel/ProbingPT/ProbingPT.cpp       |   2 +-
 .../moses2/TranslationModel/ProbingPT/quering.cpp  | 143 ---------------------
 .../moses2/TranslationModel/ProbingPT/quering.hh   |  65 ----------
 .../moses2/TranslationModel/ProbingPT/querying.cpp | 143 +++++++++++++++++++++
 .../moses2/TranslationModel/ProbingPT/querying.hh  |  65 ++++++++++
 contrib/other-builds/moses/.project                |   8 +-
 moses/TranslationModel/ProbingPT/ProbingPT.cpp     |   2 +-
 moses/TranslationModel/ProbingPT/quering.cpp       | 142 --------------------
 moses/TranslationModel/ProbingPT/quering.hh        |  65 ----------
 moses/TranslationModel/ProbingPT/querying.cpp      | 142 ++++++++++++++++++++
 moses/TranslationModel/ProbingPT/querying.hh       |  65 ++++++++++
 12 files changed, 422 insertions(+), 422 deletions(-)
 delete mode 100644 contrib/moses2/TranslationModel/ProbingPT/quering.cpp
 delete mode 100644 contrib/moses2/TranslationModel/ProbingPT/quering.hh
 create mode 100644 contrib/moses2/TranslationModel/ProbingPT/querying.cpp
 create mode 100644 contrib/moses2/TranslationModel/ProbingPT/querying.hh
 delete mode 100644 moses/TranslationModel/ProbingPT/quering.cpp
 delete mode 100644 moses/TranslationModel/ProbingPT/quering.hh
 create mode 100644 moses/TranslationModel/ProbingPT/querying.cpp
 create mode 100644 moses/TranslationModel/ProbingPT/querying.hh

diff --git a/contrib/moses2/Jamfile b/contrib/moses2/Jamfile
index 8791e3cf9..850dbcd1f 100644
--- a/contrib/moses2/Jamfile
+++ b/contrib/moses2/Jamfile
@@ -72,7 +72,7 @@ alias deps :  ../..//z ../..//boost_iostreams ../..//boost_filesystem ../../mose
     TranslationModel/ProbingPT/hash.cpp
     TranslationModel/ProbingPT/line_splitter.cpp
     TranslationModel/ProbingPT/probing_hash_utils.cpp
-    TranslationModel/ProbingPT/quering.cpp
+    TranslationModel/ProbingPT/querying.cpp
     TranslationModel/ProbingPT/storing.cpp
     TranslationModel/ProbingPT/StoreVocab.cpp
     TranslationModel/ProbingPT/StoreTarget.cpp
diff --git a/contrib/moses2/TranslationModel/ProbingPT/ProbingPT.cpp b/contrib/moses2/TranslationModel/ProbingPT/ProbingPT.cpp
index 829906b55..2c9a5f31a 100644
--- a/contrib/moses2/TranslationModel/ProbingPT/ProbingPT.cpp
+++ b/contrib/moses2/TranslationModel/ProbingPT/ProbingPT.cpp
@@ -6,7 +6,7 @@
  */
 #include <boost/foreach.hpp>
 #include "ProbingPT.h"
-#include "quering.hh"
+#include "querying.hh"
 #include "probing_hash_utils.hh"
 #include "util/exception.hh"
 #include "../../System.h"
diff --git a/contrib/moses2/TranslationModel/ProbingPT/quering.cpp b/contrib/moses2/TranslationModel/ProbingPT/quering.cpp
deleted file mode 100644
index 36e384e73..000000000
--- a/contrib/moses2/TranslationModel/ProbingPT/quering.cpp
+++ /dev/null
@@ -1,143 +0,0 @@
-#include "quering.hh"
-#include "util/exception.hh"
-#include "../../legacy/Util2.h"
-
-using namespace std;
-
-namespace Moses2
-{
-
-QueryEngine::QueryEngine(const char * filepath)
-{
-
-  //Create filepaths
-  std::string basepath(filepath);
-  std::string path_to_config = basepath + "/config";
-  std::string path_to_hashtable = basepath + "/probing_hash.dat";
-  std::string path_to_source_vocabid = basepath + "/source_vocabids";
-  std::string alignPath = basepath + "/Alignments.dat";
-
-  if (!FileExists(path_to_config)) {
-    UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config);
-  }
-
-  ///Source phrase vocabids
-  read_map(source_vocabids, path_to_source_vocabid.c_str());
-
-  // alignments
-  read_alignments(alignPath);
-
-  //Read config file
-  boost::unordered_map<std::string, std::string> keyValue;
-
-  std::ifstream config(path_to_config.c_str());
-  std::string line;
-  while (getline(config, line)) {
-    std::vector<std::string> toks = Moses2::Tokenize(line, "\t");
-    UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
-    keyValue[ toks[0] ] = toks[1];
-  }
-
-  bool found;
-  //Check API version:
-  int version;
-  found = Get(keyValue, "API_VERSION", version);
-  if (!found) {
-    std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
-  }
-  else if (version != API_VERSION) {
-    std::cerr << "The ProbingPT API has changed. " << version << "!="
-        << API_VERSION << " Please rebinarize your phrase tables." << std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  //Get tablesize.
-  int tablesize;
-  found = Get(keyValue, "uniq_entries", tablesize);
-  if (!found) {
-    std::cerr << "uniq_entries not found" << std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  //Number of scores
-  found = Get(keyValue, "num_scores", num_scores);
-  if (!found) {
-    std::cerr << "num_scores not found" << std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  //How may scores from lex reordering models
-  found = Get(keyValue, "num_lex_scores", num_lex_scores);
-  if (!found) {
-    std::cerr << "num_lex_scores not found" << std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  // have the scores been log() and FloorScore()?
-  found = Get(keyValue, "log_prob", logProb);
-  if (!found) {
-    std::cerr << "logProb not found" << std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  config.close();
-
-  //Read hashtable
-  table_filesize = Table::Size(tablesize, 1.2);
-  mem = readTable(path_to_hashtable.c_str(), table_filesize);
-  Table table_init(mem, table_filesize);
-  table = table_init;
-
-  std::cerr << "Initialized successfully! " << std::endl;
-}
-
-QueryEngine::~QueryEngine()
-{
-  //Clear mmap content from memory.
-  munmap(mem, table_filesize);
-
-}
-
-uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
-{
-  //TOO SLOW
-  //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
-  return Moses2::getKey(source_phrase, size);
-}
-
-std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
-{
-  std::pair<bool, uint64_t> ret;
-
-  const Entry * entry;
-  ret.first = table.Find(key, entry);
-  if (ret.first) {
-    ret.second = entry->value;
-  }
-  return ret;
-}
-
-void QueryEngine::read_alignments(const std::string &alignPath)
-{
-  std::ifstream strm(alignPath.c_str());
-
-  string line;
-  while (getline(strm, line)) {
-    vector<string> toks = Moses2::Tokenize(line, "\t ");
-    UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
-
-    uint32_t alignInd = Scan<uint32_t>(toks[0]);
-    if (alignInd >= alignColl.size()) {
-      alignColl.resize(alignInd + 1);
-    }
-
-    Alignments &aligns = alignColl[alignInd];
-    for (size_t i = 1; i < toks.size(); ++i) {
-      size_t pos = Scan<size_t>(toks[i]);
-      aligns.push_back(pos);
-    }
-  }
-}
-
-}
-
diff --git a/contrib/moses2/TranslationModel/ProbingPT/quering.hh b/contrib/moses2/TranslationModel/ProbingPT/quering.hh
deleted file mode 100644
index aae4b4f09..000000000
--- a/contrib/moses2/TranslationModel/ProbingPT/quering.hh
+++ /dev/null
@@ -1,65 +0,0 @@
-#pragma once
-
-#include <boost/unordered_map.hpp>
-#include <sys/stat.h> //For finding size of file
-#include "vocabid.hh"
-#include <algorithm> //toLower
-#include <deque>
-#include "probing_hash_utils.hh"
-#include "hash.hh" //Includes line splitter
-#include "line_splitter.hh"
-#include "../../legacy/Util2.h"
-
-namespace Moses2
-{
-
-class QueryEngine
-{
-  std::map<uint64_t, std::string> source_vocabids;
-
-  typedef std::vector<unsigned char> Alignments;
-  std::vector<Alignments> alignColl;
-
-  Table table;
-  char *mem; //Memory for the table, necessary so that we can correctly destroy the object
-
-  size_t table_filesize;
-  bool is_reordering;
-
-  void read_alignments(const std::string &alignPath);
-
-public:
-  int num_scores;
-  int num_lex_scores;
-  bool logProb;
-
-  QueryEngine(const char *);
-  ~QueryEngine();
-
-  std::pair<bool, uint64_t> query(uint64_t key);
-
-  const std::map<uint64_t, std::string> &getSourceVocab() const
-  {  return source_vocabids; }
-
-  const std::vector<Alignments> &getAlignments() const
-  {  return alignColl; }
-
-  uint64_t getKey(uint64_t source_phrase[], size_t size) const;
-
-  template<typename T>
-  inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const
-  {
-    boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
-    if (iter == keyValue.end()) {
-      return false;
-    }
-
-    const std::string &foundStr = iter->second;
-    found = Scan<T>(foundStr);
-    return true;
-  }
-
-};
-
-}
-
diff --git a/contrib/moses2/TranslationModel/ProbingPT/querying.cpp b/contrib/moses2/TranslationModel/ProbingPT/querying.cpp
new file mode 100644
index 000000000..fb8ccef9a
--- /dev/null
+++ b/contrib/moses2/TranslationModel/ProbingPT/querying.cpp
@@ -0,0 +1,143 @@
+#include "querying.hh"
+#include "util/exception.hh"
+#include "../../legacy/Util2.h"
+
+using namespace std;
+
+namespace Moses2
+{
+
+QueryEngine::QueryEngine(const char * filepath)
+{
+
+  //Create filepaths
+  std::string basepath(filepath);
+  std::string path_to_config = basepath + "/config";
+  std::string path_to_hashtable = basepath + "/probing_hash.dat";
+  std::string path_to_source_vocabid = basepath + "/source_vocabids";
+  std::string alignPath = basepath + "/Alignments.dat";
+
+  if (!FileExists(path_to_config)) {
+    UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config);
+  }
+
+  ///Source phrase vocabids
+  read_map(source_vocabids, path_to_source_vocabid.c_str());
+
+  // alignments
+  read_alignments(alignPath);
+
+  //Read config file
+  boost::unordered_map<std::string, std::string> keyValue;
+
+  std::ifstream config(path_to_config.c_str());
+  std::string line;
+  while (getline(config, line)) {
+    std::vector<std::string> toks = Moses2::Tokenize(line, "\t");
+    UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
+    keyValue[ toks[0] ] = toks[1];
+  }
+
+  bool found;
+  //Check API version:
+  int version;
+  found = Get(keyValue, "API_VERSION", version);
+  if (!found) {
+    std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
+  }
+  else if (version != API_VERSION) {
+    std::cerr << "The ProbingPT API has changed. " << version << "!="
+        << API_VERSION << " Please rebinarize your phrase tables." << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  //Get tablesize.
+  int tablesize;
+  found = Get(keyValue, "uniq_entries", tablesize);
+  if (!found) {
+    std::cerr << "uniq_entries not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  //Number of scores
+  found = Get(keyValue, "num_scores", num_scores);
+  if (!found) {
+    std::cerr << "num_scores not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  //How may scores from lex reordering models
+  found = Get(keyValue, "num_lex_scores", num_lex_scores);
+  if (!found) {
+    std::cerr << "num_lex_scores not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  // have the scores been log() and FloorScore()?
+  found = Get(keyValue, "log_prob", logProb);
+  if (!found) {
+    std::cerr << "logProb not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  config.close();
+
+  //Read hashtable
+  table_filesize = Table::Size(tablesize, 1.2);
+  mem = readTable(path_to_hashtable.c_str(), table_filesize);
+  Table table_init(mem, table_filesize);
+  table = table_init;
+
+  std::cerr << "Initialized successfully! " << std::endl;
+}
+
+QueryEngine::~QueryEngine()
+{
+  //Clear mmap content from memory.
+  munmap(mem, table_filesize);
+
+}
+
+uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
+{
+  //TOO SLOW
+  //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
+  return Moses2::getKey(source_phrase, size);
+}
+
+std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
+{
+  std::pair<bool, uint64_t> ret;
+
+  const Entry * entry;
+  ret.first = table.Find(key, entry);
+  if (ret.first) {
+    ret.second = entry->value;
+  }
+  return ret;
+}
+
+void QueryEngine::read_alignments(const std::string &alignPath)
+{
+  std::ifstream strm(alignPath.c_str());
+
+  string line;
+  while (getline(strm, line)) {
+    vector<string> toks = Moses2::Tokenize(line, "\t ");
+    UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
+
+    uint32_t alignInd = Scan<uint32_t>(toks[0]);
+    if (alignInd >= alignColl.size()) {
+      alignColl.resize(alignInd + 1);
+    }
+
+    Alignments &aligns = alignColl[alignInd];
+    for (size_t i = 1; i < toks.size(); ++i) {
+      size_t pos = Scan<size_t>(toks[i]);
+      aligns.push_back(pos);
+    }
+  }
+}
+
+}
+
diff --git a/contrib/moses2/TranslationModel/ProbingPT/querying.hh b/contrib/moses2/TranslationModel/ProbingPT/querying.hh
new file mode 100644
index 000000000..aae4b4f09
--- /dev/null
+++ b/contrib/moses2/TranslationModel/ProbingPT/querying.hh
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <boost/unordered_map.hpp>
+#include <sys/stat.h> //For finding size of file
+#include "vocabid.hh"
+#include <algorithm> //toLower
+#include <deque>
+#include "probing_hash_utils.hh"
+#include "hash.hh" //Includes line splitter
+#include "line_splitter.hh"
+#include "../../legacy/Util2.h"
+
+namespace Moses2
+{
+
+class QueryEngine
+{
+  std::map<uint64_t, std::string> source_vocabids;
+
+  typedef std::vector<unsigned char> Alignments;
+  std::vector<Alignments> alignColl;
+
+  Table table;
+  char *mem; //Memory for the table, necessary so that we can correctly destroy the object
+
+  size_t table_filesize;
+  bool is_reordering;
+
+  void read_alignments(const std::string &alignPath);
+
+public:
+  int num_scores;
+  int num_lex_scores;
+  bool logProb;
+
+  QueryEngine(const char *);
+  ~QueryEngine();
+
+  std::pair<bool, uint64_t> query(uint64_t key);
+
+  const std::map<uint64_t, std::string> &getSourceVocab() const
+  {  return source_vocabids; }
+
+  const std::vector<Alignments> &getAlignments() const
+  {  return alignColl; }
+
+  uint64_t getKey(uint64_t source_phrase[], size_t size) const;
+
+  template<typename T>
+  inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const
+  {
+    boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
+    if (iter == keyValue.end()) {
+      return false;
+    }
+
+    const std::string &foundStr = iter->second;
+    found = Scan<T>(foundStr);
+    return true;
+  }
+
+};
+
+}
+
diff --git a/contrib/other-builds/moses/.project b/contrib/other-builds/moses/.project
index c25eb5225..c6b7de6f7 100644
--- a/contrib/other-builds/moses/.project
+++ b/contrib/other-builds/moses/.project
@@ -3391,14 +3391,14 @@
 			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/probing_hash_utils.hh</locationURI>
 		</link>
 		<link>
-			<name>TranslationModel/ProbingPT/quering.cpp</name>
+			<name>TranslationModel/ProbingPT/querying.cpp</name>
 			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/quering.cpp</locationURI>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/querying.cpp</locationURI>
 		</link>
 		<link>
-			<name>TranslationModel/ProbingPT/quering.hh</name>
+			<name>TranslationModel/ProbingPT/querying.hh</name>
 			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/quering.hh</locationURI>
+			<locationURI>PARENT-3-PROJECT_LOC/moses/TranslationModel/ProbingPT/querying.hh</locationURI>
 		</link>
 		<link>
 			<name>TranslationModel/ProbingPT/storing.cpp</name>
diff --git a/moses/TranslationModel/ProbingPT/ProbingPT.cpp b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
index 06b1360cd..8b4505985 100644
--- a/moses/TranslationModel/ProbingPT/ProbingPT.cpp
+++ b/moses/TranslationModel/ProbingPT/ProbingPT.cpp
@@ -5,7 +5,7 @@
 #include "moses/TargetPhraseCollection.h"
 #include "moses/InputFileStream.h"
 #include "moses/TranslationModel/CYKPlusParser/ChartRuleLookupManagerSkeleton.h"
-#include "quering.hh"
+#include "querying.hh"
 
 using namespace std;
 
diff --git a/moses/TranslationModel/ProbingPT/quering.cpp b/moses/TranslationModel/ProbingPT/quering.cpp
deleted file mode 100644
index 52cd7f516..000000000
--- a/moses/TranslationModel/ProbingPT/quering.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-#include "quering.hh"
-#include "util/exception.hh"
-
-using namespace std;
-
-namespace Moses
-{
-
-QueryEngine::QueryEngine(const char * filepath)
-{
-
-  //Create filepaths
-  std::string basepath(filepath);
-  std::string path_to_config = basepath + "/config";
-  std::string path_to_hashtable = basepath + "/probing_hash.dat";
-  std::string path_to_source_vocabid = basepath + "/source_vocabids";
-  std::string alignPath = basepath + "/Alignments.dat";
-
-  if (!FileExists(path_to_config)) {
-    UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config);
-  }
-
-  ///Source phrase vocabids
-  read_map(source_vocabids, path_to_source_vocabid.c_str());
-
-  // alignments
-  read_alignments(alignPath);
-
-  //Read config file
-  boost::unordered_map<std::string, std::string> keyValue;
-
-  std::ifstream config(path_to_config.c_str());
-  std::string line;
-  while (getline(config, line)) {
-    std::vector<std::string> toks = Tokenize(line, "\t");
-    UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
-    keyValue[ toks[0] ] = toks[1];
-  }
-
-  bool found;
-  //Check API version:
-  int version;
-  found = Get(keyValue, "API_VERSION", version);
-  if (!found) {
-    std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
-  }
-  else if (version != API_VERSION) {
-    std::cerr << "The ProbingPT API has changed. " << version << "!="
-        << API_VERSION << " Please rebinarize your phrase tables." << std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  //Get tablesize.
-  int tablesize;
-  found = Get(keyValue, "uniq_entries", tablesize);
-  if (!found) {
-    std::cerr << "uniq_entries not found" << std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  //Number of scores
-  found = Get(keyValue, "num_scores", num_scores);
-  if (!found) {
-    std::cerr << "num_scores not found" << std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  //How may scores from lex reordering models
-  found = Get(keyValue, "num_lex_scores", num_lex_scores);
-  if (!found) {
-    std::cerr << "num_lex_scores not found" << std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  // have the scores been log() and FloorScore()?
-  found = Get(keyValue, "log_prob", logProb);
-  if (!found) {
-    std::cerr << "logProb not found" << std::endl;
-    exit(EXIT_FAILURE);
-  }
-
-  config.close();
-
-  //Read hashtable
-  table_filesize = Table::Size(tablesize, 1.2);
-  mem = readTable(path_to_hashtable.c_str(), table_filesize);
-  Table table_init(mem, table_filesize);
-  table = table_init;
-
-  std::cerr << "Initialized successfully! " << std::endl;
-}
-
-QueryEngine::~QueryEngine()
-{
-  //Clear mmap content from memory.
-  munmap(mem, table_filesize);
-
-}
-
-uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
-{
-  //TOO SLOW
-  //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
-  return Moses::getKey(source_phrase, size);
-}
-
-std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
-{
-  std::pair<bool, uint64_t> ret;
-
-  const Entry * entry;
-  ret.first = table.Find(key, entry);
-  if (ret.first) {
-    ret.second = entry->value;
-  }
-  return ret;
-}
-
-void QueryEngine::read_alignments(const std::string &alignPath)
-{
-  std::ifstream strm(alignPath.c_str());
-
-  string line;
-  while (getline(strm, line)) {
-    vector<string> toks = Tokenize(line, "\t ");
-    UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
-
-    uint32_t alignInd = Scan<uint32_t>(toks[0]);
-    if (alignInd >= alignColl.size()) {
-      alignColl.resize(alignInd + 1);
-    }
-
-    Alignments &aligns = alignColl[alignInd];
-    for (size_t i = 1; i < toks.size(); ++i) {
-      size_t pos = Scan<size_t>(toks[i]);
-      aligns.push_back(pos);
-    }
-  }
-}
-
-}
-
diff --git a/moses/TranslationModel/ProbingPT/quering.hh b/moses/TranslationModel/ProbingPT/quering.hh
deleted file mode 100644
index c43c7f3b9..000000000
--- a/moses/TranslationModel/ProbingPT/quering.hh
+++ /dev/null
@@ -1,65 +0,0 @@
-#pragma once
-
-#include <boost/unordered_map.hpp>
-#include <sys/stat.h> //For finding size of file
-#include "vocabid.hh"
-#include <algorithm> //toLower
-#include <deque>
-#include "probing_hash_utils.hh"
-#include "hash.hh" //Includes line splitter
-#include "line_splitter.hh"
-#include "moses//Util.h"
-
-namespace Moses
-{
-
-class QueryEngine
-{
-  std::map<uint64_t, std::string> source_vocabids;
-
-  typedef std::vector<unsigned char> Alignments;
-  std::vector<Alignments> alignColl;
-
-  Table table;
-  char *mem; //Memory for the table, necessary so that we can correctly destroy the object
-
-  size_t table_filesize;
-  bool is_reordering;
-
-  void read_alignments(const std::string &alignPath);
-
-public:
-  int num_scores;
-  int num_lex_scores;
-  bool logProb;
-
-  QueryEngine(const char *);
-  ~QueryEngine();
-
-  std::pair<bool, uint64_t> query(uint64_t key);
-
-  const std::map<uint64_t, std::string> &getSourceVocab() const
-  {  return source_vocabids; }
-
-  const std::vector<Alignments> &getAlignments() const
-  {  return alignColl; }
-
-  uint64_t getKey(uint64_t source_phrase[], size_t size) const;
-
-  template<typename T>
-  inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const
-  {
-    boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
-    if (iter == keyValue.end()) {
-      return false;
-    }
-
-    const std::string &foundStr = iter->second;
-    found = Scan<T>(foundStr);
-    return true;
-  }
-
-};
-
-}
-
diff --git a/moses/TranslationModel/ProbingPT/querying.cpp b/moses/TranslationModel/ProbingPT/querying.cpp
new file mode 100644
index 000000000..52cd7f516
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/querying.cpp
@@ -0,0 +1,142 @@
+#include "quering.hh"
+#include "util/exception.hh"
+
+using namespace std;
+
+namespace Moses
+{
+
+QueryEngine::QueryEngine(const char * filepath)
+{
+
+  //Create filepaths
+  std::string basepath(filepath);
+  std::string path_to_config = basepath + "/config";
+  std::string path_to_hashtable = basepath + "/probing_hash.dat";
+  std::string path_to_source_vocabid = basepath + "/source_vocabids";
+  std::string alignPath = basepath + "/Alignments.dat";
+
+  if (!FileExists(path_to_config)) {
+    UTIL_THROW2("Binary table doesn't exist is didn't finish binarizing: " << path_to_config);
+  }
+
+  ///Source phrase vocabids
+  read_map(source_vocabids, path_to_source_vocabid.c_str());
+
+  // alignments
+  read_alignments(alignPath);
+
+  //Read config file
+  boost::unordered_map<std::string, std::string> keyValue;
+
+  std::ifstream config(path_to_config.c_str());
+  std::string line;
+  while (getline(config, line)) {
+    std::vector<std::string> toks = Tokenize(line, "\t");
+    UTIL_THROW_IF2(toks.size() != 2, "Wrong config format:" << line);
+    keyValue[ toks[0] ] = toks[1];
+  }
+
+  bool found;
+  //Check API version:
+  int version;
+  found = Get(keyValue, "API_VERSION", version);
+  if (!found) {
+    std::cerr << "Old or corrupted version of ProbingPT. Please rebinarize your phrase tables." << std::endl;
+  }
+  else if (version != API_VERSION) {
+    std::cerr << "The ProbingPT API has changed. " << version << "!="
+        << API_VERSION << " Please rebinarize your phrase tables." << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  //Get tablesize.
+  int tablesize;
+  found = Get(keyValue, "uniq_entries", tablesize);
+  if (!found) {
+    std::cerr << "uniq_entries not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  //Number of scores
+  found = Get(keyValue, "num_scores", num_scores);
+  if (!found) {
+    std::cerr << "num_scores not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  //How may scores from lex reordering models
+  found = Get(keyValue, "num_lex_scores", num_lex_scores);
+  if (!found) {
+    std::cerr << "num_lex_scores not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  // have the scores been log() and FloorScore()?
+  found = Get(keyValue, "log_prob", logProb);
+  if (!found) {
+    std::cerr << "logProb not found" << std::endl;
+    exit(EXIT_FAILURE);
+  }
+
+  config.close();
+
+  //Read hashtable
+  table_filesize = Table::Size(tablesize, 1.2);
+  mem = readTable(path_to_hashtable.c_str(), table_filesize);
+  Table table_init(mem, table_filesize);
+  table = table_init;
+
+  std::cerr << "Initialized successfully! " << std::endl;
+}
+
+QueryEngine::~QueryEngine()
+{
+  //Clear mmap content from memory.
+  munmap(mem, table_filesize);
+
+}
+
+uint64_t QueryEngine::getKey(uint64_t source_phrase[], size_t size) const
+{
+  //TOO SLOW
+  //uint64_t key = util::MurmurHashNative(&source_phrase[0], source_phrase.size());
+  return Moses::getKey(source_phrase, size);
+}
+
+std::pair<bool, uint64_t> QueryEngine::query(uint64_t key)
+{
+  std::pair<bool, uint64_t> ret;
+
+  const Entry * entry;
+  ret.first = table.Find(key, entry);
+  if (ret.first) {
+    ret.second = entry->value;
+  }
+  return ret;
+}
+
+void QueryEngine::read_alignments(const std::string &alignPath)
+{
+  std::ifstream strm(alignPath.c_str());
+
+  string line;
+  while (getline(strm, line)) {
+    vector<string> toks = Tokenize(line, "\t ");
+    UTIL_THROW_IF2(toks.size() == 0, "Corrupt alignment file");
+
+    uint32_t alignInd = Scan<uint32_t>(toks[0]);
+    if (alignInd >= alignColl.size()) {
+      alignColl.resize(alignInd + 1);
+    }
+
+    Alignments &aligns = alignColl[alignInd];
+    for (size_t i = 1; i < toks.size(); ++i) {
+      size_t pos = Scan<size_t>(toks[i]);
+      aligns.push_back(pos);
+    }
+  }
+}
+
+}
+
diff --git a/moses/TranslationModel/ProbingPT/querying.hh b/moses/TranslationModel/ProbingPT/querying.hh
new file mode 100644
index 000000000..c43c7f3b9
--- /dev/null
+++ b/moses/TranslationModel/ProbingPT/querying.hh
@@ -0,0 +1,65 @@
+#pragma once
+
+#include <boost/unordered_map.hpp>
+#include <sys/stat.h> //For finding size of file
+#include "vocabid.hh"
+#include <algorithm> //toLower
+#include <deque>
+#include "probing_hash_utils.hh"
+#include "hash.hh" //Includes line splitter
+#include "line_splitter.hh"
+#include "moses//Util.h"
+
+namespace Moses
+{
+
+class QueryEngine
+{
+  std::map<uint64_t, std::string> source_vocabids;
+
+  typedef std::vector<unsigned char> Alignments;
+  std::vector<Alignments> alignColl;
+
+  Table table;
+  char *mem; //Memory for the table, necessary so that we can correctly destroy the object
+
+  size_t table_filesize;
+  bool is_reordering;
+
+  void read_alignments(const std::string &alignPath);
+
+public:
+  int num_scores;
+  int num_lex_scores;
+  bool logProb;
+
+  QueryEngine(const char *);
+  ~QueryEngine();
+
+  std::pair<bool, uint64_t> query(uint64_t key);
+
+  const std::map<uint64_t, std::string> &getSourceVocab() const
+  {  return source_vocabids; }
+
+  const std::vector<Alignments> &getAlignments() const
+  {  return alignColl; }
+
+  uint64_t getKey(uint64_t source_phrase[], size_t size) const;
+
+  template<typename T>
+  inline bool Get(const boost::unordered_map<std::string, std::string> &keyValue, const std::string &sought, T &found) const
+  {
+    boost::unordered_map<std::string, std::string>::const_iterator iter = keyValue.find(sought);
+    if (iter == keyValue.end()) {
+      return false;
+    }
+
+    const std::string &foundStr = iter->second;
+    found = Scan<T>(foundStr);
+    return true;
+  }
+
+};
+
+}
+
-- 
cgit v1.2.3