From 774ed64f2e06c39d95f73192754c208d7bb53599 Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Fri, 15 Feb 2013 13:06:54 -0500
Subject: Work to allow output search graph in HTK standard lattice format

---
 moses-cmd/IOWrapper.cpp |  9 ++++++
 moses-cmd/IOWrapper.h   |  2 ++
 moses-cmd/Main.cpp      | 31 ++++++++++++++++++--
 moses/Manager.cpp       | 77 ++++++++++++++++++++++++++++++++++++++++++++++++-
 moses/Manager.h         |  1 +
 moses/Parameter.cpp     |  1 +
 moses/StaticData.cpp    |  6 +++-
 moses/StaticData.h      |  4 +++
 8 files changed, 126 insertions(+), 5 deletions(-)

diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp
index f11516839..451c53ae0 100644
--- a/moses-cmd/IOWrapper.cpp
+++ b/moses-cmd/IOWrapper.cpp
@@ -189,6 +189,15 @@ InputType*IOWrapper::GetInput(InputType* inputType)
   }
 }
 
+  ofstream* IOWrapper::GetOutputSearchGraphSLFStream(size_t sentenceNumber) {
+    const StaticData &staticData = StaticData::Instance();
+    stringstream fileName;
+    fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << sentenceNumber << ".slf";
+    std::ofstream *file = new std::ofstream;
+    file->open(fileName.str().c_str());
+    return file;
+  }
+
 /***
  * print surface factor only for the given phrase
  */
diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h
index 8f164dfb3..1fdc1c6e4 100644
--- a/moses-cmd/IOWrapper.h
+++ b/moses-cmd/IOWrapper.h
@@ -117,6 +117,8 @@ public:
     return *m_outputSearchGraphStream;
   }
 
+  std::ofstream *GetOutputSearchGraphSLFStream(size_t sentenceNumber);
+
   std::ostream &GetDetailedTranslationReportingStream() {
     assert (m_detailedTranslationReportingStream);
     return *m_detailedTranslationReportingStream;
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index ac4527aae..16754b2fb 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -83,14 +83,16 @@ public:
                   OutputCollector* wordGraphCollector, OutputCollector* searchGraphCollector,
                   OutputCollector* detailedTranslationCollector,
                   OutputCollector* alignmentInfoCollector,
-                  OutputCollector* unknownsCollector) :
+                  OutputCollector* unknownsCollector,
+                  std::ofstream* searchGraphSLFStream) :
     m_source(source), m_lineNumber(lineNumber),
     m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
     m_latticeSamplesCollector(latticeSamplesCollector),
     m_wordGraphCollector(wordGraphCollector), m_searchGraphCollector(searchGraphCollector),
     m_detailedTranslationCollector(detailedTranslationCollector),
     m_alignmentInfoCollector(alignmentInfoCollector),
-    m_unknownsCollector(unknownsCollector) {}
+    m_unknownsCollector(unknownsCollector),
+    m_searchGraphSLFStream(searchGraphSLFStream) {}
 
 	/** Translate one sentence
    * gets called by main function implemented at end of this source file */
@@ -143,6 +145,19 @@ public:
 #endif
     }		
 
+    // Output search graph in HTK standard lattice format (SLF)
+    if (m_searchGraphSLFStream) {
+      if (m_searchGraphSLFStream->is_open() && m_searchGraphSLFStream->good()) {
+	ostringstream out;
+	fix(out,PRECISION);
+	manager.OutputSearchGraphAsSLF(m_lineNumber, out);
+	*m_searchGraphSLFStream << out.str();
+	m_searchGraphSLFStream -> flush();
+      } else {
+	TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
+      }
+    }
+
     // apply decision rule and output best translation(s)
     if (m_outputCollector) {
       ostringstream out;
@@ -297,7 +312,14 @@ public:
   }
 
   ~TranslationTask() {
+   
+    if (m_searchGraphSLFStream) {
+      m_searchGraphSLFStream->close();
+    }
+
+    delete m_searchGraphSLFStream;
     delete m_source;
+    
   }
 
 private:
@@ -311,6 +333,7 @@ private:
   OutputCollector* m_detailedTranslationCollector;
   OutputCollector* m_alignmentInfoCollector;
   OutputCollector* m_unknownsCollector;
+  std::ofstream  *m_searchGraphSLFStream;
   std::ofstream *m_alignmentStream;
 
 
@@ -533,7 +556,9 @@ int main(int argc, char** argv)
                             searchGraphCollector.get(),
                             detailedTranslationCollector.get(),
                             alignmentInfoCollector.get(),
-                            unknownsCollector.get() );
+                            unknownsCollector.get(),
+			    staticData.GetOutputSearchGraphSLF() ? 
+			    ioWrapper->GetOutputSearchGraphSLFStream(lineCount) : NULL);
       // execute task
 #ifdef WITH_THREADS
     pool.Submit(task);
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 468db0de3..c80bd59e4 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -26,8 +26,10 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #endif
 
 #include <algorithm>
-#include <limits>
 #include <cmath>
+#include <limits>
+#include <map>
+#include <set>
 #include "Manager.h"
 #include "TypeDef.h"
 #include "Util.h"
@@ -628,6 +630,79 @@ void Manager::GetSearchGraph(vector<SearchGraphNode>& searchGraph) const
 
 }
 
+/**! Output search graph in HTK standard lattice format (SLF) */
+void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const
+{
+
+  vector<SearchGraphNode> searchGraph;
+  GetSearchGraph(searchGraph);
+
+  long numArcs = 0;
+  long numNodes = 0;
+
+  map<int,int> nodes;
+  set<int> terminalNodes;
+
+  // Unique start node
+  nodes[0] = 0;
+  numNodes += 1;
+
+  for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) {
+
+    numArcs += 1;
+
+    int hypothesisID = searchGraph[arcNumber].hypo->GetId();
+    if (nodes.count(hypothesisID) == 0) {
+      nodes[hypothesisID] = numNodes;
+      numNodes += 1;
+
+      bool terminalNode = (searchGraph[arcNumber].forward == -1);
+      if (terminalNode) {
+	numArcs += 1;
+      }
+    }
+
+  }
+
+  // Unique end node
+  nodes[numNodes] = numNodes;
+
+  outputSearchGraphStream << "UTTERANCE=\"Sentence " << translationId << "\"" << endl;
+  outputSearchGraphStream << "VERSION=1.1" << endl;
+  outputSearchGraphStream << "base=e" << endl;
+  outputSearchGraphStream << "NODES=" << numNodes << endl;
+  outputSearchGraphStream << "LINKS=" << numArcs  << endl;
+
+  const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
+
+  for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) {
+    const Hypothesis *thisHypo = searchGraph[arcNumber].hypo;
+    const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
+    if (prevHypo) {
+
+      int startNode = nodes[prevHypo->GetId()];
+      int endNode   = nodes[thisHypo->GetId()];
+      bool terminalNode = (searchGraph[arcNumber].forward == -1);
+
+      outputSearchGraphStream <<  "J="   << arcNumber 
+			      << " S="   << startNode
+			      << " E="   << endNode
+			      << " W=\"" << thisHypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << "\""
+			      << endl;
+
+      if (terminalNode && terminalNodes.count(endNode) == 0) {
+	terminalNodes.insert(endNode);
+	outputSearchGraphStream <<  "J="   << arcNumber 
+				<< " S="   << endNode
+				<< " E="   << numNodes
+				<< endl;
+
+      }
+    }	    
+  }
+
+}
+
 void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
                       const SearchGraphNode& searchNode)
 {
diff --git a/moses/Manager.h b/moses/Manager.h
index dd011bc84..0ae7cd6f1 100644
--- a/moses/Manager.h
+++ b/moses/Manager.h
@@ -137,6 +137,7 @@ public:
 #endif
 
   void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
+  void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const;
   void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const;
   const InputType& GetSource() const {
     return m_source;
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index 103277d34..876cbd224 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -130,6 +130,7 @@ Parameter::Parameter()
   AddParam("output-search-graph", "osg", "Output connected hypotheses of search into specified filename");
   AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
   AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
+  AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)");
   AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
 #ifdef HAVE_PROTOBUF
   AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index d056dc78e..1d9d4907c 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -235,8 +235,12 @@ bool StaticData::LoadData(Parameter *parameter)
     }
     m_outputSearchGraph = true;
     m_outputSearchGraphExtended = true;
-  } else
+  } else {
     m_outputSearchGraph = false;
+  }
+  if (m_parameter->GetParam("output-search-graph-slf").size() > 0) {
+    m_outputSearchGraphSLF = true;
+  }
 #ifdef HAVE_PROTOBUF
   if (m_parameter->GetParam("output-search-graph-pb").size() > 0) {
     if (m_parameter->GetParam("output-search-graph-pb").size() != 1) {
diff --git a/moses/StaticData.h b/moses/StaticData.h
index 448f1a4e7..d644e59f7 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -216,6 +216,7 @@ protected:
   bool m_outputWordGraph; //! whether to output word graph
   bool m_outputSearchGraph; //! whether to output search graph
   bool m_outputSearchGraphExtended; //! ... in extended format
+  bool m_outputSearchGraphSLF; //! whether to output search graph in HTK standard lattice format (SLF)
 #ifdef HAVE_PROTOBUF
   bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf
 #endif
@@ -631,6 +632,9 @@ public:
   bool GetOutputSearchGraphExtended() const {
     return m_outputSearchGraphExtended;
   }
+  bool GetOutputSearchGraphSLF() const {
+    return m_outputSearchGraphSLF;
+  }
 #ifdef HAVE_PROTOBUF
   bool GetOutputSearchGraphPB() const {
     return m_outputSearchGraphPB;
-- 
cgit v1.2.3


From e106e04dc3c3fe609f82780cbd8286d042a5e47d Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Fri, 15 Feb 2013 15:49:26 -0500
Subject: More work on outputting HTK SLF.

Now, each arc emits exactly one word.
---
 moses/Manager.cpp | 55 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 40 insertions(+), 15 deletions(-)

diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index c80bd59e4..39eb7f917 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -645,16 +645,19 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
 
   // Unique start node
   nodes[0] = 0;
-  numNodes += 1;
+  //  numNodes += 1;
 
   for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) {
 
-    numArcs += 1;
+    int targetWordCount = searchGraph[arcNumber].hypo->GetCurrTargetPhrase().GetSize();
+    numArcs += targetWordCount;
 
     int hypothesisID = searchGraph[arcNumber].hypo->GetId();
     if (nodes.count(hypothesisID) == 0) {
+      
+      numNodes += targetWordCount;
       nodes[hypothesisID] = numNodes;
-      numNodes += 1;
+      //numNodes += 1;
 
       bool terminalNode = (searchGraph[arcNumber].forward == -1);
       if (terminalNode) {
@@ -663,32 +666,54 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
     }
 
   }
+  numNodes += 1;
 
   // Unique end node
   nodes[numNodes] = numNodes;
 
-  outputSearchGraphStream << "UTTERANCE=\"Sentence " << translationId << "\"" << endl;
+  outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl;
   outputSearchGraphStream << "VERSION=1.1" << endl;
   outputSearchGraphStream << "base=e" << endl;
-  outputSearchGraphStream << "NODES=" << numNodes << endl;
+  outputSearchGraphStream << "NODES=" << (numNodes+1) << endl;
   outputSearchGraphStream << "LINKS=" << numArcs  << endl;
 
-  const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
+  // const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
 
-  for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) {
-    const Hypothesis *thisHypo = searchGraph[arcNumber].hypo;
+  for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) {
+    const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
     const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
     if (prevHypo) {
 
       int startNode = nodes[prevHypo->GetId()];
       int endNode   = nodes[thisHypo->GetId()];
-      bool terminalNode = (searchGraph[arcNumber].forward == -1);
+      bool terminalNode = (searchGraph[lineNumber].forward == -1);
+      const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
+      int targetWordCount = targetPhrase.GetSize();
+
+      for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
+      // for (int startNode = nodes[prevHypo->GetId()] - targetWordCount + 1,
+      // 	     nextNode = startNode + 1;
+      // 	   nextNode < endNode; startNode+=1, nextNode+=1) {
+	int x = (targetWordCount-targetWordIndex);
+
+	outputSearchGraphStream <<  "J=" << arcNumber;
+	// outputSearchGraphStream <<  " startNode=" << startNode;
+	// outputSearchGraphStream <<  " endNode=" << endNode;
+	// outputSearchGraphStream <<  " targetWordCount=" << targetWordCount;
+	// outputSearchGraphStream <<  " targetWordIndex=" << targetWordIndex;
+
+	if (targetWordIndex==0) {
+	  outputSearchGraphStream << " S=" << startNode;
+	} else {
+	  outputSearchGraphStream << " S=" << endNode - x;
+	}
+
+	outputSearchGraphStream << " E=" << endNode - (x-1) //(startNode + targetWordIndex + 1)
+				<< " W=" << targetPhrase.GetWord(targetWordIndex)
+				<< endl;
 
-      outputSearchGraphStream <<  "J="   << arcNumber 
-			      << " S="   << startNode
-			      << " E="   << endNode
-			      << " W=\"" << thisHypo->GetCurrTargetPhrase().GetStringRep(outputFactorOrder) << "\""
-			      << endl;
+	arcNumber += 1;
+      }
 
       if (terminalNode && terminalNodes.count(endNode) == 0) {
 	terminalNodes.insert(endNode);
@@ -696,7 +721,7 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
 				<< " S="   << endNode
 				<< " E="   << numNodes
 				<< endl;
-
+	arcNumber += 1;
       }
     }	    
   }
-- 
cgit v1.2.3


From e7563111de02c5e39ff297e58641b612ff02fb4b Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Wed, 20 Feb 2013 11:03:23 -0500
Subject: More work on outputting HTK lattice format

---
 moses/Manager.cpp | 149 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 moses/Manager.h   |   7 ++-
 2 files changed, 150 insertions(+), 6 deletions(-)

diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 39eb7f917..ce214c414 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -53,12 +53,12 @@ using namespace std;
 namespace Moses
 {
 Manager::Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system)
-  :m_lineNumber(lineNumber)
-  ,m_system(system)
+  :m_system(system)
   ,m_transOptColl(source.CreateTranslationOptionCollection(system))
   ,m_search(Search::CreateSearch(*this, source, searchAlgorithm, *m_transOptColl))
   ,interrupted_flag(0)
   ,m_hypoId(0)
+  ,m_lineNumber(lineNumber)
   ,m_source(source)
 {
   m_system->InitializeBeforeSentenceProcessing(source);
@@ -630,6 +630,140 @@ void Manager::GetSearchGraph(vector<SearchGraphNode>& searchGraph) const
 
 }
 
+void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureWeightsForSLF(featureIndex, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureWeightsForSLF(featureIndex, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForSLF(featureIndex, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForSLF(featureIndex, gds[i], outputSearchGraphStream);
+  }
+
+}
+
+
+void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  // outputSearchGraphStream << endl;
+  // outputSearchGraphStream << (*hypo) << endl;
+  // const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); 
+  // outputSearchGraphStream << scoreCollection << endl;
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureValuesForSLF(featureIndex, zeros, hypo, gds[i], outputSearchGraphStream);
+  }
+
+}
+
+
+size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+  size_t numScoreComps = ff->GetNumScoreComponents();
+  if (numScoreComps != ScoreProducer::unlimited) {
+    vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+    for (size_t i = 0; i < numScoreComps; ++i) {
+      outputSearchGraphStream << "# " << ff->GetScoreProducerDescription() 
+			      << " "  << ff->GetScoreProducerWeightShortName()
+			      << " "  << (i+1) << " of " << numScoreComps << endl
+			      << "x"  << (index+i) << "scale=" << values[i] << endl;
+    }
+    return index+numScoreComps;
+  } else {
+    cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
+    assert(false);
+    return 0;
+  }
+}
+
+size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+
+  // { const FeatureFunction* sp = ff;
+  //   const FVector& m_scores = scoreCollection.GetScoresVector();
+  //   FVector& scores = const_cast<FVector&>(m_scores);
+  //   std::string prefix = sp->GetScoreProducerDescription() + FName::SEP;
+  //   // std::cout << "prefix==" << prefix << endl;
+  //   // cout << "m_scores==" << m_scores << endl;
+  //   // cout << "m_scores.size()==" << m_scores.size() << endl;
+  //   // cout << "m_scores.coreSize()==" << m_scores.coreSize() << endl;
+  //   // cout << "m_scores.cbegin() ?= m_scores.cend()\t" <<  (m_scores.cbegin() == m_scores.cend()) << endl;
+
+    
+  //   // for(FVector::FNVmap::const_iterator i = m_scores.cbegin(); i != m_scores.cend(); i++) {
+  //   //   std::cout<<prefix << "\t" << (i->first) << "\t" << (i->second) << std::endl;
+  //   // }
+  //   for(int i=0, n=v.size(); i<n; i+=1) {
+  //     //      outputSearchGraphStream << prefix << i << "==" << v[i] << std::endl;
+      
+  //   }
+  // }
+
+  // FVector featureValues = scoreCollection.GetVectorForProducer(ff);
+  // outputSearchGraphStream << featureValues << endl;
+  const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); 
+
+  vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
+  size_t numScoreComps = featureValues.size();//featureValues.coreSize();
+  //  if (numScoreComps != ScoreProducer::unlimited) {
+    // vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+  for (size_t i = 0; i < numScoreComps; ++i) {
+    outputSearchGraphStream << "x"  << (index+i) << "=" << ((zeros) ? 0.0 : featureValues[i]) << " ";
+    }
+    return index+numScoreComps;
+  // } else {
+  //   cerr << "Sparse features are not supported when outputting HTK standard lattice format" << endl;
+  //   assert(false);
+  //   return 0;
+  // }
+}
+
 /**! Output search graph in HTK standard lattice format (SLF) */
 void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const
 {
@@ -673,10 +807,12 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
 
   outputSearchGraphStream << "UTTERANCE=Sentence_" << translationId << endl;
   outputSearchGraphStream << "VERSION=1.1" << endl;
-  outputSearchGraphStream << "base=e" << endl;
+  outputSearchGraphStream << "base=2.71828182845905" << endl;
   outputSearchGraphStream << "NODES=" << (numNodes+1) << endl;
   outputSearchGraphStream << "LINKS=" << numArcs  << endl;
 
+  OutputFeatureWeightsForSLF(outputSearchGraphStream);
+
   // const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
 
   for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) {
@@ -709,8 +845,11 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
 	}
 
 	outputSearchGraphStream << " E=" << endNode - (x-1) //(startNode + targetWordIndex + 1)
-				<< " W=" << targetPhrase.GetWord(targetWordIndex)
-				<< endl;
+				<< " W=" << targetPhrase.GetWord(targetWordIndex);
+
+	OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
+
+	outputSearchGraphStream  << endl;
 
 	arcNumber += 1;
       }
diff --git a/moses/Manager.h b/moses/Manager.h
index 0ae7cd6f1..c5f54847b 100644
--- a/moses/Manager.h
+++ b/moses/Manager.h
@@ -93,6 +93,11 @@ class Manager
   Manager(Manager const&);
   void operator=(Manager const&);
   const TranslationSystem* m_system;
+private:
+  void OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const;
+  size_t OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+  void OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const;
+  size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
 protected:
   // data
 //	InputType const& m_source; /**< source sentence to be translated */
@@ -103,6 +108,7 @@ protected:
   size_t interrupted_flag;
   std::auto_ptr<SentenceStats> m_sentenceStats;
   int m_hypoId; //used to number the hypos as they are created.
+  size_t m_lineNumber;
 
   void GetConnectedGraph(
     std::map< int, bool >* pConnected,
@@ -113,7 +119,6 @@ protected:
 
 
 public:
-  size_t m_lineNumber;
   InputType const& m_source; /**< source sentence to be translated */
   Manager(size_t lineNumber, InputType const& source, SearchAlgorithm searchAlgorithm, const TranslationSystem* system);
   ~Manager();
-- 
cgit v1.2.3


From 04f107fbb02442638928c190dd3fa2f13225d570 Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Fri, 22 Feb 2013 12:24:35 -0500
Subject: Add flag to output search graph in Kenneth's hypergraph format.

---
 moses-cmd/IOWrapper.cpp |   9 +++
 moses-cmd/IOWrapper.h   |   1 +
 moses-cmd/Main.cpp      |  24 +++++-
 moses/Manager.cpp       | 204 ++++++++++++++++++++++++++++++++++++++++++++++++
 moses/Manager.h         |  11 +++
 moses/Parameter.cpp     |   1 +
 moses/StaticData.cpp    |   7 ++
 moses/StaticData.h      |   4 +
 8 files changed, 258 insertions(+), 3 deletions(-)

diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp
index 451c53ae0..7c27476d1 100644
--- a/moses-cmd/IOWrapper.cpp
+++ b/moses-cmd/IOWrapper.cpp
@@ -198,6 +198,15 @@ InputType*IOWrapper::GetInput(InputType* inputType)
     return file;
   }
 
+  ofstream* IOWrapper::GetOutputSearchGraphHypergraphStream(size_t sentenceNumber) {
+    const StaticData &staticData = StaticData::Instance();
+    stringstream fileName;
+    fileName << staticData.GetParam("output-search-graph-hypergraph")[0] << "/" << sentenceNumber;
+    std::ofstream *file = new std::ofstream;
+    file->open(fileName.str().c_str());
+    return file;
+  }
+
 /***
  * print surface factor only for the given phrase
  */
diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h
index 1fdc1c6e4..044e71491 100644
--- a/moses-cmd/IOWrapper.h
+++ b/moses-cmd/IOWrapper.h
@@ -118,6 +118,7 @@ public:
   }
 
   std::ofstream *GetOutputSearchGraphSLFStream(size_t sentenceNumber);
+  std::ofstream *GetOutputSearchGraphHypergraphStream(size_t sentenceNumber);
 
   std::ostream &GetDetailedTranslationReportingStream() {
     assert (m_detailedTranslationReportingStream);
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index 16754b2fb..afadddabf 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -84,7 +84,8 @@ public:
                   OutputCollector* detailedTranslationCollector,
                   OutputCollector* alignmentInfoCollector,
                   OutputCollector* unknownsCollector,
-                  std::ofstream* searchGraphSLFStream) :
+                  std::ofstream* searchGraphSLFStream,
+		  std::ofstream* searchGraphHypergraphStream) :
     m_source(source), m_lineNumber(lineNumber),
     m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
     m_latticeSamplesCollector(latticeSamplesCollector),
@@ -92,7 +93,8 @@ public:
     m_detailedTranslationCollector(detailedTranslationCollector),
     m_alignmentInfoCollector(alignmentInfoCollector),
     m_unknownsCollector(unknownsCollector),
-    m_searchGraphSLFStream(searchGraphSLFStream) {}
+    m_searchGraphSLFStream(searchGraphSLFStream),
+    m_searchGraphHypergraphStream(searchGraphHypergraphStream) {}
 
 	/** Translate one sentence
    * gets called by main function implemented at end of this source file */
@@ -158,6 +160,19 @@ public:
       }
     }
 
+    // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
+    if (m_searchGraphHypergraphStream) {
+      if (m_searchGraphHypergraphStream->is_open() && m_searchGraphHypergraphStream->good()) {
+	ostringstream out;
+	fix(out,PRECISION);
+	manager.OutputSearchGraphAsHypergraph(m_lineNumber, out);
+	*m_searchGraphHypergraphStream << out.str();
+	m_searchGraphHypergraphStream -> flush();
+      } else {
+	TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
+      }
+    }
+
     // apply decision rule and output best translation(s)
     if (m_outputCollector) {
       ostringstream out;
@@ -334,6 +349,7 @@ private:
   OutputCollector* m_alignmentInfoCollector;
   OutputCollector* m_unknownsCollector;
   std::ofstream  *m_searchGraphSLFStream;
+  std::ofstream  *m_searchGraphHypergraphStream;
   std::ofstream *m_alignmentStream;
 
 
@@ -558,7 +574,9 @@ int main(int argc, char** argv)
                             alignmentInfoCollector.get(),
                             unknownsCollector.get(),
 			    staticData.GetOutputSearchGraphSLF() ? 
-			    ioWrapper->GetOutputSearchGraphSLFStream(lineCount) : NULL);
+			    ioWrapper->GetOutputSearchGraphSLFStream(lineCount) : NULL,
+			    staticData.GetOutputSearchGraphHypergraph() ? 
+			    ioWrapper->GetOutputSearchGraphHypergraphStream(lineCount) : NULL);
       // execute task
 #ifdef WITH_THREADS
     pool.Submit(task);
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index ce214c414..21f116f42 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -663,6 +663,39 @@ void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream)
 
 }
 
+void Manager::OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) const
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream);
+  }
+
+}
+
 
 void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const
 {
@@ -702,6 +735,39 @@ void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std:
 
 }
 
+void Manager::OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureValuesForHypergraph(featureIndex, hypo, gds[i], outputSearchGraphStream);
+  }
+
+}
+
 
 size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
 {
@@ -722,6 +788,30 @@ size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction*
   }
 }
 
+size_t Manager::OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+  size_t numScoreComps = ff->GetNumScoreComponents();
+  if (numScoreComps != ScoreProducer::unlimited) {
+    vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+    if (numScoreComps > 1) {
+      for (size_t i = 0; i < numScoreComps; ++i) {
+	outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
+				<< i
+				<< "=" << values[i] << endl;
+      }
+    } else {
+	outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
+				<< "=" << values[0] << endl;
+    }
+    return index+numScoreComps;
+  } else {
+    cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl;
+    assert(false);
+    return 0;
+  }
+}
+
+
 size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
 {
 
@@ -764,6 +854,120 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth
   // }
 }
 
+size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
+{
+
+  const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); 
+
+  vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
+  size_t numScoreComps = featureValues.size();
+
+  if (numScoreComps > 1) {
+    for (size_t i = 0; i < numScoreComps; ++i) {
+      outputSearchGraphStream << ff->GetScoreProducerWeightShortName()  << i << "=" << featureValues[i] << " ";
+    }
+  } else {
+    outputSearchGraphStream << ff->GetScoreProducerWeightShortName()  << "=" << featureValues[0] << " ";
+  }
+
+  return index+numScoreComps;
+}
+
+void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
+                      const SearchGraphNode& searchNode);
+/**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */
+void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const
+{
+  vector<SearchGraphNode> searchGraph;
+  GetSearchGraph(searchGraph);
+outputSearchGraphStream << "searchGraph.size() == " << searchGraph.size() << endl;
+  //  long numArcs = 0;
+  long numNodes = 0;
+
+  map<int,int> nodes;
+  set<int> terminalNodes;
+  multimap<int,int> nodeToLines;
+
+  // Unique start node
+  //  nodes[0] = 0;
+  //numNodes += 1;
+  for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
+OutputSearchNode(translationId,outputSearchGraphStream,searchGraph[arcNumber]);
+    // Record that this arc ends at this node
+    //    numArcs += 1;
+    nodeToLines.insert(pair<int,int>(numNodes,arcNumber));
+
+    int hypothesisID = searchGraph[arcNumber].hypo->GetId();
+    if (nodes.count(hypothesisID) == 0) {
+      
+      nodes[hypothesisID] = numNodes;
+      numNodes += 1;
+
+      bool terminalNode = (searchGraph[arcNumber].forward == -1);
+      if (terminalNode) {
+	terminalNodes.insert(numNodes);
+	//	numArcs += 1; // Final arc to end node, representing the end of the sentence </s>
+      }
+    }
+
+  }
+  
+  // Unique end node
+  nodes[numNodes] = numNodes;
+  numNodes += 1;
+
+  long numArcs = searchGraph.size() + terminalNodes.size();
+  // Unique start node
+  // numNodes += 1;
+
+  // Print number of nodes and arcs
+  outputSearchGraphStream << numNodes << " " << numArcs << "(" << searchGraph.size() << ", " << terminalNodes.size() << ")" << endl;
+
+  // Print node and arc for beginning of sentence <s>
+  //  outputSearchGraphStream << 1 << endl;
+  //  outputSearchGraphStream << "<s> ||| " << endl;
+
+  for (int nodeNumber=0; nodeNumber <= numNodes; nodeNumber+=1) {
+
+    size_t count = nodeToLines.count(nodeNumber);
+    if (count > 0) {
+      outputSearchGraphStream << count << endl;
+
+      pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range = nodeToLines.equal_range(nodeNumber);
+      for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
+	int lineNumber = (*it).second;
+	const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
+	const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
+	if (prevHypo==NULL) {
+	  outputSearchGraphStream << "<s> ||| " << endl;
+	} else {
+	int startNode = nodes[prevHypo->GetId()];
+      
+	const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
+	int targetWordCount = targetPhrase.GetSize();
+
+	outputSearchGraphStream << "[" << startNode << "]";
+	for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
+	  outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
+	}
+	outputSearchGraphStream << " ||| ";
+	OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
+	outputSearchGraphStream << endl;
+	}
+
+      }
+    }
+  }
+
+  // Print node and arc(s) for end of sentence </s>
+  outputSearchGraphStream << terminalNodes.size() << endl;
+  for (set<int>::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) {
+    outputSearchGraphStream << "[" << (*it) << "] </s> ||| " << endl;
+  }
+
+}
+
+
 /**! Output search graph in HTK standard lattice format (SLF) */
 void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const
 {
diff --git a/moses/Manager.h b/moses/Manager.h
index c5f54847b..d580674b4 100644
--- a/moses/Manager.h
+++ b/moses/Manager.h
@@ -94,10 +94,20 @@ class Manager
   void operator=(Manager const&);
   const TranslationSystem* m_system;
 private:
+
+  // Helper functions to output search graph in HTK standard lattice format
   void OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream) const;
   size_t OutputFeatureWeightsForSLF(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
   void OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const;
   size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+
+  // Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder
+  void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) const;
+  size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+  void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const;
+  size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
+  
+
 protected:
   // data
 //	InputType const& m_source; /**< source sentence to be translated */
@@ -143,6 +153,7 @@ public:
 
   void OutputSearchGraph(long translationId, std::ostream &outputSearchGraphStream) const;
   void OutputSearchGraphAsSLF(long translationId, std::ostream &outputSearchGraphStream) const;
+  void OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const;
   void GetSearchGraph(std::vector<SearchGraphNode>& searchGraph) const;
   const InputType& GetSource() const {
     return m_source;
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index 876cbd224..359174280 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -131,6 +131,7 @@ Parameter::Parameter()
   AddParam("output-search-graph-extended", "osgx", "Output connected hypotheses of search into specified filename, in extended format");
   AddParam("unpruned-search-graph", "usg", "When outputting chart search graph, do not exclude dead ends. Note: stack pruning may have eliminated some hypotheses");
   AddParam("output-search-graph-slf", "slf", "Output connected hypotheses of search into specified directory, one file per sentence, in HTK standard lattice format (SLF)");
+  AddParam("output-search-graph-hypergraph", "Output connected hypotheses of search into specified directory, one file per sentence, in a hypergraph format (see Kenneth Heafield's lazy hypergraph decoder)");
   AddParam("include-lhs-in-search-graph", "lhssg", "When outputting chart search graph, include the label of the LHS of the rule (useful when using syntax)");
 #ifdef HAVE_PROTOBUF
   AddParam("output-search-graph-pb", "pb", "Write phrase lattice to protocol buffer objects in the specified path.");
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index 1d9d4907c..cf797582b 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -240,6 +240,13 @@ bool StaticData::LoadData(Parameter *parameter)
   }
   if (m_parameter->GetParam("output-search-graph-slf").size() > 0) {
     m_outputSearchGraphSLF = true;
+  } else {
+    m_outputSearchGraphSLF = false;
+  }
+  if (m_parameter->GetParam("output-search-graph-hypergraph").size() > 0) {
+    m_outputSearchGraphHypergraph = true;
+  } else {
+    m_outputSearchGraphHypergraph = false;
   }
 #ifdef HAVE_PROTOBUF
   if (m_parameter->GetParam("output-search-graph-pb").size() > 0) {
diff --git a/moses/StaticData.h b/moses/StaticData.h
index d644e59f7..8a9e65162 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -217,6 +217,7 @@ protected:
   bool m_outputSearchGraph; //! whether to output search graph
   bool m_outputSearchGraphExtended; //! ... in extended format
   bool m_outputSearchGraphSLF; //! whether to output search graph in HTK standard lattice format (SLF)
+  bool m_outputSearchGraphHypergraph; //! whether to output search graph in hypergraph
 #ifdef HAVE_PROTOBUF
   bool m_outputSearchGraphPB; //! whether to output search graph as a protobuf
 #endif
@@ -635,6 +636,9 @@ public:
   bool GetOutputSearchGraphSLF() const {
     return m_outputSearchGraphSLF;
   }
+  bool GetOutputSearchGraphHypergraph() const {
+    return m_outputSearchGraphHypergraph;
+  }
 #ifdef HAVE_PROTOBUF
   bool GetOutputSearchGraphPB() const {
     return m_outputSearchGraphPB;
-- 
cgit v1.2.3


From 764ce067266bb3373d5fdd8cdd528484301907c5 Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Fri, 22 Feb 2013 15:48:40 -0500
Subject: More work on outputting search graph as hypergraph

---
 moses/Hypothesis.cpp | 2 +-
 moses/Manager.cpp    | 6 +++---
 moses/StaticData.h   | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/moses/Hypothesis.cpp b/moses/Hypothesis.cpp
index 506193d5b..5bd3a4e2b 100644
--- a/moses/Hypothesis.cpp
+++ b/moses/Hypothesis.cpp
@@ -462,7 +462,7 @@ void Hypothesis::CleanupArcList()
    */
   const StaticData &staticData = StaticData::Instance();
   size_t nBestSize = staticData.GetNBestSize();
-  bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.UseLatticeMBR() ;
+  bool distinctNBest = staticData.GetDistinctNBest() || staticData.UseMBR() || staticData.GetOutputSearchGraph() || staticData.GetOutputSearchGraphSLF() || staticData.GetOutputSearchGraphHypergraph() || staticData.UseLatticeMBR() ;
 
   if (!distinctNBest && m_arcList->size() > nBestSize * 5) {
     // prune arc list only if there too many arcs
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 21f116f42..0e72d90e6 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -880,7 +880,7 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
 {
   vector<SearchGraphNode> searchGraph;
   GetSearchGraph(searchGraph);
-outputSearchGraphStream << "searchGraph.size() == " << searchGraph.size() << endl;
+  //outputSearchGraphStream << "searchGraph.size() == " << searchGraph.size() << endl;
   //  long numArcs = 0;
   long numNodes = 0;
 
@@ -892,7 +892,7 @@ outputSearchGraphStream << "searchGraph.size() == " << searchGraph.size() << end
   //  nodes[0] = 0;
   //numNodes += 1;
   for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
-OutputSearchNode(translationId,outputSearchGraphStream,searchGraph[arcNumber]);
+    //OutputSearchNode(translationId,outputSearchGraphStream,searchGraph[arcNumber]);
     // Record that this arc ends at this node
     //    numArcs += 1;
     nodeToLines.insert(pair<int,int>(numNodes,arcNumber));
@@ -921,7 +921,7 @@ OutputSearchNode(translationId,outputSearchGraphStream,searchGraph[arcNumber]);
   // numNodes += 1;
 
   // Print number of nodes and arcs
-  outputSearchGraphStream << numNodes << " " << numArcs << "(" << searchGraph.size() << ", " << terminalNodes.size() << ")" << endl;
+  outputSearchGraphStream << numNodes << " " << numArcs /*<< "(" << searchGraph.size() << ", " << terminalNodes.size() << ")"*/ << endl;
 
   // Print node and arc for beginning of sentence <s>
   //  outputSearchGraphStream << 1 << endl;
diff --git a/moses/StaticData.h b/moses/StaticData.h
index 8a9e65162..ce93a5629 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -460,7 +460,7 @@ public:
     return m_nBestFilePath;
   }
   bool IsNBestEnabled() const {
-    return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
+    return (!m_nBestFilePath.empty()) || m_mbr || m_useLatticeMBR || m_mira || m_outputSearchGraph || m_outputSearchGraphSLF || m_outputSearchGraphHypergraph || m_useConsensusDecoding || !m_latticeSamplesFilePath.empty()
 #ifdef HAVE_PROTOBUF
            || m_outputSearchGraphPB
 #endif
-- 
cgit v1.2.3


From 285661fec7fc24c8d328f1442c6b59edd508649f Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Fri, 22 Feb 2013 15:51:56 -0500
Subject: Deleted stale commented code

---
 moses/Manager.cpp | 33 +++++----------------------------
 1 file changed, 5 insertions(+), 28 deletions(-)

diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 0e72d90e6..0c7471b88 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -873,28 +873,21 @@ size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis*
   return index+numScoreComps;
 }
 
-void OutputSearchNode(long translationId, std::ostream &outputSearchGraphStream,
-                      const SearchGraphNode& searchNode);
 /**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */
 void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const
 {
   vector<SearchGraphNode> searchGraph;
   GetSearchGraph(searchGraph);
-  //outputSearchGraphStream << "searchGraph.size() == " << searchGraph.size() << endl;
-  //  long numArcs = 0;
+
   long numNodes = 0;
 
   map<int,int> nodes;
   set<int> terminalNodes;
   multimap<int,int> nodeToLines;
 
-  // Unique start node
-  //  nodes[0] = 0;
-  //numNodes += 1;
   for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
-    //OutputSearchNode(translationId,outputSearchGraphStream,searchGraph[arcNumber]);
+
     // Record that this arc ends at this node
-    //    numArcs += 1;
     nodeToLines.insert(pair<int,int>(numNodes,arcNumber));
 
     int hypothesisID = searchGraph[arcNumber].hypo->GetId();
@@ -905,8 +898,8 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
 
       bool terminalNode = (searchGraph[arcNumber].forward == -1);
       if (terminalNode) {
+	// Final arc to end node, representing the end of the sentence </s>
 	terminalNodes.insert(numNodes);
-	//	numArcs += 1; // Final arc to end node, representing the end of the sentence </s>
       }
     }
 
@@ -917,15 +910,9 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
   numNodes += 1;
 
   long numArcs = searchGraph.size() + terminalNodes.size();
-  // Unique start node
-  // numNodes += 1;
 
   // Print number of nodes and arcs
-  outputSearchGraphStream << numNodes << " " << numArcs /*<< "(" << searchGraph.size() << ", " << terminalNodes.size() << ")"*/ << endl;
-
-  // Print node and arc for beginning of sentence <s>
-  //  outputSearchGraphStream << 1 << endl;
-  //  outputSearchGraphStream << "<s> ||| " << endl;
+  outputSearchGraphStream << numNodes << " " << numArcs << endl;
 
   for (int nodeNumber=0; nodeNumber <= numNodes; nodeNumber+=1) {
 
@@ -983,7 +970,6 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
 
   // Unique start node
   nodes[0] = 0;
-  //  numNodes += 1;
 
   for (size_t arcNumber = 0; arcNumber < searchGraph.size(); ++arcNumber) {
 
@@ -1017,8 +1003,6 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
 
   OutputFeatureWeightsForSLF(outputSearchGraphStream);
 
-  // const vector<FactorType> &outputFactorOrder = StaticData::Instance().GetOutputFactorOrder();
-
   for (size_t arcNumber = 0, lineNumber = 0; lineNumber < searchGraph.size(); ++lineNumber) {
     const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
     const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
@@ -1031,16 +1015,9 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
       int targetWordCount = targetPhrase.GetSize();
 
       for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
-      // for (int startNode = nodes[prevHypo->GetId()] - targetWordCount + 1,
-      // 	     nextNode = startNode + 1;
-      // 	   nextNode < endNode; startNode+=1, nextNode+=1) {
 	int x = (targetWordCount-targetWordIndex);
 
 	outputSearchGraphStream <<  "J=" << arcNumber;
-	// outputSearchGraphStream <<  " startNode=" << startNode;
-	// outputSearchGraphStream <<  " endNode=" << endNode;
-	// outputSearchGraphStream <<  " targetWordCount=" << targetWordCount;
-	// outputSearchGraphStream <<  " targetWordIndex=" << targetWordIndex;
 
 	if (targetWordIndex==0) {
 	  outputSearchGraphStream << " S=" << startNode;
@@ -1048,7 +1025,7 @@ void Manager::OutputSearchGraphAsSLF(long translationId, std::ostream &outputSea
 	  outputSearchGraphStream << " S=" << endNode - x;
 	}
 
-	outputSearchGraphStream << " E=" << endNode - (x-1) //(startNode + targetWordIndex + 1)
+	outputSearchGraphStream << " E=" << endNode - (x-1)
 				<< " W=" << targetPhrase.GetWord(targetWordIndex);
 
 	OutputFeatureValuesForSLF(thisHypo, (targetWordIndex>0), outputSearchGraphStream);
-- 
cgit v1.2.3


From 4adeb7e33868dfc44875af9534047da6cc6bfee0 Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Fri, 22 Feb 2013 16:20:03 -0500
Subject: Output feature weights to a separate file when producing hypergraph

---
 moses-cmd/IOWrapper.cpp |  9 +++++++
 moses-cmd/IOWrapper.h   |  1 +
 moses-cmd/Main.cpp      | 66 +++++++++++++++++++++++++++++++++++++++++++++++++
 moses/Manager.cpp       | 58 -------------------------------------------
 moses/Manager.h         |  2 --
 5 files changed, 76 insertions(+), 60 deletions(-)

diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp
index 7c27476d1..6fffead46 100644
--- a/moses-cmd/IOWrapper.cpp
+++ b/moses-cmd/IOWrapper.cpp
@@ -207,6 +207,15 @@ InputType*IOWrapper::GetInput(InputType* inputType)
     return file;
   }
 
+  ofstream* IOWrapper::GetOutputSearchGraphHypergraphWeightsStream() {
+    const StaticData &staticData = StaticData::Instance();
+    stringstream fileName;
+    fileName << staticData.GetParam("output-search-graph-hypergraph")[1];
+    std::ofstream *file = new std::ofstream;
+    file->open(fileName.str().c_str());
+    return file;
+  }
+
 /***
  * print surface factor only for the given phrase
  */
diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h
index 044e71491..0376eff6f 100644
--- a/moses-cmd/IOWrapper.h
+++ b/moses-cmd/IOWrapper.h
@@ -119,6 +119,7 @@ public:
 
   std::ofstream *GetOutputSearchGraphSLFStream(size_t sentenceNumber);
   std::ofstream *GetOutputSearchGraphHypergraphStream(size_t sentenceNumber);
+  std::ofstream *GetOutputSearchGraphHypergraphWeightsStream();
 
   std::ostream &GetDetailedTranslationReportingStream() {
     assert (m_detailedTranslationReportingStream);
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index afadddabf..0e48ae64f 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -333,6 +333,7 @@ public:
     }
 
     delete m_searchGraphSLFStream;
+    delete m_searchGraphHypergraphStream;
     delete m_source;
     
   }
@@ -406,6 +407,63 @@ static void ShowWeights()
 
 }
 
+size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream)
+{
+  size_t numScoreComps = ff->GetNumScoreComponents();
+  if (numScoreComps != ScoreProducer::unlimited) {
+    vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
+    if (numScoreComps > 1) {
+      for (size_t i = 0; i < numScoreComps; ++i) {
+	outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
+				<< i
+				<< "=" << values[i] << endl;
+      }
+    } else {
+	outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
+				<< "=" << values[0] << endl;
+    }
+    return index+numScoreComps;
+  } else {
+    cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl;
+    assert(false);
+    return 0;
+  }
+}
+
+void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream)
+{
+  outputSearchGraphStream.setf(std::ios::fixed);
+  outputSearchGraphStream.precision(6);
+
+  const StaticData& staticData = StaticData::Instance();
+  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
+  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
+  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
+  size_t featureIndex = 1;
+  for (size_t i = 0; i < sff.size(); ++i) {
+    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream);
+  }
+  for (size_t i = 0; i < slf.size(); ++i) {
+    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
+          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
+          slf[i]->GetScoreProducerWeightShortName() != "I" &&
+          slf[i]->GetScoreProducerWeightShortName() != "g")
+    {
+      featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream);
+    }
+  }
+  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
+  for( size_t i=0; i<pds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream);
+  }
+  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
+  for( size_t i=0; i<gds.size(); i++ ) {
+    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream);
+  }
+
+}
+
+
 } //namespace
 
 /** main function of the command line version of the decoder **/
@@ -469,6 +527,14 @@ int main(int argc, char** argv)
       TRACE_ERR(weights);
       TRACE_ERR("\n");
     }
+    if (staticData.GetOutputSearchGraphHypergraph() && staticData.GetParam("output-search-graph-hypergraph").size() > 1) {
+      ofstream* weightsOut = ioWrapper->GetOutputSearchGraphHypergraphWeightsStream();
+      OutputFeatureWeightsForHypergraph(*weightsOut);
+      weightsOut->flush();
+      weightsOut->close();
+      delete weightsOut;
+    }
+
 
     // initialize output streams
     // note: we can't just write to STDOUT or files
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 0c7471b88..760587a55 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -663,40 +663,6 @@ void Manager::OutputFeatureWeightsForSLF(std::ostream &outputSearchGraphStream)
 
 }
 
-void Manager::OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) const
-{
-  outputSearchGraphStream.setf(std::ios::fixed);
-  outputSearchGraphStream.precision(6);
-
-  const StaticData& staticData = StaticData::Instance();
-  const TranslationSystem& system = staticData.GetTranslationSystem(TranslationSystem::DEFAULT);
-  const vector<const StatelessFeatureFunction*>& slf =system.GetStatelessFeatureFunctions();
-  const vector<const StatefulFeatureFunction*>& sff = system.GetStatefulFeatureFunctions();
-  size_t featureIndex = 1;
-  for (size_t i = 0; i < sff.size(); ++i) {
-    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, sff[i], outputSearchGraphStream);
-  }
-  for (size_t i = 0; i < slf.size(); ++i) {
-    if (slf[i]->GetScoreProducerWeightShortName() != "u" &&
-          slf[i]->GetScoreProducerWeightShortName() != "tm" &&
-          slf[i]->GetScoreProducerWeightShortName() != "I" &&
-          slf[i]->GetScoreProducerWeightShortName() != "g")
-    {
-      featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, slf[i], outputSearchGraphStream);
-    }
-  }
-  const vector<PhraseDictionaryFeature*>& pds = system.GetPhraseDictionaries();
-  for( size_t i=0; i<pds.size(); i++ ) {
-    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, pds[i], outputSearchGraphStream);
-  }
-  const vector<GenerationDictionary*>& gds = system.GetGenerationDictionaries();
-  for( size_t i=0; i<gds.size(); i++ ) {
-    featureIndex = OutputFeatureWeightsForHypergraph(featureIndex, gds[i], outputSearchGraphStream);
-  }
-
-}
-
-
 void Manager::OutputFeatureValuesForSLF(const Hypothesis* hypo, bool zeros, std::ostream &outputSearchGraphStream) const
 {
   outputSearchGraphStream.setf(std::ios::fixed);
@@ -788,30 +754,6 @@ size_t Manager::OutputFeatureWeightsForSLF(size_t index, const FeatureFunction*
   }
 }
 
-size_t Manager::OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
-{
-  size_t numScoreComps = ff->GetNumScoreComponents();
-  if (numScoreComps != ScoreProducer::unlimited) {
-    vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
-    if (numScoreComps > 1) {
-      for (size_t i = 0; i < numScoreComps; ++i) {
-	outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
-				<< i
-				<< "=" << values[i] << endl;
-      }
-    } else {
-	outputSearchGraphStream << ff->GetScoreProducerWeightShortName()
-				<< "=" << values[0] << endl;
-    }
-    return index+numScoreComps;
-  } else {
-    cerr << "Sparse features are not yet supported when outputting hypergraph format" << endl;
-    assert(false);
-    return 0;
-  }
-}
-
-
 size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
 {
 
diff --git a/moses/Manager.h b/moses/Manager.h
index d580674b4..e2f8ed8e5 100644
--- a/moses/Manager.h
+++ b/moses/Manager.h
@@ -102,8 +102,6 @@ private:
   size_t OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
 
   // Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder
-  void OutputFeatureWeightsForHypergraph(std::ostream &outputSearchGraphStream) const;
-  size_t OutputFeatureWeightsForHypergraph(size_t index, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
   void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const;
   size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
   
-- 
cgit v1.2.3


From 2eb0c5e11da9b00f8675688841ba118b91aa7471 Mon Sep 17 00:00:00 2001
From: amittai <amittai@foo.bar>
Date: Sun, 24 Feb 2013 18:07:11 -0800
Subject: let's be consistently case-insensitive with respect to the xml tags

---
 scripts/ems/support/wrap-xml.perl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl
index e941aa95b..4ef6a1de6 100755
--- a/scripts/ems/support/wrap-xml.perl
+++ b/scripts/ems/support/wrap-xml.perl
@@ -13,10 +13,10 @@ chomp(@OUT);
 while(<SRC>) {
     chomp;
     if (/^<srcset/) {
-	s/<srcset/<tstset trglang="$language"/;
+	s/<srcset/<tstset trglang="$language"/i;
     }
     elsif (/^<\/srcset/) {
-	s/<\/srcset/<\/tstset/;
+	s/<\/srcset/<\/tstset/i;
     }
     elsif (/^<doc/i) {
   s/ *sysid="[^\"]+"//;
@@ -26,10 +26,10 @@ while(<SRC>) {
 	my $line = shift(@OUT);
         $line = "" if $line =~ /NO BEST TRANSLATION/;
         if (/<\/seg>/) {
-	  s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/;
+	  s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
         }
         else {
-	  s/(<seg[^>]+> *)[^<]*/$1$line/;
+	  s/(<seg[^>]+> *)[^<]*/$1$line/i;
         }
     }
     print $_."\n";
-- 
cgit v1.2.3


From 8b6e98c633695f05190f66af644d9fc9295a52d9 Mon Sep 17 00:00:00 2001
From: amittai <amittai@foo.bar>
Date: Sun, 24 Feb 2013 18:10:19 -0800
Subject: Revert "let's be consistently case-insensitive with respect to the
 xml tags"

This reverts commit 2eb0c5e11da9b00f8675688841ba118b91aa7471.
---
 scripts/ems/support/wrap-xml.perl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl
index 4ef6a1de6..e941aa95b 100755
--- a/scripts/ems/support/wrap-xml.perl
+++ b/scripts/ems/support/wrap-xml.perl
@@ -13,10 +13,10 @@ chomp(@OUT);
 while(<SRC>) {
     chomp;
     if (/^<srcset/) {
-	s/<srcset/<tstset trglang="$language"/i;
+	s/<srcset/<tstset trglang="$language"/;
     }
     elsif (/^<\/srcset/) {
-	s/<\/srcset/<\/tstset/i;
+	s/<\/srcset/<\/tstset/;
     }
     elsif (/^<doc/i) {
   s/ *sysid="[^\"]+"//;
@@ -26,10 +26,10 @@ while(<SRC>) {
 	my $line = shift(@OUT);
         $line = "" if $line =~ /NO BEST TRANSLATION/;
         if (/<\/seg>/) {
-	  s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
+	  s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/;
         }
         else {
-	  s/(<seg[^>]+> *)[^<]*/$1$line/i;
+	  s/(<seg[^>]+> *)[^<]*/$1$line/;
         }
     }
     print $_."\n";
-- 
cgit v1.2.3


From 1bba58b134a67e7fcb8d65f941a0570d4ed75a43 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Tue, 26 Feb 2013 11:01:01 +0000
Subject: eclipse project files

---
 contrib/other-builds/lm/.project               | 20 --------------------
 contrib/other-builds/moses-chart-cmd/.cproject | 10 ++++++++--
 contrib/other-builds/moses-cmd/.cproject       |  2 ++
 3 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/contrib/other-builds/lm/.project b/contrib/other-builds/lm/.project
index e75388ac1..a1bde37c2 100644
--- a/contrib/other-builds/lm/.project
+++ b/contrib/other-builds/lm/.project
@@ -141,11 +141,6 @@
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary</locationURI>
 		</link>
-		<link>
-			<name>build_binary.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/lm/build_binary.cc</locationURI>
-		</link>
 		<link>
 			<name>clean.sh</name>
 			<type>1</type>
@@ -176,11 +171,6 @@
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/lm/facade.hh</locationURI>
 		</link>
-		<link>
-			<name>fragment.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/lm/fragment.cc</locationURI>
-		</link>
 		<link>
 			<name>left.hh</name>
 			<type>1</type>
@@ -211,11 +201,6 @@
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/lm/lm_exception.hh</locationURI>
 		</link>
-		<link>
-			<name>max_order.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/lm/max_order.cc</locationURI>
-		</link>
 		<link>
 			<name>max_order.hh</name>
 			<type>1</type>
@@ -241,11 +226,6 @@
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/lm/model_type.hh</locationURI>
 		</link>
-		<link>
-			<name>ngram_query.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/lm/ngram_query.cc</locationURI>
-		</link>
 		<link>
 			<name>ngram_query.hh</name>
 			<type>1</type>
diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject
index fedda926b..7120f0b71 100644
--- a/contrib/other-builds/moses-chart-cmd/.cproject
+++ b/contrib/other-builds/moses-chart-cmd/.cproject
@@ -46,6 +46,7 @@
 							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.816413868" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
 								<option id="gnu.cpp.link.option.paths.330225535" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
@@ -154,8 +155,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/moses-chart-cmd"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject
index 10b6784d4..0dd08a220 100644
--- a/contrib/other-builds/moses-cmd/.cproject
+++ b/contrib/other-builds/moses-cmd/.cproject
@@ -46,6 +46,8 @@
 							<tool id="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug.1546774818" name="GCC C++ Linker" superClass="cdt.managedbuild.tool.gnu.cpp.linker.exe.debug">
 								<option id="gnu.cpp.link.option.paths.523170942" name="Library search path (-L)" superClass="gnu.cpp.link.option.paths" valueType="libPaths">
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../irstlm/lib&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../boost/lib64&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/macosx&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686-m64&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc:}/../../srilm/lib/i686&quot;"/>
-- 
cgit v1.2.3


From 1ee365e0d7a96f70d64396cb5854bb3d1a2abbd7 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Tue, 26 Feb 2013 18:27:55 +0000
Subject: WIP. Fix for incorrect alignment from moses_chart

---
 moses-chart-cmd/IOWrapper.cpp | 53 +++++++++++++++++++++++++++++++------------
 1 file changed, 38 insertions(+), 15 deletions(-)

diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses-chart-cmd/IOWrapper.cpp
index 09e06fcf6..308787270 100644
--- a/moses-chart-cmd/IOWrapper.cpp
+++ b/moses-chart-cmd/IOWrapper.cpp
@@ -620,9 +620,15 @@ void IOWrapper::FixPrecision(std::ostream &stream, size_t size)
 template <class T>
 void ShiftOffsets(vector<T> &offsets, T shift)
 {
+  T currPos = shift;
   for (size_t i = 0; i < offsets.size(); ++i) {
-    shift += offsets[i];
-    offsets[i] += shift;
+    if (offsets[i] == 0) {
+	  offsets[i] = currPos;
+	  ++currPos;
+	}
+	else {
+	  currPos += offsets[i];
+	}
   }
 }
 
@@ -716,6 +722,17 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe
   m_alignmentInfoCollector->Write(translationId, out.str());
 }
 
+size_t CalcSourceSize(const Moses::ChartHypothesis *hypo)
+{
+  size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
+  const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+  for (size_t i = 0; i < prevHypos.size(); ++i) {
+    size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
+    ret -= (childSize - 1);
+  }
+  return ret;
+}
+
 size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget)
 {
   size_t totalTargetSize = 0;
@@ -723,7 +740,14 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
 
   const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
 
-  vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0);
+  if (hypo->GetCurrSourceRange().GetStartPos() == 12
+	  && hypo->GetCurrSourceRange().GetEndPos() == 18)
+  {
+	  cerr << "stop" << endl;
+  }
+
+  size_t thisSourceSize = CalcSourceSize(hypo);
+  vector<size_t> sourceOffsets(thisSourceSize, 0);
   vector<size_t> targetOffsets(tp.GetSize(), 0);
 
   const vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
@@ -765,22 +789,21 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
   ShiftOffsets(targetOffsets, startTarget);
 
   // get alignments from this hypo
-  vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
   const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
-  OutputAlignment(retAlignmentsS2T, aiTerm);
 
   // add to output arg, offsetting by source & target
-  for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) {
-    const set<size_t> &targets = retAlignmentsS2T[source];
-    set<size_t>::const_iterator iter;
-    for (iter = targets.begin(); iter != targets.end(); ++iter) {
-      size_t target = *iter;
-      pair<size_t, size_t> alignPoint(source + sourceOffsets[source]
-                                     ,target + targetOffsets[target]);
-      pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
-      CHECK(ret.second);
+  AlignmentInfo::const_iterator iter;
+  for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+    const std::pair<size_t,size_t> &align = *iter;
+    size_t relSource = align.first;
+    size_t relTarget = align.second;
+    size_t absSource = sourceOffsets[relSource];
+    size_t absTarget = targetOffsets[relTarget];
+
+    pair<size_t, size_t> alignPoint(absSource, absTarget);
+    pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+    CHECK(ret.second);
 
-    }
   }
 
   return totalTargetSize;
-- 
cgit v1.2.3


From 1fb51dc67484967053fcb7c78d379c7ca00f95e5 Mon Sep 17 00:00:00 2001
From: amittai <amittai@foo.bar>
Date: Tue, 26 Feb 2013 11:19:33 -0800
Subject: use 'gunzip -c' instead of 'zcat' for better cross-platform
 compatibility

zcat is identical to "gunzip -c", but Mac OS X doesn't ship with zcat.
---
 scripts/training/clean-corpus-n.perl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl
index bea32052a..dad339070 100755
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@@ -47,7 +47,7 @@ my $l1input = "$corpus.$l1";
 if (-e $l1input) {
   $opn = $l1input;
 } elsif (-e $l1input.".gz") {
-  $opn = "zcat $l1input.gz |";
+  $opn = "gunzip -c $l1input.gz |";
 } else {
     die "Error: $l1input does not exist";
 }
@@ -57,7 +57,7 @@ my $l2input = "$corpus.$l2";
 if (-e $l2input) {
   $opn = $l2input;
 } elsif (-e $l2input.".gz") {
-  $opn = "zcat $l2input.gz |";
+  $opn = "gunzip -c $l2input.gz |";
 } else  {
  die "Error: $l2input does not exist";
 }
-- 
cgit v1.2.3


From 5cdf65ba330033e7f98d213612f0c160f469f62f Mon Sep 17 00:00:00 2001
From: amittai <amittai@foo.bar>
Date: Tue, 26 Feb 2013 11:32:29 -0800
Subject: Revert "Revert "let's be consistently case-insensitive with respect
 to the xml tags""

This reverts commit 8b6e98c633695f05190f66af644d9fc9295a52d9.
---
 scripts/ems/support/wrap-xml.perl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/ems/support/wrap-xml.perl b/scripts/ems/support/wrap-xml.perl
index e941aa95b..4ef6a1de6 100755
--- a/scripts/ems/support/wrap-xml.perl
+++ b/scripts/ems/support/wrap-xml.perl
@@ -13,10 +13,10 @@ chomp(@OUT);
 while(<SRC>) {
     chomp;
     if (/^<srcset/) {
-	s/<srcset/<tstset trglang="$language"/;
+	s/<srcset/<tstset trglang="$language"/i;
     }
     elsif (/^<\/srcset/) {
-	s/<\/srcset/<\/tstset/;
+	s/<\/srcset/<\/tstset/i;
     }
     elsif (/^<doc/i) {
   s/ *sysid="[^\"]+"//;
@@ -26,10 +26,10 @@ while(<SRC>) {
 	my $line = shift(@OUT);
         $line = "" if $line =~ /NO BEST TRANSLATION/;
         if (/<\/seg>/) {
-	  s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/;
+	  s/(<seg[^>]+> *).*(<\/seg>)/$1$line$2/i;
         }
         else {
-	  s/(<seg[^>]+> *)[^<]*/$1$line/;
+	  s/(<seg[^>]+> *)[^<]*/$1$line/i;
         }
     }
     print $_."\n";
-- 
cgit v1.2.3


From 1f82a438377e03b71697f195494a9ae4f618fe63 Mon Sep 17 00:00:00 2001
From: amittai <amittai@foo.bar>
Date: Tue, 26 Feb 2013 11:37:31 -0800
Subject: where'd the edit go?

---
 scripts/training/clean-corpus-n.perl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/training/clean-corpus-n.perl b/scripts/training/clean-corpus-n.perl
index dad339070..2865fe391 100755
--- a/scripts/training/clean-corpus-n.perl
+++ b/scripts/training/clean-corpus-n.perl
@@ -160,3 +160,4 @@ sub word_count {
   my @w = split(/ /,$line);
   return scalar @w;
 }
+
-- 
cgit v1.2.3


From 7ca271b200c7fabef2a1697924173ecc8470a66e Mon Sep 17 00:00:00 2001
From: amittai <amittai@foo.bar>
Date: Tue, 26 Feb 2013 19:47:44 -0800
Subject: fixed typo

---
 scripts/tokenizer/tokenizer.perl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/tokenizer/tokenizer.perl b/scripts/tokenizer/tokenizer.perl
index f59cd5f86..986a2dfb5 100755
--- a/scripts/tokenizer/tokenizer.perl
+++ b/scripts/tokenizer/tokenizer.perl
@@ -171,7 +171,7 @@ if ($TIMING)
 
 # tokenize a batch of texts saved in an array
 # input: an array containing a batch of texts
-# return: another array cotaining a batch of tokenized texts for the input array
+# return: another array containing a batch of tokenized texts for the input array
 sub tokenize_batch
 {
     my(@text_list) = @_;
-- 
cgit v1.2.3


From 349dffd75037a86938d2f3d33a10dce5be55d0fe Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Sat, 2 Mar 2013 17:58:33 +0000
Subject: Fix for incorrect alignment from moses_chart

---
 moses-chart-cmd/IOWrapper.cpp | 75 ++++++++++++++++++++++---------------------
 1 file changed, 39 insertions(+), 36 deletions(-)

diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses-chart-cmd/IOWrapper.cpp
index 308787270..d0ef97c42 100644
--- a/moses-chart-cmd/IOWrapper.cpp
+++ b/moses-chart-cmd/IOWrapper.cpp
@@ -632,6 +632,17 @@ void ShiftOffsets(vector<T> &offsets, T shift)
   }
 }
 
+size_t CalcSourceSize(const Moses::ChartHypothesis *hypo)
+{
+  size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
+  const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
+  for (size_t i = 0; i < prevHypos.size(); ++i) {
+    size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
+    ret -= (childSize - 1);
+  }
+  return ret;
+}
+
 size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartTrellisNode &node, size_t startTarget)
 {
   const ChartHypothesis *hypo = &node.GetHypothesis();
@@ -641,7 +652,11 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
 
   const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
 
-  vector<size_t> sourceOffsets(hypo->GetCurrSourceRange().GetNumWordsCovered(), 0);
+  size_t thisSourceSize = CalcSourceSize(hypo);
+
+  // position of each terminal word in translation rule, irrespective of alignment
+  // if non-term, number is undefined
+  vector<size_t> sourceOffsets(thisSourceSize, 0);
   vector<size_t> targetOffsets(tp.GetSize(), 0);
 
   const ChartTrellisNode::NodeChildren &prevNodes = node.GetChildren();
@@ -661,11 +676,12 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
 
       const ChartTrellisNode &prevNode = *prevNodes[sourceInd];
 
-      // 1st. calc source size
+      // calc source size
       size_t sourceSize = prevNode.GetHypothesis().GetCurrSourceRange().GetNumWordsCovered();
       sourceOffsets[sourcePos] = sourceSize;
 
-      // 2nd. calc target size. Recursively look thru child hypos
+      // calc target size.
+      // Recursively look thru child hypos
       size_t currStartTarget = startTarget + totalTargetSize;
       size_t targetSize = OutputAlignmentNBest(retAlign, prevNode, currStartTarget);
       targetOffsets[targetPos] = targetSize;
@@ -678,27 +694,26 @@ size_t IOWrapper::OutputAlignmentNBest(Alignments &retAlign, const Moses::ChartT
     }
   }
 
-  // 3rd. shift offsets
+  // convert position within translation rule to absolute position within
+  // source sentence / output sentence
   ShiftOffsets(sourceOffsets, startSource);
   ShiftOffsets(targetOffsets, startTarget);
 
   // get alignments from this hypo
-  vector< set<size_t> > retAlignmentsS2T(hypo->GetCurrSourceRange().GetNumWordsCovered());
   const AlignmentInfo &aiTerm = hypo->GetCurrTargetPhrase().GetAlignTerm();
-  OutputAlignment(retAlignmentsS2T, aiTerm);
 
   // add to output arg, offsetting by source & target
-  for (size_t source = 0; source < retAlignmentsS2T.size(); ++source) {
-    const set<size_t> &targets = retAlignmentsS2T[source];
-    set<size_t>::const_iterator iter;
-    for (iter = targets.begin(); iter != targets.end(); ++iter) {
-      size_t target = *iter;
-      pair<size_t, size_t> alignPoint(source + sourceOffsets[source]
-                                     ,target + targetOffsets[target]);
-      pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
-      CHECK(ret.second);
+  AlignmentInfo::const_iterator iter;
+  for (iter = aiTerm.begin(); iter != aiTerm.end(); ++iter) {
+    const std::pair<size_t,size_t> &align = *iter;
+    size_t relSource = align.first;
+    size_t relTarget = align.second;
+    size_t absSource = sourceOffsets[relSource];
+    size_t absTarget = targetOffsets[relTarget];
 
-    }
+    pair<size_t, size_t> alignPoint(absSource, absTarget);
+    pair<Alignments::iterator, bool> ret = retAlign.insert(alignPoint);
+    CHECK(ret.second);
   }
 
   return totalTargetSize;
@@ -722,17 +737,6 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe
   m_alignmentInfoCollector->Write(translationId, out.str());
 }
 
-size_t CalcSourceSize(const Moses::ChartHypothesis *hypo)
-{
-  size_t ret = hypo->GetCurrSourceRange().GetNumWordsCovered();
-  const std::vector<const ChartHypothesis*> &prevHypos = hypo->GetPrevHypos();
-  for (size_t i = 0; i < prevHypos.size(); ++i) {
-    size_t childSize = prevHypos[i]->GetCurrSourceRange().GetNumWordsCovered();
-    ret -= (childSize - 1);
-  }
-  return ret;
-}
-
 size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypothesis *hypo, size_t startTarget)
 {
   size_t totalTargetSize = 0;
@@ -740,13 +744,10 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
 
   const TargetPhrase &tp = hypo->GetCurrTargetPhrase();
 
-  if (hypo->GetCurrSourceRange().GetStartPos() == 12
-	  && hypo->GetCurrSourceRange().GetEndPos() == 18)
-  {
-	  cerr << "stop" << endl;
-  }
-
   size_t thisSourceSize = CalcSourceSize(hypo);
+
+  // position of each terminal word in translation rule, irrespective of alignment
+  // if non-term, number is undefined
   vector<size_t> sourceOffsets(thisSourceSize, 0);
   vector<size_t> targetOffsets(tp.GetSize(), 0);
 
@@ -767,11 +768,12 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
 
       const ChartHypothesis *prevHypo = prevHypos[sourceInd];
 
-      // 1st. calc source size
+      // calc source size
       size_t sourceSize = prevHypo->GetCurrSourceRange().GetNumWordsCovered();
       sourceOffsets[sourcePos] = sourceSize;
 
-      // 2nd. calc target size. Recursively look thru child hypos
+      // calc target size.
+      // Recursively look thru child hypos
       size_t currStartTarget = startTarget + totalTargetSize;
       size_t targetSize = OutputAlignment(retAlign, prevHypo, currStartTarget);
       targetOffsets[targetPos] = targetSize;
@@ -784,7 +786,8 @@ size_t IOWrapper::OutputAlignment(Alignments &retAlign, const Moses::ChartHypoth
     }
   }
 
-  // 3rd. shift offsets
+  // convert position within translation rule to absolute position within
+  // source sentence / output sentence
   ShiftOffsets(sourceOffsets, startSource);
   ShiftOffsets(targetOffsets, startTarget);
 
-- 
cgit v1.2.3


From 3013227385ac6db1aa07702c115d2c512967e461 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Mon, 4 Mar 2013 10:19:50 +0000
Subject: output an empty line in align file even if no best hypo

---
 contrib/other-builds/mert_lib/.cproject | 23 ++++++++++++++++-------
 moses-chart-cmd/IOWrapper.cpp           | 18 ++++++++++--------
 2 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/contrib/other-builds/mert_lib/.cproject b/contrib/other-builds/mert_lib/.cproject
index 41a471cd1..79dffb294 100644
--- a/contrib/other-builds/mert_lib/.cproject
+++ b/contrib/other-builds/mert_lib/.cproject
@@ -7,7 +7,7 @@
 					<externalSetting>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Debug"/>
-						<entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/>
+						<entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
 					</externalSetting>
 				</externalSettings>
 				<extensions>
@@ -23,13 +23,14 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.lib.debug.1932340583" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.lib.debug">
 							<targetPlatform id="cdt.managedbuild.target.gnu.platform.lib.debug.296711714" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.lib.debug"/>
-							<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
+							<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="4" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.lib.debug.89397980" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.lib.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug">
 								<option id="gnu.cpp.compiler.lib.debug.option.optimization.level.469164841" name="Optimization Level" superClass="gnu.cpp.compiler.lib.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
 								<option id="gnu.cpp.compiler.lib.debug.option.debugging.level.1050747398" name="Debug Level" superClass="gnu.cpp.compiler.lib.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
 								<option id="gnu.cpp.compiler.option.include.paths.1565260476" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
 									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1183866856" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
 							</tool>
@@ -45,11 +46,11 @@
 							</tool>
 						</toolChain>
 					</folderInfo>
-					<fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.626295813" name="extractor.cpp" rcbsApplicability="disable" resourcePath="mert/extractor.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460">
-						<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.1550378460" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537"/>
+					<fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.646822372" name="UtilTest.cpp" rcbsApplicability="disable" resourcePath="mert/UtilTest.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.967030373">
+						<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.967030373" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537"/>
 					</fileInfo>
 					<sourceEntries>
-						<entry excluding="mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
+						<entry excluding="mert/UtilTest.cpp|mert/TimerTest.cpp|mert/SingletonTest.cpp|mert/PointTest.cpp|mert/OptimizerFactoryTest.cpp|mert/NgramTest.cpp|mert/FeatureDataTest.cpp|mert/DataTest.cpp|mert/ReferenceTest.cpp|mert/VocabularyTest.cpp|mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
 					</sourceEntries>
 				</configuration>
 			</storageModule>
@@ -61,7 +62,7 @@
 					<externalSetting>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/mert_lib"/>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/mert_lib/Release"/>
-						<entry flags="RESOLVED" kind="libraryFile" name="mert_lib"/>
+						<entry flags="RESOLVED" kind="libraryFile" name="mert_lib" srcPrefixMapping="" srcRootPath=""/>
 					</externalSetting>
 				</externalSettings>
 				<extensions>
@@ -119,5 +120,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/mert_lib"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/mert_lib"/>
+		</configuration>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
diff --git a/moses-chart-cmd/IOWrapper.cpp b/moses-chart-cmd/IOWrapper.cpp
index d0ef97c42..b65873881 100644
--- a/moses-chart-cmd/IOWrapper.cpp
+++ b/moses-chart-cmd/IOWrapper.cpp
@@ -723,14 +723,16 @@ void IOWrapper::OutputAlignment(size_t translationId , const Moses::ChartHypothe
 {
   ostringstream out;
 
-  Alignments retAlign;
-  OutputAlignment(retAlign, hypo, 0);
-
-  // output alignments
-  Alignments::const_iterator iter;
-  for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
-    const pair<size_t, size_t> &alignPoint = *iter;
-    out << alignPoint.first << "-" << alignPoint.second << " ";
+  if (hypo) {
+	Alignments retAlign;
+	OutputAlignment(retAlign, hypo, 0);
+
+	// output alignments
+	Alignments::const_iterator iter;
+	for (iter = retAlign.begin(); iter != retAlign.end(); ++iter) {
+	  const pair<size_t, size_t> &alignPoint = *iter;
+	  out << alignPoint.first << "-" << alignPoint.second << " ";
+	}
   }
   out << endl;
 
-- 
cgit v1.2.3


From 8573a66da0363f7b60cba8abdf3fb57f702289e8 Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Mon, 4 Mar 2013 09:15:46 -0500
Subject: Close hypergraph output files.

---
 moses-cmd/Main.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index 0e48ae64f..f93bc4fce 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -332,6 +332,10 @@ public:
       m_searchGraphSLFStream->close();
     }
 
+    if (m_searchGraphHypergraphStream) {
+      m_searchGraphHypergraphStream->close();
+    }
+
     delete m_searchGraphSLFStream;
     delete m_searchGraphHypergraphStream;
     delete m_source;
-- 
cgit v1.2.3


From 26bf04df5da3c756a37c93cbb4bcc0647df73aca Mon Sep 17 00:00:00 2001
From: Christian Buck <cbuck@lantis.de>
Date: Mon, 4 Mar 2013 15:29:13 +0000
Subject: added unbuffered mode for casers (using -b)

---
 scripts/recaser/detruecase.perl | 7 ++++---
 scripts/recaser/recase.perl     | 6 ++++--
 scripts/recaser/truecase.perl   | 8 +++++---
 3 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/scripts/recaser/detruecase.perl b/scripts/recaser/detruecase.perl
index 49c89c299..012c143ac 100755
--- a/scripts/recaser/detruecase.perl
+++ b/scripts/recaser/detruecase.perl
@@ -6,11 +6,12 @@ use Getopt::Long "GetOptions";
 binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
 
-
-my ($SRC,$INFILE);
+my ($SRC,$INFILE,$UNBUFFERED);
 die("detruecase.perl < in > out")
     unless &GetOptions('headline=s' => \$SRC,
-		       'in=s' => \$INFILE);
+		       'in=s' => \$INFILE,
+                       'b|unbuffered' => \$UNBUFFERED);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
 
 my %SENTENCE_END = ("."=>1,":"=>1,"?"=>1,"!"=>1);
 my %DELAYED_SENTENCE_START = ("("=>1,"["=>1,"\""=>1,"'"=>1,"&quot;"=>1,"&apos;"=>1,"&#91;"=>1,"&#93;"=>1);
diff --git a/scripts/recaser/recase.perl b/scripts/recaser/recase.perl
index c83c30daa..2858cda61 100755
--- a/scripts/recaser/recase.perl
+++ b/scripts/recaser/recase.perl
@@ -4,7 +4,7 @@
 use strict;
 use Getopt::Long "GetOptions";
 
-my ($SRC,$INFILE,$RECASE_MODEL);
+my ($SRC,$INFILE,$RECASE_MODEL,$UNBUFFERED);
 my $MOSES = "moses";
 my $LANGUAGE = "en"; # English by default;
 die("recase.perl --in file --model ini-file > out")
@@ -12,9 +12,11 @@ die("recase.perl --in file --model ini-file > out")
                        'headline=s' => \$SRC,
                        'lang=s' => \$LANGUAGE,
 		       'moses=s' => \$MOSES,
-                       'model=s' => \$RECASE_MODEL)
+                       'model=s' => \$RECASE_MODEL,
+                       'b|unbuffered' => \$UNBUFFERED)
     && defined($INFILE)
     && defined($RECASE_MODEL);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
 
 my %treated_languages = map { ($_,1) } qw/en cs/;
 die "I don't know any rules for $LANGUAGE. Use 'en' as the default."
diff --git a/scripts/recaser/truecase.perl b/scripts/recaser/truecase.perl
index 0e2df27a2..517f5c7a1 100755
--- a/scripts/recaser/truecase.perl
+++ b/scripts/recaser/truecase.perl
@@ -8,9 +8,11 @@ binmode(STDIN, ":utf8");
 binmode(STDOUT, ":utf8");
 
 # apply switches
-my $MODEL;
-die("truecase.perl --model truecaser < in > out")
-    unless &GetOptions('model=s' => \$MODEL);
+my ($MODEL, $UNBUFFERED);
+die("truecase.perl --model MODEL [-b] < in > out")
+    unless &GetOptions('model=s' => \$MODEL,'b|unbuffered' => \$UNBUFFERED)
+    && defined($MODEL);
+if (defined($UNBUFFERED) && $UNBUFFERED) { $|=1; }
 
 my (%BEST,%KNOWN);
 open(MODEL,$MODEL) || die("ERROR: could not open '$MODEL'");
-- 
cgit v1.2.3


From ec69acf3d4a2a0561ae4bed147e12562fd449280 Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Mon, 4 Mar 2013 12:07:37 -0500
Subject: Don't open all hypergraph output files at once.

---
 moses-cmd/IOWrapper.cpp | 18 ----------------
 moses-cmd/IOWrapper.h   |  2 --
 moses-cmd/Main.cpp      | 56 +++++++++++++++++++++++--------------------------
 3 files changed, 26 insertions(+), 50 deletions(-)

diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp
index 6fffead46..f7fed9998 100644
--- a/moses-cmd/IOWrapper.cpp
+++ b/moses-cmd/IOWrapper.cpp
@@ -189,24 +189,6 @@ InputType*IOWrapper::GetInput(InputType* inputType)
   }
 }
 
-  ofstream* IOWrapper::GetOutputSearchGraphSLFStream(size_t sentenceNumber) {
-    const StaticData &staticData = StaticData::Instance();
-    stringstream fileName;
-    fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << sentenceNumber << ".slf";
-    std::ofstream *file = new std::ofstream;
-    file->open(fileName.str().c_str());
-    return file;
-  }
-
-  ofstream* IOWrapper::GetOutputSearchGraphHypergraphStream(size_t sentenceNumber) {
-    const StaticData &staticData = StaticData::Instance();
-    stringstream fileName;
-    fileName << staticData.GetParam("output-search-graph-hypergraph")[0] << "/" << sentenceNumber;
-    std::ofstream *file = new std::ofstream;
-    file->open(fileName.str().c_str());
-    return file;
-  }
-
   ofstream* IOWrapper::GetOutputSearchGraphHypergraphWeightsStream() {
     const StaticData &staticData = StaticData::Instance();
     stringstream fileName;
diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h
index 0376eff6f..5decaa122 100644
--- a/moses-cmd/IOWrapper.h
+++ b/moses-cmd/IOWrapper.h
@@ -117,8 +117,6 @@ public:
     return *m_outputSearchGraphStream;
   }
 
-  std::ofstream *GetOutputSearchGraphSLFStream(size_t sentenceNumber);
-  std::ofstream *GetOutputSearchGraphHypergraphStream(size_t sentenceNumber);
   std::ofstream *GetOutputSearchGraphHypergraphWeightsStream();
 
   std::ostream &GetDetailedTranslationReportingStream() {
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index f93bc4fce..5a33c214c 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -84,8 +84,8 @@ public:
                   OutputCollector* detailedTranslationCollector,
                   OutputCollector* alignmentInfoCollector,
                   OutputCollector* unknownsCollector,
-                  std::ofstream* searchGraphSLFStream,
-		  std::ofstream* searchGraphHypergraphStream) :
+                  bool outputSearchGraphSLF,
+		  bool outputSearchGraphHypergraph) :
     m_source(source), m_lineNumber(lineNumber),
     m_outputCollector(outputCollector), m_nbestCollector(nbestCollector),
     m_latticeSamplesCollector(latticeSamplesCollector),
@@ -93,8 +93,8 @@ public:
     m_detailedTranslationCollector(detailedTranslationCollector),
     m_alignmentInfoCollector(alignmentInfoCollector),
     m_unknownsCollector(unknownsCollector),
-    m_searchGraphSLFStream(searchGraphSLFStream),
-    m_searchGraphHypergraphStream(searchGraphHypergraphStream) {}
+    m_outputSearchGraphSLF(outputSearchGraphSLF),
+    m_outputSearchGraphHypergraph(outputSearchGraphHypergraph) {}
 
 	/** Translate one sentence
    * gets called by main function implemented at end of this source file */
@@ -148,29 +148,39 @@ public:
     }		
 
     // Output search graph in HTK standard lattice format (SLF)
-    if (m_searchGraphSLFStream) {
-      if (m_searchGraphSLFStream->is_open() && m_searchGraphSLFStream->good()) {
+    if (m_outputSearchGraphSLF) {
+      stringstream fileName;
+      fileName << staticData.GetParam("output-search-graph-slf")[0] << "/" << m_lineNumber << ".slf";
+      std::ofstream *file = new std::ofstream;
+      file->open(fileName.str().c_str());
+      if (file->is_open() && file->good()) {
 	ostringstream out;
 	fix(out,PRECISION);
 	manager.OutputSearchGraphAsSLF(m_lineNumber, out);
-	*m_searchGraphSLFStream << out.str();
-	m_searchGraphSLFStream -> flush();
+	*file << out.str();
+	file -> flush();
       } else {
 	TRACE_ERR("Cannot output HTK standard lattice for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
       }
     }
 
     // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
-    if (m_searchGraphHypergraphStream) {
-      if (m_searchGraphHypergraphStream->is_open() && m_searchGraphHypergraphStream->good()) {
+    if (m_outputSearchGraphHypergraph) {
+      stringstream fileName;
+      fileName << staticData.GetParam("output-search-graph-hypergraph")[0] << "/" << m_lineNumber;
+      std::ofstream *file = new std::ofstream;
+      file->open(fileName.str().c_str());
+      if (file->is_open() && file->good()) {
 	ostringstream out;
 	fix(out,PRECISION);
 	manager.OutputSearchGraphAsHypergraph(m_lineNumber, out);
-	*m_searchGraphHypergraphStream << out.str();
-	m_searchGraphHypergraphStream -> flush();
+	*file << out.str();
+	file -> flush();
       } else {
 	TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
       }
+      file -> close();
+      delete file;
     }
 
     // apply decision rule and output best translation(s)
@@ -327,19 +337,7 @@ public:
   }
 
   ~TranslationTask() {
-   
-    if (m_searchGraphSLFStream) {
-      m_searchGraphSLFStream->close();
-    }
-
-    if (m_searchGraphHypergraphStream) {
-      m_searchGraphHypergraphStream->close();
-    }
-
-    delete m_searchGraphSLFStream;
-    delete m_searchGraphHypergraphStream;
     delete m_source;
-    
   }
 
 private:
@@ -353,8 +351,8 @@ private:
   OutputCollector* m_detailedTranslationCollector;
   OutputCollector* m_alignmentInfoCollector;
   OutputCollector* m_unknownsCollector;
-  std::ofstream  *m_searchGraphSLFStream;
-  std::ofstream  *m_searchGraphHypergraphStream;
+  bool m_outputSearchGraphSLF;
+  bool m_outputSearchGraphHypergraph;
   std::ofstream *m_alignmentStream;
 
 
@@ -643,10 +641,8 @@ int main(int argc, char** argv)
                             detailedTranslationCollector.get(),
                             alignmentInfoCollector.get(),
                             unknownsCollector.get(),
-			    staticData.GetOutputSearchGraphSLF() ? 
-			    ioWrapper->GetOutputSearchGraphSLFStream(lineCount) : NULL,
-			    staticData.GetOutputSearchGraphHypergraph() ? 
-			    ioWrapper->GetOutputSearchGraphHypergraphStream(lineCount) : NULL);
+			    staticData.GetOutputSearchGraphSLF(),
+			    staticData.GetOutputSearchGraphHypergraph());
       // execute task
 #ifdef WITH_THREADS
     pool.Submit(task);
-- 
cgit v1.2.3


From cf94b96afa9414e95de3ca26d68da6de4fe166bc Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Mon, 4 Mar 2013 14:20:01 -0500
Subject: Try to ensure that lattice nodes are output in topological order

---
 moses/Manager.cpp | 44 ++++++++++++++++++++++++++++++++------------
 1 file changed, 32 insertions(+), 12 deletions(-)

diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 760587a55..716daaee3 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -48,6 +48,8 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "rule.pb.h"
 #endif
 
+#include "util/exception.hh"
+
 using namespace std;
 
 namespace Moses
@@ -831,7 +833,18 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
 
     // Record that this arc ends at this node
     nodeToLines.insert(pair<int,int>(numNodes,arcNumber));
+    
+    // Get an id number for the previous hypothesis
+    const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo();
+    if (prevHypo!=NULL) {
+      int prevID = prevHypo->GetId();
+      if (nodes.count(prevID) == 0) {
+	nodes[prevID] = numNodes;
+	numNodes += 1;
+      }
+    }
 
+    // Get an id number for this hypothesis
     int hypothesisID = searchGraph[arcNumber].hypo->GetId();
     if (nodes.count(hypothesisID) == 0) {
       
@@ -870,18 +883,25 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
 	if (prevHypo==NULL) {
 	  outputSearchGraphStream << "<s> ||| " << endl;
 	} else {
-	int startNode = nodes[prevHypo->GetId()];
-      
-	const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
-	int targetWordCount = targetPhrase.GetSize();
-
-	outputSearchGraphStream << "[" << startNode << "]";
-	for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
-	  outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
-	}
-	outputSearchGraphStream << " ||| ";
-	OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
-	outputSearchGraphStream << endl;
+	  int startNode = nodes[prevHypo->GetId()];
+
+	  UTIL_THROW_IF(
+			(startNode >= nodeNumber),
+			util::Exception,
+			"Error while writing search lattice as hypergraph for sentence" << translationId << "." <<
+			"The nodes must be output in topological order. The code attempted to violate this restriction."
+			);
+
+	  const TargetPhrase &targetPhrase = thisHypo->GetCurrTargetPhrase();
+	  int targetWordCount = targetPhrase.GetSize();
+
+	  outputSearchGraphStream << "[" << startNode << "]";
+	  for (int targetWordIndex=0; targetWordIndex<targetWordCount; targetWordIndex+=1) {
+	    outputSearchGraphStream << " " << targetPhrase.GetWord(targetWordIndex);
+	  }
+	  outputSearchGraphStream << " ||| ";
+	  OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
+	  outputSearchGraphStream << endl;
 	}
 
       }
-- 
cgit v1.2.3


From 6fa279fadb4944618f6260dec1f1f4807de24749 Mon Sep 17 00:00:00 2001
From: Phil Williams <philip.williams@mac.com>
Date: Mon, 4 Mar 2013 21:02:50 +0000
Subject: filter-rule-table.py: change default pruning count from 1 to 0

Change the default pruning threshold from 1 to 0 to allow for
Hiero-style fractional counts.
---
 scripts/training/filter-rule-table.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/training/filter-rule-table.py b/scripts/training/filter-rule-table.py
index 8bef034de..86c8b300e 100755
--- a/scripts/training/filter-rule-table.py
+++ b/scripts/training/filter-rule-table.py
@@ -40,7 +40,8 @@ def printUsage():
 def main():
     parser = optparse.OptionParser()
     parser.add_option("-c", "--min-non-initial-rule-count",
-                      action="store", dest="minCount", type="int", default="1",
+                      action="store", dest="minCount",
+                      type="float", default="0.0",
                       help="prune non-initial rules where count is below N",
                       metavar="N")
     (options, args) = parser.parse_args()
-- 
cgit v1.2.3


From 45f0057b549eba9ddc46ca957e1c4e08fe3368c7 Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Mon, 4 Mar 2013 17:02:16 -0500
Subject: Fixed bug in outputing search lattice as hypergraph

---
 moses/Manager.cpp | 91 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 56 insertions(+), 35 deletions(-)

diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 716daaee3..f8f2402a6 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -823,72 +823,93 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
   vector<SearchGraphNode> searchGraph;
   GetSearchGraph(searchGraph);
 
-  long numNodes = 0;
-
-  map<int,int> nodes;
+  map<int,int> mosesIDToHypergraphID;
+  // map<int,int> hypergraphIDToMosesID;
   set<int> terminalNodes;
-  multimap<int,int> nodeToLines;
+  multimap<int,int> hypergraphIDToArcs;
 
-  for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
-
-    // Record that this arc ends at this node
-    nodeToLines.insert(pair<int,int>(numNodes,arcNumber));
+  long numNodes = 0;
+  long endNode = 0;
+  {
+    long hypergraphHypothesisID = 0;
+    for (size_t arcNumber = 0, size=searchGraph.size(); arcNumber < size; ++arcNumber) {
     
-    // Get an id number for the previous hypothesis
-    const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo();
-    if (prevHypo!=NULL) {
-      int prevID = prevHypo->GetId();
-      if (nodes.count(prevID) == 0) {
-	nodes[prevID] = numNodes;
-	numNodes += 1;
+      // Get an id number for the previous hypothesis
+      const Hypothesis *prevHypo = searchGraph[arcNumber].hypo->GetPrevHypo();
+      if (prevHypo!=NULL) {
+	int mosesPrevHypothesisID = prevHypo->GetId();
+	if (mosesIDToHypergraphID.count(mosesPrevHypothesisID) == 0) {
+	  mosesIDToHypergraphID[mosesPrevHypothesisID] = hypergraphHypothesisID;
+	  //	hypergraphIDToMosesID[hypergraphHypothesisID] = mosesPrevHypothesisID;
+	  hypergraphHypothesisID += 1;
+	}
       }
-    }
 
-    // Get an id number for this hypothesis
-    int hypothesisID = searchGraph[arcNumber].hypo->GetId();
-    if (nodes.count(hypothesisID) == 0) {
+      // Record that this arc ends at this node
+      hypergraphIDToArcs.insert(pair<int,int>(hypergraphHypothesisID,arcNumber));
+
+      // Get an id number for this hypothesis
+      int mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
+      if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
       
-      nodes[hypothesisID] = numNodes;
-      numNodes += 1;
+	mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
+	//      hypergraphIDToMosesID[hypergraphHypothesisID] = mosesHypothesisID;
 
-      bool terminalNode = (searchGraph[arcNumber].forward == -1);
-      if (terminalNode) {
-	// Final arc to end node, representing the end of the sentence </s>
-	terminalNodes.insert(numNodes);
+	bool terminalNode = (searchGraph[arcNumber].forward == -1);
+	if (terminalNode) {
+	  // Final arc to end node, representing the end of the sentence </s>
+	  terminalNodes.insert(hypergraphHypothesisID);
+	}
+
+	hypergraphHypothesisID += 1;
       }
     }
+    
+    // Unique end node
+    endNode = hypergraphHypothesisID;
+    //    mosesIDToHypergraphID[hypergraphHypothesisID] = hypergraphHypothesisID;
+    numNodes = endNode + 1;
 
   }
   
-  // Unique end node
-  nodes[numNodes] = numNodes;
-  numNodes += 1;
 
   long numArcs = searchGraph.size() + terminalNodes.size();
 
   // Print number of nodes and arcs
   outputSearchGraphStream << numNodes << " " << numArcs << endl;
 
-  for (int nodeNumber=0; nodeNumber <= numNodes; nodeNumber+=1) {
-
-    size_t count = nodeToLines.count(nodeNumber);
+  for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) {
+    //    int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID];
+    size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID);
     if (count > 0) {
       outputSearchGraphStream << count << endl;
 
-      pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range = nodeToLines.equal_range(nodeNumber);
+      pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range =
+	hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
       for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
 	int lineNumber = (*it).second;
 	const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
+	int mosesHypothesisID = thisHypo->GetId();
+	//	int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
+	UTIL_THROW_IF(
+		      (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
+		      util::Exception,
+		      "Error while writing search lattice as hypergraph for sentence " << translationId << ". " <<
+		      "Moses node " << mosesHypothesisID << " was expected to have hypergraph id " << hypergraphHypothesisID <<
+		      ", but actually had hypergraph id " << mosesIDToHypergraphID[mosesHypothesisID] << 
+		      ". There are " << numNodes << " nodes in the search lattice."
+		      );
+
 	const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
 	if (prevHypo==NULL) {
 	  outputSearchGraphStream << "<s> ||| " << endl;
 	} else {
-	  int startNode = nodes[prevHypo->GetId()];
+	  int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
 
 	  UTIL_THROW_IF(
-			(startNode >= nodeNumber),
+			(startNode >= hypergraphHypothesisID),
 			util::Exception,
-			"Error while writing search lattice as hypergraph for sentence" << translationId << "." <<
+			"Error while writing search lattice as hypergraph for sentence" << translationId << ". " <<
 			"The nodes must be output in topological order. The code attempted to violate this restriction."
 			);
 
-- 
cgit v1.2.3


From f2536cddffe5dbb141387fea0a27da19e1da21e2 Mon Sep 17 00:00:00 2001
From: Ian Johnson <ian.johnson@appliedlanguage.com>
Date: Wed, 6 Mar 2013 13:37:41 +0000
Subject: Added arrow based Moses training pipeline demonstration program to
 contrib.

---
 .gitmodules                                        |   3 +
 .../training-pipeline/moses-pypeline.dia           | Bin 0 -> 3532 bytes
 contrib/arrow-pipelines/python/README              |  32 ++++
 contrib/arrow-pipelines/python/libs/pypeline       |   1 +
 contrib/arrow-pipelines/python/manager.py          | 192 +++++++++++++++++++++
 contrib/arrow-pipelines/python/test/__init__.py    |   0
 contrib/arrow-pipelines/python/test/test.py        |  11 ++
 .../arrow-pipelines/python/training/__init__.py    |   0
 .../python/training/components/__init__.py         |   0
 .../python/training/components/cleanup/__init__.py |   0
 .../python/training/components/cleanup/cleanup.py  | 125 ++++++++++++++
 .../python/training/components/cleanup/cleanup3.py | 109 ++++++++++++
 .../training/components/data_split/__init__.py     |   0
 .../training/components/data_split/data_split.py   | 146 ++++++++++++++++
 .../training/components/irstlm_build/__init__.py   |   0
 .../components/irstlm_build/irstlm_build.py        | 106 ++++++++++++
 .../python/training/components/mert/__init__.py    |   0
 .../python/training/components/mert/mert.py        |  83 +++++++++
 .../training/components/model_training/__init__.py |   0
 .../components/model_training/model_training.py    |  72 ++++++++
 .../training/components/tokenizer/__init__.py      |   0
 .../training/components/tokenizer/src_tokenizer.py |  43 +++++
 .../python/training/components/tokenizer/tmp.de    |   3 +
 .../training/components/tokenizer/tokenizer.py     |  36 ++++
 .../training/components/tokenizer/trg_tokenizer.py |  43 +++++
 25 files changed, 1005 insertions(+)
 create mode 100644 contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
 create mode 100644 contrib/arrow-pipelines/python/README
 create mode 160000 contrib/arrow-pipelines/python/libs/pypeline
 create mode 100644 contrib/arrow-pipelines/python/manager.py
 create mode 100644 contrib/arrow-pipelines/python/test/__init__.py
 create mode 100644 contrib/arrow-pipelines/python/test/test.py
 create mode 100644 contrib/arrow-pipelines/python/training/__init__.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/__init__.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/cleanup/__init__.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/data_split/__init__.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/data_split/data_split.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/mert/__init__.py
 create mode 100755 contrib/arrow-pipelines/python/training/components/mert/mert.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/model_training/__init__.py
 create mode 100755 contrib/arrow-pipelines/python/training/components/model_training/model_training.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py
 create mode 100755 contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
 create mode 100644 contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
 create mode 100644 contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
 create mode 100755 contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py

diff --git a/.gitmodules b/.gitmodules
index e69de29bb..d3a8cb4da 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -0,0 +1,3 @@
+[submodule "contrib/arrow-pipelines/python/libs/pypeline"]
+	path = contrib/arrow-pipelines/python/libs/pypeline
+	url = git://github.com/ianj-als/pypeline.git
diff --git a/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia
new file mode 100644
index 000000000..1d35a1dea
Binary files /dev/null and b/contrib/arrow-pipelines/documentation/training-pipeline/moses-pypeline.dia differ
diff --git a/contrib/arrow-pipelines/python/README b/contrib/arrow-pipelines/python/README
new file mode 100644
index 000000000..e1e12975c
--- /dev/null
+++ b/contrib/arrow-pipelines/python/README
@@ -0,0 +1,32 @@
+Arrow Based Moses Training Pipeline
+===================================
+
+To use the demonstration you must first initialise the git submodules for this clone. Return to the top level directory and issue the following command:
+
+$ git submodule init
+
+This will clone the Pypeline submodule that is available on GitHub (https://github.com/ianj-als/pypeline). To install Pypeline:
+
+$ cd libs/pypeline
+$ python setup.py install
+
+Alternatively, you can set an appropriate PYTHONPATH enviornment variable to the Pypeline library.
+
+This demonstration implements a training pipeline that is shown in the Dia diagram in ../documentation/training-pipeline/moses-pypeline.dia.
+
+Three environment variables need to be set before the manager.py script can be run, they are:
+
+ - MOSES_HOME : The directory where Moses has been cloned, or installed,
+ - IRSTLM : The installation directory of your IRSTLM, and
+ - GIZA_HOME : The installation directory of GIZA++.
+
+The manager.py script takes four positional command-line arguments:
+
+ - The source language code,
+ - The target language code,
+ - The source corpus file. This file *must* be cleaned prior to use, and
+ - The target corpus file. This file *must* be cleaned prior to use.
+
+For example, run the manager.py script with:
+
+$ python manager.py en lt cleantrain.en cleantrain.lt
diff --git a/contrib/arrow-pipelines/python/libs/pypeline b/contrib/arrow-pipelines/python/libs/pypeline
new file mode 160000
index 000000000..a7084b686
--- /dev/null
+++ b/contrib/arrow-pipelines/python/libs/pypeline
@@ -0,0 +1 @@
+Subproject commit a7084b686f5196f1bbac5d389b4a6cd7f15c83fb
diff --git a/contrib/arrow-pipelines/python/manager.py b/contrib/arrow-pipelines/python/manager.py
new file mode 100644
index 000000000..1c3ece111
--- /dev/null
+++ b/contrib/arrow-pipelines/python/manager.py
@@ -0,0 +1,192 @@
+import logging
+import os
+
+from concurrent.futures import Future, ThreadPoolExecutor
+from functools import partial
+from pypeline.helpers.parallel_helpers import eval_pipeline, \
+    cons_function_component, \
+    cons_wire, \
+    cons_split_wire, \
+    cons_unsplit_wire, \
+    cons_dictionary_wire
+
+
+#
+# Some logging please
+#
+FORMAT = '%(asctime)-15s : %(threadName)s : %(levelname)s - %(message)s'
+logging.basicConfig(format = FORMAT, level = logging.DEBUG)
+logger = logging.getLogger("manager")
+
+
+# Build the pipeline components
+def build_components(components, configuration, executor):
+  pipeline_components = dict()
+  pipeline_configuration = dict()
+
+  for component_id, module_name in components.items():
+    logger.info("Loading [%s] component from [%s]..." % (component_id, module_name))
+
+    module = __import__(module_name, fromlist = ['configure', 'initialise'])
+    
+    # Component builds its own configuration object
+    config_func = getattr(module, 'configure')
+    component_config = config_func(configuration)
+    pipeline_configuration.update(component_config)
+
+    # Now build the component
+    init_func = getattr(module, 'initialise')
+    component_function = init_func(component_config)
+
+    # A wrapper for the component's function that submits to the executor
+    def get_component_function_wrapper(inner_function, comp_id, mod_name):
+      def component_function_wrapper(a, s):
+        logger.info("Running component [%s], from module [%s], with value [%s] and state [%s]..." % \
+                    (comp_id, mod_name, a, s))
+        return inner_function(a, s)
+
+      return component_function_wrapper
+
+    # Arrowize the component
+    component = cons_function_component(get_component_function_wrapper(component_function, component_id, module_name))
+
+    # And store
+    pipeline_components[component_id] = component
+
+  return pipeline_components, pipeline_configuration
+
+
+# Go!
+def main(src_lang, trg_lang, src_filename, trg_filename):
+  # Global configuration
+  # One day, this configuration shall be constructed from
+  # command line options, or a properties file.
+  configuration = {
+    'moses_installation_dir': os.environ['MOSES_HOME'],
+    'irstlm_installation_dir': os.environ['IRSTLM'],
+    'giza_installation_dir': os.environ['GIZA_HOME'],
+    'src_lang': src_lang,
+    'src_tokenisation_dir': './tokenisation',
+    'trg_lang': trg_lang,
+    'trg_tokenisation_dir': './tokenisation',
+    'segment_length_limit': 60,
+    'irstlm_smoothing_method': 'improved-kneser-ney',
+    'language_model_directory': './language-model',
+    'translation_model_directory': './translation-model',
+    'mert_working_directory': './mert',
+    'evaluation_data_size': 100,
+    'development_data_size': 100
+  }
+
+  # The modules to load
+  # In the future, the components shall be specified in some kind
+  # pipeline description file.
+  component_modules = {
+    'src_tokenizer': 'training.components.tokenizer.src_tokenizer',
+    'trg_tokenizer': 'training.components.tokenizer.trg_tokenizer',
+    'cleanup': 'training.components.cleanup.cleanup',
+    'data_split': 'training.components.data_split.data_split',
+    'irstlm_build': 'training.components.irstlm_build.irstlm_build',
+    'model_training': 'training.components.model_training.model_training',
+    'mert': 'training.components.mert.mert'
+  }
+
+  # The thread pool
+  executor = ThreadPoolExecutor(max_workers = 3)
+
+  # Phew, build the required components
+  components, component_config = build_components(component_modules, configuration, executor)
+
+  #
+  # Wire up components
+  # Description of wiring should be, in the future, alongside the component
+  # specification in some kind of confuguration file. Components shall be
+  # declared then used, i.e., bind a component instance to a unique component
+  # identifier, then wire component instances together by identifier.
+  #
+
+  #
+  # Tokenisation of source and target...
+  #
+  # IRSTLM Build components
+  irstlm_build_component = cons_split_wire() >> \
+                           (cons_wire(lambda a, s: {'input_filename':  a['tokenised_trg_filename']}) >> \
+                            components['irstlm_build']).second() >> \
+                           cons_unsplit_wire(lambda t, b: {'tokenised_trg_filename': t['tokenised_trg_filename'],
+                                                           'trg_language_model_filename': b['compiled_lm_filename']})
+
+  # The complete tokenisation component
+  tokenisation_component = (components['src_tokenizer'] & components['trg_tokenizer']) >> \
+                           irstlm_build_component.second() >> \
+                           cons_unsplit_wire(lambda t, b: {'src_filename': t['tokenised_src_filename'],
+                                                           'trg_filename': b['tokenised_trg_filename'],
+                                                           'trg_language_model_filename': b['trg_language_model_filename']})
+
+  #
+  # Cleanup and Data Spliting...
+  #
+
+  #
+  # A function that clips off the last '.' delimited string
+  #
+  def clip_last_bit(filename):
+    bn = os.path.basename(filename)
+    directory = os.path.dirname(filename)
+    bits = bn.split(".")
+    bits.pop()
+    return os.path.join(directory, ".".join(bits))
+
+  cleanup_datasplit_component = components['cleanup'] >> \
+                                cons_wire(lambda a, s: {'src_filename': a['cleaned_src_filename'],
+                                                        'trg_filename': a['cleaned_trg_filename']}) >> \
+                                components['data_split'] >> \
+                                cons_wire(lambda a, s: {'training_data_filename': clip_last_bit(a['train_src_filename']),
+                                                        'eval_src_filename': a['eval_src_filename'],
+                                                        'eval_trg_filename': a['eval_trg_filename']})
+
+  #
+  # Translation model training
+  #
+  translation_model_component = cons_split_wire() >> \
+                                components['model_training'].first() >> \
+                                cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
+                                                                'development_data_filename': b['eval_src_filename']})
+
+  #
+  # The whole pipeline
+  #
+  pipeline = tokenisation_component >> \
+             cons_split_wire() >> \
+             (cleanup_datasplit_component >> translation_model_component).first() >> \
+             cons_unsplit_wire(lambda t, b: {'moses_ini_file': t['moses_ini_file'],
+                                             'development_data_filename': clip_last_bit(t['development_data_filename']),
+                                             'trg_language_model_filename': b['trg_language_model_filename'],
+                                             'trg_language_model_order': 3,
+                                             'trg_language_model_type': 9}) >> \
+             components['mert']
+
+
+  #
+  # The input to the pipeline
+  #
+  value = {'src_filename': src_filename,
+           'trg_filename': trg_filename}
+
+  #
+  # Evaluate the pipeline
+  #
+  logger.info("Evaluating pipeline with input [%s]..." % value)
+  new_value = eval_pipeline(executor, pipeline, value, component_config)
+
+  #
+  # Wait for all components to finish
+  #
+  executor.shutdown(True)
+  
+  logger.info("Pipeline evaluated to %s" % new_value)
+
+
+if __name__ == '__main__':
+  import sys
+
+  main(sys.argv[1], sys.argv[2], sys.argv[3], sys.argv[4])
diff --git a/contrib/arrow-pipelines/python/test/__init__.py b/contrib/arrow-pipelines/python/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/test/test.py b/contrib/arrow-pipelines/python/test/test.py
new file mode 100644
index 000000000..628796f7d
--- /dev/null
+++ b/contrib/arrow-pipelines/python/test/test.py
@@ -0,0 +1,11 @@
+import subprocess
+
+def cat(filename, content):
+  fh = open(filename, "w")
+  for line in content:
+    #print(line, file=fh)
+    print >> fh, line
+  fh.close()
+
+def diff(filename1, filename2):
+  subprocess.check_output(["diff", filename1, filename2], stderr=subprocess.STDOUT)
diff --git a/contrib/arrow-pipelines/python/training/__init__.py b/contrib/arrow-pipelines/python/training/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/__init__.py b/contrib/arrow-pipelines/python/training/components/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py b/contrib/arrow-pipelines/python/training/components/cleanup/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
new file mode 100644
index 000000000..cb2e057ce
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup.py
@@ -0,0 +1,125 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['segment_length'] = args['segment_length_limit']
+  return result
+
+def initialise(config):
+  def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+    def _short(line):
+      n = 0
+      for c in line:
+        if c == " ":
+          n += 1
+      #print(line, ":", n)
+      return n < limit
+
+    for (l1, l2) in zip(ifh1, ifh2):
+      if _short(l1) and _short(l2):
+        print >>ofh1, l1,
+        print >>ofh2, l2,
+
+  def _make_cleaned_filename(filename):
+    bits = filename.split(".")
+    bits[-1] = "clean"
+    return ".".join(bits)
+
+  def _filter_main(value, config):
+    limit = config['segment_length']
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      input_src_filename = value['src_filename']
+      input_trg_filename = value['trg_filename']
+
+      print "Cleanup: Cleaning [%s] and [%s]..." % (input_src_filename, input_trg_filename)
+
+      ifh1 = open(input_src_filename, "r")
+      ifh2 = open(input_trg_filename, "r")
+
+      cleaned_src_filename = _make_cleaned_filename(input_src_filename)
+      cleaned_trg_filename = _make_cleaned_filename(input_trg_filename)
+      ofh1 = open(cleaned_src_filename, "w")
+      ofh2 = open(cleaned_trg_filename, "w")
+
+      _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+      return {'cleaned_src_filename': cleaned_src_filename,
+              'cleaned_trg_filename': cleaned_trg_filename}
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return _filter_main
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import test.test as thelp
+
+  from pypeline.helpers.helpers import eval_pipeline
+
+
+  def _test_main():
+    configuration = {'segment_length_limit': 20}
+
+    src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'cleaned_src_file_expected': src_filename[1] + ".expected",
+      'cleaned_trg_file_expected': trg_filename[1] + ".expected"
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    output = eval_pipeline(box, box_eval, box_config)
+    try:
+      thelp.diff(box_eval['cleaned_src_file_expected'], output['cleaned_src_filename'])
+      thelp.diff(box_eval['cleaned_trg_file_expected'], output['cleaned_trg_filename'])
+    finally:
+      os.unlink(output['cleaned_src_filename'])
+      os.unlink(output['cleaned_trg_filename'])
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+    thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+    #expected output:
+    thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+    thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
diff --git a/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
new file mode 100644
index 000000000..27625c612
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/cleanup/cleanup3.py
@@ -0,0 +1,109 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['segment_length'] = args['segment_length_limit']
+  return result
+
+def initialise(config):
+  def _filter(limit, ifh1, ofh1, ifh2, ofh2):
+    def _short(line):
+      n = 0
+      for c in line:
+        if c == " ":
+          n += 1
+      #print(line, ":", n)
+      return n < limit
+
+    for (l1, l2) in zip(ifh1, ifh2):
+      if _short(l1) and _short(l2):
+        print(l1, end='', file=ofh1)
+        print(l2, end='', file=ofh2)
+
+  def _filter_main(config, value):
+    limit = config['segment_length']
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      ifh1 = open(value['src_filename'], "r")
+      ifh2 = open(value['trg_filename'], "r")
+      ofh1 = open(value['cleaned_src_filename'], "w")
+      ofh2 = open(value['cleaned_trg_filename'], "w")
+
+      _filter(limit, ifh1, ofh1, ifh2, ofh2)
+
+      return {'cleaned_src_filename': value['cleaned_src_filename'],
+              'cleaned_trg_filename': value['cleaned_trg_filename']}
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return cons_function_component(_filter_main)
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import training.components.shared.test as thelp
+
+
+  def _test_main():
+    configuration = {'segment_length_limit': 20}
+
+    src_filename = tempfile.mkstemp(suffix = "src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = "trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'cleaned_src_filename': src_filename[1] + ".clean",
+      'cleaned_trg_filename': trg_filename[1] + ".clean",
+      'cleaned_src_file_expected': src_filename[1] + ".expected",
+      'cleaned_trg_file_expected': trg_filename[1] + ".expected"
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    from pypeline.helpers.helpers import run_pipeline
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    run_pipeline(box, box_config, box_eval)
+    thelp.diff(box_eval['cleaned_src_file_expected'], box_eval['cleaned_src_filename'])
+    thelp.diff(box_eval['cleaned_trg_file_expected'], box_eval['cleaned_trg_filename'])
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line([10, 20, 30, 40, 17, 21]))
+    thelp.cat(box_eval['trg_filename'], _line([40, 30, 20, 10, 20, 21]))
+    #expected output:
+    thelp.cat(box_eval['cleaned_src_file_expected'], _line([17]))
+    thelp.cat(box_eval['cleaned_trg_file_expected'], _line([20]))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
diff --git a/contrib/arrow-pipelines/python/training/components/data_split/__init__.py b/contrib/arrow-pipelines/python/training/components/data_split/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/data_split/data_split.py b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
new file mode 100644
index 000000000..b8469cbf6
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/data_split/data_split.py
@@ -0,0 +1,146 @@
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+  result = {}
+  result['evaluate_size'] = args['evaluation_data_size']
+  result['development_size'] = args['development_data_size']
+  return result
+
+def initialise(config):
+
+  def _copy(size, inp, ofh1, ofh2):
+    try:
+      while size != 0:
+        (l1, l2) = inp.next()
+        print >>ofh1, l1,
+        print >>ofh2, l2,
+        size -= 1
+    except StopIteration:
+      pass
+
+  def _make_split_filename(filename, data_set):
+    bits = filename.split(".")
+    last = bits.pop()
+    lang_code = bits.pop()
+    
+    bits.append(last)
+    bits.append(data_set)
+    bits.append(lang_code)
+
+    new_filename = ".".join(bits)
+    return new_filename
+
+  def _splitter_main(value, config):
+    (ifh1, ifh2, ofh1, ofh2) = (None, None, None, None)
+    try:
+      input_src_filename = value['src_filename']
+      input_trg_filename = value['trg_filename']
+
+      ifh1 = open(input_src_filename, "r")
+      ifh2 = open(input_trg_filename, "r")
+      inp = iter(zip(ifh1, ifh2))
+
+      result = {}
+      for (data_set, size) in [
+        ('devel', config['development_size']),
+        ('eval', config['evaluate_size']),
+        ('train', -1)
+                ]:
+        output_src_filename = _make_split_filename(input_src_filename, data_set)
+        output_trg_filename = _make_split_filename(input_trg_filename, data_set)
+        ofh1 = open(output_src_filename, "w")
+        ofh2 = open(output_trg_filename, "w")
+
+        _copy(size, inp, ofh1, ofh2)
+        result[data_set + '_src_filename'] = output_src_filename
+        result[data_set + '_trg_filename'] = output_trg_filename
+
+      return result
+
+    finally:
+      def _safe_close(fh):
+        if fh is not None:
+          fh.close()
+      _safe_close(ifh1)
+      _safe_close(ifh2)
+      _safe_close(ofh1)
+      _safe_close(ofh2)
+    
+  return _splitter_main
+
+
+if __name__ == '__main__':
+  import os
+  import tempfile
+  import test.test as thelp
+
+  from pypeline.helpers.helpers import eval_pipeline
+
+
+  def _test_main():
+    configuration = {
+      'evaluation_data_size': 7,
+      'development_data_size': 13,
+    }
+
+    src_filename = tempfile.mkstemp(suffix = ".src", dir = "/tmp")
+    trg_filename = tempfile.mkstemp(suffix = ".trg", dir = "/tmp")
+
+    box_eval = {
+      'src_filename': src_filename[1],
+      'trg_filename': trg_filename[1],
+      'devel_src_expected': src_filename[1] + ".devel.expected",
+      'devel_trg_expected': trg_filename[1] + ".devel.expected",
+      'eval_src_expected': src_filename[1] + ".eval.expected",
+      'eval_trg_expected': trg_filename[1] + ".eval.expected",
+      'train_src_expected': src_filename[1] + ".train.expected",
+      'train_trg_expected': trg_filename[1] + ".train.expected",
+    }
+
+    try:
+      _prep_files(box_eval)
+      _run_test(configuration, box_eval)
+    finally:
+      _cleanup_files(box_eval)
+
+
+  def _run_test(configuration, box_eval):
+    box_config = configure(configuration)
+    box = initialise(box_config)
+    
+    output = eval_pipeline(box, box_eval, box_config)
+    for data_set in ['devel', 'eval', 'train']:
+      for lang in ['src', 'trg']:
+        filename = output[data_set + '_' + lang + '_filename']
+        filename_expected = box_eval[data_set + '_' + lang + '_expected']
+      thelp.diff(filename_expected, filename)
+
+
+  def _line(line_lengths):
+    def _gen_line(tokens):
+      return " ".join(map(lambda n: "tok" + str(n), range(tokens)))
+    return map(_gen_line, line_lengths)
+
+
+  def _prep_files(box_eval):
+    thelp.cat(box_eval['src_filename'], _line(range(50)))
+    thelp.cat(box_eval['trg_filename'], _line(range(50)))
+    #expected output:
+    thelp.cat(box_eval['devel_src_expected'], _line(range(0,13)))
+    thelp.cat(box_eval['devel_trg_expected'], _line(range(0,13)))
+    thelp.cat(box_eval['eval_src_expected'], _line(range(13,20)))
+    thelp.cat(box_eval['eval_trg_expected'], _line(range(13,20)))
+    thelp.cat(box_eval['train_src_expected'], _line(range(20,50)))
+    thelp.cat(box_eval['train_trg_expected'], _line(range(20,50)))
+
+
+  def _cleanup_files(box_eval):
+    try:
+      for key, filename in box_eval.items():
+        os.unlink(filename)
+    except:
+      pass
+
+
+  _test_main()
+
diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
new file mode 100644
index 000000000..f65d61973
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/irstlm_build/irstlm_build.py
@@ -0,0 +1,106 @@
+import os
+import shutil
+import subprocess
+import tempfile
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    config = dict()
+    config['irstlm_install_directory'] = args['irstlm_installation_dir']
+    config['smoothing_method'] = args['irstlm_smoothing_method']
+    config['lm_directory'] = args['language_model_directory']
+    return config
+
+def initialise(config):
+    def process(a, s):
+        # Create the LM directory if we need to
+        if os.path.exists(s['lm_directory']) is False:
+            os.makedirs(s['lm_directory'])
+
+        # The filename of the file to chew through
+        start_end_input_filename = a['input_filename']
+        if os.path.exists(start_end_input_filename) is False:
+            raise Exception("IRSTLM Build: Input file could not be found at [%s]" % start_end_input_filename)
+
+        # Derive the output file name for the add start-end marker processor
+        filename_bits = os.path.basename(start_end_input_filename).split(".")
+        filename_bits[2] = "sb";
+        start_end_output_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # Derive the output file name of the LM build
+        filename_bits[2] = "lm"
+        lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # Derive the compiled LM file name
+        filename_bits[2] = "arpa"
+        compiled_lm_filename = os.path.join(s['lm_directory'], ".".join(filename_bits))
+
+        # First thing to do is add start and end markers
+        start_end_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "add-start-end.sh")]
+        infile = open(start_end_input_filename, 'r')
+        outfile = open(start_end_output_filename, 'w')
+        print "IRSTLM Build: Invoking [%s]..." % " ".join(start_end_cmdline)
+        return_code = subprocess.check_call(start_end_cmdline, stdin = infile, stdout = outfile)
+        if return_code:
+            raise Exception("IRSTLM add start and end markers failed: input file = [%s], output file = [%s], return code = [%d]" % \
+                            start_end_input_filename, start_end_output_filename, return_code)
+
+        # Next build the language model
+        tmp_dir = tempfile.mkdtemp(dir = "/tmp")
+        try:
+            build_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "build-lm.sh"),
+                                "-i", start_end_output_filename,
+                                "-t", tmp_dir,
+                                "-p",
+                                "-s", s['smoothing_method'],
+                                "-o", lm_filename]
+            print "IRSTLM Build: Invoking [%s]..." % " ".join(build_lm_cmdline)
+            return_code = subprocess.check_call(build_lm_cmdline)
+            if return_code: 
+                raise Exception("IRST language model failed to build: return code = [%d]" % return_code)
+        finally:
+            if os.path.exists(tmp_dir):
+                shutil.rmtree(tmp_dir)
+
+        # Compile the LM
+        lm_filename = lm_filename + ".gz"
+        compile_lm_cmdline = [os.path.join(s['irstlm_install_directory'], "bin", "compile-lm"),
+                              "--text", "yes",
+                              lm_filename,
+                              compiled_lm_filename]
+        print "IRSTLM Build: Invoking [%s]..." % " ".join(compile_lm_cmdline)
+        return_code = subprocess.check_call(compile_lm_cmdline)
+        if return_code:
+            raise Exception("IRST language model compilation failed: return code = [%d]" % return_code)
+
+        output = {'add_start_end_filename': start_end_output_filename,
+                  'lm_filename': lm_filename,
+                  'compiled_lm_filename': compiled_lm_filename}
+
+        print "IRSTLM Build: Output = %s" % output
+
+        return output
+
+    return process
+
+
+if __name__ == '__main__':
+    from pypeline.helpers.helpers import eval_pipeline
+
+    lm_dir = os.environ["PWD"]
+    configuration = {'irstlm_root': os.environ["IRSTLM"],
+                     'irstlm_smoothing_method': 'improved-kneser-ney',
+                     'language_model_directory': lm_dir}
+    component_config = configure(configuration)
+    component = initialise(component_config)
+
+    value = eval_pipeline(component,
+                          {'input_filename': '/Users/ianjohnson/Dropbox/Documents/MTM2012/tokenised_files/news-commentary-v7.fr-en.tok.en'},
+                          component_config)
+    target = {'add_start_end_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.sb.en'),
+              'lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.lm.en.gz'),
+              'compiled_lm_filename': os.path.join(lm_dir, 'news-commentary-v7.fr-en.arpa.en')}
+    print "Target: %s" % target
+    if value != target:
+        raise Exception("Massive fail!")
diff --git a/contrib/arrow-pipelines/python/training/components/mert/__init__.py b/contrib/arrow-pipelines/python/training/components/mert/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/mert/mert.py b/contrib/arrow-pipelines/python/training/components/mert/mert.py
new file mode 100755
index 000000000..2b60b1720
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/mert/mert.py
@@ -0,0 +1,83 @@
+#!/usr/bin/env python
+
+import os, shutil, subprocess
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['trg_lang'] = args['trg_lang']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    result['mert_working_dir'] = args['mert_working_directory']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = os.path.abspath(a['development_data_filename'])
+        lm_file = os.path.abspath(a['trg_language_model_filename'])
+        lm_order = int(a['trg_language_model_order'])
+        lm_type = int(a['trg_language_model_type'])
+        orig_moses_ini = os.path.abspath(a['moses_ini_file'])
+        
+        if not os.path.exists(orig_moses_ini):
+            raise Exception, "Error: Input moses.ini does not exist"
+
+        workdir = os.path.abspath(config['mert_working_dir'])
+        #simply call the training perl script
+        #remove the workdir if it is already there
+        if os.path.exists(workdir):
+            shutil.rmtree(workdir)
+        os.makedirs(workdir)
+
+        #local vars
+        moses_install_dir = os.path.abspath(config['moses_installation_dir'])
+        mert_perl = os.path.join(moses_install_dir, 'scripts', 'training', 'mert-moses.pl')
+        bin_dir = os.path.join(moses_install_dir, 'bin')
+        moses_bin = os.path.join(moses_install_dir, 'bin', 'moses')
+        src_file = infilename + '.' + config['src_lang']
+        ref_file = infilename + '.' + config['trg_lang']
+        logfile = os.path.join(workdir, 'log')
+        #change lm configuration in moses ini
+        moses_ini = os.path.join(workdir, 'trained-moses.ini')
+        cmd = r"cat %(orig_moses_ini)s | sed '/\[lmodel-file\]/,/^[[:space:]]*$/c\[lmodel-file\]\n%(lm_type)s 0 %(lm_order)s %(lm_file)s\n' > %(moses_ini)s"
+        cmd = cmd % locals()
+        os.system(cmd)
+        
+        #the command
+        cmd = '%(mert_perl)s --mertdir %(bin_dir)s --working-dir %(workdir)s %(src_file)s %(ref_file)s %(moses_bin)s %(moses_ini)s 2> %(logfile)s'
+        cmd = cmd % locals()
+
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+        #check the moses ini
+        new_mosesini = os.path.join(workdir, 'moses.ini')
+        if not os.path.exists(new_mosesini):
+            raise Exception, 'Failed MERT'
+        
+        return {'moses_ini_file':new_mosesini}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'en',
+                         'trg_lang':'lt',
+                         'moses_installation_dir':os.path.abspath('../../../../'),
+                         'mert_working_dir':'../../../../../tuning'}
+        values = {'development_data_filename':'../../../../../corpus/tune',
+                  'moses_ini_file':'../../../../../model/model/moses.ini',
+                  'trg_language_model_filename':'../../../../../corpus/train.lt.lm',
+                  'trg_language_model_type':9,
+                  'trg_language_model_order':4}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
diff --git a/contrib/arrow-pipelines/python/training/components/model_training/__init__.py b/contrib/arrow-pipelines/python/training/components/model_training/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/model_training/model_training.py b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
new file mode 100755
index 000000000..e990307d2
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/model_training/model_training.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python
+
+import os, shutil, subprocess
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['trg_lang'] = args['trg_lang']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    result['external_bin_dir'] = args['giza_installation_dir']
+    result['model_directory'] = args['translation_model_directory']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = os.path.abspath(a['training_data_filename'])
+        workdir = os.path.abspath(config['model_directory'])
+        #simply call the training perl script
+        #remove the workdir if it is already there
+        if os.path.exists(workdir):
+            shutil.rmtree(workdir)
+        os.makedirs(workdir)
+        
+        #local vars
+        train_model_perl = os.path.abspath(config['moses_installation_dir']) + os.sep + 'scripts' + os.sep + 'training' + os.sep + 'train-model.perl'
+        src_lang = config['src_lang'].lower()
+        trg_lang = config['trg_lang'].lower()
+        external_bin = os.path.abspath(config['external_bin_dir'])
+        #create a dummy lm file
+        dummy_lmfile = workdir + os.sep + 'dummy.lm'
+        f = open(dummy_lmfile, 'w')
+        print >> f, "dummy lm file"
+        f.close()
+        logfile = workdir + os.sep + 'log'
+        
+        #the command
+        cmd = '%(train_model_perl)s -root-dir %(workdir)s -corpus %(infilename)s -f %(src_lang)s -e %(trg_lang)s -alignment grow-diag-final-and -reordering msd-bidirectional-fe -lm 0:5:%(dummy_lmfile)s:0 -external-bin-dir %(external_bin)s 2> %(logfile)s'
+
+        cmd = cmd % locals()
+
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+        #check the moses ini
+        mosesini = workdir + os.sep + 'model' + os.sep + 'moses.ini'
+        if not os.path.exists(mosesini):
+            raise Exception, 'Failed training model'
+        
+        return {'moses_ini_file':mosesini}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'en',
+                         'trg_lang':'lt',
+                         'moses_installation_dir':os.environ['MOSES_HOME'],
+                         'giza_installation_dir':os.environ['GIZA_HOME'],
+                         'translation_model_directory':'model-dir'}
+        values = {'training_data_filename':'/Users/ianjohnson/work/MTM-2012/corpus/training/cleantrain'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(box_config)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py b/contrib/arrow-pipelines/python/training/components/tokenizer/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
new file mode 100755
index 000000000..57f8771df
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/src_tokenizer.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+
+from tokenizer import Tokenizer
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['src_lang'] = args['src_lang']
+    result['src_tokenisation_dir'] = args['src_tokenisation_dir']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = a['src_filename']
+        outfilename = Tokenizer.batch_tokenise(
+            config['src_lang'], 
+            config['moses_installation_dir'], 
+            infilename, 
+            config['src_tokenisation_dir'])
+        return {'tokenised_src_filename':outfilename}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'src_lang':'de',
+                         'src_tokenisation_dir':'tmptok',
+                         'moses_installation_dir':os.path.abspath('../../../../')}
+        values = {'src_filename':'tmp.de'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
new file mode 100644
index 000000000..c6b41edbe
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tmp.de
@@ -0,0 +1,3 @@
+asdfweoih
+awfwoeijf awefo
+what's this
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
new file mode 100644
index 000000000..354ec1abc
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/tokenizer.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+
+import sys, os, subprocess
+
+class Tokenizer:
+    
+    @staticmethod
+    def batch_tokenise(lang, mosesdir, infilename, workdir):
+        print "Tokenizing [%s] in working directory [%s]..." % (infilename, workdir)
+        if not os.path.exists(workdir):
+            os.makedirs(workdir)
+        tok = Tokenizer(lang, mosesdir)
+        basefilename = os.path.basename(infilename)
+        outfilename = workdir + os.sep + basefilename + '.tok'
+        tok.file_tokenise(infilename, outfilename)
+        return outfilename
+        
+    def __init__(self, lang, mosesdir):
+        self.arrows = None
+        self.lang = lang
+        #check the perl tokenizer is here
+        #path = os.path.dirname(os.path.abspath(__file__))
+        path = mosesdir + os.sep + 'scripts' + os.sep + 'tokenizer'
+        self.perltok = path + os.sep + 'tokenizer.perl'
+        if not os.path.exists(path):
+            raise Exception, "Perl tokenizer does not exists"
+
+    def file_tokenise(self, infilename, outfilename):
+        cmd = '%s -q -l %s < %s > %s' % (self.perltok, self.lang, infilename, outfilename)
+        pipe = subprocess.Popen(cmd, stdin = subprocess.PIPE, stdout = subprocess.PIPE, shell=True)
+        pipe.wait()
+
+if __name__ == '__main__':
+    #do some test
+    pass
+
diff --git a/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
new file mode 100755
index 000000000..3852e296f
--- /dev/null
+++ b/contrib/arrow-pipelines/python/training/components/tokenizer/trg_tokenizer.py
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+
+import os
+
+from tokenizer import Tokenizer
+
+from pypeline.helpers.helpers import cons_function_component
+
+def configure(args):
+    result = {}
+    result['trg_lang'] = args['trg_lang']
+    result['trg_tokenisation_dir'] = args['trg_tokenisation_dir']
+    result['moses_installation_dir'] = args['moses_installation_dir']
+    return result
+
+def initialise(config):
+
+    def process(a, s):
+        infilename = a['trg_filename']
+        outfilename = Tokenizer.batch_tokenise(
+            config['trg_lang'], 
+            config['moses_installation_dir'],
+            infilename, 
+            config['trg_tokenisation_dir'])
+        return {'tokenised_trg_filename':outfilename}
+
+    return process
+
+if __name__ == '__main__':
+
+    def __test():
+        configuration = {'trg_lang':'de',
+                         'trg_tokenisation_dir':'tmptoktrg',
+                         'moses_installation_dir':os.path.abspath('../../../../')}
+        values = {'trg_filename':'tmp.de'}
+        from pypeline.helpers.helpers import run_pipeline
+        box_config = configure(configuration)
+        box = initialise(configuration)
+        print run_pipeline(box, values, None)
+
+    #do some test
+    __test()
+
-- 
cgit v1.2.3


From 3e73ac29b2c54ac82f7553b4a17fd42ef73e84f1 Mon Sep 17 00:00:00 2001
From: Ian Johnson <ian.johnson@appliedlanguage.com>
Date: Wed, 6 Mar 2013 14:53:45 +0000
Subject: Added the RPM installer builder to contrib.

---
 contrib/rpm/README                    | 42 ++++++++++++++++++++++
 contrib/rpm/build_source.sh           | 63 +++++++++++++++++++++++++++++++++
 contrib/rpm/rpmbuild/SPECS/moses.spec | 65 +++++++++++++++++++++++++++++++++++
 3 files changed, 170 insertions(+)
 create mode 100644 contrib/rpm/README
 create mode 100755 contrib/rpm/build_source.sh
 create mode 100644 contrib/rpm/rpmbuild/SPECS/moses.spec

diff --git a/contrib/rpm/README b/contrib/rpm/README
new file mode 100644
index 000000000..8ba7ef4da
--- /dev/null
+++ b/contrib/rpm/README
@@ -0,0 +1,42 @@
+Building Moses RPM
+==================
+
+*** WARNING ***
+Before completing *any* of the tasks outlined in this README, please commit and push any changes you wish to be included in your installer.
+*** WARNING ***
+
+
+Building the RPM SPEC file
+--------------------------
+
+The first phase is to construct the RPM SPEC file in $HOME/rpmbuild. The build_source.sh script builds all the artefacts needed to build. This script needs the following information:
+
+ - The Git repository from which an installer will be built,
+ - The branch in the Git repository to build, and
+ - The version of the installed Moses distribution.
+
+For example, to build the RELEASE-1.0 branch in the mosesdecode repository (git://github.com/moses-smt/mosesdecoder.git):
+
+$ build_source.sh -r git://github.com/moses-smt/mosesdecoder.git -b RELASE-1.0 -v 1.0
+
+This builds the source tarballs in the $HOME/rpmbuild/SOURCES directory and the moses.spec file in $HOME/rpmbuild/SPECS.
+
+
+Building the RPM
+----------------
+
+Change directory to $HOME/rpmbuild, and build the binary RPM with:
+
+$ rpmbuild -bb SPECS/moses.spec
+
+This will download IRSTLM v5.70.04 and GIZA++ v2, then build them along with Moses and make the RPM in the directory $HOME/rpmbuild/RPMS/<architecture>/moses-<version>-1.<architecture>.rpm.
+
+For example building on a 64 bit Intel architecture, and building v1.0 the RPM would be called moses-1.0-1.x86_64.rpm.
+
+
+Building a Debian package
+-------------------------
+
+The Alien tool converts RPM packages to Debian packages. If a Debian package is required then follow the instructions on the following web-page:
+
+https://help.ubuntu.com/community/RPM/AlienHowto
diff --git a/contrib/rpm/build_source.sh b/contrib/rpm/build_source.sh
new file mode 100755
index 000000000..d0fac6a33
--- /dev/null
+++ b/contrib/rpm/build_source.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+BRANCH="master"
+declare -i NO_RPM_BUILD=0
+declare -r RPM_VERSION_TAG="___RPM_VERSION__"
+
+function usage() {
+  echo "`basename $0` -r [Moses Git repo] -b [Moses Git branch: default ${BRANCH}] -v [RPM version]"
+  exit 1
+}
+
+if [ $# -lt 4 ]; then
+  usage
+fi
+
+while getopts r:b:v:nh OPTION
+do
+  case "$OPTION" in
+      r) REPO="${OPTARG}";;
+      b) BRANCH="${OPTARG}";;
+      v) VERSION="${OPTARG}";;
+      n) NO_RPM_BUILD=1;;
+      [h\?]) usage;;
+  esac
+done
+
+if [ ! -d ./rpmbuild ]; then
+  echo "RPM build directory not in current working direcotry"
+  exit 1
+fi
+
+declare -r MOSES_DIR="moses-${VERSION}"
+git clone ${REPO} ${MOSES_DIR}
+if [ $? -ne 0 ]; then
+  echo "Failed to clone Git repository ${REPO}"
+  exit 3
+fi
+
+cd ${MOSES_DIR}
+
+git checkout ${BRANCH}
+if [ $? -ne 0 ]; then
+  echo "Failed to checkout branch ${BRANCH}"
+  exit 3
+fi
+
+cd ..
+
+tar -cf moses-${VERSION}.tar ${MOSES_DIR}
+gzip -f9 moses-${VERSION}.tar
+
+if [ ${NO_RPM_BUILD} -eq 0 ]; then
+  if [ ! -d ${HOME}/rpmbuild/SPECS ]; then
+    mkdir -p ${HOME}/rpmbuild/SPECS
+  fi
+  eval sed s/${RPM_VERSION_TAG}/${VERSION}/ ./rpmbuild/SPECS/moses.spec > ${HOME}/rpmbuild/SPECS/moses.spec
+  if [ ! -d ${HOME}/rpmbuild/SOURCES ]; then
+    mkdir -p ${HOME}/rpmbuild/SOURCES
+  fi
+  mv moses-${VERSION}.tar.gz ${HOME}/rpmbuild/SOURCES
+fi
+
+rm -Rf ${MOSES_DIR}
diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec
new file mode 100644
index 000000000..1ae8082ef
--- /dev/null
+++ b/contrib/rpm/rpmbuild/SPECS/moses.spec
@@ -0,0 +1,65 @@
+Name: moses
+Summary: Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair.
+Version: ___RPM_VERSION__
+Release: 1
+URL: http://www.statmt.org/moses/
+Source0: %{name}-%{version}.tar.gz
+License: LGPL
+Group: Development/Tools
+Vendor: Capita Translation and Interpreting
+Packager: Ian Johnson <ian.johnson@capita-ti.com>
+Requires: boost >= 1.48, python >= 2.6, perl >= 5
+BuildRoot: /home/ian/rpmbuild/builds/%{name}-%{version}-%{release}
+%description
+Moses is a statistical machine translation system that allows you to automatically train translation models for any language pair. All you need is a collection of translated texts (parallel corpus). An efficient search algorithm finds quickly the highest probability translation among the exponential number of choices.
+%prep
+%setup -q
+
+mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v2
+
+wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz 
+wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz
+
+cd $RPM_BUILD_DIR
+
+tar -zxf irstlm-5.70.04.tgz
+tar -zxf giza-pp-v1.0.7.tgz
+
+cd irstlm-5.70.04
+bash regenerate-makefiles.sh --force
+./configure --prefix $RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04
+make
+make install
+
+cd ../giza-pp
+make
+cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v2
+%build
+./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v2 -j2
+%install
+mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R bin $RPM_BUILD_ROOT/opt/moses
+cp -R scripts/analysis $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/ems $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/generic $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/other $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/recaser $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/regression-testing $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/share $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/tokenizer $RPM_BUILD_ROOT/opt/moses/scripts
+cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
+%clean
+%files
+%defattr(-,root,root)
+/opt/moses/bin/*
+/opt/moses/scripts/analysis/*
+/opt/moses/scripts/ems/*
+/opt/moses/scripts/generic/*
+/opt/moses/scripts/other/*
+/opt/moses/scripts/recaser/*
+/opt/moses/scripts/regression-testing/*
+/opt/moses/scripts/share/*
+/opt/moses/scripts/tokenizer/*
+/opt/moses/scripts/training/*
+/opt/moses/irstlm-5.70.04/*
+/opt/moses/giza++-v2/*
-- 
cgit v1.2.3


From fe5e737589b94c5a82fa3a075a5fc1372f9bab3f Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Thu, 7 Mar 2013 12:35:29 -0500
Subject: Subtract prior scores when outputting phrase-based hypergraph arcs.

---
 moses/Manager.cpp | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index f8f2402a6..4c4fa267e 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -801,8 +801,11 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth
 size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
 {
 
-  const ScoreComponentCollection& scoreCollection = hypo->GetScoreBreakdown(); 
-
+  ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown(); 
+  const Hypothesis *prevHypo = hypo->GetPrevHypo();
+  if (prevHypo) {
+    scoreCollection.MinusEquals( prevHypo->GetScoreBreakdown() );
+  }
   vector<float> featureValues = scoreCollection.GetScoresForProducer(ff);
   size_t numScoreComps = featureValues.size();
 
-- 
cgit v1.2.3


From b968eae6291af3c6e2f4893a50dd643ebd297768 Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Thu, 7 Mar 2013 15:13:08 -0500
Subject: Take hypothesis recombination into account when outputting
 phrase-based lattice as hypergraph

---
 moses/Manager.cpp | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 4c4fa267e..011187cda 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -848,11 +848,14 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
 	}
       }
 
-      // Record that this arc ends at this node
-      hypergraphIDToArcs.insert(pair<int,int>(hypergraphHypothesisID,arcNumber));
-
       // Get an id number for this hypothesis
-      int mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
+      int mosesHypothesisID;
+      if (searchGraph[arcNumber].recombinationHypo) {
+	mosesHypothesisID = searchGraph[arcNumber].recombinationHypo->GetId();
+      } else {
+	mosesHypothesisID = searchGraph[arcNumber].hypo->GetId();
+      }
+
       if (mosesIDToHypergraphID.count(mosesHypothesisID) == 0) {
       
 	mosesIDToHypergraphID[mosesHypothesisID] = hypergraphHypothesisID;
@@ -866,6 +869,10 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
 
 	hypergraphHypothesisID += 1;
       }
+
+      // Record that this arc ends at this node
+      hypergraphIDToArcs.insert(pair<int,int>(mosesIDToHypergraphID[mosesHypothesisID],arcNumber));
+
     }
     
     // Unique end node
@@ -892,7 +899,12 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
       for (multimap<int,int>::iterator it=range.first; it!=range.second; ++it) {
 	int lineNumber = (*it).second;
 	const Hypothesis *thisHypo = searchGraph[lineNumber].hypo;
-	int mosesHypothesisID = thisHypo->GetId();
+	int mosesHypothesisID;// = thisHypo->GetId();
+	if (searchGraph[lineNumber].recombinationHypo) {
+	  mosesHypothesisID = searchGraph[lineNumber].recombinationHypo->GetId();
+	} else {
+	  mosesHypothesisID = searchGraph[lineNumber].hypo->GetId();
+	}
 	//	int actualHypergraphHypothesisID = mosesIDToHypergraphID[mosesHypothesisID];
 	UTIL_THROW_IF(
 		      (hypergraphHypothesisID != mosesIDToHypergraphID[mosesHypothesisID]),
-- 
cgit v1.2.3


From 5f1be3217b5ec3b69fb10b098e212e940c0b855c Mon Sep 17 00:00:00 2001
From: Barry Haddow <barry.haddow@gmail.com>
Date: Thu, 7 Mar 2013 21:40:43 +0000
Subject: bugifx format of extract file for instance weighting

---
 phrase-extract/extract-main.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
index 92c8a470e..cab91e92d 100644
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@@ -712,6 +712,10 @@ for(int fi=startF; fi<=endF; fi++) {
   if (m_options.isOrientationFlag())
     outextractstrOrientation << orientationInfo;
 
+  if (m_options.isIncludeSentenceIdFlag()) {
+    outextractstr << " ||| " << sentence.sentenceID;
+  }
+
   if (m_options.getInstanceWeightsFile().length()) {
     if (m_options.isTranslationFlag()) {
       outextractstr << " ||| " << sentence.weightString;
@@ -722,9 +726,6 @@ for(int fi=startF; fi<=endF; fi++) {
     }
   }
 
-  if (m_options.isIncludeSentenceIdFlag()) {
-    outextractstr << " ||| " << sentence.sentenceID;
-  }
 
   if (m_options.isTranslationFlag()) outextractstr << "\n";
   if (m_options.isTranslationFlag()) outextractstrInv << "\n";
-- 
cgit v1.2.3


From 13b6973ceeea4a1327d162c28093dd6cd4e9497c Mon Sep 17 00:00:00 2001
From: Ian Johnson <ian.johnson@appliedlanguage.com>
Date: Mon, 11 Mar 2013 13:04:10 +0000
Subject: Fixed GIZA++ installation directory name.

---
 contrib/rpm/rpmbuild/SPECS/moses.spec | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/contrib/rpm/rpmbuild/SPECS/moses.spec b/contrib/rpm/rpmbuild/SPECS/moses.spec
index 1ae8082ef..0f4a6c6ec 100644
--- a/contrib/rpm/rpmbuild/SPECS/moses.spec
+++ b/contrib/rpm/rpmbuild/SPECS/moses.spec
@@ -15,7 +15,7 @@ Moses is a statistical machine translation system that allows you to automatical
 %prep
 %setup -q
 
-mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v2
+mkdir -p $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
 
 wget -O $RPM_BUILD_DIR/irstlm-5.70.04.tgz http://moses-suite.googlecode.com/files/irstlm-5.70.04.tgz 
 wget -O $RPM_BUILD_DIR/giza-pp-v1.0.7.tgz http://moses-suite.googlecode.com/files/giza-pp-v1.0.7.tar.gz
@@ -33,9 +33,9 @@ make install
 
 cd ../giza-pp
 make
-cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v2
+cp $RPM_BUILD_DIR/giza-pp/GIZA++-v2/GIZA++ $RPM_BUILD_DIR/giza-pp/GIZA++-v2/snt2cooc.out $RPM_BUILD_DIR/giza-pp/mkcls-v2/mkcls $RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7
 %build
-./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v2 -j2
+./bjam --with-irstlm=$RPM_BUILD_ROOT/opt/moses/irstlm-5.70.04 --with-giza=$RPM_BUILD_ROOT/opt/moses/giza++-v1.0.7 -j2
 %install
 mkdir -p $RPM_BUILD_ROOT/opt/moses/scripts
 cp -R bin $RPM_BUILD_ROOT/opt/moses
@@ -62,4 +62,4 @@ cp -R scripts/training $RPM_BUILD_ROOT/opt/moses/scripts
 /opt/moses/scripts/tokenizer/*
 /opt/moses/scripts/training/*
 /opt/moses/irstlm-5.70.04/*
-/opt/moses/giza++-v2/*
+/opt/moses/giza++-v1.0.7/*
-- 
cgit v1.2.3


From 7b1c062cdcd04b48ce3fface6a08511d05558f41 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Tue, 12 Mar 2013 02:03:44 -0400
Subject: Update Boost install link

---
 BUILD-INSTRUCTIONS.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/BUILD-INSTRUCTIONS.txt b/BUILD-INSTRUCTIONS.txt
index 318956ccd..3dac64f60 100644
--- a/BUILD-INSTRUCTIONS.txt
+++ b/BUILD-INSTRUCTIONS.txt
@@ -45,7 +45,7 @@ ADVICE ON INSTALLING EXTERNAL LIBRARIES
 Generally, for trouble installing external libraries, you should get support
 directly from the library maker:
 
-Boost: http://www.boost.org/doc/libs/1_48_0/more/getting_started/unix-variants.html
+Boost: http://www.boost.org/doc/libs/release/more/getting_started/unix-variants.html
 IRSTLM: https://list.fbk.eu/sympa/subscribe/user-irstlm
 SRILM: http://www.speech.sri.com/projects/srilm/#srilm-user
 
-- 
cgit v1.2.3


From 21c51194fab91f8e79409545db98642cbac9b505 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Wed, 13 Mar 2013 12:12:33 +0000
Subject: add -print-alignment-info

---
 moses-cmd/IOWrapper.cpp | 13 +++++++++++++
 moses-cmd/IOWrapper.h   |  2 +-
 moses-cmd/Main.cpp      |  9 +++++++--
 moses/Parameter.cpp     |  1 +
 moses/StaticData.cpp    | 14 ++++++++++----
 moses/StaticData.h      |  4 ++++
 6 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp
index f7fed9998..2da30f380 100644
--- a/moses-cmd/IOWrapper.cpp
+++ b/moses-cmd/IOWrapper.cpp
@@ -271,6 +271,19 @@ void OutputAlignment(ostream &out, const vector<const Hypothesis *> &edges)
   out << std::endl;
 }
 
+void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo)
+{
+  std::vector<const Hypothesis *> edges;
+  const Hypothesis *currentHypo = hypo;
+  while (currentHypo) {
+    edges.push_back(currentHypo);
+    currentHypo = currentHypo->GetPrevHypo();
+  }
+
+  OutputAlignment(out, edges);
+
+}
+
 void OutputAlignment(OutputCollector* collector, size_t lineNo , const vector<const Hypothesis *> &edges)
 {
   ostringstream out;
diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h
index 5decaa122..267a3a0bc 100644
--- a/moses-cmd/IOWrapper.h
+++ b/moses-cmd/IOWrapper.h
@@ -139,7 +139,7 @@ void OutputBestHypo(const Moses::TrellisPath &path, long /*translationId*/,bool
 void OutputInput(std::ostream& os, const Moses::Hypothesis* hypo);
 void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo, const Moses::Hypothesis *hypo);
 void OutputAlignment(Moses::OutputCollector* collector, size_t lineNo,  const Moses::TrellisPath &path);
-
+void OutputAlignment(std::ostream &out, const Moses::Hypothesis *hypo);
 
 }
 
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index 5a33c214c..624c31994 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -197,7 +197,7 @@ public:
       // MAP decoding: best hypothesis
       const Hypothesis* bestHypo = NULL;
       if (!staticData.UseMBR()) 
-			{
+	  {
         bestHypo = manager.GetBestHypothesis();
         if (bestHypo) {
           if (staticData.IsPathRecoveryEnabled()) {
@@ -214,13 +214,18 @@ public:
             staticData.GetOutputFactorOrder(),
             staticData.GetReportSegmentation(),
             staticData.GetReportAllFactors());
+          if (staticData.PrintAlignmentInfo()) {
+        	out << "||| ";
+            OutputAlignment(out, bestHypo);
+          }
+
           OutputAlignment(m_alignmentInfoCollector, m_lineNumber, bestHypo);
           IFVERBOSE(1) {
             debug << "BEST TRANSLATION: " << *bestHypo << endl;
           }
         }
         out << endl;
-			}
+	  }
 
       // MBR decoding (n-best MBR, lattice MBR, consensus)
       else 
diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index 359174280..356cf219b 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -179,6 +179,7 @@ Parameter::Parameter()
   AddParam("minlexr-memory", "Load lexical reordering table in minlexr format into memory");                                          
   AddParam("minphr-memory", "Load phrase table in minphr format into memory");
 
+  AddParam("print-alignment-info", "Output word-to-word alignment into the log file. Word-to-word alignments are takne from the phrase table if any. Default is false");
   AddParam("include-segmentation-in-n-best", "include phrasal segmentation in the n-best list. default is false");
   AddParam("print-alignment-info-in-n-best", "Include word-to-word alignment in the n-best list. Word-to-word alignments are takne from the phrase table if any. Default is false");
   AddParam("alignment-output-file", "print output word alignments into given file");
diff --git a/moses/StaticData.cpp b/moses/StaticData.cpp
index cf797582b..9c27d9634 100644
--- a/moses/StaticData.cpp
+++ b/moses/StaticData.cpp
@@ -162,10 +162,6 @@ bool StaticData::LoadData(Parameter *parameter)
     }
   }
 
-  if(m_parameter->GetParam("sort-word-alignment").size()) {
-    m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
-  }
-  
   // factor delimiter
   if (m_parameter->GetParam("factor-delimiter").size() > 0) {
     m_factorDelimiter = m_parameter->GetParam("factor-delimiter")[0];
@@ -175,6 +171,16 @@ bool StaticData::LoadData(Parameter *parameter)
   SetBooleanParameter( &m_outputHypoScore, "output-hypo-score", false );
 
   //word-to-word alignment
+  // alignments
+  SetBooleanParameter( &m_PrintAlignmentInfo, "print-alignment-info", false );
+  if (m_PrintAlignmentInfo) {
+    m_needAlignmentInfo = true;
+  }
+
+  if(m_parameter->GetParam("sort-word-alignment").size()) {
+    m_wordAlignmentSort = (WordAlignmentSort) Scan<size_t>(m_parameter->GetParam("sort-word-alignment")[0]);
+  }
+
   SetBooleanParameter( &m_PrintAlignmentInfoNbest, "print-alignment-info-in-n-best", false );
   if (m_PrintAlignmentInfoNbest) {
     m_needAlignmentInfo = true;
diff --git a/moses/StaticData.h b/moses/StaticData.h
index ce93a5629..20d36e4b8 100644
--- a/moses/StaticData.h
+++ b/moses/StaticData.h
@@ -171,6 +171,7 @@ protected:
   bool m_reportAllFactorsNBest;
   std::string m_detailedTranslationReportingFilePath;
   bool m_onlyDistinctNBest;
+  bool m_PrintAlignmentInfo;
   bool m_needAlignmentInfo;
   bool m_PrintAlignmentInfoNbest;
 
@@ -730,6 +731,9 @@ public:
   const std::string &GetAlignmentOutputFile() const {
     return m_alignmentOutputFile;
   }
+  bool PrintAlignmentInfo() const {
+    return m_PrintAlignmentInfo;
+  }
   bool PrintAlignmentInfoInNbest() const {
     return m_PrintAlignmentInfoNbest;
   }
-- 
cgit v1.2.3


From 5ba153806b253c8ab4768882d34ac61c32f25b62 Mon Sep 17 00:00:00 2001
From: phikoehn <pkoehn@inf.ed.ac.uk>
Date: Wed, 13 Mar 2013 17:52:24 +0000
Subject: =?UTF-8?q?fixed=20kneserNey=20phrase=20probability=20smoothing=20?=
 =?UTF-8?q?bug=20reported=20by=20=09=20=C4=8Ceslav=20Przywara=20<ceslav@pr?=
 =?UTF-8?q?zywara.cz>?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 phrase-extract/consolidate-main.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index 70de9678b..fd33907de 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -256,7 +256,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
    if (kneserNeyFlag) {
      float D = kneserNey_D3;
      if (countEF < 2) D = kneserNey_D1;
-     if (countEF < 3) D = kneserNey_D2;
+     else if (countEF < 3) D = kneserNey_D2;
      if (D > countEF) D = countEF - 0.01; // sanity constraint
 
      float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
-- 
cgit v1.2.3


From 3a7f4f776a34049e2dd57622451694068c604bad Mon Sep 17 00:00:00 2001
From: phikoehn <pkoehn@inf.ed.ac.uk>
Date: Wed, 13 Mar 2013 17:54:29 +0000
Subject: minor

---
 scripts/generic/compound-splitter.perl | 43 ++++++++++++++++++----------------
 scripts/generic/mteval-v13a.pl         |  2 +-
 2 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/scripts/generic/compound-splitter.perl b/scripts/generic/compound-splitter.perl
index 8f82ab8d9..beca70eb0 100755
--- a/scripts/generic/compound-splitter.perl
+++ b/scripts/generic/compound-splitter.perl
@@ -16,15 +16,15 @@ $HELP = 1
     unless &GetOptions('corpus=s' => \$CORPUS,
 		       'model=s' => \$MODEL,
 		       'filler=s' => \$FILLER,
-           'factored' => \$FACTORED,
+		       'factored' => \$FACTORED,
 		       'min-size=i' => \$MIN_SIZE,
 		       'min-count=i' => \$MIN_COUNT,
 		       'max-count=i' => \$MAX_COUNT,
 		       'help' => \$HELP,
 		       'verbose' => \$VERBOSE,
-           'syntax' => \$SYNTAX,
-           'binarize' => \$BINARIZE,
-           'mark-split' => \$MARK_SPLIT,
+		       'syntax' => \$SYNTAX,
+		       'binarize' => \$BINARIZE,
+		       'mark-split' => \$MARK_SPLIT,
 		       'train' => \$TRAIN);
 
 if ($HELP ||
@@ -155,34 +155,37 @@ sub apply {
         next if defined($COUNT{$lc}) && $COUNT{$lc} > $count;
 	$COUNT{$lc} = $count;
 	$TRUECASE{$lc} = $factored_word;
-  $LABEL{$lc} = $label if $SYNTAX;
+	$LABEL{$lc} = $label if $SYNTAX;
     }
     close(MODEL);
 
     while(<STDIN>) {
 	my $first = 1;
 	chop; s/\s+/ /g; s/^ //; s/ $//;
-  my @BUFFER; # for xml tags
+	my @BUFFER; # for xml tags
 	foreach my $factored_word (split) {
 	    print " " unless $first;	    
 	    $first = 0;
 
-      # syntax: don't split xml
-      if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
-        push @BUFFER,$factored_word;
-        $first = 1;
-        next;
-      }
-
-      # get case class
-      my $word = $factored_word;
-      $word =~ s/\|.+//g; # just first factor
-      my $lc = lc($word);
-
+	    # syntax: don't split xml
+	    if ($SYNTAX && ($factored_word =~ /^</ || $factored_word =~ />$/)) {
+		push @BUFFER,$factored_word;
+		$first = 1;
+		next;
+	    }
+	    
+	    # get case class
+	    my $word = $factored_word;
+	    $word =~ s/\|.+//g; # just first factor
+	    my $lc = lc($word);
+	    
+	    print STDERR "considering $word ($lc)...\n" if $VERBOSE;
 	    # don't split frequent words
-	    if (defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) {
-    print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
+	    if ((defined($COUNT{$lc}) && $COUNT{$lc}>=$MAX_COUNT) ||
+	        $lc !~ /[a-zA-Z]/) {; # has to have at least one letter
+		print join(" ",@BUFFER)." " if scalar(@BUFFER); @BUFFER = (); # clear buffer
 		print $factored_word;
+		print STDERR "\tfrequent word ($COUNT{$lc}>=$MAX_COUNT), skipping\n" if $VERBOSE;
 		next;
 	    }
 
diff --git a/scripts/generic/mteval-v13a.pl b/scripts/generic/mteval-v13a.pl
index 879212e6e..f1f8f9ef6 100755
--- a/scripts/generic/mteval-v13a.pl
+++ b/scripts/generic/mteval-v13a.pl
@@ -1009,7 +1009,7 @@ sub extract_sgml_tag_and_span
 sub extract_sgml_tag_attribute
 {
 	my ($name, $data) = @_;
-	($data =~ m|$name\s*=\s*\"([^\"]*)\"|si) ? ($1) : ();
+	($data =~ m|$name\s*=\s*\"?([^\"]*)\"?|si) ? ($1) : ();
 }
 
 #################################
-- 
cgit v1.2.3


From 946fbc45e1b0a4e7fda9377e8c3a847d5cce99a0 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Thu, 14 Mar 2013 09:58:33 +0000
Subject: testing commit emails function

---
 NOTICE | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NOTICE b/NOTICE
index 7d631cd88..23d8b2ad1 100644
--- a/NOTICE
+++ b/NOTICE
@@ -1,3 +1,5 @@
 This code includes data from Daniel Naber's Language Tools (czech abbreviations).
 
 This code includes data from czech wiktionary (also czech abbreviations).
+
+
-- 
cgit v1.2.3


From 08330adfe6a69d107bf67c92742034fcaf700fc9 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Thu, 14 Mar 2013 10:44:00 +0000
Subject: testing commit emails function. 2

---
 NOTICE | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NOTICE b/NOTICE
index 23d8b2ad1..cea4ab1da 100644
--- a/NOTICE
+++ b/NOTICE
@@ -3,3 +3,5 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations)
 This code includes data from czech wiktionary (also czech abbreviations).
 
 
+
+
-- 
cgit v1.2.3


From 763f6b84a74f3d56f418edb810a2f847a776bbcc Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Thu, 14 Mar 2013 17:48:43 +0000
Subject: testing commit emails function. 3

---
 NOTICE | 2 --
 1 file changed, 2 deletions(-)

diff --git a/NOTICE b/NOTICE
index cea4ab1da..23d8b2ad1 100644
--- a/NOTICE
+++ b/NOTICE
@@ -3,5 +3,3 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations)
 This code includes data from czech wiktionary (also czech abbreviations).
 
 
-
-
-- 
cgit v1.2.3


From a3d50bab1a2f65eb07b021246d2773418c5c9a07 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Thu, 14 Mar 2013 17:50:58 +0000
Subject: testing commit emails function. 4

---
 NOTICE | 1 +
 1 file changed, 1 insertion(+)

diff --git a/NOTICE b/NOTICE
index 23d8b2ad1..c2ff560fc 100644
--- a/NOTICE
+++ b/NOTICE
@@ -3,3 +3,4 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations)
 This code includes data from czech wiktionary (also czech abbreviations).
 
 
+
-- 
cgit v1.2.3


From df3e379d2bc5af4dce66fd2b894b3ad23cda2895 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Thu, 14 Mar 2013 17:58:57 +0000
Subject: testing commit emails function. 5

---
 NOTICE | 1 +
 1 file changed, 1 insertion(+)

diff --git a/NOTICE b/NOTICE
index c2ff560fc..cea4ab1da 100644
--- a/NOTICE
+++ b/NOTICE
@@ -4,3 +4,4 @@ This code includes data from czech wiktionary (also czech abbreviations).
 
 
+
-- 
cgit v1.2.3


From b639f0b2101021b86ec8d6f8dd01921f30b073fe Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Thu, 14 Mar 2013 18:02:38 +0000
Subject: testing commit emails function. 6

---
 NOTICE | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/NOTICE b/NOTICE
index cea4ab1da..b0dcba070 100644
--- a/NOTICE
+++ b/NOTICE
@@ -2,6 +2,3 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations)
 
 This code includes data from czech wiktionary (also czech abbreviations).
 
-
-
-
-- 
cgit v1.2.3


From d931006dbc1e7e86c39f225af2e795adc1b3d342 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Thu, 14 Mar 2013 18:04:11 +0000
Subject: testing commit emails function. 7

---
 NOTICE | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NOTICE b/NOTICE
index b0dcba070..c2ff560fc 100644
--- a/NOTICE
+++ b/NOTICE
@@ -2,3 +2,5 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations)
 
 This code includes data from czech wiktionary (also czech abbreviations).
 
+
+
-- 
cgit v1.2.3


From d7806b9351762342ea3e6d550c52a8f802995b62 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Thu, 14 Mar 2013 18:08:15 +0000
Subject: testing commit emails function. 8

---
 NOTICE | 2 --
 1 file changed, 2 deletions(-)

diff --git a/NOTICE b/NOTICE
index c2ff560fc..b0dcba070 100644
--- a/NOTICE
+++ b/NOTICE
@@ -2,5 +2,3 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations)
 
 This code includes data from czech wiktionary (also czech abbreviations).
 
-
-
-- 
cgit v1.2.3


From e60c4f74ef0e9d1a16af4a0df398962694750d5b Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Fri, 15 Mar 2013 10:11:36 +0000
Subject: test commit. One last time on sourceforge

---
 NOTICE | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/NOTICE b/NOTICE
index b0dcba070..c2ff560fc 100644
--- a/NOTICE
+++ b/NOTICE
@@ -2,3 +2,5 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations)
 
 This code includes data from czech wiktionary (also czech abbreviations).
 
+
+
-- 
cgit v1.2.3


From 802f5ab3b78cfb12cff05753a3677a0f998c24c0 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Fri, 15 Mar 2013 11:19:15 +0000
Subject: testing commit emails function. 9

---
 NOTICE | 1 -
 1 file changed, 1 deletion(-)

diff --git a/NOTICE b/NOTICE
index c2ff560fc..23d8b2ad1 100644
--- a/NOTICE
+++ b/NOTICE
@@ -3,4 +3,3 @@ This code includes data from Daniel Naber's Language Tools (czech abbreviations)
 This code includes data from czech wiktionary (also czech abbreviations).
 
 
-
-- 
cgit v1.2.3


From 3da09b921c8278f5a808f6e951e329d04ab186f7 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Fri, 15 Mar 2013 12:30:39 +0000
Subject: memory leak

---
 moses-cmd/Main.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index 624c31994..9f610204f 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -495,20 +495,20 @@ int main(int argc, char** argv)
 
     // load all the settings into the Parameter class
     // (stores them as strings, or array of strings)
-    Parameter* params = new Parameter();
-    if (!params->LoadParam(argc,argv)) {
+    Parameter params;
+    if (!params.LoadParam(argc,argv)) {
       exit(1);
     }
 
 
     // initialize all "global" variables, which are stored in StaticData
     // note: this also loads models such as the language model, etc.
-    if (!StaticData::LoadDataStatic(params, argv[0])) {
+    if (!StaticData::LoadDataStatic(&params, argv[0])) {
       exit(1);
     }
 
     // setting "-show-weights" -> just dump out weights and exit
-    if (params->isParamSpecified("show-weights")) {
+    if (params.isParamSpecified("show-weights")) {
       ShowWeights();
       exit(0);
     }
-- 
cgit v1.2.3


From 974bdd979b4b79b4eb252665fd253a0e335e371f Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Fri, 15 Mar 2013 14:19:36 +0000
Subject: memory leak

---
 moses/LM/SingleFactor.cpp | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/moses/LM/SingleFactor.cpp b/moses/LM/SingleFactor.cpp
index 3418aefe2..c061d0fed 100644
--- a/moses/LM/SingleFactor.cpp
+++ b/moses/LM/SingleFactor.cpp
@@ -36,8 +36,9 @@ using namespace std;
 namespace Moses
 {
 
-LanguageModelSingleFactor::~LanguageModelSingleFactor() {}
-
+LanguageModelSingleFactor::~LanguageModelSingleFactor()
+{
+}
 
 struct PointerState : public FFState {
   const void* lmstate;
@@ -58,7 +59,11 @@ LanguageModelPointerState::LanguageModelPointerState()
   m_beginSentenceState = new PointerState(NULL);
 }
 
-LanguageModelPointerState::~LanguageModelPointerState() {}
+LanguageModelPointerState::~LanguageModelPointerState()
+{
+  delete m_nullContextState;
+  delete m_beginSentenceState;
+}
 
 const FFState *LanguageModelPointerState::GetNullContextState() const
 {
-- 
cgit v1.2.3


From 2d252d2dd0c97e1d1def55b9d421db4122d762f2 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Fri, 15 Mar 2013 16:11:15 +0000
Subject: memory leak

---
 moses-cmd/Main.cpp                | 2 ++
 moses/AlignmentInfoCollection.cpp | 3 +++
 moses/AlignmentInfoCollection.h   | 1 +
 3 files changed, 6 insertions(+)

diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index 9f610204f..117cac3f9 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -665,6 +665,8 @@ int main(int argc, char** argv)
     pool.Stop(true); //flush remaining jobs
 #endif
 
+    delete ioWrapper;
+
   } catch (const std::exception &e) {
     std::cerr << "Exception: " << e.what() << std::endl;
     return EXIT_FAILURE;
diff --git a/moses/AlignmentInfoCollection.cpp b/moses/AlignmentInfoCollection.cpp
index 5daba9ba1..53b83d8cd 100644
--- a/moses/AlignmentInfoCollection.cpp
+++ b/moses/AlignmentInfoCollection.cpp
@@ -30,6 +30,9 @@ AlignmentInfoCollection::AlignmentInfoCollection()
   m_emptyAlignmentInfo = Add(pairs);
 }
 
+AlignmentInfoCollection::~AlignmentInfoCollection()
+{}
+
 const AlignmentInfo &AlignmentInfoCollection::GetEmptyAlignmentInfo() const
 {
   return *m_emptyAlignmentInfo;
diff --git a/moses/AlignmentInfoCollection.h b/moses/AlignmentInfoCollection.h
index 9c7f75e13..de0949f8f 100644
--- a/moses/AlignmentInfoCollection.h
+++ b/moses/AlignmentInfoCollection.h
@@ -55,6 +55,7 @@ class AlignmentInfoCollection
 
   //! Only a single static variable should be created.
   AlignmentInfoCollection();
+  ~AlignmentInfoCollection();
 
   static AlignmentInfoCollection s_instance;
 
-- 
cgit v1.2.3


From de2519fb889e2de8037e9317f1c676f1efbca475 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Fri, 15 Mar 2013 19:48:11 +0000
Subject: eclipse

---
 contrib/other-builds/OnDiskPt/.cproject        | 11 ++++++++---
 contrib/other-builds/extractor/.cproject       | 12 ++++++++++--
 contrib/other-builds/lm/.cproject              | 11 +++++++++--
 contrib/other-builds/mert_lib/.cproject        |  5 +----
 contrib/other-builds/moses-chart-cmd/.cproject |  2 +-
 contrib/other-builds/moses-cmd/.cproject       | 11 ++++++++---
 contrib/other-builds/moses/.cproject           | 18 +++++++++++-------
 contrib/other-builds/search/.cproject          | 11 +++++++++--
 contrib/other-builds/search/.project           |  5 -----
 contrib/other-builds/util/.cproject            | 11 ++++++++---
 10 files changed, 65 insertions(+), 32 deletions(-)

diff --git a/contrib/other-builds/OnDiskPt/.cproject b/contrib/other-builds/OnDiskPt/.cproject
index e135b8886..f551380fd 100644
--- a/contrib/other-builds/OnDiskPt/.cproject
+++ b/contrib/other-builds/OnDiskPt/.cproject
@@ -24,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.846397978." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.725420545" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1586272140" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
-							<builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
+							<builder buildPath="${workspace_loc:/OnDiskPt/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.1909553559" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.30521110" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.478334849" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
 								<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.1328561226" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@@ -133,8 +133,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/OnDiskPt"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/other-builds/extractor/.cproject b/contrib/other-builds/extractor/.cproject
index 7529a7799..1ccfe2578 100644
--- a/contrib/other-builds/extractor/.cproject
+++ b/contrib/other-builds/extractor/.cproject
@@ -18,7 +18,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.1133345948." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1405862229" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform id="cdt.managedbuild.target.gnu.platform.exe.debug.605722566" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/extractor/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.238577912" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1956867596" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1512268277" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2143789149" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -119,5 +119,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/extractor"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/extractor"/>
+		</configuration>
+	</storageModule>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
diff --git a/contrib/other-builds/lm/.cproject b/contrib/other-builds/lm/.cproject
index 2036e6b18..e3e47fd7e 100644
--- a/contrib/other-builds/lm/.cproject
+++ b/contrib/other-builds/lm/.cproject
@@ -24,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.351042750." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.640882096" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.793478365" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
-							<builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
+							<builder buildPath="${workspace_loc:/lm/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.36011795" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.1252826468" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1024598065" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
 								<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.139111896" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@@ -131,7 +131,14 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/lm"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/lm"/>
+		</configuration>
+	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
diff --git a/contrib/other-builds/mert_lib/.cproject b/contrib/other-builds/mert_lib/.cproject
index 79dffb294..e1c19b822 100644
--- a/contrib/other-builds/mert_lib/.cproject
+++ b/contrib/other-builds/mert_lib/.cproject
@@ -23,7 +23,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.lib.debug.1932340583" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.lib.debug">
 							<targetPlatform id="cdt.managedbuild.target.gnu.platform.lib.debug.296711714" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.lib.debug"/>
-							<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="4" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
+							<builder buildPath="${workspace_loc:/mert_lib/Debug}" id="cdt.managedbuild.target.gnu.builder.lib.debug.1369910974" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.lib.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.lib.debug.89397980" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.lib.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug">
 								<option id="gnu.cpp.compiler.lib.debug.option.optimization.level.469164841" name="Optimization Level" superClass="gnu.cpp.compiler.lib.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -46,9 +46,6 @@
 							</tool>
 						</toolChain>
 					</folderInfo>
-					<fileInfo id="cdt.managedbuild.config.gnu.lib.debug.1721952013.646822372" name="UtilTest.cpp" rcbsApplicability="disable" resourcePath="mert/UtilTest.cpp" toolsToInvoke="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.967030373">
-						<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537.967030373" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.lib.debug.329920537"/>
-					</fileInfo>
 					<sourceEntries>
 						<entry excluding="mert/UtilTest.cpp|mert/TimerTest.cpp|mert/SingletonTest.cpp|mert/PointTest.cpp|mert/OptimizerFactoryTest.cpp|mert/NgramTest.cpp|mert/FeatureDataTest.cpp|mert/DataTest.cpp|mert/ReferenceTest.cpp|mert/VocabularyTest.cpp|mert/extractor.cpp" flags="VALUE_WORKSPACE_PATH|RESOLVED" kind="sourcePath" name=""/>
 					</sourceEntries>
diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject
index 7120f0b71..90a730cf7 100644
--- a/contrib/other-builds/moses-chart-cmd/.cproject
+++ b/contrib/other-builds/moses-chart-cmd/.cproject
@@ -19,7 +19,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.162355801." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1633424067" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1437309068" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/moses-chart-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.1495140314" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1247128100" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1087697480" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1163099464" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject
index 0dd08a220..573fe715f 100644
--- a/contrib/other-builds/moses-cmd/.cproject
+++ b/contrib/other-builds/moses-cmd/.cproject
@@ -19,7 +19,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.461114338." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1896491482" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.2144309834" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/moses-cmd/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.56664170" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1278274354" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.626095182" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2084031389" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -157,8 +157,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/moses-cmd"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject
index e54a1385b..787024533 100644
--- a/contrib/other-builds/moses/.cproject
+++ b/contrib/other-builds/moses/.cproject
@@ -1,7 +1,5 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?>
-
-<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
 	<storageModule moduleId="org.eclipse.cdt.core.settings">
 		<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.656913512">
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.656913512" moduleId="org.eclipse.cdt.core.settings" name="Debug">
@@ -9,7 +7,7 @@
 					<externalSetting>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="includePath" name="/moses"/>
 						<entry flags="VALUE_WORKSPACE_PATH" kind="libraryPath" name="/moses/Debug"/>
-						<entry flags="RESOLVED" kind="libraryFile" name="moses"/>
+						<entry flags="RESOLVED" kind="libraryFile" name="moses" srcPrefixMapping="" srcRootPath=""/>
 					</externalSetting>
 				</externalSettings>
 				<extensions>
@@ -26,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.656913512." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1793369992" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.1051650049" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/moses/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.505583888" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.1976472988" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1774992327" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.1759650532" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -152,8 +150,14 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileCPP"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/moses"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/moses"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/moses"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
+	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
diff --git a/contrib/other-builds/search/.cproject b/contrib/other-builds/search/.cproject
index 9ccb8f8e9..2de36fecd 100644
--- a/contrib/other-builds/search/.cproject
+++ b/contrib/other-builds/search/.cproject
@@ -24,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.exe.debug.722547278." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.exe.debug.1512691763" name="Linux GCC" superClass="cdt.managedbuild.toolchain.gnu.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.ELF;org.eclipse.cdt.core.MachO64" id="cdt.managedbuild.target.gnu.platform.exe.debug.633526059" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.exe.debug"/>
-							<builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
+							<builder buildPath="${workspace_loc:/search/Debug}" id="cdt.managedbuild.target.gnu.builder.exe.debug.164367197" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.base.854512708" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1096845166" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.240381177" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
@@ -127,6 +127,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/search"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/search"/>
+		</configuration>
+	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
 </cproject>
diff --git a/contrib/other-builds/search/.project b/contrib/other-builds/search/.project
index efad842ea..95f074aae 100644
--- a/contrib/other-builds/search/.project
+++ b/contrib/other-builds/search/.project
@@ -156,11 +156,6 @@
 			<type>1</type>
 			<locationURI>PARENT-3-PROJECT_LOC/search/vertex.hh</locationURI>
 		</link>
-		<link>
-			<name>vertex_generator.cc</name>
-			<type>1</type>
-			<locationURI>PARENT-3-PROJECT_LOC/search/vertex_generator.cc</locationURI>
-		</link>
 		<link>
 			<name>vertex_generator.hh</name>
 			<type>1</type>
diff --git a/contrib/other-builds/util/.cproject b/contrib/other-builds/util/.cproject
index ab37362a4..2fd4d2dfb 100644
--- a/contrib/other-builds/util/.cproject
+++ b/contrib/other-builds/util/.cproject
@@ -24,7 +24,7 @@
 					<folderInfo id="cdt.managedbuild.config.gnu.macosx.exe.debug.1869657447." name="/" resourcePath="">
 						<toolChain id="cdt.managedbuild.toolchain.gnu.macosx.exe.debug.1388624938" name="MacOSX GCC" superClass="cdt.managedbuild.toolchain.gnu.macosx.exe.debug">
 							<targetPlatform binaryParser="org.eclipse.cdt.core.MachO64;org.eclipse.cdt.core.ELF" id="cdt.managedbuild.target.gnu.platform.macosx.exe.debug.1873607607" name="Debug Platform" superClass="cdt.managedbuild.target.gnu.platform.macosx.exe.debug"/>
-							<builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
+							<builder buildPath="${workspace_loc:/util/Debug}" id="cdt.managedbuild.target.gnu.builder.macosx.exe.debug.2045214944" keepEnvironmentInBuildfile="false" managedBuildOn="true" name="Gnu Make Builder" parallelBuildOn="true" parallelizationNumber="optimal" superClass="cdt.managedbuild.target.gnu.builder.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug.589471640" name="MacOS X C Linker" superClass="cdt.managedbuild.tool.macosx.c.linker.macosx.exe.debug"/>
 							<tool id="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug.1543780089" name="MacOS X C++ Linker" superClass="cdt.managedbuild.tool.macosx.cpp.linker.macosx.exe.debug">
 								<inputType id="cdt.managedbuild.tool.macosx.cpp.linker.input.635667684" superClass="cdt.managedbuild.tool.macosx.cpp.linker.input">
@@ -136,8 +136,13 @@
 			<autodiscovery enabled="true" problemReportingEnabled="true" selectedProfileId="org.eclipse.cdt.managedbuilder.core.GCCManagedMakePerProjectProfileC"/>
 		</scannerConfigBuildInfo>
 	</storageModule>
-	<storageModule moduleId="refreshScope" versionNumber="1">
-		<resource resourceType="PROJECT" workspacePath="/util"/>
+	<storageModule moduleId="refreshScope" versionNumber="2">
+		<configuration configurationName="Release">
+			<resource resourceType="PROJECT" workspacePath="/util"/>
+		</configuration>
+		<configuration configurationName="Debug">
+			<resource resourceType="PROJECT" workspacePath="/util"/>
+		</configuration>
 	</storageModule>
 	<storageModule moduleId="org.eclipse.cdt.make.core.buildtargets"/>
 	<storageModule moduleId="org.eclipse.cdt.core.LanguageSettingsProviders"/>
-- 
cgit v1.2.3


From df5f0934be559418177ffa9a68c2e561918a310f Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Fri, 15 Mar 2013 19:48:51 +0000
Subject: eclipse

---
 contrib/other-builds/extractor/.cproject | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/contrib/other-builds/extractor/.cproject b/contrib/other-builds/extractor/.cproject
index 1ccfe2578..fc08b4c3d 100644
--- a/contrib/other-builds/extractor/.cproject
+++ b/contrib/other-builds/extractor/.cproject
@@ -23,6 +23,9 @@
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug.1512268277" name="GCC C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.exe.debug">
 								<option id="gnu.cpp.compiler.exe.debug.option.optimization.level.2143789149" name="Optimization Level" superClass="gnu.cpp.compiler.exe.debug.option.optimization.level" value="gnu.cpp.compiler.optimization.level.none" valueType="enumerated"/>
 								<option id="gnu.cpp.compiler.exe.debug.option.debugging.level.285958391" name="Debug Level" superClass="gnu.cpp.compiler.exe.debug.option.debugging.level" value="gnu.cpp.compiler.debugging.level.max" valueType="enumerated"/>
+								<option id="gnu.cpp.compiler.option.include.paths.966722418" superClass="gnu.cpp.compiler.option.include.paths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${workspace_loc}/../../boost/include&quot;"/>
+								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.1839105433" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input"/>
 							</tool>
 							<tool id="cdt.managedbuild.tool.gnu.c.compiler.exe.debug.554846982" name="GCC C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.exe.debug">
-- 
cgit v1.2.3


From 8523a27768a44acbd35a04573fee90c17a753056 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <s0565741@crom.inf.ed.ac.uk>
Date: Fri, 15 Mar 2013 20:38:26 +0000
Subject: fix single-threaded

---
 moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
index c680d7245..065368ca7 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@@ -552,7 +552,9 @@ namespace tmmt
   
   bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
   {
+#ifdef WITH_THREADS
     boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+#endif
     map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
     if (lookup != m_lsed.end()) {
       value = lookup->second;
@@ -564,7 +566,9 @@ namespace tmmt
 
   void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
   {
+#ifdef WITH_THREADS
     boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+#endif
     m_lsed[ key ] = value;
   }
 
-- 
cgit v1.2.3


From 9f4824b2be25e13bf450cddbe683b790c257cf9e Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Sat, 16 Mar 2013 13:25:31 +0000
Subject: single threaded compile error

---
 moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
index c680d7245..065368ca7 100644
--- a/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
+++ b/moses/TranslationModel/fuzzy-match/FuzzyMatchWrapper.cpp
@@ -552,7 +552,9 @@ namespace tmmt
   
   bool FuzzyMatchWrapper::GetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, unsigned int &value) const
   {
+#ifdef WITH_THREADS
     boost::shared_lock<boost::shared_mutex> read_lock(m_accessLock);
+#endif
     map< pair< WORD_ID, WORD_ID >, unsigned int >::const_iterator lookup = m_lsed.find( key );
     if (lookup != m_lsed.end()) {
       value = lookup->second;
@@ -564,7 +566,9 @@ namespace tmmt
 
   void FuzzyMatchWrapper::SetLSEDCache(const std::pair< WORD_ID, WORD_ID > &key, const unsigned int &value)
   {
+#ifdef WITH_THREADS
     boost::unique_lock<boost::shared_mutex> lock(m_accessLock);
+#endif
     m_lsed[ key ] = value;
   }
 
-- 
cgit v1.2.3


From 1b83b85f443be34686792f0c9b7a2997adca1f4f Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Mon, 18 Mar 2013 16:48:40 +0000
Subject: debug info from sort command

---
 scripts/generic/extract-parallel.perl | 6 +++---
 scripts/generic/score-parallel.perl   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl
index 7533b39e0..192169c86 100755
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@@ -153,9 +153,9 @@ if (defined($baselineExtract)) {
 		$catOCmd .= "$baselineExtract.o$sorted.gz ";
 }
 
-$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.sorted.gz \n";
-$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.inv.sorted.gz \n";
-$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR | gzip -c > $extract.o.sorted.gz \n";
+$catCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.sorted.gz 2>> /dev/stderr \n";
+$catInvCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.inv.sorted.gz 2>> /dev/stderr \n";
+$catOCmd .= " | LC_ALL=C $sortCmd -T $TMPDIR 2>> /dev/stderr | gzip -c > $extract.o.sorted.gz 2>> /dev/stderr \n";
 
 
 @children = ();
diff --git a/scripts/generic/score-parallel.perl b/scripts/generic/score-parallel.perl
index 520fbddbe..3f763e5d9 100755
--- a/scripts/generic/score-parallel.perl
+++ b/scripts/generic/score-parallel.perl
@@ -163,7 +163,7 @@ else
     $cmd .= "| LC_ALL=C $sortCmd -T $TMPDIR ";
   }
 
-  $cmd .= " | gzip -c > $ptHalf";
+  $cmd .= " | gzip -c > $ptHalf  2>> /dev/stderr ";
 }
 print STDERR $cmd;
 systemCheck($cmd);
-- 
cgit v1.2.3


From 038871fdb36359e1ec2027de1ef0227162f5473a Mon Sep 17 00:00:00 2001
From: Achim <achim@achim01.(none)>
Date: Mon, 18 Mar 2013 17:17:35 -0400
Subject: Hungarian and Latvian non-breaking prefix files

---
 .../nonbreaking_prefixes/nonbreaking_prefix.hu     | 103 +++++++++++++++++++++
 .../nonbreaking_prefixes/nonbreaking_prefix.lv     | 100 ++++++++++++++++++++
 2 files changed, 203 insertions(+)
 create mode 100644 scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu
 create mode 100644 scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv

diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu
new file mode 100644
index 000000000..c6b9af8ca
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.hu
@@ -0,0 +1,103 @@
+﻿#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+B
+C
+D
+E
+F
+G
+H
+I
+J
+K
+L
+M
+N
+O
+P
+Q
+R
+S
+T
+U
+V
+W
+X
+Y
+Z
+Á
+É
+Í
+Ó
+Ö
+Ő
+Ú
+Ü
+Ű
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+Dr
+dr
+kb
+Kb
+vö
+Vö
+pl
+Pl
+ca
+Ca
+min
+Min
+max
+Max
+ún
+Ún
+prof
+Prof
+de
+De
+du
+Du
+Szt
+St
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+
+# Month name abbreviations
+jan #NUMERIC_ONLY#
+Jan #NUMERIC_ONLY#
+Feb #NUMERIC_ONLY#
+feb #NUMERIC_ONLY#
+márc #NUMERIC_ONLY#
+Márc #NUMERIC_ONLY#
+ápr #NUMERIC_ONLY#
+Ápr #NUMERIC_ONLY#
+máj #NUMERIC_ONLY#
+Máj #NUMERIC_ONLY#
+jún #NUMERIC_ONLY#
+Jún #NUMERIC_ONLY#
+Júl #NUMERIC_ONLY#
+júl #NUMERIC_ONLY#
+aug #NUMERIC_ONLY#
+Aug #NUMERIC_ONLY#
+Szept #NUMERIC_ONLY#
+szept #NUMERIC_ONLY#
+okt #NUMERIC_ONLY#
+Okt #NUMERIC_ONLY#
+nov #NUMERIC_ONLY#
+Nov #NUMERIC_ONLY#
+dec #NUMERIC_ONLY#
+Dec #NUMERIC_ONLY#
+
+# Other abbreviations
+tel #NUMERIC_ONLY#
+Tel #NUMERIC_ONLY#
+Fax #NUMERIC_ONLY#
+fax #NUMERIC_ONLY#
diff --git a/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv
new file mode 100644
index 000000000..81754a17a
--- /dev/null
+++ b/scripts/share/nonbreaking_prefixes/nonbreaking_prefix.lv
@@ -0,0 +1,100 @@
+#Anything in this file, followed by a period (and an upper-case word), does NOT indicate an end-of-sentence marker.
+#Special cases are included for prefixes that ONLY appear before 0-9 numbers.
+
+#any single upper case letter  followed by a period is not a sentence ender (excluding I occasionally, but we leave it in)
+#usually upper case letters are initials in a name
+A
+Ā
+B
+C
+Č
+D
+E
+Ē
+F
+G
+Ģ
+H
+I
+Ī
+J
+K
+Ķ
+L
+Ļ
+M
+N
+Ņ
+O
+P
+Q
+R
+S
+Š
+T
+U
+Ū
+V
+W
+X
+Y
+Z
+Ž
+
+#List of titles. These are often followed by upper-case names, but do not indicate sentence breaks
+dr
+Dr
+med
+prof
+Prof
+inž
+Inž
+ist.loc
+Ist.loc
+kor.loc
+Kor.loc
+v.i
+vietn
+Vietn
+
+#misc - odd period-ending items that NEVER indicate breaks (p.m. does NOT fall into this category - it sometimes ends a sentence)
+a.l
+t.p
+pārb
+Pārb
+vec
+Vec
+inv
+Inv
+sk
+Sk
+spec
+Spec
+vienk
+Vienk
+virz
+Virz
+māksl
+Māksl
+mūz
+Mūz
+akad
+Akad
+soc
+Soc
+galv
+Galv
+vad
+Vad
+sertif
+Sertif
+folkl
+Folkl
+hum
+Hum
+
+#Numbers only. These should only induce breaks when followed by a numeric sequence
+# add NUMERIC_ONLY after the word for this function
+#This case is mostly for the english "No." which can either be a sentence of its own, or
+#if followed by a number, a non-breaking prefix
+Nr #NUMERIC_ONLY# 
-- 
cgit v1.2.3


From 8efeb5922816ab3df527c29742961a19b522f5ad Mon Sep 17 00:00:00 2001
From: Barry Haddow <barry.haddow@gmail.com>
Date: Mon, 18 Mar 2013 21:29:17 +0000
Subject: don't lowercase reference if there's a recaser

---
 scripts/ems/experiment.meta | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index d74677a69..7c84839f5 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -1000,6 +1000,7 @@ lowercase-reference
 	out: reference
 	default-name: evaluation/reference
 	pass-unless: output-lowercaser
+  pass-if: recaser
 	multiref: $moses-script-dir/ems/support/run-command-on-multiple-refsets.perl
 	template: $output-lowercaser < IN > OUT	
 nist-bleu
-- 
cgit v1.2.3


From 7dc4faa97ed2021981c69551d2af9a68787e867d Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Tue, 19 Mar 2013 11:17:17 +0000
Subject: Fix cd error when running bjam from non-top

---
 bjam | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bjam b/bjam
index d0d94dedb..0ebf105c3 100755
--- a/bjam
+++ b/bjam
@@ -1,17 +1,17 @@
 #!/bin/bash
 set -e
+top="$(dirname "$0")"
 if
   bjam="$(which bjam 2>/dev/null)" && #exists
   [ ${#bjam} != 0 ] && #paranoia about which printing nothing then returning true
   ! grep UFIHGUFIHBDJKNCFZXAEVA "${bjam}" </dev/null >/dev/null && #bjam in path isn't this script
   "${bjam}" --sanity-test 2>/dev/null |grep Sane >/dev/null && #The test in jam-files/sanity.jam passes
-  (cd jam-files/fail && ! "${bjam}") >/dev/null #Returns non-zero on failure
+  (cd "${top}/jam-files/fail" && ! "${bjam}") >/dev/null #Returns non-zero on failure
 then
   #Delegate to system bjam
   exec "${bjam}" "$@"
 fi
 
-top="$(dirname "$0")"
 if [ ! -x "$top"/jam-files/bjam ] || "$top"/jam-files/bjam -v |grep 2011.4 >/dev/null; then
   pushd "$top/jam-files/engine"
   ./build.sh
-- 
cgit v1.2.3


From 55f02f2fecc9bfdb6720f84beb1b94733166ec64 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Tue, 19 Mar 2013 14:46:52 +0000
Subject: Accept concatenated bzip2 files

---
 util/read_compressed.cc | 100 +++++++++++++++++++++++++++++++++++-------------
 1 file changed, 73 insertions(+), 27 deletions(-)

diff --git a/util/read_compressed.cc b/util/read_compressed.cc
index b81549e42..b62a6e833 100644
--- a/util/read_compressed.cc
+++ b/util/read_compressed.cc
@@ -180,12 +180,73 @@ class GZip : public ReadBase {
 };
 #endif // HAVE_ZLIB
 
+const uint8_t kBZMagic[3] = {'B', 'Z', 'h'};
+
 #ifdef HAVE_BZLIB
 class BZip : public ReadBase {
   public:
-    explicit BZip(int fd, void *already_data, std::size_t already_size) {
+    BZip(int fd, void *already_data, std::size_t already_size) {
       scoped_fd hold(fd);
       closer_.reset(FDOpenReadOrThrow(hold));
+      file_ = NULL;
+      Open(already_data, already_size);
+    }
+
+    BZip(FILE *file, void *already_data, std::size_t already_size) {
+      closer_.reset(file);
+      file_ = NULL;
+      Open(already_data, already_size);
+    }
+
+    ~BZip() {
+      Close(file_);
+    }
+
+    std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
+      assert(file_);
+      int bzerror = BZ_OK;
+      int ret = BZ2_bzRead(&bzerror, file_, to, std::min<std::size_t>(static_cast<std::size_t>(INT_MAX), amount));
+      long pos = ftell(closer_.get());
+      if (pos != -1) ReadCount(thunk) = pos;
+      switch (bzerror) {
+        case BZ_STREAM_END:
+          /* bzip2 files can be concatenated by e.g. pbzip2.  Annoyingly, the
+           * library doesn't handle this internally.  This gets the trailing
+           * data, grows it up to magic as needed, validates the magic, and
+           * reopens.
+           */
+          {
+            bzerror = BZ_OK;
+            void *trailing_data;
+            int trailing_size;
+            BZ2_bzReadGetUnused(&bzerror, file_, &trailing_data, &trailing_size);
+            UTIL_THROW_IF(bzerror != BZ_OK, BZException, "bzip2 error in BZ2_bzReadGetUnused " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror);
+            std::string trailing(static_cast<const char*>(trailing_data), trailing_size);
+            Close(file_);
+
+            if (trailing_size < (int)sizeof(kBZMagic)) {
+              trailing.resize(sizeof(kBZMagic));
+              if (1 != fread(&trailing[trailing_size], sizeof(kBZMagic) - trailing_size, 1, closer_.get())) {
+                UTIL_THROW_IF(trailing_size, BZException, "File has trailing cruft");
+                // Legitimate end of file.
+                ReplaceThis(new Complete(), thunk);
+                return ret;
+              }
+            }
+            UTIL_THROW_IF(memcmp(trailing.data(), kBZMagic, sizeof(kBZMagic)), BZException, "Trailing cruft is not another bzip2 stream");
+            Open(&trailing[0], trailing.size());
+          }
+          return ret;
+        case BZ_OK:
+          return ret;
+        default:
+          UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror);
+      }
+    }
+
+  private:
+    void Open(void *already_data, std::size_t already_size) {
+      assert(!file_);
       int bzerror = BZ_OK;
       file_ = BZ2_bzReadOpen(&bzerror, closer_.get(), 0, 0, already_data, already_size);
       switch (bzerror) {
@@ -199,38 +260,23 @@ class BZip : public ReadBase {
           UTIL_THROW(BZException, "IO error reading file");
         case BZ_MEM_ERROR:
           throw std::bad_alloc();
+        default:
+          UTIL_THROW(BZException, "Unknown bzip2 error code " << bzerror);
       }
+      assert(file_);
     }
 
-    ~BZip() {
+    static void Close(BZFILE *&file) {
+      if (file == NULL) return;
       int bzerror = BZ_OK;
-      BZ2_bzReadClose(&bzerror, file_);
+      BZ2_bzReadClose(&bzerror, file);
       if (bzerror != BZ_OK) {
-        std::cerr << "bz2 readclose error" << std::endl;
+        std::cerr << "bz2 readclose error number " << bzerror << std::endl;
         abort();
       }
+      file = NULL;
     }
 
-    std::size_t Read(void *to, std::size_t amount, ReadCompressed &thunk) {
-      int bzerror = BZ_OK;
-      int ret = BZ2_bzRead(&bzerror, file_, to, std::min<std::size_t>(static_cast<std::size_t>(INT_MAX), amount));
-      long pos;
-      switch (bzerror) {
-        case BZ_STREAM_END:
-          pos = ftell(closer_.get());
-          if (pos != -1) ReadCount(thunk) = pos;
-          ReplaceThis(new Complete(), thunk);
-          return ret;
-        case BZ_OK:
-          pos = ftell(closer_.get());
-          if (pos != -1) ReadCount(thunk) = pos;
-          return ret;
-        default:
-          UTIL_THROW(BZException, "bzip2 error " << BZ2_bzerror(file_, &bzerror) << " code " << bzerror);
-      }
-    }
-
-  private:
     scoped_FILE closer_;
     BZFILE *file_;
 };
@@ -346,11 +392,11 @@ MagicResult DetectMagic(const void *from_void) {
   if (header[0] == 0x1f && header[1] == 0x8b) {
     return GZIP;
   }
-  if (header[0] == 'B' && header[1] == 'Z' && header[2] == 'h') {
+  if (!memcmp(header, kBZMagic, sizeof(kBZMagic))) {
     return BZIP;
   }
-  const uint8_t xzmagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
-  if (!memcmp(header, xzmagic, 6)) {
+  const uint8_t kXZMagic[6] = { 0xFD, '7', 'z', 'X', 'Z', 0x00 };
+  if (!memcmp(header, kXZMagic, sizeof(kXZMagic))) {
     return XZIP;
   }
   return UNKNOWN;
-- 
cgit v1.2.3


From 6efa1681fc9f385bcdec1243ef059203812018c0 Mon Sep 17 00:00:00 2001
From: Christian Buck <cbuck@lantis.de>
Date: Tue, 19 Mar 2013 18:21:35 +0000
Subject: added operator< to SearchGraphNode - compares Ids

---
 moses/Manager.h | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/moses/Manager.h b/moses/Manager.h
index e2f8ed8e5..11762ec37 100644
--- a/moses/Manager.h
+++ b/moses/Manager.h
@@ -56,6 +56,10 @@ struct SearchGraphNode {
     hypo(theHypo), recombinationHypo(theRecombinationHypo),
     forward(theForward), fscore(theFscore) {}
 
+    bool operator<(const SearchGraphNode& sgn) const {
+        return this->hypo->GetId() < sgn.hypo->GetId();
+    }
+
 };
 
 /** The Manager class implements a stack decoding algorithm for phrase-based decoding
@@ -104,7 +108,7 @@ private:
   // Helper functions to output search graph in the hypergraph format of Kenneth Heafield's lazy hypergraph decoder
   void OutputFeatureValuesForHypergraph(const Hypothesis* hypo, std::ostream &outputSearchGraphStream) const;
   size_t OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const;
-  
+
 
 protected:
   // data
-- 
cgit v1.2.3


From e7f54efa72c346cc9880d564def6785449dac4af Mon Sep 17 00:00:00 2001
From: Christian Buck <cbuck@lantis.de>
Date: Tue, 19 Mar 2013 18:22:21 +0000
Subject: mosesserver gives search graph ordered by hyp-id

---
 contrib/server/mosesserver.cpp | 16 ++++++++++++----
 1 file changed, 12 insertions(+), 4 deletions(-)

diff --git a/contrib/server/mosesserver.cpp b/contrib/server/mosesserver.cpp
index 98024c891..5d9c40a9b 100644
--- a/contrib/server/mosesserver.cpp
+++ b/contrib/server/mosesserver.cpp
@@ -1,6 +1,8 @@
 #include "util/check.hh"
 #include <stdexcept>
 #include <iostream>
+#include <vector>
+#include <algorithm>
 
 
 #include "moses/ChartManager.h"
@@ -54,7 +56,7 @@ public:
     PhraseDictionaryDynSuffixArray* pdsa = (PhraseDictionaryDynSuffixArray*) pdf->GetDictionary();
     cerr << "Inserting into address " << pdsa << endl;
     pdsa->insertSnt(source_, target_, alignment_);
-    if(add2ORLM_) {       
+    if(add2ORLM_) {
       updateORLM();
     }
     cerr << "Done inserting\n";
@@ -83,8 +85,8 @@ public:
     const std::string sBOS = orlm->GetSentenceStart()->GetString();
     const std::string sEOS = orlm->GetSentenceEnd()->GetString();
     Utils::splitToStr(target_, vl, " ");
-    // insert BOS and EOS 
-    vl.insert(vl.begin(), sBOS); 
+    // insert BOS and EOS
+    vl.insert(vl.begin(), sBOS);
     vl.insert(vl.end(), sEOS);
     for(int j=0; j < vl.size(); ++j) {
       int i = (j<ngOrder) ? 0 : j-ngOrder+1;
@@ -177,7 +179,7 @@ public:
     map<string, xmlrpc_c::value> retData;
 
     if (staticData.IsChart()) {
-       TreeInput tinput; 
+       TreeInput tinput;
         const vector<FactorType> &inputFactorOrder =
           staticData.GetInputFactorOrder();
         stringstream in(source + "\n");
@@ -260,10 +262,16 @@ public:
 
   }
 
+
+  bool compareSearchGraphNode(const SearchGraphNode& a, const SearchGraphNode b) {
+    return a.hypo->GetId() < b.hypo->GetId();
+  }
+
   void insertGraphInfo(Manager& manager, map<string, xmlrpc_c::value>& retData) {
     vector<xmlrpc_c::value> searchGraphXml;
     vector<SearchGraphNode> searchGraph;
     manager.GetSearchGraph(searchGraph);
+    std::sort(searchGraph.begin(), searchGraph.end());
     for (vector<SearchGraphNode>::const_iterator i = searchGraph.begin(); i != searchGraph.end(); ++i) {
       map<string, xmlrpc_c::value> searchGraphXmlNode;
       searchGraphXmlNode["forward"] = xmlrpc_c::value_double(i->forward);
-- 
cgit v1.2.3


From 34c8975aa7c4505486b3f8494f48abd3d3b645d1 Mon Sep 17 00:00:00 2001
From: Christian Buck <cbuck@lantis.de>
Date: Tue, 19 Mar 2013 18:22:43 +0000
Subject: ported perl xmlrpc example to python

---
 contrib/server/client.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100755 contrib/server/client.py

diff --git a/contrib/server/client.py b/contrib/server/client.py
new file mode 100755
index 000000000..43e77555a
--- /dev/null
+++ b/contrib/server/client.py
@@ -0,0 +1,21 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+# python port of client.perl
+
+import xmlrpclib
+import datetime
+
+url = "http://localhost:8080/RPC2"
+proxy = xmlrpclib.ServerProxy(url)
+
+text = u"il a souhaité que la présidence trace à nice le chemin pour l' avenir ."
+params = {"text":text, "align":"true", "report-all-factors":"true"}
+
+result = proxy.translate(params)
+print result['text']
+if 'align' in result:
+    print "Phrase alignments:"
+    aligns = result['align']
+    for align in aligns:
+        print "%s,%s,%s" %(align['tgt-start'], align['src-start'], align['src-end'])
-- 
cgit v1.2.3


From 7b9c5c1194528ddeea7b0c33fa7d1c43f4a3b373 Mon Sep 17 00:00:00 2001
From: Matous Machacek <machacekmatous@gmail.com>
Date: Tue, 19 Mar 2013 23:08:28 +0100
Subject: fixed bug in InterpolatedScorer

---
 mert/InterpolatedScorer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mert/InterpolatedScorer.cpp b/mert/InterpolatedScorer.cpp
index e610cbdd0..af3f26bf2 100644
--- a/mert/InterpolatedScorer.cpp
+++ b/mert/InterpolatedScorer.cpp
@@ -164,7 +164,7 @@ void InterpolatedScorer::prepareStats(size_t sid, const string& text, ScoreStats
 {
   stringstream buff;
   string align = text;
-  string sentence = "";
+  string sentence = text;
   size_t alignmentData = text.find("|||");
   //Get sentence and alignment parts
   if(alignmentData != string::npos) {
-- 
cgit v1.2.3


From 0cc07c5c9ae232710e95889da4249e483d877823 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Wed, 20 Mar 2013 17:17:16 +0000
Subject: --max-factors=1 and Sparse features segfault -> different check
 suggested by Philipp

---
 moses/SourceWordDeletionFeature.cpp  | 7 +------
 moses/TargetWordInsertionFeature.cpp | 7 +------
 2 files changed, 2 insertions(+), 12 deletions(-)

diff --git a/moses/SourceWordDeletionFeature.cpp b/moses/SourceWordDeletionFeature.cpp
index c5a61111f..c312a3b03 100644
--- a/moses/SourceWordDeletionFeature.cpp
+++ b/moses/SourceWordDeletionFeature.cpp
@@ -55,12 +55,7 @@ void SourceWordDeletionFeature::ComputeFeatures(const TargetPhrase& targetPhrase
   // handle special case: unknown words (they have no word alignment)
 	size_t targetLength = targetPhrase.GetSize();
 	size_t sourceLength = targetPhrase.GetSourcePhrase().GetSize();
-	if (targetLength == 1 && sourceLength == 1) {
-		const Factor* f1 = targetPhrase.GetWord(0).GetFactor(1);
-		if (f1 && f1->GetString().compare(UNKNOWN_FACTOR) == 0) {
-			return;
-		}
-	}
+	if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
 
   // flag aligned words
   bool aligned[16];
diff --git a/moses/TargetWordInsertionFeature.cpp b/moses/TargetWordInsertionFeature.cpp
index 537c5c9cb..3b9bf36ba 100644
--- a/moses/TargetWordInsertionFeature.cpp
+++ b/moses/TargetWordInsertionFeature.cpp
@@ -56,12 +56,7 @@ void TargetWordInsertionFeature::ComputeFeatures(const TargetPhrase& targetPhras
   // handle special case: unknown words (they have no word alignment)
   size_t targetLength = targetPhrase.GetSize();
   size_t sourceLength = targetPhrase.GetSourcePhrase().GetSize();
-  if (targetLength == 1 && sourceLength == 1) {
-		const Factor* f1 = targetPhrase.GetWord(0).GetFactor(1);
-		if (f1 && f1->GetString().compare(UNKNOWN_FACTOR) == 0) {
-			return;
-		}
-  }
+  if (targetLength == 1 && sourceLength == 1 && !alignmentInfo.GetSize()) return;
 
   // flag aligned words
   bool aligned[16];
-- 
cgit v1.2.3


From 22c77f73310e1afda4f232b06a3a4b212c40593c Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Thu, 21 Mar 2013 12:17:16 -0400
Subject: Work on decreasing memory requirement for outputting hypergraph

---
 moses-cmd/Main.cpp |  9 +++++----
 moses/Manager.cpp  | 31 +++++++++++++++++++++++--------
 2 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index 117cac3f9..f4cdb4388 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -171,10 +171,11 @@ public:
       std::ofstream *file = new std::ofstream;
       file->open(fileName.str().c_str());
       if (file->is_open() && file->good()) {
-	ostringstream out;
-	fix(out,PRECISION);
-	manager.OutputSearchGraphAsHypergraph(m_lineNumber, out);
-	*file << out.str();
+	//	ostringstream out;
+	//	fix(out,PRECISION);
+	fix(*file,PRECISION);
+	manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
+	//	*file << out.str();
 	file -> flush();
       } else {
 	TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
diff --git a/moses/Manager.cpp b/moses/Manager.cpp
index 011187cda..2ca689bb0 100644
--- a/moses/Manager.cpp
+++ b/moses/Manager.cpp
@@ -800,7 +800,6 @@ size_t Manager::OutputFeatureValuesForSLF(size_t index, bool zeros, const Hypoth
 
 size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis* hypo, const FeatureFunction* ff, std::ostream &outputSearchGraphStream) const
 {
-
   ScoreComponentCollection scoreCollection = hypo->GetScoreBreakdown(); 
   const Hypothesis *prevHypo = hypo->GetPrevHypo();
   if (prevHypo) {
@@ -823,14 +822,20 @@ size_t Manager::OutputFeatureValuesForHypergraph(size_t index, const Hypothesis*
 /**! Output search graph in hypergraph format of Kenneth Heafield's lazy hypergraph decoder */
 void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &outputSearchGraphStream) const
 {
+
+  VERBOSE(2,"Getting search graph to output as hypergraph for sentence " << translationId << std::endl)
+
   vector<SearchGraphNode> searchGraph;
   GetSearchGraph(searchGraph);
 
+
   map<int,int> mosesIDToHypergraphID;
   // map<int,int> hypergraphIDToMosesID;
   set<int> terminalNodes;
   multimap<int,int> hypergraphIDToArcs;
 
+  VERBOSE(2,"Gathering information about search graph to output as hypergraph for sentence " << translationId << std::endl)
+
   long numNodes = 0;
   long endNode = 0;
   {
@@ -888,11 +893,21 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
   // Print number of nodes and arcs
   outputSearchGraphStream << numNodes << " " << numArcs << endl;
 
+  VERBOSE(2,"Search graph to output as hypergraph for sentence " << translationId 
+	  << " contains " << numArcs << " arcs and " << numNodes << " nodes" << std::endl)
+
+  VERBOSE(2,"Outputting search graph to output as hypergraph for sentence " << translationId << std::endl)
+
+
   for (int hypergraphHypothesisID=0; hypergraphHypothesisID < endNode; hypergraphHypothesisID+=1) {
+    if (hypergraphHypothesisID % 100000 == 0) {
+      VERBOSE(2,"Processed " << hypergraphHypothesisID << " of " << numNodes << " hypergraph nodes for sentence " << translationId << std::endl);
+    }
     //    int mosesID = hypergraphIDToMosesID[hypergraphHypothesisID];
     size_t count = hypergraphIDToArcs.count(hypergraphHypothesisID);
+    //    VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has " << count << " incoming arcs" << std::endl)
     if (count > 0) {
-      outputSearchGraphStream << count << endl;
+      outputSearchGraphStream << count << "\n";
 
       pair<multimap<int,int>::iterator, multimap<int,int>::iterator> range =
 	hypergraphIDToArcs.equal_range(hypergraphHypothesisID);
@@ -917,10 +932,11 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
 
 	const Hypothesis *prevHypo = thisHypo->GetPrevHypo();
 	if (prevHypo==NULL) {
-	  outputSearchGraphStream << "<s> ||| " << endl;
+	  //	VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " start of sentence" << std::endl)
+	  outputSearchGraphStream << "<s> ||| \n";
 	} else {
 	  int startNode = mosesIDToHypergraphID[prevHypo->GetId()];
-
+	  //	  VERBOSE(2,"Hypergraph node " << hypergraphHypothesisID << " has parent node " << startNode << std::endl)
 	  UTIL_THROW_IF(
 			(startNode >= hypergraphHypothesisID),
 			util::Exception,
@@ -937,17 +953,16 @@ void Manager::OutputSearchGraphAsHypergraph(long translationId, std::ostream &ou
 	  }
 	  outputSearchGraphStream << " ||| ";
 	  OutputFeatureValuesForHypergraph(thisHypo, outputSearchGraphStream);
-	  outputSearchGraphStream << endl;
+	  outputSearchGraphStream << "\n";
 	}
-
       }
     }
   }
 
   // Print node and arc(s) for end of sentence </s>
-  outputSearchGraphStream << terminalNodes.size() << endl;
+  outputSearchGraphStream << terminalNodes.size() << "\n";
   for (set<int>::iterator it=terminalNodes.begin(); it!=terminalNodes.end(); ++it) {
-    outputSearchGraphStream << "[" << (*it) << "] </s> ||| " << endl;
+    outputSearchGraphStream << "[" << (*it) << "] </s> ||| \n";
   }
 
 }
-- 
cgit v1.2.3


From 28c980d58e4d3b53c257d2979de99a746bb7b0e3 Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Thu, 21 Mar 2013 15:19:31 -0400
Subject: Allow hypergraph output to be in plain text, gzip, or bzip2.

The output-search-graph-hypergraph flag now takes two params:

* The first param must be "none", "gzip", or "bzip2"
* The second param is the hypergraph directory,
  which must already exist
---
 moses-cmd/Jamfile  |  2 +-
 moses-cmd/Main.cpp | 23 ++++++++++++++++++-----
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile
index 04f395a81..42d54568f 100644
--- a/moses-cmd/Jamfile
+++ b/moses-cmd/Jamfile
@@ -1,4 +1,4 @@
-alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ../moses//moses ;
+alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z ..//boost_iostreams ../moses//moses ;
 
 exe moses : Main.cpp deps ;
 exe lmbrgrid : LatticeMBRGrid.cpp deps ;
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index f4cdb4388..90c924a08 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -23,6 +23,11 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  * Moses main, for single-threaded and multi-threaded.
  **/
 
+#include <boost/iostreams/device/file.hpp>
+#include <boost/iostreams/filter/bzip2.hpp>
+#include <boost/iostreams/filter/gzip.hpp>
+#include <boost/iostreams/filtering_stream.hpp>
+
 #include <exception>
 #include <fstream>
 #include <sstream>
@@ -167,10 +172,18 @@ public:
     // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
     if (m_outputSearchGraphHypergraph) {
       stringstream fileName;
-      fileName << staticData.GetParam("output-search-graph-hypergraph")[0] << "/" << m_lineNumber;
-      std::ofstream *file = new std::ofstream;
-      file->open(fileName.str().c_str());
-      if (file->is_open() && file->good()) {
+      fileName << staticData.GetParam("output-search-graph-hypergraph")[1] << "/" << m_lineNumber;
+      boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream;
+      //      file->open(fileName.str().c_str());
+      string compression = staticData.GetParam("output-search-graph-hypergraph")[0];
+      if ( compression == "gzip" || compression == "gz" ) {
+	file->push( boost::iostreams::gzip_compressor() );
+      } else if ( compression == "bzip2" || compression == "bz2" ) {
+	file->push( boost::iostreams::bzip2_compressor() );
+      } 
+      file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
+      //      if (file->is_open() && file->good()) {
+      if (file->is_complete() && file->good()) {
 	//	ostringstream out;
 	//	fix(out,PRECISION);
 	fix(*file,PRECISION);
@@ -180,7 +193,7 @@ public:
       } else {
 	TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
       }
-      file -> close();
+      file -> pop();
       delete file;
     }
 
-- 
cgit v1.2.3


From b2bba0bae344cdace0ec3fe9dc4292bcae0a2a43 Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Thu, 21 Mar 2013 16:48:47 -0400
Subject: Work on compression and defaults for outputting hypergraphs.

---
 moses-cmd/IOWrapper.cpp |  9 -----
 moses-cmd/IOWrapper.h   |  2 --
 moses-cmd/Jamfile       |  2 +-
 moses-cmd/Main.cpp      | 95 +++++++++++++++++++++++++++++++++++++------------
 4 files changed, 73 insertions(+), 35 deletions(-)

diff --git a/moses-cmd/IOWrapper.cpp b/moses-cmd/IOWrapper.cpp
index 2da30f380..335a570a6 100644
--- a/moses-cmd/IOWrapper.cpp
+++ b/moses-cmd/IOWrapper.cpp
@@ -189,15 +189,6 @@ InputType*IOWrapper::GetInput(InputType* inputType)
   }
 }
 
-  ofstream* IOWrapper::GetOutputSearchGraphHypergraphWeightsStream() {
-    const StaticData &staticData = StaticData::Instance();
-    stringstream fileName;
-    fileName << staticData.GetParam("output-search-graph-hypergraph")[1];
-    std::ofstream *file = new std::ofstream;
-    file->open(fileName.str().c_str());
-    return file;
-  }
-
 /***
  * print surface factor only for the given phrase
  */
diff --git a/moses-cmd/IOWrapper.h b/moses-cmd/IOWrapper.h
index 267a3a0bc..8dbdeda9c 100644
--- a/moses-cmd/IOWrapper.h
+++ b/moses-cmd/IOWrapper.h
@@ -117,8 +117,6 @@ public:
     return *m_outputSearchGraphStream;
   }
 
-  std::ofstream *GetOutputSearchGraphHypergraphWeightsStream();
-
   std::ostream &GetDetailedTranslationReportingStream() {
     assert (m_detailedTranslationReportingStream);
     return *m_detailedTranslationReportingStream;
diff --git a/moses-cmd/Jamfile b/moses-cmd/Jamfile
index 42d54568f..bddc10911 100644
--- a/moses-cmd/Jamfile
+++ b/moses-cmd/Jamfile
@@ -1,4 +1,4 @@
-alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z ..//boost_iostreams ../moses//moses ;
+alias deps : IOWrapper.cpp mbr.cpp LatticeMBR.cpp TranslationAnalysis.cpp ..//z ..//boost_iostreams ..//boost_filesystem ../moses//moses ;
 
 exe moses : Main.cpp deps ;
 exe lmbrgrid : LatticeMBRGrid.cpp deps ;
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index 90c924a08..8f6a6c069 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -23,6 +23,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  * Moses main, for single-threaded and multi-threaded.
  **/
 
+#include <boost/filesystem.hpp>
 #include <boost/iostreams/device/file.hpp>
 #include <boost/iostreams/filter/bzip2.hpp>
 #include <boost/iostreams/filter/gzip.hpp>
@@ -171,30 +172,76 @@ public:
 
     // Output search graph in hypergraph format for Kenneth Heafield's lazy hypergraph decoder
     if (m_outputSearchGraphHypergraph) {
-      stringstream fileName;
-      fileName << staticData.GetParam("output-search-graph-hypergraph")[1] << "/" << m_lineNumber;
-      boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream;
-      //      file->open(fileName.str().c_str());
-      string compression = staticData.GetParam("output-search-graph-hypergraph")[0];
-      if ( compression == "gzip" || compression == "gz" ) {
-	file->push( boost::iostreams::gzip_compressor() );
-      } else if ( compression == "bzip2" || compression == "bz2" ) {
-	file->push( boost::iostreams::bzip2_compressor() );
+
+      vector<string> hypergraphParameters = staticData.GetParam("output-search-graph-hypergraph");
+
+      bool appendSuffix;
+      if (hypergraphParameters.size() > 0 && hypergraphParameters[0] == "true") {
+	appendSuffix = true;
+      } else {
+	appendSuffix = false;
+      }
+
+      string compression;
+      if (hypergraphParameters.size() > 1) {
+	compression = hypergraphParameters[1];
+      } else {
+	compression = "txt";
+      }
+
+      string hypergraphDir;
+      if ( hypergraphParameters.size() > 2 ) {
+	hypergraphDir = hypergraphParameters[2];
+      } else {
+	string nbestFile = staticData.GetNBestFilePath();
+	if ( ! nbestFile.empty()) {
+	  boost::filesystem::path nbestPath(nbestFile);
+	  hypergraphDir = nbestPath.parent_path().filename();
+	} else {
+	  stringstream hypergraphDirName;
+	  hypergraphDirName << boost::filesystem::current_path() << "/hypergraph";
+	  hypergraphDir = hypergraphDirName.str();
+	}
+      }
+
+      if ( ! boost::filesystem::exists(hypergraphDir) ) {
+	boost::filesystem::create_directory(hypergraphDir);
       } 
-      file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
-      //      if (file->is_open() && file->good()) {
-      if (file->is_complete() && file->good()) {
-	//	ostringstream out;
-	//	fix(out,PRECISION);
-	fix(*file,PRECISION);
-	manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
-	//	*file << out.str();
-	file -> flush();
+
+      if ( ! boost::filesystem::exists(hypergraphDir) ) {
+	TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because the directory does not exist" << std::endl);
+      } else if ( ! boost::filesystem::is_directory(hypergraphDir) ) {
+	TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl);
       } else {
-	TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file is not open or not ready for writing" << std::endl);
+	
+	stringstream fileName;
+	fileName << hypergraphDir << "/" << m_lineNumber;
+	if ( appendSuffix ) {
+	  fileName << "." << compression;
+	}
+	boost::iostreams::filtering_ostream *file = new boost::iostreams::filtering_ostream;
+
+	if ( compression == "gz" ) {
+	  file->push( boost::iostreams::gzip_compressor() );
+	} else if ( compression == "bz2" ) {
+	  file->push( boost::iostreams::bzip2_compressor() );
+	} else if ( compression != "txt" ) {
+	  TRACE_ERR("Unrecognized hypergraph compression format (" << compression << ") - using uncompressed plain txt" << std::endl);
+	  compression = "txt";
+	}
+
+	file->push( boost::iostreams::file_sink(fileName.str(), ios_base::out) );
+
+	if (file->is_complete() && file->good()) {
+	  fix(*file,PRECISION);
+	  manager.OutputSearchGraphAsHypergraph(m_lineNumber, *file);
+	  file -> flush();
+	} else {
+	  TRACE_ERR("Cannot output hypergraph for line " << m_lineNumber << " because the output file " << fileName.str() << " is not open or not ready for writing" << std::endl);
+	}
+	file -> pop();
+	delete file;
       }
-      file -> pop();
-      delete file;
     }
 
     // apply decision rule and output best translation(s)
@@ -548,8 +595,10 @@ int main(int argc, char** argv)
       TRACE_ERR(weights);
       TRACE_ERR("\n");
     }
-    if (staticData.GetOutputSearchGraphHypergraph() && staticData.GetParam("output-search-graph-hypergraph").size() > 1) {
-      ofstream* weightsOut = ioWrapper->GetOutputSearchGraphHypergraphWeightsStream();
+    if (staticData.GetOutputSearchGraphHypergraph() && staticData.GetParam("output-search-graph-hypergraph").size() > 3) {
+      ofstream* weightsOut = new std::ofstream;
+      string weightsFilename = staticData.GetParam("output-search-graph-hypergraph")[3];
+      weightsOut->open(weightsFilename.c_str());
       OutputFeatureWeightsForHypergraph(*weightsOut);
       weightsOut->flush();
       weightsOut->close();
-- 
cgit v1.2.3


From 3a4e63c558a5d5beaefbb2e45497b66e4d9ae72f Mon Sep 17 00:00:00 2001
From: Lane Schwartz <dowobeha@gmail.com>
Date: Fri, 22 Mar 2013 12:14:28 -0400
Subject: Ensure directory exists for outputting hypergraphs

---
 moses-cmd/Main.cpp | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index 8f6a6c069..8fd46ba38 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -23,6 +23,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  * Moses main, for single-threaded and multi-threaded.
  **/
 
+#include <boost/algorithm/string/predicate.hpp>
 #include <boost/filesystem.hpp>
 #include <boost/iostreams/device/file.hpp>
 #include <boost/iostreams/filter/bzip2.hpp>
@@ -194,7 +195,7 @@ public:
 	hypergraphDir = hypergraphParameters[2];
       } else {
 	string nbestFile = staticData.GetNBestFilePath();
-	if ( ! nbestFile.empty()) {
+	if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
 	  boost::filesystem::path nbestPath(nbestFile);
 	  hypergraphDir = nbestPath.parent_path().filename();
 	} else {
@@ -595,10 +596,26 @@ int main(int argc, char** argv)
       TRACE_ERR(weights);
       TRACE_ERR("\n");
     }
-    if (staticData.GetOutputSearchGraphHypergraph() && staticData.GetParam("output-search-graph-hypergraph").size() > 3) {
+    if (staticData.GetOutputSearchGraphHypergraph()) {
       ofstream* weightsOut = new std::ofstream;
-      string weightsFilename = staticData.GetParam("output-search-graph-hypergraph")[3];
-      weightsOut->open(weightsFilename.c_str());
+      stringstream weightsFilename;
+      if (staticData.GetParam("output-search-graph-hypergraph").size() > 3) { 
+	weightsFilename << staticData.GetParam("output-search-graph-hypergraph")[3];
+      } else {
+	string nbestFile = staticData.GetNBestFilePath();
+	if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
+	  boost::filesystem::path nbestPath(nbestFile);
+	  weightsFilename << nbestPath.parent_path().filename() << "/weights";
+	} else {
+	  weightsFilename << boost::filesystem::current_path() << "/hypergraph/weights";
+	}
+      }
+      boost::filesystem::path weightsFilePath(weightsFilename.str());
+      if ( ! boost::filesystem::exists(weightsFilePath.parent_path()) ) {
+	boost::filesystem::create_directory(weightsFilePath.parent_path());
+      }
+      TRACE_ERR("The weights file is " << weightsFilename.str() << "\n");
+      weightsOut->open(weightsFilename.str().c_str());
       OutputFeatureWeightsForHypergraph(*weightsOut);
       weightsOut->flush();
       weightsOut->close();
-- 
cgit v1.2.3


From db005f6503a5179e0427b460d8039e8920c8d65b Mon Sep 17 00:00:00 2001
From: Hieu Hoang <fishandfrolick@gmail.com>
Date: Sat, 23 Mar 2013 15:58:07 +0000
Subject: compile error caused by different versions of boost

---
 contrib/other-builds/moses-chart-cmd/.cproject | 5 +++--
 contrib/other-builds/moses-cmd/.cproject       | 5 +++--
 moses-cmd/Main.cpp                             | 4 ++--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject
index 90a730cf7..4ca560326 100644
--- a/contrib/other-builds/moses-chart-cmd/.cproject
+++ b/contrib/other-builds/moses-chart-cmd/.cproject
@@ -1,5 +1,7 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+<?fileVersion 4.0.0?>
+
+<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
 	<storageModule moduleId="org.eclipse.cdt.core.settings">
 		<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.162355801">
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.162355801" moduleId="org.eclipse.cdt.core.settings" name="Debug">
@@ -74,7 +76,6 @@
 									<listOptionValue builtIn="false" value="boost_system-mt"/>
 									<listOptionValue builtIn="false" value="boost_thread-mt"/>
 									<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
-									<listOptionValue builtIn="false" value="rt"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.128214028" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
 									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject
index 573fe715f..3d6d32a72 100644
--- a/contrib/other-builds/moses-cmd/.cproject
+++ b/contrib/other-builds/moses-cmd/.cproject
@@ -1,5 +1,7 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+<?fileVersion 4.0.0?>
+
+<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
 	<storageModule moduleId="org.eclipse.cdt.core.settings">
 		<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.461114338">
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.461114338" moduleId="org.eclipse.cdt.core.settings" name="Debug">
@@ -73,7 +75,6 @@
 									<listOptionValue builtIn="false" value="boost_thread-mt"/>
 									<listOptionValue builtIn="false" value="lm"/>
 									<listOptionValue builtIn="false" value="util"/>
-									<listOptionValue builtIn="false" value="rt"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.983725033" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
 									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index 8fd46ba38..ef85fd66b 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -192,12 +192,12 @@ public:
 
       string hypergraphDir;
       if ( hypergraphParameters.size() > 2 ) {
-	hypergraphDir = hypergraphParameters[2];
+        hypergraphDir = hypergraphParameters[2];
       } else {
 	string nbestFile = staticData.GetNBestFilePath();
 	if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
 	  boost::filesystem::path nbestPath(nbestFile);
-	  hypergraphDir = nbestPath.parent_path().filename();
+	  //hypergraphDir = nbestPath.parent_path().filename();
 	} else {
 	  stringstream hypergraphDirName;
 	  hypergraphDirName << boost::filesystem::current_path() << "/hypergraph";
-- 
cgit v1.2.3


From 4a954f2be7cc4cef32c08f6748d39f7430c1e4a5 Mon Sep 17 00:00:00 2001
From: Joan Puigcerver <joapuipe@gmail.com>
Date: Sun, 24 Mar 2013 20:01:59 +0100
Subject: Fixes compilation error using libboost_1_50

Problem spotted in https://github.com/moses-smt/mosesdecoder/issues/32
fixed.

According to the Boost doc, nbestPath.parent_path().filename()
returns a path object, to get the correspondent std::string
representation, one must call one of the methods listed in:
http://www.boost.org/doc/libs/1_53_0/libs/filesystem/doc/reference.html#path-native-format-observers

native() is supposed to return the path in the specific OS
path format (using backslashes for Windows). Anyway, since we
are considering only the filename here, the result is the same.
---
 moses-cmd/Main.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index ef85fd66b..68d8049c4 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -197,7 +197,7 @@ public:
 	string nbestFile = staticData.GetNBestFilePath();
 	if ( ! nbestFile.empty() && nbestFile!="-" && !boost::starts_with(nbestFile,"/dev/stdout") ) {
 	  boost::filesystem::path nbestPath(nbestFile);
-	  //hypergraphDir = nbestPath.parent_path().filename();
+	  hypergraphDir = nbestPath.parent_path().filename().native();
 	} else {
 	  stringstream hypergraphDirName;
 	  hypergraphDirName << boost::filesystem::current_path() << "/hypergraph";
@@ -214,7 +214,6 @@ public:
       } else if ( ! boost::filesystem::is_directory(hypergraphDir) ) {
 	TRACE_ERR("Cannot output hypergraphs to " << hypergraphDir << " because that path exists, but is not a directory" << std::endl);
       } else {
-	
 	stringstream fileName;
 	fileName << hypergraphDir << "/" << m_lineNumber;
 	if ( appendSuffix ) {
-- 
cgit v1.2.3


From f2acca0943a7594740b0dbaa8ec70b6b8bbcb67d Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieuhoang@gmail.com>
Date: Tue, 26 Mar 2013 11:50:10 +0000
Subject: eclipse

---
 contrib/other-builds/moses-chart-cmd/.cproject | 9 +++++----
 contrib/other-builds/moses-cmd/.cproject       | 8 +++++---
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject
index 4ca560326..aae6822b6 100644
--- a/contrib/other-builds/moses-chart-cmd/.cproject
+++ b/contrib/other-builds/moses-chart-cmd/.cproject
@@ -1,7 +1,5 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?>
-
-<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
 	<storageModule moduleId="org.eclipse.cdt.core.settings">
 		<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.162355801">
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.162355801" moduleId="org.eclipse.cdt.core.settings" name="Debug">
@@ -62,6 +60,7 @@
 								</option>
 								<option id="gnu.cpp.link.option.libs.1177721357" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
 									<listOptionValue builtIn="false" value="dstruct"/>
+									<listOptionValue builtIn="false" value="rt"/>
 									<listOptionValue builtIn="false" value="flm"/>
 									<listOptionValue builtIn="false" value="oolm"/>
 									<listOptionValue builtIn="false" value="lattice"/>
@@ -73,9 +72,11 @@
 									<listOptionValue builtIn="false" value="lm"/>
 									<listOptionValue builtIn="false" value="util"/>
 									<listOptionValue builtIn="false" value="z"/>
+									<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
+									<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
 									<listOptionValue builtIn="false" value="boost_system-mt"/>
 									<listOptionValue builtIn="false" value="boost_thread-mt"/>
-									<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
+									<listOptionValue builtIn="false" value="bz2"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.128214028" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
 									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject
index 3d6d32a72..3a098bb9f 100644
--- a/contrib/other-builds/moses-cmd/.cproject
+++ b/contrib/other-builds/moses-cmd/.cproject
@@ -1,7 +1,5 @@
 <?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<?fileVersion 4.0.0?>
-
-<cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
+<?fileVersion 4.0.0?><cproject storage_type_id="org.eclipse.cdt.core.XmlProjectDescriptionStorage">
 	<storageModule moduleId="org.eclipse.cdt.core.settings">
 		<cconfiguration id="cdt.managedbuild.config.gnu.exe.debug.461114338">
 			<storageModule buildSystemId="org.eclipse.cdt.managedbuilder.core.configurationDataProvider" id="cdt.managedbuild.config.gnu.exe.debug.461114338" moduleId="org.eclipse.cdt.core.settings" name="Debug">
@@ -62,6 +60,7 @@
 								</option>
 								<option id="gnu.cpp.link.option.libs.998577284" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
 									<listOptionValue builtIn="false" value="dstruct"/>
+									<listOptionValue builtIn="false" value="rt"/>
 									<listOptionValue builtIn="false" value="flm"/>
 									<listOptionValue builtIn="false" value="lattice"/>
 									<listOptionValue builtIn="false" value="misc"/>
@@ -73,8 +72,11 @@
 									<listOptionValue builtIn="false" value="z"/>
 									<listOptionValue builtIn="false" value="boost_system-mt"/>
 									<listOptionValue builtIn="false" value="boost_thread-mt"/>
+									<listOptionValue builtIn="false" value="boost_iostreams-mt"/>
+									<listOptionValue builtIn="false" value="boost_filesystem-mt"/>
 									<listOptionValue builtIn="false" value="lm"/>
 									<listOptionValue builtIn="false" value="util"/>
+									<listOptionValue builtIn="false" value="bz2"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.983725033" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
 									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
-- 
cgit v1.2.3


From 51a59b881ec7c37761c80809b44e46d34bee39a5 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Tue, 26 Mar 2013 12:45:27 +0000
Subject: move RedVoc() as method in PDTimp class. Ready for getting rid of
 static variable

---
 moses/TranslationModel/PhraseDictionaryTree.cpp | 35 ++++++++++++++-----------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp
index 515d2f649..024294c61 100644
--- a/moses/TranslationModel/PhraseDictionaryTree.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTree.cpp
@@ -156,22 +156,6 @@ PhraseDictionaryTree::PrefixPtr::operator bool() const
 
 typedef LVoc<std::string> WordVoc;
 
-static WordVoc* ReadVoc(const std::string& filename)
-{
-  static std::map<std::string,WordVoc*> vocs;
-#ifdef WITH_THREADS
-  boost::mutex mutex;
-  boost::mutex::scoped_lock lock(mutex);
-#endif
-  std::map<std::string,WordVoc*>::iterator vi = vocs.find(filename);
-  if (vi == vocs.end()) {
-    WordVoc* voc = new WordVoc();
-    voc->Read(filename);
-    vocs[filename] = voc;
-  }
-  return vocs[filename];
-}
-
 
 class PDTimp {
 public:
@@ -190,6 +174,8 @@ public:
   ObjectPool<PPimp> pPool;
   // a comparison with the Boost MemPools might be useful
 
+  std::map<std::string,WordVoc*> vocs;
+
   bool needwordalign, haswordAlign;
   bool printwordalign;
 
@@ -304,6 +290,8 @@ public:
 
     return PPtr();
   }
+
+  WordVoc* ReadVoc(const std::string& filename);
 };
 
 
@@ -376,6 +364,21 @@ void PDTimp::PrintTgtCand(const TgtCands& tcand,std::ostream& out) const
   }
 }
 
+WordVoc* PDTimp::ReadVoc(const std::string& filename)
+{
+	#ifdef WITH_THREADS
+	boost::mutex mutex;
+	boost::mutex::scoped_lock lock(mutex);
+	#endif
+	std::map<std::string,WordVoc*>::iterator vi = vocs.find(filename);
+	if (vi == vocs.end()) {
+	  WordVoc* voc = new WordVoc();
+	  voc->Read(filename);
+	  vocs[filename] = voc;
+	}
+	return vocs[filename];
+}
+
 ////////////////////////////////////////////////////////////
 //
 // member functions of PhraseDictionaryTree
-- 
cgit v1.2.3


From e2b18c5337ca0b2e7a2cd94dfdae5fd7708f263b Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Tue, 26 Mar 2013 13:29:59 +0000
Subject: no leak message due to static variable in binary phrase table.
 Doesn't actually solve the mem leak though

---
 moses/TranslationModel/PhraseDictionaryTree.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp
index 024294c61..05b7afc4d 100644
--- a/moses/TranslationModel/PhraseDictionaryTree.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTree.cpp
@@ -186,6 +186,12 @@ public:
     if(os) fClose(os);
     if(ot) fClose(ot);
     FreeMemory();
+
+    std::map<std::string,WordVoc*>::iterator iter;
+    for (iter = vocs.begin(); iter != vocs.end(); ++iter) {
+      WordVoc *voc = iter->second;
+      delete voc;
+    }
   }
 
   inline void NeedAlignmentInfo(bool a) {
-- 
cgit v1.2.3


From f8afc7356965701fe4cb5fef51aa7e30a07e360e Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Tue, 26 Mar 2013 14:05:50 +0000
Subject: get rid of locking altogether. PDTimp already has separated source &
 target vocab variable. Use those instead

---
 moses/TranslationModel/PhraseDictionaryTree.cpp | 59 +++++++------------------
 1 file changed, 16 insertions(+), 43 deletions(-)

diff --git a/moses/TranslationModel/PhraseDictionaryTree.cpp b/moses/TranslationModel/PhraseDictionaryTree.cpp
index 05b7afc4d..675656112 100644
--- a/moses/TranslationModel/PhraseDictionaryTree.cpp
+++ b/moses/TranslationModel/PhraseDictionaryTree.cpp
@@ -168,14 +168,12 @@ public:
   std::vector<OFF_T> srcOffsets;
 
   FILE *os,*ot;
-  WordVoc* sv;
-  WordVoc* tv;
+  WordVoc sv;
+  WordVoc tv;
 
   ObjectPool<PPimp> pPool;
   // a comparison with the Boost MemPools might be useful
 
-  std::map<std::string,WordVoc*> vocs;
-
   bool needwordalign, haswordAlign;
   bool printwordalign;
 
@@ -186,12 +184,6 @@ public:
     if(os) fClose(os);
     if(ot) fClose(ot);
     FreeMemory();
-
-    std::map<std::string,WordVoc*>::iterator iter;
-    for (iter = vocs.begin(); iter != vocs.end(); ++iter) {
-      WordVoc *voc = iter->second;
-      delete voc;
-    }
   }
 
   inline void NeedAlignmentInfo(bool a) {
@@ -261,12 +253,12 @@ public:
 
       rv.back().tokens.reserve(iphrase.size());
       for(size_t j=0; j<iphrase.size(); ++j) {
-        rv.back().tokens.push_back(&tv->symbol(iphrase[j]));
+        rv.back().tokens.push_back(&tv.symbol(iphrase[j]));
       }
       rv.back().scores = i->GetScores();
       const IPhrase& fnames = i->GetFeatureNames();
       for (size_t j = 0; j < fnames.size(); ++j) {
-        rv.back().fnames.push_back(&tv->symbol(fnames[j]));
+        rv.back().fnames.push_back(&tv.symbol(fnames[j]));
       }
       rv.back().fvalues = i->GetFeatureValues();
       if (wa) wa->push_back(i->GetAlignment());
@@ -281,7 +273,7 @@ public:
     CHECK(p);
     if(w.empty() || w==EPSILON) return p;
 
-    LabelId wi=sv->index(w);
+    LabelId wi=sv.index(w);
 
     if(wi==InvalidLabelId) return PPtr(); // unknown word
     else if(p.imp->isRoot()) {
@@ -344,10 +336,8 @@ int PDTimp::Read(const std::string& fn)
   for(size_t i=0; i<data.size(); ++i)
     data[i]=CPT(os,srcOffsets[i]);
 
-  sv = ReadVoc(ifsv);
-  tv = ReadVoc(iftv);
-  //sv.Read(ifsv);
-  //tv.Read(iftv);
+  sv.Read(ifsv);
+  tv.Read(iftv);
 
   TRACE_ERR("binary phrasefile loaded, default OFF_T: "<<PTF::getDefault()
             <<"\n");
@@ -364,27 +354,12 @@ void PDTimp::PrintTgtCand(const TgtCands& tcand,std::ostream& out) const
     const IPhrase& iphr=tcand[i].GetPhrase();
 
     out << i << " -- " << sc << " -- ";
-    for(size_t j=0; j<iphr.size(); ++j)			out << tv->symbol(iphr[j])<<" ";
+    for(size_t j=0; j<iphr.size(); ++j)			out << tv.symbol(iphr[j])<<" ";
     out<< " -- " << trgAlign;
     out << std::endl;
   }
 }
 
-WordVoc* PDTimp::ReadVoc(const std::string& filename)
-{
-	#ifdef WITH_THREADS
-	boost::mutex mutex;
-	boost::mutex::scoped_lock lock(mutex);
-	#endif
-	std::map<std::string,WordVoc*>::iterator vi = vocs.find(filename);
-	if (vi == vocs.end()) {
-	  WordVoc* voc = new WordVoc();
-	  voc->Read(filename);
-	  vocs[filename] = voc;
-	}
-	return vocs[filename];
-}
-
 ////////////////////////////////////////////////////////////
 //
 // member functions of PhraseDictionaryTree
@@ -432,7 +407,7 @@ GetTargetCandidates(const std::vector<std::string>& src,
 {
   IPhrase f(src.size());
   for(size_t i=0; i<src.size(); ++i) {
-    f[i]=imp->sv->index(src[i]);
+    f[i]=imp->sv.index(src[i]);
     if(f[i]==InvalidLabelId) return;
   }
 
@@ -448,7 +423,7 @@ GetTargetCandidates(const std::vector<std::string>& src,
 {
   IPhrase f(src.size());
   for(size_t i=0; i<src.size(); ++i) {
-    f[i]=imp->sv->index(src[i]);
+    f[i]=imp->sv.index(src[i]);
     if(f[i]==InvalidLabelId) return;
   }
 
@@ -464,7 +439,7 @@ PrintTargetCandidates(const std::vector<std::string>& src,
 {
   IPhrase f(src.size());
   for(size_t i=0; i<src.size(); ++i) {
-    f[i]=imp->sv->index(src[i]);
+    f[i]=imp->sv.index(src[i]);
     if(f[i]==InvalidLabelId) {
       TRACE_ERR("the source phrase '"<<src<<"' contains an unknown word '"
                 <<src[i]<<"'\n");
@@ -506,8 +481,6 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
   std::vector<OFF_T> vo;
   size_t lnc=0;
   size_t numElement = NOT_FOUND; // 3=old format, 5=async format which include word alignment info
-  imp->sv = new WordVoc();
-  imp->tv = new WordVoc();
   size_t missingAlignmentCount = 0; 
 
   while(getline(inFile, line)) {
@@ -541,11 +514,11 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
 
     std::vector<std::string> wordVec = Tokenize(sourcePhraseString);
     for (size_t i = 0 ; i < wordVec.size() ; ++i)
-      f.push_back(imp->sv->add(wordVec[i]));
+      f.push_back(imp->sv.add(wordVec[i]));
 
     wordVec = Tokenize(targetPhraseString);
     for (size_t i = 0 ; i < wordVec.size() ; ++i)
-      e.push_back(imp->tv->add(wordVec[i]));
+      e.push_back(imp->tv.add(wordVec[i]));
 
     //			while(is>>w && w!="|||") sc.push_back(atof(w.c_str()));
     // Mauro: to handle 0 probs in phrase tables
@@ -585,7 +558,7 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
         abort();  
       }
       for (size_t i = 0; i < sparseTokens.size(); i+=2) {
-        fnames.push_back(imp->tv->add(sparseTokens[i]));
+        fnames.push_back(imp->tv.add(sparseTokens[i]));
         fvalues.push_back(Scan<FValue>(sparseTokens[i+1]));
       }
     }
@@ -672,8 +645,8 @@ int PhraseDictionaryTree::Create(std::istream& inFile,const std::string& out)
   fWriteVector(oi,vo);
   fClose(oi);
 
-  imp->sv->Write(ofsv);
-  imp->tv->Write(oftv);
+  imp->sv.Write(ofsv);
+  imp->tv.Write(oftv);
 
   return 1;
 }
-- 
cgit v1.2.3


From eeeda717a41b41db784564c88020f15a2f5bef57 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Tue, 26 Mar 2013 15:47:30 +0000
Subject: eclipse

---
 contrib/other-builds/moses-cmd/.cproject | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/other-builds/moses-cmd/.cproject b/contrib/other-builds/moses-cmd/.cproject
index 3a098bb9f..42d2100d8 100644
--- a/contrib/other-builds/moses-cmd/.cproject
+++ b/contrib/other-builds/moses-cmd/.cproject
@@ -60,7 +60,6 @@
 								</option>
 								<option id="gnu.cpp.link.option.libs.998577284" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
 									<listOptionValue builtIn="false" value="dstruct"/>
-									<listOptionValue builtIn="false" value="rt"/>
 									<listOptionValue builtIn="false" value="flm"/>
 									<listOptionValue builtIn="false" value="lattice"/>
 									<listOptionValue builtIn="false" value="misc"/>
@@ -77,6 +76,7 @@
 									<listOptionValue builtIn="false" value="lm"/>
 									<listOptionValue builtIn="false" value="util"/>
 									<listOptionValue builtIn="false" value="bz2"/>
+									<listOptionValue builtIn="false" value="rt"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.983725033" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
 									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
-- 
cgit v1.2.3


From ae53bc91d1a0186d791a98570c544b9999f39bd8 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Thu, 28 Mar 2013 10:27:11 +0000
Subject: Nicer error message for too many factor delimiters

---
 moses/Word.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/moses/Word.cpp b/moses/Word.cpp
index c23e8de8c..2c1ac09ea 100644
--- a/moses/Word.cpp
+++ b/moses/Word.cpp
@@ -25,6 +25,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "Word.h"
 #include "TypeDef.h"
 #include "StaticData.h"  // needed to determine the FactorDelimiter
+#include "util/exception.hh"
 #include "util/tokenize_piece.hh"
 
 using namespace std;
@@ -95,6 +96,8 @@ std::string Word::GetString(FactorType factorType) const
   	return NULL;
 }
 
+class StrayFactorException : public util::Exception {};
+
 void Word::CreateFromString(FactorDirection direction
                             , const std::vector<FactorType> &factorOrder
                             , const StringPiece &str
@@ -106,7 +109,7 @@ void Word::CreateFromString(FactorDirection direction
   for (size_t ind = 0; ind < factorOrder.size() && fit; ++ind, ++fit) {
     m_factorArray[factorOrder[ind]] = factorCollection.AddFactor(*fit);
   }
-  CHECK(!fit);
+  UTIL_THROW_IF(fit, StrayFactorException, "You have configured " << factorOrder.size() << " factors but the word " << str << " contains factor delimiter " << StaticData::Instance().GetFactorDelimiter() << " too many times.");
 
   // assume term/non-term same for all factors
   m_isNonTerminal = isNonTerminal;
-- 
cgit v1.2.3


From 627f3f908cbc4a1dc4c5970a13b45bd5fcc66f82 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Sun, 31 Mar 2013 15:58:34 +0100
Subject: OS X returns EINVAL for write > 2^31

---
 util/file.cc | 37 ++++++++++++++++++++++++++++---------
 1 file changed, 28 insertions(+), 9 deletions(-)

diff --git a/util/file.cc b/util/file.cc
index 86d9b12de..c7d8e23b2 100644
--- a/util/file.cc
+++ b/util/file.cc
@@ -111,15 +111,26 @@ void ResizeOrThrow(int fd, uint64_t to) {
   UTIL_THROW_IF_ARG(ret, FDException, (fd), "while resizing to " << to << " bytes");
 }
 
+namespace {
+std::size_t GuardLarge(std::size_t size) {
+  // The following operating systems have broken read/write/pread/pwrite that
+  // only supports up to 2^31.
+#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID)
+  return std::min(static_cast<std::size_t>(INT_MAX), size);
+#else
+  return size;
+#endif
+}
+}
+
 std::size_t PartialRead(int fd, void *to, std::size_t amount) {
 #if defined(_WIN32) || defined(_WIN64)
-  amount = min(static_cast<std::size_t>(INT_MAX), amount);
-  int ret = _read(fd, to, amount); 
+  int ret = _read(fd, to, GuardLarge(amount));
 #else
   errno = 0;
   ssize_t ret;
   do {
-    ret = read(fd, to, amount);
+    ret = read(fd, to, GuardLarge(amount));
   } while (ret == -1 && errno == EINTR);
 #endif
   UTIL_THROW_IF_ARG(ret < 0, FDException, (fd), "while reading " << amount << " bytes");
@@ -169,11 +180,13 @@ void PReadOrThrow(int fd, void *to_void, std::size_t size, uint64_t off) {
     ssize_t ret;
     errno = 0;
     do {
+      ret =
 #ifdef OS_ANDROID
-      ret = pread64(fd, to, size, off);
+        pread64
 #else
-      ret = pread(fd, to, size, off);
+        pread
 #endif
+        (fd, to, GuardLarge(size), off);
     } while (ret == -1 && errno == EINTR);
     if (ret <= 0) {
       UTIL_THROW_IF(ret == 0, EndOfFileException, " for reading " << size << " bytes at " << off << " from " << NameFromFD(fd));
@@ -190,14 +203,20 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
   const uint8_t *data = static_cast<const uint8_t*>(data_void);
   while (size) {
 #if defined(_WIN32) || defined(_WIN64)
-    int ret = write(fd, data, min(static_cast<std::size_t>(INT_MAX), size));
+    int ret;
 #else
-    errno = 0;
     ssize_t ret;
+#endif
+    errno = 0;
     do {
-      ret = write(fd, data, size);
-    } while (ret == -1 && errno == EINTR);
+      ret = 
+#if defined(_WIN32) || defined(_WIN64)
+        _write
+#else
+        write
 #endif
+        (fd, data, GuardLarge(size));
+    } while (ret == -1 && errno == EINTR);
     UTIL_THROW_IF_ARG(ret < 1, FDException, (fd), "while writing " << size << " bytes");
     data += ret;
     size -= ret;
-- 
cgit v1.2.3


From b144e48fcc5c84b24595899ec53eae5dc4bf2428 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Mon, 1 Apr 2013 11:05:55 +0100
Subject: Make failure to parse a boolean argument fatal instead of log +
 interpret as false.

---
 moses/Util.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/moses/Util.cpp b/moses/Util.cpp
index 98de1241e..495e05124 100644
--- a/moses/Util.cpp
+++ b/moses/Util.cpp
@@ -35,6 +35,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include "TypeDef.h"
 #include "Util.h"
 #include "Timer.h"
+#include "util/exception.hh"
 #include "util/file.hh"
 
 using namespace std;
@@ -65,6 +66,8 @@ const std::string ToLower(const std::string& str)
   return lc;
 }
 
+class BoolValueException : public util::Exception {};
+
 template<>
 bool Scan<bool>(const std::string &input)
 {
@@ -73,8 +76,7 @@ bool Scan<bool>(const std::string &input)
     return true;
   if (lc == "no" || lc == "n" || lc =="false" || lc == "0")
     return false;
-  TRACE_ERR( "Scan<bool>: didn't understand '" << lc << "', returning false" << std::endl);
-  return false;
+  UTIL_THROW(BoolValueException, "Could not interpret " << input << " as a boolean.  After lowercasing, valid values are yes, y, true, 1, no, n, false, and 0.");
 }
 
 bool FileExists(const std::string& filePath)
-- 
cgit v1.2.3


From 0a978e9f01c566292da6586ca4aa82ae5e77b53c Mon Sep 17 00:00:00 2001
From: phikoehn <pkoehn@inf.ed.ac.uk>
Date: Mon, 1 Apr 2013 14:31:32 +0100
Subject: bug fixes

---
 OnDiskPt/Main.cpp                 |  1 +
 scripts/ems/support/analysis.perl | 17 +++++++++++++----
 scripts/training/train-model.perl |  3 ++-
 3 files changed, 16 insertions(+), 5 deletions(-)

diff --git a/OnDiskPt/Main.cpp b/OnDiskPt/Main.cpp
index 5f6da5a33..5d4e0be8d 100644
--- a/OnDiskPt/Main.cpp
+++ b/OnDiskPt/Main.cpp
@@ -174,6 +174,7 @@ OnDiskPt::PhrasePtr Tokenize(SourcePhrase &sourcePhrase, TargetPhrase &targetPhr
         break;
 	}
       default:
+        cerr << "ERROR in line " << line << endl;
         assert(false);
         break;
       }
diff --git a/scripts/ems/support/analysis.perl b/scripts/ems/support/analysis.perl
index 29962ca71..a2f9580a9 100755
--- a/scripts/ems/support/analysis.perl
+++ b/scripts/ems/support/analysis.perl
@@ -745,7 +745,8 @@ sub hierarchical_segmentation {
     open(OUTPUT_TREE,">$dir/output-tree") or die "Cannot open: $!";
     open(NODE,">$dir/node") or die "Cannot open: $!";
     while(<TRACE>) {
-	/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ || die("cannot scan line $_");
+	/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>(.+) :([\(\),\d\- ]*): pC=[\d\.\-e]+, c=/ ||
+	/^Trans Opt (\d+) \[(\d+)\.\.(\d+)\]: (.+)  : (\S+) \-\>(.+) :([\(\),\d\- ]*): c=/ || die("cannot scan line $_");
 	my ($sentence,$start,$end,$spans,$rule_lhs,$rule_rhs,$alignment) = ($1,$2,$3,$4,$5,$6,$7);
 	if ($last_sentence >= 0 && $sentence != $last_sentence) {
 	    &hs_process($last_sentence,\@DERIVATION,\%STATS);
@@ -1137,9 +1138,17 @@ sub process_search_graph {
   `mkdir -p $dir/search-graph`;
   my $last_sentence = -1;
   while(<OSG>) {
-    /^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</ || die("ERROR: buggy search graph line: $_"); 
-    my ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score) 
-      = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
+    my ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score);
+    if (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): pC=([\de\-\.]+), c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] \<\</) {
+      ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$heuristic_rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
+    }
+    elsif (/^(\d+) (\d+)\-?\>?(\S*) (\S+) =\> (.+) :(.*): c=([\de\-\.]+) \[(\d+)\.\.(\d+)\] (.*)\[total=([\d\-\.]+)\] core/) {
+      ($sentence,$id,$recomb,$lhs,$output,$alignment,$rule_score,$from,$to,$children,$hyp_score) = ($1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12);
+      $heuristic_rule_score = $rule_score; # hmmmm....
+    }
+    else {
+      die("ERROR: buggy search graph line: $_"); 
+    }
     chop($alignment) if $alignment;
     chop($children) if $children;
     $recomb = 0 unless $recomb;
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index 5b0553581..e4292007e 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -38,8 +38,9 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
    $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
    $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
    @_ADDITIONAL_INI,$_ADDITIONAL_INI_FILE,
-   $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_CORPUS, $_BASELINE_ALIGNMENT,
+   $_SPARSE_TRANSLATION_TABLE, @_BASELINE_ALIGNMENT_MODEL, $_BASELINE_EXTRACT, $_BASELINE_ALIGNMENT,
    $_DICTIONARY, $_SPARSE_PHRASE_FEATURES, $_EPPEX, $_INSTANCE_WEIGHTS_FILE, $_LMODEL_OOV_FEATURE, $IGNORE);
+my $_BASELINE_CORPUS = "";
 my $_CORES = 1;
 
 my $debug = 0; # debug this script, do not delete any files in debug mode
-- 
cgit v1.2.3


From 354d1a9474eef9aaa6ae569b4b82f2d16b2d6395 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Tue, 2 Apr 2013 14:35:20 +0100
Subject: add back -early-distortion-cost accidently deleted

---
 moses/Parameter.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/moses/Parameter.cpp b/moses/Parameter.cpp
index 356cf219b..6a9745ade 100644
--- a/moses/Parameter.cpp
+++ b/moses/Parameter.cpp
@@ -107,6 +107,7 @@ Parameter::Parameter()
 	AddParam("monotone-at-punctuation", "mp", "do not reorder over punctuation");
 	AddParam("distortion-file", "source factors (0 if table independent of source), target factors, location of the factorized/lexicalized reordering tables");
  	AddParam("distortion", "configurations for each factorized/lexicalized reordering model.");
+ 	AddParam("early-distortion-cost", "edc", "include estimate of distortion cost yet to be incurred in the score [Moore & Quirk 2007]. Default is no");
 	AddParam("xml-input", "xi", "allows markup of input with desired translations and probabilities. values can be 'pass-through' (default), 'inclusive', 'exclusive', 'ignore'");
   AddParam("xml-brackets", "xb", "specify strings to be used as xml tags opening and closing, e.g. \"{{ }}\" (default \"< >\"). Avoid square brackets because of configuration file format. Valid only with text input mode" );
  	AddParam("minimum-bayes-risk", "mbr", "use miminum Bayes risk to determine best translation");
-- 
cgit v1.2.3


From 93433cf0157ecadd19162929a077aa6179dc0d8b Mon Sep 17 00:00:00 2001
From: Ondrej Bojar <bojar@ufal.mff.cuni.cz>
Date: Wed, 3 Apr 2013 18:07:42 +0200
Subject: support --translation-details OUTFILE in moses-parallel

---
 scripts/generic/moses-parallel.pl | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl
index d1840fc55..b8d393e71 100755
--- a/scripts/generic/moses-parallel.pl
+++ b/scripts/generic/moses-parallel.pl
@@ -64,6 +64,7 @@ my $wordgraphfile=undef;
 my $wordgraphflag=0;
 my $robust=5; # resubmit crashed jobs robust-times
 my $alifile=undef;
+my $detailsfile=undef;
 my $logfile="";
 my $logflag="";
 my $searchgraphlist="";
@@ -93,6 +94,7 @@ sub init(){
 	     'output-search-graph|osg=s'=> \$searchgraphlist,
              'output-word-graph|owg=s'=> \$wordgraphlist,
              'alignment-output-file=s'=> \$alifile,
+             'translation-details|T=s'=> \$detailsfile,
 	     'qsub-prefix=s'=> \$qsubname,
 	     'queue-parameters=s'=> \$queueparameters,
 	     'inputtype=i'=> \$inputtype,
@@ -539,6 +541,7 @@ while ($robust && scalar @idx_todo) {
 concatenate_1best();
 concatenate_logs() if $logflag;
 concatenate_ali() if defined $alifile;  
+concatenate_details() if defined $detailsfile;  
 concatenate_nbest() if $nbestflag;  
 safesystem("cat nbest$$ >> /dev/stdout") if $nbestlist[0] eq '-';
 
@@ -580,6 +583,11 @@ sub preparing_script(){
       $tmpalioutfile="-alignment-output-file $tmpdir/$alifile.$splitpfx$idx";
     }
 
+    my $tmpdetailsoutfile = "";
+    if (defined $detailsfile){
+      $tmpdetailsoutfile="-translation-details $tmpdir/$detailsfile.$splitpfx$idx";
+    }
+
     my $tmpsearchgraphlist="";
     if ($searchgraphflag){
       $tmpsearchgraphlist="-output-search-graph $tmpdir/$searchgraphfile.$splitpfx$idx";
@@ -592,13 +600,17 @@ sub preparing_script(){
 
 	my $tmpStartTranslationId = ""; # "-start-translation-id $currStartTranslationId";
 
-    print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n";
+    print OUT "$mosescmd $mosesparameters $tmpStartTranslationId $tmpalioutfile $tmpdetailsoutfile $tmpwordgraphlist $tmpsearchgraphlist $tmpnbestlist $inputmethod ${inputfile}.$splitpfx$idx > $tmpdir/${inputfile}.$splitpfx$idx.trans\n\n";
     print OUT "echo exit status \$\?\n\n";
 
     if (defined $alifile){
       print OUT "\\mv -f $tmpdir/${alifile}.$splitpfx$idx .\n\n";
       print OUT "echo exit status \$\?\n\n";
     }
+    if (defined $detailsfile){
+      print OUT "\\mv -f $tmpdir/${detailsfile}.$splitpfx$idx .\n\n";
+      print OUT "echo exit status \$\?\n\n";
+    }
     if ($nbestflag){
       print OUT "\\mv -f $tmpdir/${nbestfile}.$splitpfx$idx .\n\n";
       print OUT "echo exit status \$\?\n\n";
@@ -827,6 +839,18 @@ sub concatenate_ali(){
   close(OUT);
 }
 
+sub concatenate_details(){
+  open (OUT, "> ${detailsfile}");
+  foreach my $idx (@idxlist){
+    my @in=();
+    open (IN, "$detailsfile.$splitpfx$idx");
+    @in=<IN>;
+    print OUT "@in";
+    close(IN);
+  }
+  close(OUT);
+}
+
 
 sub check_exit_status(){
   print STDERR "check_exit_status\n";
@@ -925,6 +949,7 @@ sub remove_temporary_files(){
     unlink("${inputfile}.${splitpfx}${idx}.trans");
     unlink("${inputfile}.${splitpfx}${idx}");
     if (defined $alifile){ unlink("${alifile}.${splitpfx}${idx}"); }
+    if (defined $detailsfile){ unlink("${detailsfile}.${splitpfx}${idx}"); }
     if ($nbestflag){ unlink("${nbestfile}.${splitpfx}${idx}"); }
     if ($searchgraphflag){ unlink("${searchgraphfile}.${splitpfx}${idx}"); }
     if ($wordgraphflag){ unlink("${wordgraphfile}.${splitpfx}${idx}"); }
-- 
cgit v1.2.3


From ac82be3120ac0b143039e2e36dff8b8538bdcb68 Mon Sep 17 00:00:00 2001
From: phikoehn <pkoehn@inf.ed.ac.uk>
Date: Wed, 3 Apr 2013 21:59:03 +0100
Subject: Hal moved. We follow.

---
 scripts/training/mert-moses.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index 0ac3b414f..f73b58120 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -360,7 +360,7 @@ my $pro_optimizer = File::Spec->catfile($mertdir, "megam_i686.opt");  # or set t
 
 if (($___PAIRWISE_RANKED_OPTIMIZER || $___PRO_STARTING_POINT) && ! -x $pro_optimizer) {
   print "Could not find $pro_optimizer, installing it in $mertdir\n";
-  my $megam_url = "http://www.umiacs.umd.edu/~hal/megam/";
+  my $megam_url = "http://hal3.name/megam";
   if (&is_mac_osx()) {
     die "Error: Sorry for Mac OS X users! Please get the source code of megam and compile by hand. Please see $megam_url for details.";
   }
-- 
cgit v1.2.3


From c016b6e04b9a0c6eddc423b0b839021f00599bbe Mon Sep 17 00:00:00 2001
From: phikoehn <pkoehn@inf.ed.ac.uk>
Date: Fri, 5 Apr 2013 11:26:00 +0100
Subject: extended display options for biconcor

---
 biconcor/PhrasePair.cpp           | 37 ++++++++++++++++-
 biconcor/PhrasePair.h             |  3 +-
 biconcor/PhrasePairCollection.cpp | 87 ++++++++++++++++++++++-----------------
 biconcor/PhrasePairCollection.h   | 10 ++---
 biconcor/biconcor.cpp             | 65 +++++++++++++++++++++++++----
 5 files changed, 150 insertions(+), 52 deletions(-)

diff --git a/biconcor/PhrasePair.cpp b/biconcor/PhrasePair.cpp
index 9c16be77c..038fa3a31 100644
--- a/biconcor/PhrasePair.cpp
+++ b/biconcor/PhrasePair.cpp
@@ -8,7 +8,42 @@
 
 using namespace std;
 
-void PhrasePair::Print( ostream* out, int width ) const
+void PhrasePair::Print( ostream* out ) const
+{
+  // source
+  int sentence_start = m_source_position - m_source_start;
+  char source_length = m_suffixArray->GetSentenceLength( m_suffixArray->GetSentence( m_source_position ) );
+
+  for( char i=0; i<source_length; i++ ) {
+    if (i>0) *out << " ";
+    *out << m_suffixArray->GetWord( sentence_start + i );
+  }
+
+  // target
+  *out << " |||";
+  for( char i=0; i<m_target_length; i++ ) {
+    *out << " " << m_targetCorpus->GetWord( m_sentence_id, i);
+  }
+
+  // source span
+  *out << " ||| " << (int)m_source_start << " " << (int)m_source_end;
+
+  // target span
+  *out << " ||| " << (int)m_target_start << " " << (int)m_target_end;
+
+  // word alignment
+  *out << " |||";
+
+  INDEX ap_points = m_alignment->GetNumberOfAlignmentPoints( m_sentence_id );
+  for( INDEX i=0; i<ap_points; i++) {
+    *out << " " << m_alignment->GetSourceWord( m_sentence_id, i )
+	 << "-" << m_alignment->GetTargetWord( m_sentence_id, i );
+  }
+
+  *out << endl;
+}
+
+void PhrasePair::PrintPretty( ostream* out, int width ) const
 {
   vector< WORD_ID >::const_iterator t;
 
diff --git a/biconcor/PhrasePair.h b/biconcor/PhrasePair.h
index f8a7881a0..f1dadb637 100644
--- a/biconcor/PhrasePair.h
+++ b/biconcor/PhrasePair.h
@@ -43,7 +43,8 @@ public:
   ~PhrasePair () {}
 
   void PrintTarget( std::ostream* out ) const;
-  void Print( std::ostream* out, int width ) const;
+  void Print( std::ostream* out ) const;
+  void PrintPretty( std::ostream* out, int width ) const;
   void PrintHTML( std::ostream* out ) const;
   void PrintClippedHTML( std::ostream* out, int width ) const;
 };
diff --git a/biconcor/PhrasePairCollection.cpp b/biconcor/PhrasePairCollection.cpp
index 17c95d24a..7497b2af8 100644
--- a/biconcor/PhrasePairCollection.cpp
+++ b/biconcor/PhrasePairCollection.cpp
@@ -13,31 +13,32 @@
 
 using namespace std;
 
-PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a )
+PhrasePairCollection::PhrasePairCollection( SuffixArray *sa, TargetCorpus *tc, Alignment *a, int max_translation, int max_example )
   :m_suffixArray(sa)
   ,m_targetCorpus(tc)
   ,m_alignment(a)
   ,m_size(0)
-  ,m_max_lookup(10000)
-  ,m_max_pp_target(50)
-  ,m_max_pp(50)
+  ,m_max_lookup(10000)          // maximum number of source occurrences sampled
+  ,m_max_translation(max_translation)    // max number of different distinct translations returned
+  ,m_max_example(max_example) // max number of examples returned for each distinct translation
 {}
 
 PhrasePairCollection::~PhrasePairCollection()
 {}
 
-bool PhrasePairCollection::GetCollection( const vector< string >& sourceString )
+int PhrasePairCollection::GetCollection( const vector< string >& sourceString )
 {
   INDEX first_match, last_match;
   if (! m_suffixArray->FindMatches( sourceString, first_match, last_match )) {
-    return false;
+    return 0;
   }
-  cerr << "\tfirst match " << first_match << endl;
-  cerr << "\tlast match " << last_match << endl;
+  //cerr << "\tfirst match " << first_match << endl;
+  //cerr << "\tlast match " << last_match << endl;
 
   INDEX found = last_match - first_match +1;
 
   map< vector< WORD_ID >, INDEX > index;
+  int real_count = 0;
   for( INDEX i=first_match; i<=last_match; i++ ) {
     int position = m_suffixArray->GetPosition( i );
     int source_start = m_suffixArray->GetWordInSentence( position );
@@ -45,23 +46,23 @@ bool PhrasePairCollection::GetCollection( const vector< string >& sourceString )
     INDEX sentence_id = m_suffixArray->GetSentence( position );
     int sentence_length = m_suffixArray->GetSentenceLength( sentence_id );
     int target_length = m_targetCorpus->GetSentenceLength( sentence_id );
-    cerr << "match " << (i-first_match)
-         << " in sentence " << sentence_id
-         << ", starting at word " << source_start
-         << " of " << sentence_length
-         << ". target sentence has " << target_length << " words.";
+    //cerr << "match " << (i-first_match)
+         //<< " in sentence " << sentence_id
+         //<< ", starting at word " << source_start
+         //<< " of " << sentence_length
+         //<< ". target sentence has " << target_length << " words.";
     int target_start, target_end, pre_null, post_null;
     if (m_alignment->PhraseAlignment( sentence_id, target_length, source_start, source_end, target_start, target_end, pre_null, post_null)) {
-      cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
-      cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
+      //cerr << " aligned to [" << (int)target_start << "," << (int)target_end << "]";
+      //cerr << " +(" << (int)pre_null << "," << (int)post_null << ")";
 			bool null_boundary_words = false;
       for (int pre = 0; pre <= pre_null && (pre == 0 || null_boundary_words); pre++ ) {
         for (int post = 0; post <= post_null && (post == 0 || null_boundary_words); post++ ) {
           vector< WORD_ID > targetString;
-          cerr << "; ";
+          //cerr << "; ";
           for (int target = target_start - pre; target <= target_end + post; target++) {
             targetString.push_back( m_targetCorpus->GetWordId( sentence_id, target) );
-            cerr << m_targetCorpus->GetWord( sentence_id, target) << " ";
+            //cerr << m_targetCorpus->GetWord( sentence_id, target) << " ";
           }
           PhrasePair *phrasePair = new PhrasePair( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, target_length, position, source_start, source_end, target_start-pre, target_end+post, pre, post, pre_null-pre, post_null-post);
           // matchCollection.Add( sentence_id, )
@@ -76,37 +77,47 @@ bool PhrasePairCollection::GetCollection( const vector< string >& sourceString )
       }
     }
 		else {
-			cerr << "mismatch " << (i-first_match)
-					 << " in sentence " << sentence_id
-					 << ", starting at word " << source_start
-					 << " of " << sentence_length
-					 << ". target sentence has " << target_length << " words.";
+			//cerr << "mismatch " << (i-first_match)
+			//		 << " in sentence " << sentence_id
+			//		 << ", starting at word " << source_start
+			//		 << " of " << sentence_length
+			//		 << ". target sentence has " << target_length << " words.";
 			Mismatch *mismatch = new Mismatch( m_suffixArray, m_targetCorpus, m_alignment, sentence_id, position, sentence_length, target_length, source_start, source_end );
 			if (mismatch->Unaligned())
 				m_unaligned.push_back( mismatch );
 			else
 				m_mismatch.push_back( mismatch );
 		}
-    cerr << endl;
+    //cerr << endl;
 
     if (found > (INDEX)m_max_lookup) {
       i += found/m_max_lookup-1;
     }
+    real_count++;
   }
   sort(m_collection.begin(), m_collection.end(), CompareBySize());
-  return true;
+  return real_count;
 }
 
-void PhrasePairCollection::Print() const
+void PhrasePairCollection::Print(bool pretty) const
 {
   vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
-  for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end(); ppWithSameTarget++ ) {
+  int i=0;
+  for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && i<m_max_translation; i++, ppWithSameTarget++ ) {
     (*(ppWithSameTarget->begin()))->PrintTarget( &cout );
     int count = ppWithSameTarget->size();
     cout << "(" << count << ")" << endl;
-    vector< PhrasePair* >::const_iterator p;
-    for(p = ppWithSameTarget->begin(); p != ppWithSameTarget->end(); p++ ) {
-      (*p)->Print( &cout, 100 );
+    vector< PhrasePair* >::const_iterator p = ppWithSameTarget->begin();
+    for(int j=0; j<ppWithSameTarget->size() && j<m_max_example; j++, p++ ) {
+      if (pretty) {
+        (*p)->PrintPretty( &cout, 100 );
+      }
+      else {
+        (*p)->Print( &cout );
+      }
+      if (ppWithSameTarget->size() > m_max_example) {
+        p += ppWithSameTarget->size()/m_max_example-1;
+      }
     }
   }
 }
@@ -117,7 +128,7 @@ void PhrasePairCollection::PrintHTML() const
 	bool singleton = false;
 	// loop over all translations
   vector< vector<PhrasePair*> >::const_iterator ppWithSameTarget;
-  for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_pp_target; ppWithSameTarget++, pp_target++ ) {
+  for( ppWithSameTarget = m_collection.begin(); ppWithSameTarget != m_collection.end() && pp_target<m_max_translation; ppWithSameTarget++, pp_target++ ) {
 
 		int count = ppWithSameTarget->size();
 		if (!singleton) {
@@ -143,9 +154,9 @@ void PhrasePairCollection::PrintHTML() const
 		int i=0;
     for(p = ppWithSameTarget->begin(); i<10 && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
       (*p)->PrintClippedHTML( &cout, 160 );
-      if (count > m_max_pp) {
-        p += count/m_max_pp-1;
-        pp += count/m_max_pp-1;
+      if (count > m_max_example) {
+        p += count/m_max_example-1;
+        pp += count/m_max_example-1;
       }
     }
 		if (i == 10 && pp < count) {			
@@ -153,11 +164,11 @@ void PhrasePairCollection::PrintHTML() const
 			cout << "<tr><td colspan=7 align=center class=\"pp_more\" onclick=\"javascript:document.getElementById('pp_" << pp_target << "').style.display = 'none'; document.getElementById('pp_ext_" << pp_target << "').style.display = 'block';\">(more)</td></tr></table></div>";
 			cout << "<div id=\"pp_ext_" << pp_target << "\" style=\"display:none;\";\">";
 			cout << "<table align=\"center\">";
-			for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_pp && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
+			for(i=0, pp=0, p = ppWithSameTarget->begin(); i<m_max_example && pp<count && p != ppWithSameTarget->end(); p++, pp++, i++ ) {
 				(*p)->PrintClippedHTML( &cout, 160 );
-				if (count > m_max_pp) {
-					p += count/m_max_pp-1;
-					pp += count/m_max_pp-1;
+				if (count > m_max_example) {
+					p += count/m_max_example-1;
+					pp += count/m_max_example-1;
 				}
 			}
 		}
@@ -172,7 +183,7 @@ void PhrasePairCollection::PrintHTML() const
 	if (singleton) cout << "</table></div>\n";
 	else if (pp_target > 9)	cout << "</div>";
 
-	size_t max_mismatch = m_max_pp/3;
+	size_t max_mismatch = m_max_example/3;
 	// unaligned phrases
 	if (m_unaligned.size() > 0) {
 		cout << "<p class=\"pp_singleton_header\">unaligned" 
diff --git a/biconcor/PhrasePairCollection.h b/biconcor/PhrasePairCollection.h
index f88bfc10f..e076eba9b 100644
--- a/biconcor/PhrasePairCollection.h
+++ b/biconcor/PhrasePairCollection.h
@@ -22,19 +22,19 @@ private:
   std::vector< Mismatch* > m_mismatch, m_unaligned;
   int m_size;
   int m_max_lookup;
-  int m_max_pp_target;
-  int m_max_pp;
+  int m_max_translation;
+  int m_max_example;
 
   // No copying allowed.
   PhrasePairCollection(const PhrasePairCollection&);
   void operator=(const PhrasePairCollection&);
 
 public:
-  PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment * );
+  PhrasePairCollection ( SuffixArray *, TargetCorpus *, Alignment *, int, int );
   ~PhrasePairCollection ();
 
-  bool GetCollection( const std::vector<std::string >& sourceString );
-  void Print() const;
+  int GetCollection( const std::vector<std::string >& sourceString );
+  void Print(bool pretty) const;
   void PrintHTML() const;
 };
 
diff --git a/biconcor/biconcor.cpp b/biconcor/biconcor.cpp
index a25e63cb7..f4e7c03fb 100644
--- a/biconcor/biconcor.cpp
+++ b/biconcor/biconcor.cpp
@@ -19,8 +19,12 @@ int main(int argc, char* argv[])
   int saveFlag = false;
   int createFlag = false;
   int queryFlag = false;
-  int htmlFlag = false;
-  string info = "usage: suffix-query\n\t[--load file]\n\t[--save file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n";
+  int htmlFlag = false;   // output as HTML
+  int prettyFlag = false; // output readable on screen
+  int stdioFlag = false;  // receive requests from STDIN, respond to STDOUT
+  int max_translation = 20;
+  int max_example = 50;
+  string info = "usage: biconcor\n\t[--load model-file]\n\t[--save model-file]\n\t[--create source-corpus]\n\t[--query string]\n\t[--target target-corpus]\n\t[--alignment file]\n\t[--translations count]\n\t[--examples count]\n\t[--html]\n\t[--stdio]\n";
   while(1) {
     static struct option long_options[] = {
       {"load", required_argument, 0, 'l'},
@@ -29,11 +33,15 @@ int main(int argc, char* argv[])
       {"query", required_argument, 0, 'q'},
       {"target", required_argument, 0, 't'},
       {"alignment", required_argument, 0, 'a'},
-      {"html", no_argument, &htmlFlag, 0},
+      {"html", no_argument, 0, 'h'},
+      {"pretty", no_argument, 0, 'p'},
+      {"stdio", no_argument, 0, 'i'},
+      {"translations", required_argument, 0, 'o'},
+      {"examples", required_argument, 0, 'e'},
       {0, 0, 0, 0}
     };
     int option_index = 0;
-    int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:h", long_options, &option_index);
+    int c = getopt_long (argc, argv, "l:s:c:q:Q:t:a:hpio:e:", long_options, &option_index);
     if (c == -1) break;
     switch (c) {
     case 'l':
@@ -62,11 +70,29 @@ int main(int argc, char* argv[])
       query = string(optarg);
       queryFlag = true;
       break;
+    case 'o':
+      max_translation = atoi(optarg);
+      break;
+    case 'e':
+      max_example = atoi(optarg);
+      break;
+    case 'p':
+      prettyFlag = true;
+      break;
+    case 'h':
+      htmlFlag = true;
+      break;
+    case 'i':
+      stdioFlag = true;
+      break;
     default:
       cerr << info;
       exit(1);
     }
   }
+  if (stdioFlag) {
+    queryFlag = true;
+  }
 
   // check if parameter settings are legal
   if (saveFlag && !createFlag) {
@@ -111,12 +137,37 @@ int main(int argc, char* argv[])
     targetCorpus.Load( fileNameSuffix );
     alignment.Load( fileNameSuffix );
   }
-  if (queryFlag) {
+  if (stdioFlag) {
+    cout << "-|||- BICONCOR START -|||-" << endl << flush;
+    while(true) {
+      string query;
+      if (getline(cin, query, '\n').eof()) {
+        return 0;
+      }
+      vector< string > queryString = alignment.Tokenize( query.c_str() );
+      PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
+      int total = ppCollection.GetCollection( queryString );
+      cout << "TOTAL: " << total << endl;
+      if (htmlFlag) {
+        ppCollection.PrintHTML();
+      }
+      else {
+	ppCollection.Print(prettyFlag);
+      }
+      cout << "-|||- BICONCOR END -|||-" << endl << flush;
+    }
+  }
+  else if (queryFlag) {
     cerr << "query is " << query << endl;
     vector< string > queryString = alignment.Tokenize( query.c_str() );
-    PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment );
+    PhrasePairCollection ppCollection( &suffixArray, &targetCorpus, &alignment, max_translation, max_example );
     ppCollection.GetCollection( queryString );
-    ppCollection.PrintHTML();
+    if (htmlFlag) {
+      ppCollection.PrintHTML();
+    }
+    else {
+      ppCollection.Print(prettyFlag);
+    }
   }
 
   return 0;
-- 
cgit v1.2.3


From 38aa0c74b0f6ac994ccc4affd69435935f450ae6 Mon Sep 17 00:00:00 2001
From: Rico Sennrich <rico.sennrich@gmx.ch>
Date: Tue, 9 Apr 2013 11:13:11 +0200
Subject: sigtest-filter: hierarchical mode now works with syntactic models
 (labels other than X)

---
 contrib/sigtest-filter/filter-pt.cpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/contrib/sigtest-filter/filter-pt.cpp b/contrib/sigtest-filter/filter-pt.cpp
index f06d2b430..6ab1a5657 100644
--- a/contrib/sigtest-filter/filter-pt.cpp
+++ b/contrib/sigtest-filter/filter-pt.cpp
@@ -287,24 +287,24 @@ SentIdSet find_occurrences(const std::string& rule, C_SuffixArraySearchApplicati
     if (hierarchical) {
         //   std::cerr << "splitting up phrase: " << phrase << "\n";
         int pos = 0;
-        int endPos = 0;
+        int NTStartPos, NTEndPos;
         vector<std::string> phrases;
-
-        while (rule.find("[X][X] ", pos) < rule.size()) {
-            endPos = rule.find("[X][X] ",pos) - 1; // -1 to cut space before NT
-            if (endPos < pos) { // no space: NT at start of rule (or two consecutive NTs)
-                pos += 7; 
+        while (rule.find("] ", pos) < rule.size()) {
+            NTStartPos = rule.find("[",pos) - 1; // -1 to cut space before NT
+            NTEndPos = rule.find("] ",pos);
+            if (NTStartPos < pos) { // no space: NT at start of rule (or two consecutive NTs)
+                pos = NTEndPos + 2;
                 continue;
             }
-            phrases.push_back(rule.substr(pos,endPos-pos));
-            pos = endPos + 8;
+            phrases.push_back(rule.substr(pos,NTStartPos-pos));
+            pos = NTEndPos + 2;
         }
 
-        // cut LHS of rule
-        endPos = rule.size()-4;
-        if (endPos > pos) {
-            phrases.push_back(rule.substr(pos,endPos-pos));
+        NTStartPos = rule.find("[",pos) - 1; // LHS of rule
+        if (NTStartPos > pos) {
+            phrases.push_back(rule.substr(pos,NTStartPos-pos));
         }
+
         sa_set = lookup_multiple_phrases(phrases, my_sa, rule, cache);
     }
     else {
-- 
cgit v1.2.3


From 5dce1463e715295fa9442d9b5eac079a76f890b6 Mon Sep 17 00:00:00 2001
From: Rico Sennrich <rico.sennrich@gmx.ch>
Date: Tue, 9 Apr 2013 11:15:28 +0200
Subject: documentation: -phrase-word-alignment is on by default.

---
 contrib/tmcombine/README.md    | 2 +-
 contrib/tmcombine/tmcombine.py | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/contrib/tmcombine/README.md b/contrib/tmcombine/README.md
index 2cbc83299..7b8ebd45e 100644
--- a/contrib/tmcombine/README.md
+++ b/contrib/tmcombine/README.md
@@ -58,7 +58,7 @@ Regression tests (check if the output files (`test/phrase-table_testN`) differ f
 FURTHER NOTES
 -------------
 
- - Different combination algorithms require different statistics. To be on the safe side, use the options `-phrase-word-alignment` and `-write-lexical-counts` when training models.
+ - Different combination algorithms require different statistics. To be on the safe side, use the option and `-write-lexical-counts` when training models.
 
  - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). Sort the tables with `LC_ALL=C`. Phrase tables produced by Moses are sorted correctly.
 
diff --git a/contrib/tmcombine/tmcombine.py b/contrib/tmcombine/tmcombine.py
index 0bbcf7c78..5b65cc590 100755
--- a/contrib/tmcombine/tmcombine.py
+++ b/contrib/tmcombine/tmcombine.py
@@ -15,7 +15,7 @@
 
 
 # Some general things to note:
-#  - Different combination algorithms require different statistics. To be on the safe side, use the options `-phrase-word-alignment` and `-write-lexical-counts` when training models.
+#  - Different combination algorithms require different statistics. To be on the safe side, use the option `-write-lexical-counts` when training models.
 #  - The script assumes that phrase tables are sorted (to allow incremental, more memory-friendly processing). sort with LC_ALL=C.
 #  - Some configurations require additional statistics that are loaded in memory (lexical tables; complete list of target phrases). If memory consumption is a problem, use the option --lowmem (slightly slower and writes temporary files to disk), or consider pruning your phrase table before combining (e.g. using Johnson et al. 2007).
 #  - The script can read/write gzipped files, but the Python implementation is slow. You're better off unzipping the files on the command line and working with the unzipped files.
@@ -306,7 +306,7 @@ class Moses():
         # assuming that alignment is empty
         elif len(line) == 4:
             if self.require_alignment:
-                sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment\n')
+                sys.stderr.write('Error: unexpected phrase table format. Your current configuration requires alignment information. Make sure you trained your model with -phrase-word-alignment (default in newer Moses versions)\n')
                 exit()
             
             self.phrase_pairs[src][target][1] = [b'',line[3].lstrip(b'| ')]
-- 
cgit v1.2.3


From 44a0e52e3052371a850f7ee463279f4cc0522ea5 Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Tue, 9 Apr 2013 14:44:32 +0100
Subject: fixed ShowWeights() for confusion networks. This is a reason why we
 should get rid of ShortNames and move to refactored moses pdq

---
 contrib/other-builds/moses-chart-cmd/.cproject | 2 +-
 moses-cmd/Main.cpp                             | 2 +-
 scripts/training/mert-moses.pl                 | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject
index aae6822b6..71462b5df 100644
--- a/contrib/other-builds/moses-chart-cmd/.cproject
+++ b/contrib/other-builds/moses-chart-cmd/.cproject
@@ -60,7 +60,6 @@
 								</option>
 								<option id="gnu.cpp.link.option.libs.1177721357" name="Libraries (-l)" superClass="gnu.cpp.link.option.libs" valueType="libs">
 									<listOptionValue builtIn="false" value="dstruct"/>
-									<listOptionValue builtIn="false" value="rt"/>
 									<listOptionValue builtIn="false" value="flm"/>
 									<listOptionValue builtIn="false" value="oolm"/>
 									<listOptionValue builtIn="false" value="lattice"/>
@@ -77,6 +76,7 @@
 									<listOptionValue builtIn="false" value="boost_system-mt"/>
 									<listOptionValue builtIn="false" value="boost_thread-mt"/>
 									<listOptionValue builtIn="false" value="bz2"/>
+									<listOptionValue builtIn="false" value="rt"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.linker.input.128214028" superClass="cdt.managedbuild.tool.gnu.cpp.linker.input">
 									<additionalInput kind="additionalinputdependency" paths="$(USER_OBJS)"/>
diff --git a/moses-cmd/Main.cpp b/moses-cmd/Main.cpp
index 68d8049c4..b08ba532a 100644
--- a/moses-cmd/Main.cpp
+++ b/moses-cmd/Main.cpp
@@ -431,7 +431,7 @@ static void PrintFeatureWeight(const FeatureFunction* ff)
     vector<float> values = StaticData::Instance().GetAllWeights().GetScoresForProducer(ff);
     for (size_t i = 0; i < numScoreComps; ++i) 
       cout << ff->GetScoreProducerDescription() <<  " "
-           << ff->GetScoreProducerWeightShortName() << " "
+           << ff->GetScoreProducerWeightShortName(i) << " "
            << values[i] << endl;
   }
   else {
diff --git a/scripts/training/mert-moses.pl b/scripts/training/mert-moses.pl
index f73b58120..175fa12fb 100755
--- a/scripts/training/mert-moses.pl
+++ b/scripts/training/mert-moses.pl
@@ -1,4 +1,4 @@
-#!/usr/bin/perl -w
+#!/usr/bin/perl -w 
 # $Id$
 # Usage:
 # mert-moses.pl <foreign> <english> <decoder-executable> <decoder-config>
-- 
cgit v1.2.3


From 73035543d6086d98185b38d313a670176adde1f2 Mon Sep 17 00:00:00 2001
From: Kenneth Heafield <github@kheafield.com>
Date: Wed, 10 Apr 2013 18:27:25 +0100
Subject: Binary phrase table does string ops, at least make them fast

---
 moses/PDTAimp.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/moses/PDTAimp.h b/moses/PDTAimp.h
index 25131b98a..5680b8ecb 100644
--- a/moses/PDTAimp.h
+++ b/moses/PDTAimp.h
@@ -11,6 +11,7 @@
 #include "moses/TranslationModel/PhraseDictionaryTreeAdaptor.h"
 #include "SparsePhraseDictionaryFeature.h"
 #include "Util.h"
+#include "util/tokenize_piece.hh"
 
 namespace Moses
 {
@@ -284,11 +285,10 @@ protected:
     FactorCollection &factorCollection = FactorCollection::Instance();
 
     for(size_t k=0; k<factorStrings.size(); ++k) {
-      std::vector<std::string> factors=TokenizeMultiCharSeparator(*factorStrings[k],StaticData::Instance().GetFactorDelimiter());
-      CHECK(factors.size()==m_output.size());
+      util::TokenIter<util::MultiCharacter, false> word(*factorStrings[k], StaticData::Instance().GetFactorDelimiter());
       Word& w=targetPhrase.AddWord();
-      for(size_t l=0; l<m_output.size(); ++l) {
-        w[m_output[l]]= factorCollection.AddFactor(Output, m_output[l], factors[l]);
+      for(size_t l=0; l<m_output.size(); ++l, ++word) {
+        w[m_output[l]]= factorCollection.AddFactor(*word);
       }
     }
 
-- 
cgit v1.2.3


From 517d6c7bb834e40bcf25e8cbc79985180cb7f29f Mon Sep 17 00:00:00 2001
From: Hieu Hoang <hieu@hoang.co.uk>
Date: Wed, 10 Apr 2013 18:40:25 +0100
Subject: add score breakdown to target phrase debugging output

---
 moses/TargetPhrase.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/moses/TargetPhrase.cpp b/moses/TargetPhrase.cpp
index b1d99ab50..6f14657a3 100644
--- a/moses/TargetPhrase.cpp
+++ b/moses/TargetPhrase.cpp
@@ -326,8 +326,10 @@ TO_STRING_BODY(TargetPhrase);
 
 std::ostream& operator<<(std::ostream& os, const TargetPhrase& tp)
 {
-  os << static_cast<const Phrase&>(tp) << ":" << tp.GetAlignNonTerm();
-  os << ": c=" << tp.m_fullScore;
+  os << static_cast<const Phrase&>(tp) << ":" << flush;
+  os << tp.GetAlignNonTerm() << flush;
+  os << ": c=" << tp.m_fullScore << flush;
+  os << " " << tp.m_scoreBreakdown << flush;
 
   return os;
 }
-- 
cgit v1.2.3