Merge branch 'master' of git://github.com/moses-smt/mosesdecoder

author: phikoehn <pkoehn@inf.ed.ac.uk> 2012-05-26 03:10:08 +0400
committer: phikoehn <pkoehn@inf.ed.ac.uk> 2012-05-26 03:10:08 +0400
commit: 07fafd51b509e93db7be238107325c45ca5f57cd (patch)
tree: e8cccec20ee05726accd9f87a041b68370adeaab /scripts/training
parent: 561b9ac9567d3e5b0bbc56fdae3b29961b8bc728 (diff)
parent: a72744c49b7821bf0355e7fe4638c392a74b0d60 (diff)
53 files changed, 2087 insertions, 176 deletions
diff --git a/scripts/training/phrase-extract/ExtractedRule.h b/scripts/training/phrase-extract/ExtractedRule.h
index 170ccf892..be6e30836 100644
--- a/scripts/training/phrase-extract/ExtractedRule.h
+++ b/scripts/training/phrase-extract/ExtractedRule.h
@@ -43,6 +43,7 @@ public:
   int startS;
   int endS;
   float count;
+  double pcfgScore;
 
   std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
   
@@ -58,6 +59,7 @@ public:
     , startS(sS)
     , endS(eS)
     , count(0)
+    , pcfgScore(0.0)
   {}
   
   void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength)
diff --git a/scripts/training/phrase-extract/Jamfile b/scripts/training/phrase-extract/Jamfile
index 0872130f9..9be67e80a 100644
--- a/scripts/training/phrase-extract/Jamfile
+++ b/scripts/training/phrase-extract/Jamfile
@@ -10,13 +10,13 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
 alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
 alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
 
-exe extract : tables-core.o SentenceAlignment.o extract.cpp InputFileStream ../../..//boost_iostreams ;
+exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ;
 
 exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../../../moses/src//ThreadPool ../../..//boost_iostreams ;
 
 exe extract-lex : extract-lex.cpp InputFileStream ;
 
-exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp InputFileStream ../../..//boost_iostreams ;
+exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ;
 
 exe consolidate : consolidate.cpp tables-core.o OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ;
 
@@ -33,3 +33,5 @@ alias programs : extract extract-rules extract-lex score consolidate consolidate
 install legacy : programs : <location>. <install-type>EXE ;
 
 build-project extract-ghkm ;
+build-project pcfg-extract ;
+build-project pcfg-score ;
diff --git a/scripts/training/phrase-extract/PhraseAlignment.cpp b/scripts/training/phrase-extract/PhraseAlignment.cpp
index c0bfbde3e..ceb74f04c 100644
--- a/scripts/training/phrase-extract/PhraseAlignment.cpp
+++ b/scripts/training/phrase-extract/PhraseAlignment.cpp
@@ -13,6 +13,8 @@
 #include "tables-core.h"
 #include "score.h"
 
+#include <cstdlib>
+
 using namespace std;
 
 extern Vocabulary vcbT;
@@ -111,6 +113,9 @@ void PhraseAlignment::create( char line[], int lineID )
     }
     else if (item == 5) { // non-term lengths
       addNTLength(token[j]);
+    } else if (item == 6) { // target syntax PCFG score
+      float pcfgScore = std::atof(token[j].c_str());
+      pcfgSum = pcfgScore * count;
     }
   }
 
@@ -119,7 +124,7 @@ void PhraseAlignment::create( char line[], int lineID )
   if (item == 3) {
     count = 1.0;
   }
-  if (item < 3 || item > 5) {
+  if (item < 3 || item > 6) {
     cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
   }
 }
diff --git a/scripts/training/phrase-extract/PhraseAlignment.h b/scripts/training/phrase-extract/PhraseAlignment.h
index 8b8f5115c..8bd83503d 100644
--- a/scripts/training/phrase-extract/PhraseAlignment.h
+++ b/scripts/training/phrase-extract/PhraseAlignment.h
@@ -25,6 +25,7 @@ protected:
   void createAlignVec(size_t sourceSize, size_t targetSize);
   void addNTLength(const std::string &tok);
 public:
+  float pcfgSum;
   float count;
   std::vector< std::set<size_t> > alignedToT;
   std::vector< std::set<size_t> > alignedToS;
diff --git a/scripts/training/phrase-extract/RuleExtractionOptions.h b/scripts/training/phrase-extract/RuleExtractionOptions.h
index 70bb548c9..f9123de86 100644
--- a/scripts/training/phrase-extract/RuleExtractionOptions.h
+++ b/scripts/training/phrase-extract/RuleExtractionOptions.h
@@ -45,6 +45,7 @@ public:
   bool targetSyntax;
   bool duplicateRules;
   bool fractionalCounting;
+  bool pcfgScore;
   bool outputNTLengths;
   bool gzOutput;
   
@@ -74,6 +75,7 @@ public:
     , targetSyntax(false)
     , duplicateRules(true)
     , fractionalCounting(true)
+    , pcfgScore(false)
     , outputNTLengths(false)
     , gzOutput(false)
   {}
diff --git a/scripts/training/phrase-extract/SyntaxTree.cpp b/scripts/training/phrase-extract/SyntaxTree.cpp
index e181b1e8a..f2783ffd2 100644
--- a/scripts/training/phrase-extract/SyntaxTree.cpp
+++ b/scripts/training/phrase-extract/SyntaxTree.cpp
@@ -42,11 +42,12 @@ void SyntaxTree::Clear()
   m_index.clear();
 }
 
-void SyntaxTree::AddNode( int startPos, int endPos, std::string label )
+SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label )
 {
   SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
   m_nodes.push_back( newNode );
   m_index[ startPos ][ endPos ].push_back( newNode );
+  return newNode;
 }
 
 ParentNodes SyntaxTree::Parse()
diff --git a/scripts/training/phrase-extract/SyntaxTree.h b/scripts/training/phrase-extract/SyntaxTree.h
index 0ca5ca472..17c106b49 100644
--- a/scripts/training/phrase-extract/SyntaxTree.h
+++ b/scripts/training/phrase-extract/SyntaxTree.h
@@ -34,12 +34,14 @@ protected:
   std::string m_label;
   std::vector< SyntaxNode* > m_children;
   SyntaxNode* m_parent;
+  float m_pcfgScore;
 public:
   SyntaxNode( int startPos, int endPos, std::string label )
     :m_start(startPos)
     ,m_end(endPos)
     ,m_label(label)
     ,m_parent(0)
+    ,m_pcfgScore(0.0f)
   {}
   int GetStart() const {
     return m_start;
@@ -50,6 +52,12 @@ public:
   std::string GetLabel() const {
     return m_label;
   }
+  float GetPcfgScore() const {
+    return m_pcfgScore;
+  }
+  void SetPcfgScore(float score) {
+    m_pcfgScore = score;
+  }
   SyntaxNode *GetParent() {
     return m_parent;
   }
@@ -89,11 +97,12 @@ public:
   }
   ~SyntaxTree();
 
+  SyntaxNode *AddNode( int startPos, int endPos, std::string label );
+
   SyntaxNode *GetTop() {
     return m_top;
   }
 
-  void AddNode( int startPos, int endPos, std::string label );
   ParentNodes Parse();
   bool HasNode( int startPos, int endPos ) const;
   const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
diff --git a/scripts/training/phrase-extract/XmlTree.cpp b/scripts/training/phrase-extract/XmlTree.cpp
index 19825c02c..29c0d94aa 100644
--- a/scripts/training/phrase-extract/XmlTree.cpp
+++ b/scripts/training/phrase-extract/XmlTree.cpp
@@ -25,7 +25,7 @@
 #include <string>
 #include <set>
 #include <iostream>
-#include <stdlib.h>
+#include <cstdlib>
 #include <sstream>
 #include "SyntaxTree.h"
 #include "XmlException.h"
@@ -355,13 +355,18 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
         string label = ParseXmlTagAttribute(tagContent,"label");
         labelCollection.insert( label );
 
+        string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
+        float pcfgScore = pcfgString == "" ? 0.0f
+                                           : std::atof(pcfgString.c_str());
+
         // report what we have processed so far
         if (0) {
           cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
           cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
           cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
         }
-        tree.AddNode( startPos, endPos-1, label );
+        SyntaxNode *node = tree.AddNode( startPos, endPos-1, label );
+        node->SetPcfgScore(pcfgScore);
       }
     }
   }
diff --git a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 0ecffae5c..6bd32a13b 100644
--- a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -212,6 +212,10 @@ Node *AlignmentGraph::CopyParseTree(const ParseTree *root)
 
   std::auto_ptr<Node> n(new Node(root->GetLabel(), nodeType));
 
+  if (nodeType == TREE) {
+    n->SetPcfgScore(root->GetPcfgScore());
+  }
+
   const std::vector<ParseTree *> &children = root->GetChildren();
   std::vector<Node *> childNodes;
   childNodes.reserve(children.size());
diff --git a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 008026e1a..397ce1e3c 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -285,6 +285,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
         "set maximum allowed scope")
     ("Minimal",
         "extract minimal rules only")
+    ("PCFG",
+        "include score based on PCFG scores in target corpus")
     ("UnknownWordLabel",
         po::value(&options.unknownWordFile),
         "write unknown word labels to named file")
@@ -361,6 +363,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
   if (vm.count("Minimal")) {
     options.minimal = true;
   }
+  if (vm.count("PCFG")) {
+    options.pcfg = true;
+  }
   if (vm.count("UnpairedExtractFormat")) {
     options.unpairedExtractFormat = true;
   }
diff --git a/scripts/training/phrase-extract/extract-ghkm/Node.h b/scripts/training/phrase-extract/extract-ghkm/Node.h
index 228fdc812..775473362 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Node.h
+++ b/scripts/training/phrase-extract/extract-ghkm/Node.h
@@ -41,8 +41,7 @@ class Node
   Node(const std::string &label, NodeType type)
       : m_label(label)
       , m_type(type)
-      , m_children()
-      , m_parents() {}
+      , m_pcfgScore(0.0f) {}
 
   ~Node();
 
@@ -50,12 +49,14 @@ class Node
   NodeType GetType() const { return m_type; }
   const std::vector<Node*> &GetChildren() const { return m_children; }
   const std::vector<Node*> &GetParents() const { return m_parents; }
+  float GetPcfgScore() const { return m_pcfgScore; }
   const Span &GetSpan() const { return m_span; }
   const Span &GetComplementSpan() const { return m_complementSpan; }
   const std::vector<const Subgraph*> &GetRules() const { return m_rules; }
 
   void SetChildren(const std::vector<Node*> &c) { m_children = c; }
   void SetParents(const std::vector<Node*> &p) { m_parents = p; }
+  void SetPcfgScore(float s) { m_pcfgScore = s; }
   void SetSpan(const Span &s) { m_span = s; }
   void SetComplementSpan(const Span &cs) { m_complementSpan = cs; }
 
@@ -92,6 +93,7 @@ class Node
   NodeType m_type;
   std::vector<Node*> m_children;
   std::vector<Node*> m_parents;
+  float m_pcfgScore;
   Span m_span;
   Span m_complementSpan;
   std::vector<const Subgraph*> m_rules;
diff --git a/scripts/training/phrase-extract/extract-ghkm/Options.h b/scripts/training/phrase-extract/extract-ghkm/Options.h
index 108e19d66..c4b57f311 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Options.h
+++ b/scripts/training/phrase-extract/extract-ghkm/Options.h
@@ -36,6 +36,7 @@ struct Options {
       , maxRuleSize(3)
       , maxScope(3)
       , minimal(false)
+      , pcfg(false)
       , unpairedExtractFormat(false) {}
 
   // Positional options
@@ -53,6 +54,7 @@ struct Options {
   int maxRuleSize;
   int maxScope;
   bool minimal;
+  bool pcfg;
   bool unpairedExtractFormat;
   std::string unknownWordFile;
 };
diff --git a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
index ec6fc147a..273e2e04e 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
+++ b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
@@ -32,17 +32,19 @@ class ParseTree
  public:
   ParseTree(const std::string &label)
       : m_label(label)
-      , m_children()
-      , m_parent() {}
+      , m_parent(0)
+      , m_pcfgScore(0.0) {}
 
   ~ParseTree();
 
   const std::string &GetLabel() const { return m_label; }
   const std::vector<ParseTree*> &GetChildren() const { return m_children; }
   const ParseTree *GetParent() const { return m_parent; }
+  float GetPcfgScore() const { return m_pcfgScore; }
 
   void SetParent(ParseTree *);
   void SetChildren(const std::vector<ParseTree*> &);
+  void SetPcfgScore(float score) { m_pcfgScore = score; }
 
   void AddChild(ParseTree *);
 
@@ -59,6 +61,7 @@ class ParseTree
   std::string m_label;
   std::vector<ParseTree*> m_children;
   ParseTree *m_parent;
+  float m_pcfgScore;  // log probability
 };
 
 template<typename OutputIterator>
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp
index 8473e4283..5dc70052c 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -30,6 +30,7 @@ namespace GHKM {
 ScfgRule::ScfgRule(const Subgraph &fragment)
     : m_sourceLHS("X", NonTerminal)
     , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
+    , m_pcfgScore(fragment.GetPcfgScore())
 {
   // Source RHS
 
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h
index 1ed534d9e..2405d8fa3 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h
@@ -57,6 +57,7 @@ class ScfgRule
   const std::vector<Symbol> &GetSourceRHS() const { return m_sourceRHS; }
   const std::vector<Symbol> &GetTargetRHS() const { return m_targetRHS; }
   const Alignment &GetAlignment() const { return m_alignment; }
+  float GetPcfgScore() const { return m_pcfgScore; }
 
   int Scope() const;
 
@@ -68,6 +69,7 @@ class ScfgRule
   std::vector<Symbol> m_sourceRHS;
   std::vector<Symbol> m_targetRHS;
   Alignment m_alignment;
+  float m_pcfgScore;
 };
 
 }  // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
index 4be3f048d..d5d16b790 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
@@ -24,6 +24,7 @@
 #include "ScfgRule.h"
 
 #include <cassert>
+#include <cmath>
 #include <ostream>
 #include <map>
 #include <sstream>
@@ -34,14 +35,43 @@ namespace GHKM {
 
 void ScfgRuleWriter::Write(const ScfgRule &rule)
 {
+  std::ostringstream sourceSS;
+  std::ostringstream targetSS;
+
   if (m_options.unpairedExtractFormat) {
-    WriteUnpairedFormat(rule);
+    WriteUnpairedFormat(rule, sourceSS, targetSS);
   } else {
-    WriteStandardFormat(rule);
+    WriteStandardFormat(rule, sourceSS, targetSS);
+  }
+
+  // Write the rule to the forward and inverse extract files.
+  m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+  m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+
+  const Alignment &alignment = rule.GetAlignment();
+  for (Alignment::const_iterator p = alignment.begin();
+       p != alignment.end(); ++p) {
+    m_fwd << " " << p->first << "-" << p->second;
+    m_inv << " " << p->second << "-" << p->first;
+  }
+
+  // Write a count of 1 and an empty NT length column to the forward extract
+  // file.
+  // TODO Add option to write NT length?
+  m_fwd << " ||| 1 ||| |||";
+  if (m_options.pcfg) {
+    // Write the PCFG score.
+    m_fwd << " " << std::exp(rule.GetPcfgScore());
   }
+  m_fwd << std::endl;
+
+  // Write a count of 1 to the inverse extract file.
+  m_inv << " ||| 1" << std::endl;
 }
 
-void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
+void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
+                                         std::ostream &sourceSS,
+                                         std::ostream &targetSS)
 {
   const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
   const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
@@ -60,9 +90,6 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
     }
   }
 
-  std::ostringstream sourceSS;
-  std::ostringstream targetSS;
-
   // Write the source side of the rule to sourceSS.
   int i = 0;
   for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
@@ -88,27 +115,14 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
     targetSS << " ";
   }
   WriteSymbol(rule.GetTargetLHS(), targetSS);
-
-  // Write the rule to the forward and inverse extract files.
-  m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
-  m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
-  for (Alignment::const_iterator p(alignment.begin());
-       p != alignment.end(); ++p) {
-    m_fwd << " " << p->first << "-" << p->second;
-    m_inv << " " << p->second << "-" << p->first;
-  }
-  m_fwd << " ||| 1" << std::endl;
-  m_inv << " ||| 1" << std::endl;
 }
 
-void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule)
+void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
+                                         std::ostream &sourceSS,
+                                         std::ostream &targetSS)
 {
   const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
   const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
-  const Alignment &alignment = rule.GetAlignment();
-
-  std::ostringstream sourceSS;
-  std::ostringstream targetSS;
 
   // Write the source side of the rule to sourceSS.
   int i = 0;
@@ -127,17 +141,6 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule)
     targetSS << " ";
   }
   WriteSymbol(rule.GetTargetLHS(), targetSS);
-
-  // Write the rule to the forward and inverse extract files.
-  m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
-  m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
-  for (Alignment::const_iterator p(alignment.begin());
-       p != alignment.end(); ++p) {
-    m_fwd << " " << p->first << "-" << p->second;
-    m_inv << " " << p->second << "-" << p->first;
-  }
-  m_fwd << " ||| 1" << std::endl;
-  m_inv << " ||| 1" << std::endl;
 }
 
 void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h
index 738d09ce9..b92a432a1 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h
@@ -45,8 +45,8 @@ class ScfgRuleWriter
   ScfgRuleWriter(const ScfgRuleWriter &);
   ScfgRuleWriter &operator=(const ScfgRuleWriter &);
 
-  void WriteStandardFormat(const ScfgRule &);
-  void WriteUnpairedFormat(const ScfgRule &);
+  void WriteStandardFormat(const ScfgRule &, std::ostream &, std::ostream &);
+  void WriteUnpairedFormat(const ScfgRule &, std::ostream &, std::ostream &);
   void WriteSymbol(const Symbol &, std::ostream &);
 
   std::ostream &m_fwd;
diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
index e5aedbb16..e048f2c55 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
@@ -101,5 +101,21 @@ int Subgraph::CalcDepth(const Node *n) const
   return maxChildDepth + 1;
 }
 
+float Subgraph::CalcPcfgScore() const
+{
+  if (m_root->GetType() != TREE || m_leaves.empty()) {
+    return 0.0f;
+  }
+  float score = m_root->GetPcfgScore();
+  for (std::set<const Node *>::const_iterator p = m_leaves.begin();
+       p != m_leaves.end(); ++p) {
+    const Node *leaf = *p;
+    if (leaf->GetType() == TREE) {
+      score -= leaf->GetPcfgScore();
+    }
+  }
+  return score;
+}
+
 }  // namespace Moses
 }  // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
index e84903502..ede1233e9 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
+++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
@@ -38,7 +38,8 @@ class Subgraph
       : m_root(root)
       , m_depth(0)
       , m_size(root->GetType() == TREE ? 1 : 0)
-      , m_nodeCount(1) {}
+      , m_nodeCount(1)
+      , m_pcfgScore(0.0f) {}
 
   Subgraph(const Node *root, const std::set<const Node *> &leaves)
       : m_root(root)
@@ -46,10 +47,12 @@ class Subgraph
       , m_depth(-1)
       , m_size(-1)
       , m_nodeCount(-1)
+      , m_pcfgScore(0.0f)
   {
     m_depth = CalcDepth(m_root);
     m_size = CalcSize(m_root);
     m_nodeCount = CountNodes(m_root);
+    m_pcfgScore = CalcPcfgScore();
   }
 
   const Node *GetRoot() const { return m_root; }
@@ -57,6 +60,7 @@ class Subgraph
   int GetDepth() const { return m_depth; }
   int GetSize() const { return m_size; }
   int GetNodeCount() const { return m_nodeCount; }
+  float GetPcfgScore() const { return m_pcfgScore; }
 
   bool IsTrivial() const { return m_leaves.empty(); }
 
@@ -66,6 +70,7 @@ class Subgraph
   void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
   int CalcDepth(const Node *) const;
   int CalcSize(const Node *) const;
+  float CalcPcfgScore() const;
   int CountNodes(const Node *) const;
 
   const Node *m_root;
@@ -73,6 +78,7 @@ class Subgraph
   int m_depth;
   int m_size;
   int m_nodeCount;
+  float m_pcfgScore;
 };
 
 }  // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
index 31c0e3843..cc961dc0c 100644
--- a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@@ -61,6 +61,7 @@ std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree(
     const std::vector<std::string> &words)
 {
   std::auto_ptr<ParseTree> root(new ParseTree(tree.GetLabel()));
+  root->SetPcfgScore(tree.GetPcfgScore());
   const std::vector<SyntaxNode*> &children = tree.GetChildren();
   if (children.empty()) {
     if (tree.GetStart() != tree.GetEnd()) {
diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp
index 2cc9dc54d..a00667b82 100644
--- a/scripts/training/phrase-extract/extract-rules.cpp
+++ b/scripts/training/phrase-extract/extract-rules.cpp
@@ -90,7 +90,7 @@ void addHieroRule( int startT, int endT, int startS, int endS
 void printHieroPhrase( int startT, int endT, int startS, int endS
                        , HoleCollection &holeColl, LabelIndex &labelIndex);
 string printTargetHieroPhrase(  int startT, int endT, int startS, int endS
-                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex);
+                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
 string printSourceHieroPhrase( int startT, int endT, int startS, int endS
                                , HoleCollection &holeColl, const LabelIndex &labelIndex);
 void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
@@ -257,6 +257,8 @@ int main(int argc, char* argv[])
     // if an source phrase is paired with two target phrases, then count(t|s) = 0.5
     else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
       options.fractionalCounting = false;
+    } else if (strcmp(argv[i],"--PCFG") == 0) {
+      options.pcfgScore = true;
     } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
       options.outputNTLengths = true;
 #ifdef WITH_THREADS
@@ -517,7 +519,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
 }
 
 string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS
-                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex)
+                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore)
 {
   HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
   assert(iterHoleList != holeColl.GetHoles().end());
@@ -545,6 +547,11 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
 
       out += "[" + sourceLabel + "][" + targetLabel + "] ";
 
+      if (m_options.pcfgScore) {
+        double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
+        logPCFGScore -= score;
+      }
+
       currPos = hole.GetEnd(1);
       hole.SetPos(outPos, 1);
       ++iterHoleList;
@@ -658,8 +665,16 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
   preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
 
   // target
-  rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex)
+  if (m_options.pcfgScore) {
+    double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+    rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+                + " [" + targetLabel + "]";
+    rule.pcfgScore = std::exp(logPCFGScore);
+  } else {
+    double logPCFGScore = 0.0f;
+    rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
                 + " [" + targetLabel + "]";
+  }
 
   // source
   // holeColl.SortSourceHoles();
@@ -877,6 +892,11 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist
     rule.target += m_sentence->target[ti] + " ";
   rule.target += "[" + targetLabel + "]";
 
+  if (m_options.pcfgScore) {
+    double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+    rule.pcfgScore = std::exp(logPCFGScore);
+  }
+
   // alignment
   for(int ti=startT; ti<=endT; ti++) {
     for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
@@ -957,11 +977,13 @@ void ExtractTask::writeRulesToFile()
     out << rule->source << " ||| "
         << rule->target << " ||| "
         << rule->alignment << " ||| "
-        << rule->count;
+        << rule->count << " ||| ";
     if (m_options.outputNTLengths) {
-      out << " ||| ";
       rule->OutputNTLengths(out); 
     }
+    if (m_options.pcfgScore) {
+      out << " ||| " << rule->pcfgScore;
+    }
     out << "\n";
 
     if (!m_options.onlyDirectFlag) {
diff --git a/scripts/training/phrase-extract/extract.cpp b/scripts/training/phrase-extract/extract.cpp
index f6d6cbb9b..16b413da9 100644
--- a/scripts/training/phrase-extract/extract.cpp
+++ b/scripts/training/phrase-extract/extract.cpp
@@ -22,6 +22,7 @@
 #include "SentenceAlignment.h"
 #include "tables-core.h"
 #include "InputFileStream.h"
+#include "OutputFileStream.h"
 
 using namespace std;
 
@@ -82,15 +83,16 @@ bool hierModel = false;
 REO_MODEL_TYPE hierType = REO_MSD;
 
 
-ofstream extractFile;
-ofstream extractFileInv;
-ofstream extractFileOrientation;
-ofstream extractFileSentenceId;
+Moses::OutputFileStream extractFile;
+Moses::OutputFileStream extractFileInv;
+Moses::OutputFileStream extractFileOrientation;
+Moses::OutputFileStream extractFileSentenceId;
 int maxPhraseLength;
 bool orientationFlag = false;
 bool translationFlag = true;
 bool sentenceIdFlag = false; //create extract file with sentence id
 bool onlyOutputSpanInfo = false;
+bool gzOutput = false;
 
 int main(int argc, char* argv[])
 {
@@ -116,6 +118,8 @@ int main(int argc, char* argv[])
       translationFlag = false;
     } else if (strcmp(argv[i], "--SentenceId") == 0) {
       sentenceIdFlag = true;  
+    } else if (strcmp(argv[i], "--GZOutput") == 0) {
+      gzOutput = true;  
     } else if(strcmp(argv[i],"--model") == 0) {
       if (i+1 >= argc) {
         cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
@@ -193,18 +197,18 @@ int main(int argc, char* argv[])
 
   // open output files
   if (translationFlag) {
-    string fileNameExtractInv = fileNameExtract + ".inv";
-    extractFile.open(fileNameExtract.c_str());
-    extractFileInv.open(fileNameExtractInv.c_str());
+    string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
+    extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
+    extractFileInv.Open(fileNameExtractInv.c_str());
   }
   if (orientationFlag) {
-    string fileNameExtractOrientation = fileNameExtract + ".o";
-    extractFileOrientation.open(fileNameExtractOrientation.c_str());
+    string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
+    extractFileOrientation.Open(fileNameExtractOrientation.c_str());
   }
 
   if (sentenceIdFlag) {
-    string fileNameExtractSentenceId = fileNameExtract + ".sid";
-    extractFileSentenceId.open(fileNameExtractSentenceId.c_str());
+    string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
+    extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
   }
 
   int i=0;
@@ -239,12 +243,12 @@ int main(int argc, char* argv[])
   //az: only close if we actually opened it
   if (!onlyOutputSpanInfo) {
     if (translationFlag) {
-      extractFile.close();
-      extractFileInv.close();
+      extractFile.Close();
+      extractFileInv.Close();
     }
-    if (orientationFlag) extractFileOrientation.close();
+    if (orientationFlag) extractFileOrientation.Close();
     if (sentenceIdFlag) {
-      extractFileSentenceId.close();
+      extractFileSentenceId.Close();
     }
   }
 }
diff --git a/scripts/training/phrase-extract/pcfg-common/Jamfile b/scripts/training/phrase-extract/pcfg-common/Jamfile
new file mode 100644
index 000000000..3dc272a56
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/Jamfile
@@ -0,0 +1 @@
+lib pcfg_common : [ glob *.cc ] ..//trees ;
diff --git a/scripts/training/phrase-extract/pcfg-common/exception.h b/scripts/training/phrase-extract/pcfg-common/exception.h
new file mode 100644
index 000000000..3dbd59d0e
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/exception.h
@@ -0,0 +1,41 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXCEPTION_H_
+#define PCFG_EXCEPTION_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+class Exception {
+ public:
+  Exception(const char *msg) : msg_(msg) {}
+  Exception(const std::string &msg) : msg_(msg) {}
+  const std::string &msg() const { return msg_; }
+ private:
+  std::string msg_;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/numbered_set.h b/scripts/training/phrase-extract/pcfg-common/numbered_set.h
new file mode 100644
index 000000000..f88d710ed
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/numbered_set.h
@@ -0,0 +1,109 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_NUMBERED_SET_H_
+#define PCFG_NUMBERED_SET_H_
+
+#include "exception.h"
+
+#include <boost/unordered_map.hpp>
+
+#include <limits>
+#include <sstream>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Stores a set of elements of type T, each of which is allocated an integral
+// ID of type I.  IDs are contiguous starting at 0.  Individual elements cannot
+// be removed once inserted (but the whole set can be cleared).
+template<typename T, typename I=size_t>
+class NumberedSet {
+ private:
+  typedef boost::unordered_map<T, I> ElementToIdMap;
+  typedef std::vector<const T *> IdToElementMap;
+
+ public:
+  typedef I IdType;
+  typedef typename IdToElementMap::const_iterator const_iterator;
+
+  NumberedSet() {}
+
+  const_iterator begin() const { return id_to_element_.begin(); }
+  const_iterator end() const { return id_to_element_.end(); }
+
+  // Static value
+  static I NullId() { return std::numeric_limits<I>::max(); }
+
+  bool Empty() const { return id_to_element_.empty(); }
+  size_t Size() const { return id_to_element_.size(); }
+
+  // Insert the given object and return its ID.
+  I Insert(const T &);
+
+  I Lookup(const T &) const;
+  const T &Lookup(I) const;
+
+  void Clear();
+
+ private:
+  ElementToIdMap element_to_id_;
+  IdToElementMap id_to_element_;
+};
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Lookup(const T &s) const {
+  typename ElementToIdMap::const_iterator p = element_to_id_.find(s);
+  return (p == element_to_id_.end()) ? NullId() : p->second;
+}
+
+template<typename T, typename I>
+const T &NumberedSet<T, I>::Lookup(I id) const {
+  if (id < 0 || id >= id_to_element_.size()) {
+    std::ostringstream msg;
+    msg << "Value not found: " << id;
+    throw Exception(msg.str());
+  }
+  return *(id_to_element_[id]);
+}
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Insert(const T &x) {
+  std::pair<T, I> value(x, id_to_element_.size());
+  std::pair<typename ElementToIdMap::iterator, bool> result =
+      element_to_id_.insert(value);
+  if (result.second) {
+    // x is a new element.
+    id_to_element_.push_back(&result.first->first);
+  }
+  return result.first->second;
+}
+
+template<typename T, typename I>
+void NumberedSet<T, I>::Clear() {
+  element_to_id_.clear();
+  id_to_element_.clear();
+}
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.cc b/scripts/training/phrase-extract/pcfg-common/pcfg.cc
new file mode 100644
index 000000000..d045b820b
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/pcfg.cc
@@ -0,0 +1,106 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "pcfg.h"
+
+#include "exception.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include <cassert>
+
+namespace Moses {
+namespace PCFG {
+
+void Pcfg::Add(const Key &key, double score) {
+  rules_[key] = score;
+}
+
+bool Pcfg::Lookup(const Key &key, double &score) const {
+  Map::const_iterator p = rules_.find(key);
+  if (p == rules_.end()) {
+    return false;
+  }
+  score = p->second;
+  return true;
+}
+
+void Pcfg::Read(std::istream &input, Vocabulary &vocab) {
+  std::string line;
+  std::string lhs_string;
+  std::vector<std::string> rhs_strings;
+  std::string score_string;
+  Key key;
+  while (std::getline(input, line)) {
+    // Read LHS.
+    size_t pos = line.find("|||");
+    if (pos == std::string::npos) {
+      throw Exception("missing first delimiter");
+    }
+    lhs_string = line.substr(0, pos);
+    boost::trim(lhs_string);
+
+    // Read RHS.
+    size_t begin = pos+3;
+    pos = line.find("|||", begin);
+    if (pos == std::string::npos) {
+      throw Exception("missing second delimiter");
+    }
+    std::string rhs_text = line.substr(begin, pos-begin);
+    boost::trim(rhs_text);
+    rhs_strings.clear();
+    boost::split(rhs_strings, rhs_text, boost::algorithm::is_space(),
+                 boost::algorithm::token_compress_on);
+
+    // Read score.
+    score_string = line.substr(pos+3);
+    boost::trim(score_string);
+
+    // Construct key.
+    key.clear();
+    key.reserve(rhs_strings.size()+1);
+    key.push_back(vocab.Insert(lhs_string));
+    for (std::vector<std::string>::const_iterator p = rhs_strings.begin();
+         p != rhs_strings.end(); ++p) {
+      key.push_back(vocab.Insert(*p));
+    }
+
+    // Add rule.
+    double score = boost::lexical_cast<double>(score_string);
+    Add(key, score);
+  }
+}
+
+void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const {
+  for (const_iterator p = begin(); p != end(); ++p) {
+    const Key &key = p->first;
+    double score = p->second;
+    std::vector<size_t>::const_iterator q = key.begin();
+    std::vector<size_t>::const_iterator end = key.end();
+    output << vocab.Lookup(*q++) << " |||";
+    while (q != end) {
+      output << " " << vocab.Lookup(*q++);
+    }
+    output << " ||| " << score << std::endl;
+  }
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.h b/scripts/training/phrase-extract/pcfg-common/pcfg.h
new file mode 100644
index 000000000..757eea449
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/pcfg.h
@@ -0,0 +1,61 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_PCFG_H_
+#define PCFG_PCFG_H_
+
+#include "typedef.h"
+
+#include <istream>
+#include <map>
+#include <ostream>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+class Pcfg {
+ public:
+  typedef std::vector<size_t> Key;
+  typedef std::map<Key, double> Map;
+  typedef Map::iterator iterator;
+  typedef Map::const_iterator const_iterator;
+
+  Pcfg() {}
+
+  iterator begin() { return rules_.begin(); }
+  const_iterator begin() const { return rules_.begin(); }
+
+  iterator end() { return rules_.end(); }
+  const_iterator end() const { return rules_.end(); }
+
+  void Add(const Key &, double);
+  bool Lookup(const Key &, double &) const;
+  void Read(std::istream &, Vocabulary &);
+  void Write(const Vocabulary &, std::ostream &) const;
+
+ private:
+  Map rules_;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h
new file mode 100644
index 000000000..bdac64dfc
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h
@@ -0,0 +1,77 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_PCFG_TREE_H_
+#define PCFG_PCFG_TREE_H_
+
+#include "syntax_tree.h"
+#include "xml_tree_writer.h"
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+template<typename DerivedType>
+class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
+ public:
+  typedef std::string LabelType;
+  typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
+
+  PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
+
+  double score() const { return score_; }
+  void set_score(double s) { score_ = s; }
+
+ private:
+  double score_;
+};
+
+class PcfgTree : public PcfgTreeBase<PcfgTree> {
+ public:
+  typedef PcfgTreeBase<PcfgTree> BaseType;
+  PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
+};
+
+// Specialise XmlOutputHandler for PcfgTree.
+template<>
+class XmlOutputHandler<PcfgTree> {
+ public:
+  typedef std::map<std::string, std::string> AttributeMap;
+
+  void GetLabel(const PcfgTree &tree, std::string &label) const {
+    label = tree.label();
+  }
+
+  void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const {
+    attribute_map.clear();
+    double score = tree.score();
+    if (score != 0.0) {
+      std::ostringstream out;
+      out << tree.score();
+      attribute_map["pcfg"] = out.str();
+    }
+  }
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/syntax_tree.h b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h
new file mode 100644
index 000000000..37f72dd58
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h
@@ -0,0 +1,91 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SYNTAX_TREE_H_
+#define PCFG_SYNTAX_TREE_H_
+
+#include <cassert>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Base class for SyntaxTree, AgreementTree, and friends.
+template<typename T, typename DerivedType>
+class SyntaxTreeBase {
+ public:
+  // Constructors
+  SyntaxTreeBase(const T &label)
+      : label_(label)
+      , children_()
+      , parent_(0) {}
+
+  SyntaxTreeBase(const T &label, const std::vector<DerivedType *> &children)
+      : label_(label)
+      , children_(children)
+      , parent_(0) {}
+
+  // Destructor
+  virtual ~SyntaxTreeBase();
+
+  const T &label() const { return label_; }
+  const DerivedType *parent() const { return parent_; }
+  DerivedType *parent() { return parent_; }
+  const std::vector<DerivedType *> &children() const { return children_; }
+  std::vector<DerivedType *> &children() { return children_; }
+
+  void set_label(const T &label) { label_ = label; }
+  void set_parent(DerivedType *parent) { parent_ = parent; }
+  void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
+
+  bool IsLeaf() const { return children_.empty(); }
+
+  bool IsPreterminal() const {
+    return children_.size() == 1 && children_[0]->IsLeaf();
+  }
+
+  void AddChild(DerivedType *child) { children_.push_back(child); }
+
+ private:
+  T label_;
+  std::vector<DerivedType *> children_;
+  DerivedType *parent_;
+};
+
+template<typename T>
+class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
+ public:
+  typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
+  SyntaxTree(const T &label) : BaseType(label) {}
+  SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
+      : BaseType(label, children) {}
+};
+
+template<typename T, typename DerivedType>
+SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
+  for (size_t i = 0; i < children_.size(); ++i) {
+    delete children_[i];
+  }
+}
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/tool.cc b/scripts/training/phrase-extract/pcfg-common/tool.cc
new file mode 100644
index 000000000..bebd220e1
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/tool.cc
@@ -0,0 +1,80 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "tool.h"
+
+#include <sstream>
+
+namespace Moses {
+namespace PCFG {
+
+std::istream &Tool::OpenInputOrDie(const std::string &filename) {
+  // TODO Check that function is only called once?
+  if (filename.empty() || filename == "-") {
+    input_ptr_ = &(std::cin);
+  } else {
+    input_file_stream_.open(filename.c_str());
+    if (!input_file_stream_) {
+      std::ostringstream msg;
+      msg << "failed to open input file: " << filename;
+      Error(msg.str());
+    }
+    input_ptr_ = &input_file_stream_;
+  }
+  return *input_ptr_;
+}
+
+std::ostream &Tool::OpenOutputOrDie(const std::string &filename) {
+  // TODO Check that function is only called once?
+  if (filename.empty() || filename == "-") {
+    output_ptr_ = &(std::cout);
+  } else {
+    output_file_stream_.open(filename.c_str());
+    if (!output_file_stream_) {
+      std::ostringstream msg;
+      msg << "failed to open output file: " << filename;
+      Error(msg.str());
+    }
+    output_ptr_ = &output_file_stream_;
+  }
+  return *output_ptr_;
+}
+
+void Tool::OpenNamedInputOrDie(const std::string &filename,
+                               std::ifstream &stream) {
+  stream.open(filename.c_str());
+  if (!stream) {
+    std::ostringstream msg;
+    msg << "failed to open input file: " << filename;
+    Error(msg.str());
+  }
+}
+
+void Tool::OpenNamedOutputOrDie(const std::string &filename,
+                                std::ofstream &stream) {
+  stream.open(filename.c_str());
+  if (!stream) {
+    std::ostringstream msg;
+    msg << "failed to open output file: " << filename;
+    Error(msg.str());
+  }
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-common/tool.h b/scripts/training/phrase-extract/pcfg-common/tool.h
new file mode 100644
index 000000000..0af342569
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/tool.h
@@ -0,0 +1,91 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_TOOL_H_
+#define PCFG_TOOL_H_
+
+#include <boost/program_options/cmdline.hpp>
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+class Tool {
+ public:
+  virtual ~Tool() {}
+
+  const std::string &name() const { return name_; }
+
+  virtual int Main(int argc, char *argv[]) = 0;
+
+ protected:
+  Tool(const std::string &name) : name_(name) {}
+
+  // Returns the boost::program_options style that should be used by all tools.
+  static int CommonOptionStyle() {
+    namespace cls = boost::program_options::command_line_style;
+    return cls::default_style & (~cls::allow_guessing);
+  }
+
+  void Warn(const std::string &msg) const {
+    std::cerr << name_ << ": warning: " << msg << std::endl;
+  }
+
+  void Error(const std::string &msg) const {
+    std::cerr << name_ << ": error: " << msg << std::endl;
+    std::exit(1);
+  }
+
+  // Initialises the tool's main input stream and returns a reference that is
+  // valid for the remainder of the tool's lifetime.  If filename is empty or
+  // "-" then input is standard input; otherwise it is the named file.  Calls
+  // Error() if the file cannot be opened for reading.
+  std::istream &OpenInputOrDie(const std::string &filename);
+
+  // Initialises the tool's main output stream and returns a reference that is
+  // valid for the remainder of the tool's lifetime.  If filename is empty or
+  // "-" then output is standard output; otherwise it is the named file.  Calls
+  // Error() if the file cannot be opened for writing.
+  std::ostream &OpenOutputOrDie(const std::string &filename);
+
+  // Opens the named input file using the supplied ifstream.  Calls Error() if
+  // the file cannot be opened for reading.
+  void OpenNamedInputOrDie(const std::string &, std::ifstream &);
+
+  // Opens the named output file using the supplied ofstream.  Calls Error() if
+  // the file cannot be opened for writing.
+  void OpenNamedOutputOrDie(const std::string &, std::ofstream &);
+
+ private:
+  std::string name_;
+  std::istream *input_ptr_;
+  std::ifstream input_file_stream_;
+  std::ostream *output_ptr_;
+  std::ofstream output_file_stream_;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/typedef.h b/scripts/training/phrase-extract/pcfg-common/typedef.h
new file mode 100644
index 000000000..49a12d681
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/typedef.h
@@ -0,0 +1,37 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_TYPEDEF_H_
+#define PCFG_TYPEDEF_H_
+
+#include "numbered_set.h"
+#include "syntax_tree.h"
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+typedef NumberedSet<std::string> Vocabulary;
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc
new file mode 100644
index 000000000..5c596a0fb
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc
@@ -0,0 +1,85 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "xml_tree_parser.h"
+
+#include "exception.h"
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+
+#include <cassert>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+XmlTreeParser::XmlTreeParser()
+{
+}
+
+std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line)
+{
+  m_line = line;
+  m_tree.Clear();
+  try {
+    if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
+      throw Exception("");
+    }
+  } catch (const XmlException &e) {
+    throw Exception(e.getMsg());
+  }
+  m_tree.ConnectNodes();
+  SyntaxNode *root = m_tree.GetTop();
+  assert(root);
+  m_words = tokenize(m_line.c_str());
+  return ConvertTree(*root, m_words);
+}
+
+// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree.
+std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
+    const SyntaxNode &tree,
+    const std::vector<std::string> &words)
+{
+  std::auto_ptr<PcfgTree> root(new PcfgTree(tree.GetLabel()));
+  const std::vector<SyntaxNode*> &children = tree.GetChildren();
+  if (children.empty()) {
+    if (tree.GetStart() != tree.GetEnd()) {
+      std::ostringstream msg;
+      msg << "leaf node covers multiple words (" << tree.GetStart()
+          << "-" << tree.GetEnd() << "): this is currently unsupported";
+      throw Exception(msg.str());
+    }
+    std::auto_ptr<PcfgTree> leaf(new PcfgTree(words[tree.GetStart()]));
+    leaf->set_parent(root.get());
+    root->AddChild(leaf.release());
+  } else {
+    for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
+         p != children.end(); ++p) {
+      assert(*p);
+      std::auto_ptr<PcfgTree> child = ConvertTree(**p, words);
+      child->set_parent(root.get());
+      root->AddChild(child.release());
+    }
+  }
+  return root;
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h
new file mode 100644
index 000000000..6b418c44e
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h
@@ -0,0 +1,56 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_XML_TREE_PARSER_H_
+#define PCFG_XML_TREE_PARSER_H_
+
+#include "pcfg_tree.h"
+#include "SyntaxTree.h"
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Parses a string in Moses' XML parse tree format and returns a PcfgTree
+// object.
+class XmlTreeParser {
+ public:
+  XmlTreeParser();
+  std::auto_ptr<PcfgTree> Parse(const std::string &);
+ private:
+  std::auto_ptr<PcfgTree> ConvertTree(const SyntaxNode &,
+                                      const std::vector<std::string> &);
+
+  std::set<std::string> m_labelSet;
+  std::map<std::string, int> m_topLabelSet;
+  std::string m_line;
+  ::SyntaxTree m_tree;
+  std::vector<std::string> m_words;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h
new file mode 100644
index 000000000..347c352bb
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h
@@ -0,0 +1,127 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_XML_TREE_WRITER_H_
+#define PCFG_XML_TREE_WRITER_H_
+
+#include "syntax_tree.h"
+
+#include "XmlTree.h"
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <vector>
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+template<typename InputTree>
+class XmlOutputHandler {
+ public:
+  typedef std::map<std::string, std::string> AttributeMap;
+
+  void GetLabel(const InputTree &, std::string &) const;
+  void GetAttributes(const InputTree &, AttributeMap &) const;
+};
+
+template<typename InputTree>
+class XmlTreeWriter : public XmlOutputHandler<InputTree> {
+ public:
+  typedef XmlOutputHandler<InputTree> Base;
+  void Write(const InputTree &, std::ostream &) const;
+ private:
+  std::string Escape(const std::string &) const;
+};
+
+template<typename InputTree>
+void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
+                                     std::ostream &out) const {
+  assert(!tree.IsLeaf());
+
+  // Opening tag
+
+  std::string label;
+  Base::GetLabel(tree, label);
+  out << "<tree label=\"" << Escape(label) << "\"";
+
+  typename Base::AttributeMap attribute_map;
+  Base::GetAttributes(tree, attribute_map);
+
+  for (typename Base::AttributeMap::const_iterator p = attribute_map.begin();
+       p != attribute_map.end(); ++p) {
+    out << " " << p->first << "=\"" << p->second << "\"";
+  }
+
+  out << ">";
+
+  // Children
+
+  const std::vector<InputTree *> &children = tree.children();
+  for (typename std::vector<InputTree *>::const_iterator p = children.begin();
+       p != children.end(); ++p) {
+    InputTree &child = **p;
+    if (child.IsLeaf()) {
+      Base::GetLabel(child, label);
+      out << " " << Escape(label);
+    } else {
+      out << " ";
+      Write(**p, out);
+    }
+  }
+
+  // Closing tag
+  out << " </tree>";
+
+  if (tree.parent() == 0) {
+    out << std::endl;
+  }
+}
+
+// Escapes XML special characters.
+template<typename InputTree>
+std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
+  std::string t;
+  size_t len = s.size();
+  t.reserve(len);
+  for (size_t i = 0; i < len; ++i) {
+    if (s[i] == '<') {
+      t += "&lt;";
+    } else if (s[i] == '>') {
+      t += "&gt;";
+    } else if (s[i] == '&') {
+      t += "&amp;";
+    } else if (s[i] == '\'') {
+      t += "&apos;";
+    } else if (s[i] == '"') {
+      t += "&quot;";
+    } else {
+      t += s[i];
+    }
+  }
+  return t;
+}
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/Jamfile b/scripts/training/phrase-extract/pcfg-extract/Jamfile
new file mode 100644
index 000000000..be91d6d2f
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/Jamfile
@@ -0,0 +1 @@
+exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ;
diff --git a/scripts/training/phrase-extract/pcfg-extract/main.cc b/scripts/training/phrase-extract/pcfg-extract/main.cc
new file mode 100644
index 000000000..47b45afc3
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/main.cc
@@ -0,0 +1,25 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "pcfg_extract.h"
+
+int main(int argc, char *argv[]) {
+  Moses::PCFG::PcfgExtract tool;
+  return tool.Main(argc, argv);
+}
diff --git a/scripts/training/phrase-extract/pcfg-extract/options.h b/scripts/training/phrase-extract/pcfg-extract/options.h
new file mode 100644
index 000000000..3acb31b58
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/options.h
@@ -0,0 +1,36 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_OPTIONS_H_
+#define PCFG_EXTRACT_OPTIONS_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+struct Options {
+  std::string corpus_file;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc
new file mode 100644
index 000000000..151c9959c
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -0,0 +1,131 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "pcfg_extract.h"
+
+#include "options.h"
+#include "rule_collection.h"
+#include "rule_extractor.h"
+
+#include "pcfg-common/exception.h"
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/syntax_tree.h"
+#include "pcfg-common/typedef.h"
+#include "pcfg-common/xml_tree_parser.h"
+
+#include <boost/program_options.hpp>
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+int PcfgExtract::Main(int argc, char *argv[]) {
+  // Process command-line options.
+  Options options;
+  ProcessOptions(argc, argv, options);
+
+  // Extract PCFG rules from corpus.
+  Vocabulary non_term_vocab;
+  RuleExtractor rule_extractor(non_term_vocab);
+  RuleCollection rule_collection;
+  XmlTreeParser parser;
+  std::string line;
+  size_t line_num = 0;
+  std::auto_ptr<PcfgTree> tree;
+  while (std::getline(std::cin, line)) {
+    ++line_num;
+    try {
+      tree = parser.Parse(line);
+    } catch (Exception &e) {
+      std::ostringstream msg;
+      msg << "line " << line_num << ": " << e.msg();
+      Error(msg.str());
+    }
+    if (!tree.get()) {
+      std::ostringstream msg;
+      msg << "no tree at line " << line_num;
+      Warn(msg.str());
+      continue;
+    }
+    rule_extractor.Extract(*tree, rule_collection);
+  }
+
+  // Score rules and write PCFG to output.
+  Pcfg pcfg;
+  rule_collection.CreatePcfg(pcfg);
+  pcfg.Write(non_term_vocab, std::cout);
+
+  return 0;
+}
+
+void PcfgExtract::ProcessOptions(int argc, char *argv[],
+                                 Options &options) const {
+  namespace po = boost::program_options;
+
+  std::ostringstream usage_top;
+  usage_top << "Usage: " << name() << "\n\n" << "Options";
+
+  // Declare the command line options that are visible to the user.
+  po::options_description visible(usage_top.str());
+  visible.add_options()
+    ("help", "print help message and exit")
+  ;
+
+  // Declare the command line options that are hidden from the user
+  // (these are used as positional options).
+  po::options_description hidden("Hidden options");
+  hidden.add_options();
+
+  // Compose the full set of command-line options.
+  po::options_description cmd_line_options;
+  cmd_line_options.add(visible).add(hidden);
+
+  // Register the positional options.
+  po::positional_options_description p;
+
+  // Process the command-line.
+  po::variables_map vm;
+  try {
+    po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+              options(cmd_line_options).positional(p).run(), vm);
+    po::notify(vm);
+  } catch (const std::exception &e) {
+    std::ostringstream msg;
+    msg << e.what() << "\n\n" << visible;
+    Error(msg.str());
+  }
+
+  if (vm.count("help")) {
+    std::cout << visible << std::endl;
+    std::exit(0);
+  }
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h
new file mode 100644
index 000000000..1af6cb4fe
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_PCFG_EXTRACT_H_
+#define PCFG_EXTRACT_PCFG_EXTRACT_H_
+
+#include "pcfg-common/tool.h"
+
+namespace Moses {
+namespace PCFG {
+
+class Options;
+
+class PcfgExtract : public Tool {
+ public:
+  PcfgExtract() : Tool("pcfg-extract") {}
+  virtual int Main(int, char *[]);
+ private:
+  void ProcessOptions(int, char *[], Options &) const;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc
new file mode 100644
index 000000000..503b1a9e6
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc
@@ -0,0 +1,58 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "rule_collection.h"
+
+#include "pcfg-common/pcfg.h"
+
+#include <cmath>
+
+namespace Moses {
+namespace PCFG {
+
+void RuleCollection::Add(size_t lhs, const std::vector<size_t> &rhs) {
+  ++collection_[lhs][rhs];
+}
+
+void RuleCollection::CreatePcfg(Pcfg &pcfg) {
+  std::vector<size_t> key;
+  for (const_iterator p = begin(); p != end(); ++p) {
+    size_t lhs = p->first;
+    const RhsCountMap &rhs_counts = p->second;
+    size_t total = 0;
+    for (RhsCountMap::const_iterator q = rhs_counts.begin();
+         q != rhs_counts.end(); ++q) {
+      total += q->second;
+    }
+    for (RhsCountMap::const_iterator q = rhs_counts.begin();
+         q != rhs_counts.end(); ++q) {
+      const std::vector<size_t> &rhs = q->first;
+      size_t count = q->second;
+      double score = std::log(static_cast<double>(count) /
+                              static_cast<double>(total));
+      key.clear();
+      key.push_back(lhs);
+      key.insert(key.end(), rhs.begin(), rhs.end());
+      pcfg.Add(key, score);
+    }
+  }
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.h b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h
new file mode 100644
index 000000000..1b768dd21
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h
@@ -0,0 +1,59 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_RULE_COLLECTION_H_
+#define PCFG_EXTRACT_RULE_COLLECTION_H_
+
+#include "pcfg-common/pcfg.h"
+
+#include <boost/unordered_map.hpp>
+
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Contains PCFG rules and their counts.
+class RuleCollection {
+ public:
+  typedef boost::unordered_map<std::vector<size_t>, size_t> RhsCountMap;
+  typedef boost::unordered_map<size_t, RhsCountMap> Map;
+  typedef Map::iterator iterator;
+  typedef Map::const_iterator const_iterator;
+
+  RuleCollection() {}
+
+  iterator begin() { return collection_.begin(); }
+  const_iterator begin() const { return collection_.begin(); }
+
+  iterator end() { return collection_.end(); }
+  const_iterator end() const { return collection_.end(); }
+
+  void Add(size_t, const std::vector<size_t> &);
+  void CreatePcfg(Pcfg &);
+
+ private:
+  Map collection_;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc
new file mode 100644
index 000000000..48a82a6d0
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc
@@ -0,0 +1,51 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "rule_extractor.h"
+
+#include "pcfg-common/pcfg_tree.h"
+
+namespace Moses {
+namespace PCFG {
+
+RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab)
+    : non_term_vocab_(non_term_vocab) {
+}
+
+void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const {
+  if (tree.IsPreterminal() || tree.IsLeaf()) {
+    return;
+  }
+
+  size_t lhs = non_term_vocab_.Insert(tree.label());
+  std::vector<size_t> rhs;
+
+  const std::vector<PcfgTree *> &children = tree.children();
+  rhs.reserve(children.size());
+  for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+       p != children.end(); ++p) {
+    const PcfgTree &child = **p;
+    rhs.push_back(non_term_vocab_.Insert(child.label()));
+    Extract(child, rc);
+  }
+  rc.Add(lhs, rhs);
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h
new file mode 100644
index 000000000..6bcffbc61
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h
@@ -0,0 +1,45 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
+#define PCFG_EXTRACT_RULE_EXTRACTOR_H_
+
+#include "rule_collection.h"
+
+#include "pcfg-common/typedef.h"
+
+namespace Moses {
+namespace PCFG {
+
+class PcfgTree;
+
+// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
+class RuleExtractor {
+ public:
+  RuleExtractor(Vocabulary &);
+  void Extract(const PcfgTree &, RuleCollection &) const;
+ private:
+  Vocabulary &non_term_vocab_;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-score/Jamfile b/scripts/training/phrase-extract/pcfg-score/Jamfile
new file mode 100644
index 000000000..7225381c0
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/Jamfile
@@ -0,0 +1 @@
+exe pcfg-score : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ;
diff --git a/scripts/training/phrase-extract/pcfg-score/main.cc b/scripts/training/phrase-extract/pcfg-score/main.cc
new file mode 100644
index 000000000..da5392add
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/main.cc
@@ -0,0 +1,25 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "pcfg_score.h"
+
+int main(int argc, char *argv[]) {
+  Moses::PCFG::PcfgScore tool;
+  return tool.Main(argc, argv);
+}
diff --git a/scripts/training/phrase-extract/pcfg-score/options.h b/scripts/training/phrase-extract/pcfg-score/options.h
new file mode 100644
index 000000000..e54b2a0b9
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/options.h
@@ -0,0 +1,36 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_OPTIONS_H_
+#define PCFG_SCORE_OPTIONS_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+struct Options {
+  std::string pcfg_file;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc
new file mode 100644
index 000000000..d780200ad
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc
@@ -0,0 +1,152 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "pcfg_score.h"
+
+#include "options.h"
+#include "tree_scorer.h"
+
+#include "pcfg-common/exception.h"
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/syntax_tree.h"
+#include "pcfg-common/typedef.h"
+#include "pcfg-common/xml_tree_parser.h"
+
+#include <boost/program_options.hpp>
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+int PcfgScore::Main(int argc, char *argv[]) {
+  // Process command-line options.
+  Options options;
+  ProcessOptions(argc, argv, options);
+
+  // Open PCFG stream.
+  std::ifstream pcfg_stream;
+  OpenNamedInputOrDie(options.pcfg_file, pcfg_stream);
+
+  // Read PCFG.
+  Pcfg pcfg;
+  Vocabulary non_term_vocab;
+  pcfg.Read(pcfg_stream, non_term_vocab);
+
+  // Score corpus according to PCFG.
+  TreeScorer scorer(pcfg, non_term_vocab);
+  XmlTreeParser parser;
+  XmlTreeWriter<PcfgTree> writer;
+  std::string line;
+  size_t line_num = 0;
+  std::auto_ptr<PcfgTree> tree;
+  while (std::getline(std::cin, line)) {
+    ++line_num;
+    try {
+      tree = parser.Parse(line);
+    } catch (Exception &e) {
+      std::ostringstream msg;
+      msg << "line " << line_num << ": " << e.msg();
+      Error(msg.str());
+    }
+    if (!tree.get()) {
+      std::ostringstream msg;
+      msg << "no tree at line " << line_num;
+      Warn(msg.str());
+      std::cout << std::endl;
+      continue;
+    }
+    if (!scorer.Score(*tree)) {
+      std::ostringstream msg;
+      msg << "failed to score tree at line " << line_num;
+      Warn(msg.str());
+      std::cout << std::endl;
+      continue;
+    }
+    writer.Write(*tree, std::cout);
+  }
+
+  return 0;
+}
+
+void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const {
+  namespace po = boost::program_options;
+
+  std::ostringstream usage_top;
+  usage_top << "Usage: " << name() << " PCFG\n\n"
+            << "Options";
+
+  // Declare the command line options that are visible to the user.
+  po::options_description visible(usage_top.str());
+  visible.add_options()
+    ("help", "print help message and exit")
+  ;
+
+  // Declare the command line options that are hidden from the user
+  // (these are used as positional options).
+  po::options_description hidden("Hidden options");
+  hidden.add_options()
+    ("pcfg-file", po::value(&options.pcfg_file), "pcfg file")
+  ;
+
+  // Compose the full set of command-line options.
+  po::options_description cmd_line_options;
+  cmd_line_options.add(visible).add(hidden);
+
+  // Register the positional options.
+  po::positional_options_description p;
+  p.add("pcfg-file", 1);
+
+  // Process the command-line.
+  po::variables_map vm;
+  try {
+    po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+              options(cmd_line_options).positional(p).run(), vm);
+    po::notify(vm);
+  } catch (const std::exception &e) {
+    std::ostringstream msg;
+    msg << e.what() << "\n\n" << visible;
+    Error(msg.str());
+  }
+
+  if (vm.count("help")) {
+    std::cout << visible << std::endl;
+    std::exit(0);
+  }
+
+  // Check positional options were given.
+
+  if (!vm.count("pcfg-file")) {
+    std::ostringstream msg;
+    msg << "missing required argument\n\n" << visible << std::endl;
+    Error(msg.str());
+  }
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.h b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h
new file mode 100644
index 000000000..5e506c39d
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_PCFG_SCORE_H_
+#define PCFG_SCORE_PCFG_SCORE_H_
+
+#include "pcfg-common/tool.h"
+
+namespace Moses {
+namespace PCFG {
+
+class Options;
+
+class PcfgScore : public Tool {
+ public:
+  PcfgScore() : Tool("pcfg-score") {}
+  virtual int Main(int, char *[]);
+ private:
+  void ProcessOptions(int, char *[], Options &) const;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc
new file mode 100644
index 000000000..5f695e4fc
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc
@@ -0,0 +1,68 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "tree_scorer.h"
+
+#include <cassert>
+
+namespace Moses {
+namespace PCFG {
+
+TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab)
+    : pcfg_(pcfg)
+    , non_term_vocab_(non_term_vocab) {
+}
+
+bool TreeScorer::Score(PcfgTree &root) const {
+  if (root.IsPreterminal() || root.IsLeaf()) {
+    return true;
+  }
+
+  const std::vector<PcfgTree *> &children = root.children();
+
+  double log_prob = 0.0;
+
+  std::vector<size_t> key;
+  key.reserve(children.size()+1);
+  key.push_back(non_term_vocab_.Lookup(root.label()));
+
+  for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+       p != children.end(); ++p) {
+    PcfgTree *child = *p;
+    assert(!child->IsLeaf());
+    key.push_back(non_term_vocab_.Lookup(child->label()));
+    if (!Score(*child)) {
+      return false;
+    }
+    if (!child->IsPreterminal()) {
+      log_prob += child->score();
+    }
+  }
+  double rule_score;
+  bool found = pcfg_.Lookup(key, rule_score);
+  if (!found) {
+    return false;
+  }
+  log_prob += rule_score;
+  root.set_score(log_prob);
+  return true;
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.h b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h
new file mode 100644
index 000000000..36f4e1e99
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h
@@ -0,0 +1,47 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_TREE_SCORER_H_
+#define PCFG_SCORE_TREE_SCORER_H_
+
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/typedef.h"
+
+namespace Moses {
+namespace PCFG {
+
+class TreeScorer {
+ public:
+  TreeScorer(const Pcfg &, const Vocabulary &);
+
+  // Score tree according to PCFG.  Returns false if unsuccessful (due to
+  // missing rule).
+  bool Score(PcfgTree &) const;
+
+ private:
+  const Pcfg &pcfg_;
+  const Vocabulary &non_term_vocab_;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp
index af7401132..c5fb0b99f 100644
--- a/scripts/training/phrase-extract/score.cpp
+++ b/scripts/training/phrase-extract/score.cpp
@@ -32,6 +32,7 @@
 #include "PhraseAlignment.h"
 #include "score.h"
 #include "InputFileStream.h"
+#include "OutputFileStream.h"
 
 using namespace std;
 
@@ -56,7 +57,7 @@ public:
 
 vector<string> tokenize( const char [] );
 
-void writeCountOfCounts( const char* fileNameCountOfCounts );
+void writeCountOfCounts( const string &fileNameCountOfCounts );
 void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile);
 PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair );
 void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile );
@@ -71,6 +72,7 @@ void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
 LexicalTable lexTable;
 bool inverseFlag = false;
 bool hierarchicalFlag = false;
+bool pcfgFlag = false;
 bool wordAlignmentFlag = false;
 bool goodTuringFlag = false;
 bool kneserNeyFlag = false;
@@ -91,13 +93,13 @@ int main(int argc, char* argv[])
        << "scoring methods for extracted rules\n";
 
   if (argc < 4) {
-    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring coc-file] [--KneserNey coc-file] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] \n";
+    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] \n";
     exit(1);
   }
   char* fileNameExtract = argv[1];
   char* fileNameLex = argv[2];
   char* fileNamePhraseTable = argv[3];
-  char* fileNameCountOfCounts;
+  string fileNameCountOfCounts;
   char* fileNameFunctionWords;
 
   for(int i=4; i<argc; i++) {
@@ -107,6 +109,9 @@ int main(int argc, char* argv[])
     } else if (strcmp(argv[i],"--Hierarchical") == 0) {
       hierarchicalFlag = true;
       cerr << "processing hierarchical rules\n";
+    } else if (strcmp(argv[i],"--PCFG") == 0) {
+      pcfgFlag = true;
+      cerr << "including PCFG scores\n";
     } else if (strcmp(argv[i],"--WordAlignment") == 0) {
       wordAlignmentFlag = true;
       cerr << "outputing word alignment" << endl;
@@ -115,19 +120,11 @@ int main(int argc, char* argv[])
       cerr << "not computing lexical translation score\n";
     } else if (strcmp(argv[i],"--GoodTuring") == 0) {
       goodTuringFlag = true;
-      if (i+1==argc) { 
-        cerr << "ERROR: specify count of count files for Good Turing discounting!\n";
-        exit(1);
-      }
-      fileNameCountOfCounts = argv[++i];
+			fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
       cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
     } else if (strcmp(argv[i],"--KneserNey") == 0) {
       kneserNeyFlag = true;
-      if (i+1==argc) { 
-        cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
-        exit(1);
-      }
-      fileNameCountOfCounts = argv[++i];
+			fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
       cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
     } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
       unalignedFlag = true;
@@ -188,9 +185,9 @@ int main(int argc, char* argv[])
 		phraseTableFile = &cout;
 	}
 	else {
-		ofstream *outputFile = new ofstream();
-		outputFile->open(fileNamePhraseTable);
-		if (outputFile->fail()) {
+		Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
+		bool success = outputFile->Open(fileNamePhraseTable);
+		if (!success) {
 			cerr << "ERROR: could not open file phrase table file "
 					 << fileNamePhraseTable << endl;
 			exit(1);
@@ -200,6 +197,7 @@ int main(int argc, char* argv[])
 	
   // loop through all extracted phrase translations
   float lastCount = 0.0f;
+  float lastPcfgSum = 0.0f;
   vector< PhraseAlignment > phrasePairsWithSameF;
   int i=0;
   char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
@@ -214,6 +212,7 @@ int main(int argc, char* argv[])
     // identical to last line? just add count
     if (strcmp(line,lastLine) == 0) {
       lastPhrasePair->count += lastCount;
+      lastPhrasePair->pcfgSum += lastPcfgSum;
       continue;
     }
     strcpy( lastLine, line );
@@ -222,10 +221,12 @@ int main(int argc, char* argv[])
     PhraseAlignment phrasePair;
     phrasePair.create( line, i );
     lastCount = phrasePair.count;
+    lastPcfgSum = phrasePair.pcfgSum;
 
     // only differs in count? just add count
     if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) {
       lastPhrasePair->count += phrasePair.count;
+      lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
       continue;
     }
 
@@ -245,7 +246,6 @@ int main(int argc, char* argv[])
 	
 	phraseTableFile->flush();
 	if (phraseTableFile != &cout) {
-		(dynamic_cast<ofstream*>(phraseTableFile))->close();
 		delete phraseTableFile;
 	}
 
@@ -255,12 +255,12 @@ int main(int argc, char* argv[])
   }
 }
 
-void writeCountOfCounts( const char* fileNameCountOfCounts )
+void writeCountOfCounts( const string &fileNameCountOfCounts )
 {
   // open file
-	ofstream countOfCountsFile;
-	countOfCountsFile.open(fileNameCountOfCounts);
-	if (countOfCountsFile.fail()) {
+	Moses::OutputFileStream countOfCountsFile;
+	bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str());
+	if (!success) {
 		cerr << "ERROR: could not open count-of-counts file "
 				 << fileNameCountOfCounts << endl;
     return;
@@ -273,7 +273,7 @@ void writeCountOfCounts( const char* fileNameCountOfCounts )
   for(int i=1; i<=COC_MAX; i++) {
     countOfCountsFile << countOfCounts[ i ] << endl;
   }
-	countOfCountsFile.close();
+	countOfCountsFile.Close();
 }
 
 void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile )
@@ -446,6 +446,16 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
       countOfCounts[ countInt ]++;
   }
 
+  // compute PCFG score
+  float pcfgScore;
+  if (pcfgFlag && !inverseFlag) {
+    float pcfgSum = 0;
+    for(size_t i=0; i<phrasePair.size(); ++i) {
+        pcfgSum += phrasePair[i]->pcfgSum;
+    }
+    pcfgScore = pcfgSum / count;
+  }
+
   // output phrases
   const PHRASE &phraseS = phrasePair[0]->GetSource();
   const PHRASE &phraseT = phrasePair[0]->GetTarget();
@@ -501,6 +511,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
     phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
   }
 
+  // target-side PCFG score
+  if (pcfgFlag && !inverseFlag) {
+    phraseTableFile << " " << pcfgScore;
+  }
+
   phraseTableFile << " ||| ";
 
   // alignment info for non-terminals
diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir
index d3748fdc9..869f979fc 100755
--- a/scripts/training/train-model.perl.missing_bin_dir
+++ b/scripts/training/train-model.perl.missing_bin_dir
@@ -19,7 +19,7 @@ if ($SCRIPTS_ROOTDIR eq '') {
 $SCRIPTS_ROOTDIR =~ s/\/training$//;
 $SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
 
-my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_CORPUS,
+my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE,  $_SORT_COMPRESS, $_SORT_PARALLEL, $_CORPUS,
    $_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH,
    $_LEXICAL_FILE, $_NO_LEXICAL_WEIGHTING, $_VERBOSE, $_ALIGNMENT,
    $_ALIGNMENT_FILE, $_ALIGNMENT_STEM, @_LM, $_EXTRACT_FILE, $_GIZA_OPTION, $_HELP, $_PARTS,
@@ -29,18 +29,19 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
    $_DECODING_GRAPH_BACKOFF,
    $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
    @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
-   $_DONT_ZIP,  $_MGIZA, $_MGIZA_CPUS,  $_HMM_ALIGN, $_CONFIG,
-   $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS,
+   $_DONT_ZIP,  $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG,
+   $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_PCFG,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS,
    $_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
    $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
    $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
    $_ADDITIONAL_INI,
    $_DICTIONARY, $_EPPEX);
+my $_CORES = 1;
 
 my $debug = 0; # debug this script, do not delete any files in debug mode
 
 # the following line is set installation time by 'make release'.  BEWARE!
-my $BINDIR="/Users/hieuhoang/workspace/bin/training-tools";
+my $BINDIR="/home/hieu/workspace/bin/training-tools/";
 
 $_HELP = 1
     unless &GetOptions('root-dir=s' => \$_ROOT_DIR,
@@ -57,7 +58,9 @@ $_HELP = 1
 		       'model-dir=s' => \$_MODEL_DIR,
 		       'temp-dir=s' => \$_TEMP_DIR,
            'sort-buffer-size=s' => \$_SORT_BUFFER_SIZE,
-           'sort-batch-size=s' => \$_SORT_BATCH_SIZE,
+           'sort-batch-size=i' => \$_SORT_BATCH_SIZE,
+           'sort-compress=s' => \$_SORT_COMPRESS,
+           'sort-parallel=i' => \$_SORT_PARALLEL,
 		       'extract-file=s' => \$_EXTRACT_FILE,
 		       'alignment=s' => \$_ALIGNMENT,
 		       'alignment-file=s' => \$_ALIGNMENT_FILE,
@@ -72,6 +75,7 @@ $_HELP = 1
 		       'help' => \$_HELP,
 		       'mgiza' => \$_MGIZA, # multi-thread 
 		       'mgiza-cpus=i' => \$_MGIZA_CPUS, # multi-thread 
+		       'snt2cooc=s' => \$_SNT2COOC, # override snt2cooc exe. For when you want to run reduced memory snt2cooc.perl from mgiza
 		       'hmm-align' => \$_HMM_ALIGN,
 		       'final-alignment-model=s' => \$_FINAL_ALIGNMENT_MODEL, # use word alignment model 1/2/hmm/3/4/5 as final (default is 4); value 'hmm' equivalent to the --hmm-align switch
 		       'debug' => \$debug,
@@ -101,6 +105,7 @@ $_HELP = 1
 		       'glue-grammar-file=s' => \$_GLUE_GRAMMAR_FILE,
 		       'unknown-word-label-file=s' => \$_UNKNOWN_WORD_LABEL_FILE,
 		       'ghkm' => \$_GHKM,
+		       'pcfg' => \$_PCFG,
 		       'extract-options=s' => \$_EXTRACT_OPTIONS,
 		       'score-options=s' => \$_SCORE_OPTIONS,
 		       'source-syntax' => \$_SOURCE_SYNTAX,
@@ -114,7 +119,8 @@ $_HELP = 1
 		       'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
 		       'dictionary=s' => \$_DICTIONARY,
 		       'eppex:s' => \$_EPPEX,
-           'additional-ini=s' => \$_ADDITIONAL_INI
+           'additional-ini=s' => \$_ADDITIONAL_INI, 
+           'cores=i' => \$_CORES
                );
 
 if ($_HELP) {
@@ -185,29 +191,63 @@ my $SNT2COOC;
 if(!defined $_MGIZA ){
 	$GIZA = "$BINDIR/GIZA++";
 	if (-x "$BINDIR/snt2cooc.out") {
-   		$SNT2COOC = "$BINDIR/snt2cooc.out";
+  	$SNT2COOC = "$BINDIR/snt2cooc.out";
 	} elsif (-x "$BINDIR/snt2cooc") { # Since "snt2cooc.out" and "snt2cooc" work the same   
 		$SNT2COOC = "$BINDIR/snt2cooc";
 	}
 	print STDERR "Using single-thread GIZA\n";
 } else {
-    	$GIZA = "$BINDIR/mgiza";
+  $GIZA = "$BINDIR/mgiza";
 	if (-x "$BINDIR/snt2cooc") {
-                $SNT2COOC = "$BINDIR/snt2cooc";
-        } elsif (-x "$BINDIR/snt2cooc.out") { # Important for users that use MGIZA and copy only the "mgiza" file to $BINDIR 
-                $SNT2COOC = "$BINDIR/snt2cooc.out";
-        }
+  	$SNT2COOC = "$BINDIR/snt2cooc";
+  } elsif (-x "$BINDIR/snt2cooc.out") { # Important for users that use MGIZA and copy only the "mgiza" file to $BINDIR 
+    $SNT2COOC = "$BINDIR/snt2cooc.out";
+  }
 	print STDERR "Using multi-thread GIZA\n";	
-    	if (!defined($_MGIZA_CPUS)) {
-        	$_MGIZA_CPUS=4;
-    	}
-    die("ERROR: Cannot find $MGIZA_MERGE_ALIGN") unless (-x $MGIZA_MERGE_ALIGN);
+  if (!defined($_MGIZA_CPUS)) {
+  	$_MGIZA_CPUS=4;
+  }
+  die("ERROR: Cannot find $MGIZA_MERGE_ALIGN") unless (-x $MGIZA_MERGE_ALIGN);
 }
 
+# override
+$SNT2COOC = "$BINDIR/$_SNT2COOC" if defined($_SNT2COOC);
+
 my $MKCLS = "$BINDIR/mkcls";
 
+# parallel extract
+my $SPLIT_EXEC = `gsplit --help 2>/dev/null`; 
+if($SPLIT_EXEC) {
+  $SPLIT_EXEC = 'gsplit';
+}
+else {
+  $SPLIT_EXEC = 'split';
+}
+
+my $SORT_EXEC = `gsort --help 2>/dev/null`; 
+if($SORT_EXEC) {
+  $SORT_EXEC = 'gsort';
+}
+else {
+  $SORT_EXEC = 'sort';
+}
+
+my $__SORT_BUFFER_SIZE = "";
+$__SORT_BUFFER_SIZE = "-S $_SORT_BUFFER_SIZE" if $_SORT_BUFFER_SIZE;
+
+my $__SORT_BATCH_SIZE = "";
+$__SORT_BATCH_SIZE = "--batch-size $_SORT_BATCH_SIZE" if $_SORT_BATCH_SIZE;
+
+my $__SORT_COMPRESS = "";
+$__SORT_COMPRESS = "--compress-program $_SORT_COMPRESS" if $_SORT_COMPRESS;
+
+my $__SORT_PARALLEL = "";
+$__SORT_PARALLEL = "--parallel $_SORT_PARALLEL" if $_SORT_PARALLEL;
+
 # supporting scripts/binaries from this package
 my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract";
+$PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $PHRASE_EXTRACT";
+
 my $RULE_EXTRACT;
 if (defined($_GHKM)) {
   $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-ghkm/tools/extract-ghkm";
@@ -215,12 +255,17 @@ if (defined($_GHKM)) {
 else {
   $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-rules";
 }
+$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $RULE_EXTRACT";
+
 my $LEXICAL_REO_SCORER = "$SCRIPTS_ROOTDIR/training/lexical-reordering/score";
 my $MEMSCORE = "$SCRIPTS_ROOTDIR/training/memscore/memscore";
 my $EPPEX = "$SCRIPTS_ROOTDIR/training/eppex/eppex";
 my $SYMAL = "$SCRIPTS_ROOTDIR/training/symal/symal";
 my $GIZA2BAL = "$SCRIPTS_ROOTDIR/training/symal/giza2bal.pl";
+
 my $PHRASE_SCORE = "$SCRIPTS_ROOTDIR/training/phrase-extract/score";
+$PHRASE_SCORE = "$SCRIPTS_ROOTDIR/generic/score-parallel.perl $_CORES \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $PHRASE_SCORE";
+
 my $PHRASE_CONSOLIDATE = "$SCRIPTS_ROOTDIR/training/phrase-extract/consolidate";
 
 # utilities
@@ -308,12 +353,6 @@ $_DONT_ZIP = $___DONT_ZIP unless $___DONT_ZIP;
 my $___TEMP_DIR = $___MODEL_DIR;
 $___TEMP_DIR = $_TEMP_DIR if $_TEMP_DIR;
 
-my $__SORT_BUFFER_SIZE = "";
-$__SORT_BUFFER_SIZE = "-S $_SORT_BUFFER_SIZE" if $_SORT_BUFFER_SIZE;
-
-my $__SORT_BATCH_SIZE = "";
-$__SORT_BATCH_SIZE = "--batch-size $_SORT_BATCH_SIZE" if $_SORT_BATCH_SIZE;
-
 my $___CONTINUE = 0; 
 $___CONTINUE = $_CONTINUE if $_CONTINUE;
 
@@ -1335,6 +1374,7 @@ sub extract_phrase {
         $cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file";
         $cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR;
         $cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
+        $cmd .= " --PCFG" if $_PCFG;
         if (!defined($_GHKM)) {
           $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
           $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
@@ -1362,20 +1402,16 @@ sub extract_phrase {
         $cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
       }
     }
+    
+    $cmd .= " --GZOutput ";
+    
     map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a);
     print STDERR "$cmd\n";
     safesystem("$cmd") or die "ERROR: Phrase extraction failed (missing input files?)";
     foreach my $f (@tempfiles) {
       unlink $f;
     }
-    if (! $___DONT_ZIP) { 
-      safesystem("gzip $extract_file.o") if -e "$extract_file.o";
-      safesystem("gzip $extract_file.sid") if -e "$extract_file.sid";
-      if ($ttable_flag) {
-        safesystem("gzip $extract_file.inv") or die("ERROR");
-        safesystem("gzip $extract_file") or die("ERROR");
-      }
-    }
+    
 }
 
 ### (6) PHRASE SCORING
@@ -1457,41 +1493,32 @@ sub score_phrase_phrase_extract {
 	          $inverse = " --Inverse";
                   $extract_filename = $extract_file.".inv";
               }
-	      my $extract = "$extract_filename.sorted";
-
-	      if (!($___CONTINUE && -e "$extract_filename.sorted")) {
-	          # sorting
-	          print STDERR "(6.".($substep++).")  sorting $direction @ ".`date`;
-	          if (-e "$extract_filename.gz") {
-		      safesystem("gunzip < $extract_filename.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_filename.sorted") or die("ERROR");
-	          }
-	          else {
-		      safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $extract_filename > $extract_filename.sorted") or die("ERROR");
-	          }
-              }
+	      my $extract = "$extract_filename.sorted.gz";
 
 	      print STDERR "(6.".($substep++).")  creating table half $ttable_file.half.$direction @ ".`date`;
 
         my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse";
         $cmd .= " --Hierarchical" if $_HIERARCHICAL;
         $cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT;
-        $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
-        $cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING && $inverse eq "";
+        $cmd .= " --KneserNey" if $KNESER_NEY;
+        $cmd .= " --GoodTuring" if $GOOD_TURING && $inverse eq "";
         $cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
         $cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT;
         $cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL;
+        $cmd .= " --PCFG" if $_PCFG;
         $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
-        print $cmd."\n";
+
+				# sorting
+				if ($direction eq "e2f") {
+					$cmd .= " 1 ";
+				}
+				else {
+					$cmd .= " 0 ";
+				}
+
+      print $cmd."\n";
         safesystem($cmd) or die "ERROR: Scoring of phrases failed";	    
-        if (! $debug) { safesystem("rm -f $extract") or die("ERROR"); }
   
-        # sorting inverse phrase-table-half to sync up with regular one
-        if ($direction eq "e2f" && ! ($___CONTINUE && -e "$ttable_file.half.e2f.sorted")) {
-          print STDERR "(6." . ($substep++) . ") sorting inverse e2f table@ ".`date`;
-          safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $ttable_file.half.e2f > $ttable_file.half.e2f.sorted") or die("ERROR");
-          if (! $debug) { safesystem("rm -f $ttable_file.half.e2f") or die("ERROR"); }
-        }
-
         exit();
       }
       else
@@ -1516,20 +1543,17 @@ sub score_phrase_phrase_extract {
     # merging the two halves
     print STDERR "(6.6) consolidating the two halves @ ".`date`;
     return if $___CONTINUE && -e "$ttable_file.gz";
-    my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file";
+    my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e.gz $ttable_file.half.e2f.sorted.gz $ttable_file.gz";
     $cmd .= " --Hierarchical" if $_HIERARCHICAL;
     $cmd .= " --LogProb" if $LOG_PROB;
     $cmd .= " --NegLogProb" if $NEG_LOG_PROB;
     $cmd .= " --OnlyDirect" if $ONLY_DIRECT;
     $cmd .= " --NoPhraseCount" unless $PHRASE_COUNT;
     $cmd .= " --LowCountFeature" if $LOW_COUNT;
-    $cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING;
-    $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
+    $cmd .= " --GoodTuring $ttable_file.half.f2e.coc" if $GOOD_TURING;
+    $cmd .= " --KneserNey $ttable_file.half.f2e.coc" if $KNESER_NEY;
     safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed";
     if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); }
-    if (! $___DONT_ZIP) {
-        safesystem("gzip $ttable_file") || die("ERROR: could not gzip $ttable_file");
-    }
 }
 
 sub score_phrase_memscore {
@@ -1597,35 +1621,27 @@ sub get_reordering_factored {
 }
 
 sub get_reordering {
-    my ($extract_file,$reo_model_path) = @_;
-    if (-e "$extract_file.o.gz") {
-	safesystem("gunzip < $extract_file.o.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_file.o.sorted") or die("ERROR");
-    }
-    else {
-        safesystem("LC_ALL=C sort -T $___TEMP_DIR $extract_file.o > $extract_file.o.sorted") or die("ERROR");
-    }
-
-    my $smooth = $___REORDERING_SMOOTH;
-
-    print STDERR "(7.2) building tables @ ".`date`;
-
-    #create cmd string for lexical reordering scoring
-    my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted $smooth $reo_model_path";
-    $cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/);
-    for my $mtype (keys %REORDERING_MODEL_TYPES) {
-	$cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
-	foreach my $model (@REORDERING_MODELS) {
-	    if ($model->{"type"} eq $mtype) {
-		$cmd .= " ".$model->{"filename"};
-	    }
+	my ($extract_file,$reo_model_path) = @_;
+	my $smooth = $___REORDERING_SMOOTH;
+	
+	print STDERR "(7.2) building tables @ ".`date`;
+	
+	#create cmd string for lexical reordering scoring
+	my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted.gz $smooth $reo_model_path";
+	$cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/);
+	for my $mtype (keys %REORDERING_MODEL_TYPES) {
+		$cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
+		foreach my $model (@REORDERING_MODELS) {
+			if ($model->{"type"} eq $mtype) {
+				$cmd .= " ".$model->{"filename"};
+			}
+		}
+		$cmd .= "\"";
 	}
-	$cmd .= "\"";
-    }
-    
-    #Call the lexical reordering scorer
-    safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
-
-    if (! $debug) { safesystem("rm $extract_file.o.sorted") or die("ERROR");}
+	
+	#Call the lexical reordering scorer
+	safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
+	
 }
 
 
@@ -1788,6 +1804,7 @@ sub create_ini {
    $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
    $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
    $basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
+   $basic_weight_count++ if $_PCFG;
    foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
      $num_of_ttables++;
      my $ff = $f;
author	phikoehn <pkoehn@inf.ed.ac.uk>	2012-05-26 03:10:08 +0400
committer	phikoehn <pkoehn@inf.ed.ac.uk>	2012-05-26 03:10:08 +0400
commit	07fafd51b509e93db7be238107325c45ca5f57cd (patch)
tree	e8cccec20ee05726accd9f87a041b68370adeaab /scripts/training
parent	561b9ac9567d3e5b0bbc56fdae3b29961b8bc728 (diff)
parent	a72744c49b7821bf0355e7fe4638c392a74b0d60 (diff)