Add an optional PCFG scoring feature for target syntax models (similar to

the p_cfg feature used in Marcu, Wang, Echihabi, and Knight (2006)).
author: Phil Williams <philip.williams@mac.com> 2012-05-25 20:29:47 +0400
committer: Phil Williams <philip.williams@mac.com> 2012-05-25 20:29:47 +0400
commit: 90c0bc9f5ceec4e7d33386ec597fd753e7d23d4a (patch)
tree: 2e4aa63e87c6150a5317e3e8bae3cc00d9187db3
parent: 2fab137aaeeda8077734e4c6e5627bfb44d27691 (diff)
55 files changed, 1970 insertions, 51 deletions
diff --git a/scripts/Jamfile b/scripts/Jamfile
index 6fb9bad39..b9eefcffe 100644
--- a/scripts/Jamfile
+++ b/scripts/Jamfile
@@ -42,6 +42,8 @@ if $(location) {
   install compactify : training/compact-rule-table//compactify : <location>$(location)/training/compact-rule-table/tools ;
 
   install phrase-extract : training/phrase-extract//programs : <location>$(location)/training/phrase-extract ;
+  install pcfg-extract : training/phrase-extract/pcfg-extract//pcfg-extract : <location>$(location)/training/phrase-extract/pcfg-extract ;
+  install pcfg-score : training/phrase-extract/pcfg-score//pcfg-score : <location>$(location)/training/phrase-extract/pcfg-score ;
   install lexical-reordering : training/lexical-reordering//score : <location>$(location)/training/lexical-reordering ;
   install symal : training/symal//symal : <location>$(location)/training/symal ;
 
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 51ac0f67a..b33c589d2 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -344,8 +344,21 @@ parse-relax
 	pass-unless: input-parse-relaxer output-parse-relaxer
 	template-if: input-parse-relaxer IN.$input-extension OUT.$input-extension
 	template-if: output-parse-relaxer IN.$output-extension OUT.$output-extension
+pcfg-extract
+  in: parse-relaxed-corpus
+  out: pcfg
+  default-name: model/pcfg
+  ignore-unless: use-pcfg-feature
+  rerun-on-change: use-pcfg-feature
+  template: $moses-script-dir/training/phrase-extract/pcfg-extract/pcfg-extract < IN.$output-extension > OUT.$output-extension
+pcfg-score
+  in: parse-relaxed-corpus pcfg
+  out: scored-corpus
+  default-name: model/scored-corpus
+  pass-unless: use-pcfg-feature
+  template: ln -s IN.$input-extension OUT.$input-extension ; $moses-script-dir/training/phrase-extract/pcfg-score/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension
 extract-phrases
-	in: word-alignment parse-relaxed-corpus
+	in: word-alignment scored-corpus
 	out: extracted-phrases
 	rerun-on-change: max-phrase-length translation-factors reordering-factors hierarchical-rule-set extract-settings training-options script use-ghkm
 	default-name: model/extract
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 59bd2788f..0c61a2a05 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -2007,6 +2007,7 @@ sub get_training_setting {
     my $target_syntax = &get("GENERAL:output-parser");
     my $score_settings = &get("TRAINING:score-settings");
     my $parallel = &get("TRAINING:parallel");
+    my $pcfg = &get("TRAINING:use-pcfg-feature");
 
     my $xml = $source_syntax || $target_syntax;
 
@@ -2029,6 +2030,7 @@ sub get_training_setting {
     $cmd .= "-glue-grammar " if $hierarchical;
     $cmd .= "-score-options '".$score_settings."' " if $score_settings;
     $cmd .= "-parallel " if $parallel;
+    $cmd .= "-pcfg " if $pcfg;
 
     # factored training
     if (&backoff_and_get("TRAINING:input-factors")) {
diff --git a/scripts/training/phrase-extract/ExtractedRule.h b/scripts/training/phrase-extract/ExtractedRule.h
index 170ccf892..be6e30836 100644
--- a/scripts/training/phrase-extract/ExtractedRule.h
+++ b/scripts/training/phrase-extract/ExtractedRule.h
@@ -43,6 +43,7 @@ public:
   int startS;
   int endS;
   float count;
+  double pcfgScore;
 
   std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
   
@@ -58,6 +59,7 @@ public:
     , startS(sS)
     , endS(eS)
     , count(0)
+    , pcfgScore(0.0)
   {}
   
   void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength)
diff --git a/scripts/training/phrase-extract/Jamfile b/scripts/training/phrase-extract/Jamfile
index 5ed3f20f1..9be67e80a 100644
--- a/scripts/training/phrase-extract/Jamfile
+++ b/scripts/training/phrase-extract/Jamfile
@@ -33,3 +33,5 @@ alias programs : extract extract-rules extract-lex score consolidate consolidate
 install legacy : programs : <location>. <install-type>EXE ;
 
 build-project extract-ghkm ;
+build-project pcfg-extract ;
+build-project pcfg-score ;
diff --git a/scripts/training/phrase-extract/PhraseAlignment.cpp b/scripts/training/phrase-extract/PhraseAlignment.cpp
index c0bfbde3e..ceb74f04c 100644
--- a/scripts/training/phrase-extract/PhraseAlignment.cpp
+++ b/scripts/training/phrase-extract/PhraseAlignment.cpp
@@ -13,6 +13,8 @@
 #include "tables-core.h"
 #include "score.h"
 
+#include <cstdlib>
+
 using namespace std;
 
 extern Vocabulary vcbT;
@@ -111,6 +113,9 @@ void PhraseAlignment::create( char line[], int lineID )
     }
     else if (item == 5) { // non-term lengths
       addNTLength(token[j]);
+    } else if (item == 6) { // target syntax PCFG score
+      float pcfgScore = std::atof(token[j].c_str());
+      pcfgSum = pcfgScore * count;
     }
   }
 
@@ -119,7 +124,7 @@ void PhraseAlignment::create( char line[], int lineID )
   if (item == 3) {
     count = 1.0;
   }
-  if (item < 3 || item > 5) {
+  if (item < 3 || item > 6) {
     cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
   }
 }
diff --git a/scripts/training/phrase-extract/PhraseAlignment.h b/scripts/training/phrase-extract/PhraseAlignment.h
index 8b8f5115c..8bd83503d 100644
--- a/scripts/training/phrase-extract/PhraseAlignment.h
+++ b/scripts/training/phrase-extract/PhraseAlignment.h
@@ -25,6 +25,7 @@ protected:
   void createAlignVec(size_t sourceSize, size_t targetSize);
   void addNTLength(const std::string &tok);
 public:
+  float pcfgSum;
   float count;
   std::vector< std::set<size_t> > alignedToT;
   std::vector< std::set<size_t> > alignedToS;
diff --git a/scripts/training/phrase-extract/RuleExtractionOptions.h b/scripts/training/phrase-extract/RuleExtractionOptions.h
index 70bb548c9..f9123de86 100644
--- a/scripts/training/phrase-extract/RuleExtractionOptions.h
+++ b/scripts/training/phrase-extract/RuleExtractionOptions.h
@@ -45,6 +45,7 @@ public:
   bool targetSyntax;
   bool duplicateRules;
   bool fractionalCounting;
+  bool pcfgScore;
   bool outputNTLengths;
   bool gzOutput;
   
@@ -74,6 +75,7 @@ public:
     , targetSyntax(false)
     , duplicateRules(true)
     , fractionalCounting(true)
+    , pcfgScore(false)
     , outputNTLengths(false)
     , gzOutput(false)
   {}
diff --git a/scripts/training/phrase-extract/SyntaxTree.cpp b/scripts/training/phrase-extract/SyntaxTree.cpp
index e181b1e8a..f2783ffd2 100644
--- a/scripts/training/phrase-extract/SyntaxTree.cpp
+++ b/scripts/training/phrase-extract/SyntaxTree.cpp
@@ -42,11 +42,12 @@ void SyntaxTree::Clear()
   m_index.clear();
 }
 
-void SyntaxTree::AddNode( int startPos, int endPos, std::string label )
+SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label )
 {
   SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
   m_nodes.push_back( newNode );
   m_index[ startPos ][ endPos ].push_back( newNode );
+  return newNode;
 }
 
 ParentNodes SyntaxTree::Parse()
diff --git a/scripts/training/phrase-extract/SyntaxTree.h b/scripts/training/phrase-extract/SyntaxTree.h
index 0ca5ca472..17c106b49 100644
--- a/scripts/training/phrase-extract/SyntaxTree.h
+++ b/scripts/training/phrase-extract/SyntaxTree.h
@@ -34,12 +34,14 @@ protected:
   std::string m_label;
   std::vector< SyntaxNode* > m_children;
   SyntaxNode* m_parent;
+  float m_pcfgScore;
 public:
   SyntaxNode( int startPos, int endPos, std::string label )
     :m_start(startPos)
     ,m_end(endPos)
     ,m_label(label)
     ,m_parent(0)
+    ,m_pcfgScore(0.0f)
   {}
   int GetStart() const {
     return m_start;
@@ -50,6 +52,12 @@ public:
   std::string GetLabel() const {
     return m_label;
   }
+  float GetPcfgScore() const {
+    return m_pcfgScore;
+  }
+  void SetPcfgScore(float score) {
+    m_pcfgScore = score;
+  }
   SyntaxNode *GetParent() {
     return m_parent;
   }
@@ -89,11 +97,12 @@ public:
   }
   ~SyntaxTree();
 
+  SyntaxNode *AddNode( int startPos, int endPos, std::string label );
+
   SyntaxNode *GetTop() {
     return m_top;
   }
 
-  void AddNode( int startPos, int endPos, std::string label );
   ParentNodes Parse();
   bool HasNode( int startPos, int endPos ) const;
   const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
diff --git a/scripts/training/phrase-extract/XmlTree.cpp b/scripts/training/phrase-extract/XmlTree.cpp
index 716414f86..b22c159a1 100644
--- a/scripts/training/phrase-extract/XmlTree.cpp
+++ b/scripts/training/phrase-extract/XmlTree.cpp
@@ -25,7 +25,7 @@
 #include <string>
 #include <set>
 #include <iostream>
-#include <stdlib.h>
+#include <cstdlib>
 #include <sstream>
 #include "SyntaxTree.h"
 #include "XmlException.h"
@@ -345,13 +345,18 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
         string label = ParseXmlTagAttribute(tagContent,"label");
         labelCollection.insert( label );
 
+        string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
+        float pcfgScore = pcfgString == "" ? 0.0f
+                                           : std::atof(pcfgString.c_str());
+
         // report what we have processed so far
         if (0) {
           cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
           cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
           cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
         }
-        tree.AddNode( startPos, endPos-1, label );
+        SyntaxNode *node = tree.AddNode( startPos, endPos-1, label );
+        node->SetPcfgScore(pcfgScore);
       }
     }
   }
diff --git a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 0ecffae5c..6bd32a13b 100644
--- a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -212,6 +212,10 @@ Node *AlignmentGraph::CopyParseTree(const ParseTree *root)
 
   std::auto_ptr<Node> n(new Node(root->GetLabel(), nodeType));
 
+  if (nodeType == TREE) {
+    n->SetPcfgScore(root->GetPcfgScore());
+  }
+
   const std::vector<ParseTree *> &children = root->GetChildren();
   std::vector<Node *> childNodes;
   childNodes.reserve(children.size());
diff --git a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 008026e1a..397ce1e3c 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -285,6 +285,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
         "set maximum allowed scope")
     ("Minimal",
         "extract minimal rules only")
+    ("PCFG",
+        "include score based on PCFG scores in target corpus")
     ("UnknownWordLabel",
         po::value(&options.unknownWordFile),
         "write unknown word labels to named file")
@@ -361,6 +363,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
   if (vm.count("Minimal")) {
     options.minimal = true;
   }
+  if (vm.count("PCFG")) {
+    options.pcfg = true;
+  }
   if (vm.count("UnpairedExtractFormat")) {
     options.unpairedExtractFormat = true;
   }
diff --git a/scripts/training/phrase-extract/extract-ghkm/Node.h b/scripts/training/phrase-extract/extract-ghkm/Node.h
index 228fdc812..775473362 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Node.h
+++ b/scripts/training/phrase-extract/extract-ghkm/Node.h
@@ -41,8 +41,7 @@ class Node
   Node(const std::string &label, NodeType type)
       : m_label(label)
       , m_type(type)
-      , m_children()
-      , m_parents() {}
+      , m_pcfgScore(0.0f) {}
 
   ~Node();
 
@@ -50,12 +49,14 @@ class Node
   NodeType GetType() const { return m_type; }
   const std::vector<Node*> &GetChildren() const { return m_children; }
   const std::vector<Node*> &GetParents() const { return m_parents; }
+  float GetPcfgScore() const { return m_pcfgScore; }
   const Span &GetSpan() const { return m_span; }
   const Span &GetComplementSpan() const { return m_complementSpan; }
   const std::vector<const Subgraph*> &GetRules() const { return m_rules; }
 
   void SetChildren(const std::vector<Node*> &c) { m_children = c; }
   void SetParents(const std::vector<Node*> &p) { m_parents = p; }
+  void SetPcfgScore(float s) { m_pcfgScore = s; }
   void SetSpan(const Span &s) { m_span = s; }
   void SetComplementSpan(const Span &cs) { m_complementSpan = cs; }
 
@@ -92,6 +93,7 @@ class Node
   NodeType m_type;
   std::vector<Node*> m_children;
   std::vector<Node*> m_parents;
+  float m_pcfgScore;
   Span m_span;
   Span m_complementSpan;
   std::vector<const Subgraph*> m_rules;
diff --git a/scripts/training/phrase-extract/extract-ghkm/Options.h b/scripts/training/phrase-extract/extract-ghkm/Options.h
index 108e19d66..c4b57f311 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Options.h
+++ b/scripts/training/phrase-extract/extract-ghkm/Options.h
@@ -36,6 +36,7 @@ struct Options {
       , maxRuleSize(3)
       , maxScope(3)
       , minimal(false)
+      , pcfg(false)
       , unpairedExtractFormat(false) {}
 
   // Positional options
@@ -53,6 +54,7 @@ struct Options {
   int maxRuleSize;
   int maxScope;
   bool minimal;
+  bool pcfg;
   bool unpairedExtractFormat;
   std::string unknownWordFile;
 };
diff --git a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
index ec6fc147a..273e2e04e 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
+++ b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
@@ -32,17 +32,19 @@ class ParseTree
  public:
   ParseTree(const std::string &label)
       : m_label(label)
-      , m_children()
-      , m_parent() {}
+      , m_parent(0)
+      , m_pcfgScore(0.0) {}
 
   ~ParseTree();
 
   const std::string &GetLabel() const { return m_label; }
   const std::vector<ParseTree*> &GetChildren() const { return m_children; }
   const ParseTree *GetParent() const { return m_parent; }
+  float GetPcfgScore() const { return m_pcfgScore; }
 
   void SetParent(ParseTree *);
   void SetChildren(const std::vector<ParseTree*> &);
+  void SetPcfgScore(float score) { m_pcfgScore = score; }
 
   void AddChild(ParseTree *);
 
@@ -59,6 +61,7 @@ class ParseTree
   std::string m_label;
   std::vector<ParseTree*> m_children;
   ParseTree *m_parent;
+  float m_pcfgScore;  // log probability
 };
 
 template<typename OutputIterator>
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp
index 8473e4283..5dc70052c 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -30,6 +30,7 @@ namespace GHKM {
 ScfgRule::ScfgRule(const Subgraph &fragment)
     : m_sourceLHS("X", NonTerminal)
     , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
+    , m_pcfgScore(fragment.GetPcfgScore())
 {
   // Source RHS
 
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h
index 1ed534d9e..2405d8fa3 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h
@@ -57,6 +57,7 @@ class ScfgRule
   const std::vector<Symbol> &GetSourceRHS() const { return m_sourceRHS; }
   const std::vector<Symbol> &GetTargetRHS() const { return m_targetRHS; }
   const Alignment &GetAlignment() const { return m_alignment; }
+  float GetPcfgScore() const { return m_pcfgScore; }
 
   int Scope() const;
 
@@ -68,6 +69,7 @@ class ScfgRule
   std::vector<Symbol> m_sourceRHS;
   std::vector<Symbol> m_targetRHS;
   Alignment m_alignment;
+  float m_pcfgScore;
 };
 
 }  // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
index 4be3f048d..d5d16b790 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
@@ -24,6 +24,7 @@
 #include "ScfgRule.h"
 
 #include <cassert>
+#include <cmath>
 #include <ostream>
 #include <map>
 #include <sstream>
@@ -34,14 +35,43 @@ namespace GHKM {
 
 void ScfgRuleWriter::Write(const ScfgRule &rule)
 {
+  std::ostringstream sourceSS;
+  std::ostringstream targetSS;
+
   if (m_options.unpairedExtractFormat) {
-    WriteUnpairedFormat(rule);
+    WriteUnpairedFormat(rule, sourceSS, targetSS);
   } else {
-    WriteStandardFormat(rule);
+    WriteStandardFormat(rule, sourceSS, targetSS);
+  }
+
+  // Write the rule to the forward and inverse extract files.
+  m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+  m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+
+  const Alignment &alignment = rule.GetAlignment();
+  for (Alignment::const_iterator p = alignment.begin();
+       p != alignment.end(); ++p) {
+    m_fwd << " " << p->first << "-" << p->second;
+    m_inv << " " << p->second << "-" << p->first;
+  }
+
+  // Write a count of 1 and an empty NT length column to the forward extract
+  // file.
+  // TODO Add option to write NT length?
+  m_fwd << " ||| 1 ||| |||";
+  if (m_options.pcfg) {
+    // Write the PCFG score.
+    m_fwd << " " << std::exp(rule.GetPcfgScore());
   }
+  m_fwd << std::endl;
+
+  // Write a count of 1 to the inverse extract file.
+  m_inv << " ||| 1" << std::endl;
 }
 
-void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
+void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
+                                         std::ostream &sourceSS,
+                                         std::ostream &targetSS)
 {
   const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
   const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
@@ -60,9 +90,6 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
     }
   }
 
-  std::ostringstream sourceSS;
-  std::ostringstream targetSS;
-
   // Write the source side of the rule to sourceSS.
   int i = 0;
   for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
@@ -88,27 +115,14 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
     targetSS << " ";
   }
   WriteSymbol(rule.GetTargetLHS(), targetSS);
-
-  // Write the rule to the forward and inverse extract files.
-  m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
-  m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
-  for (Alignment::const_iterator p(alignment.begin());
-       p != alignment.end(); ++p) {
-    m_fwd << " " << p->first << "-" << p->second;
-    m_inv << " " << p->second << "-" << p->first;
-  }
-  m_fwd << " ||| 1" << std::endl;
-  m_inv << " ||| 1" << std::endl;
 }
 
-void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule)
+void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
+                                         std::ostream &sourceSS,
+                                         std::ostream &targetSS)
 {
   const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
   const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
-  const Alignment &alignment = rule.GetAlignment();
-
-  std::ostringstream sourceSS;
-  std::ostringstream targetSS;
 
   // Write the source side of the rule to sourceSS.
   int i = 0;
@@ -127,17 +141,6 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule)
     targetSS << " ";
   }
   WriteSymbol(rule.GetTargetLHS(), targetSS);
-
-  // Write the rule to the forward and inverse extract files.
-  m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
-  m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
-  for (Alignment::const_iterator p(alignment.begin());
-       p != alignment.end(); ++p) {
-    m_fwd << " " << p->first << "-" << p->second;
-    m_inv << " " << p->second << "-" << p->first;
-  }
-  m_fwd << " ||| 1" << std::endl;
-  m_inv << " ||| 1" << std::endl;
 }
 
 void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h
index 738d09ce9..b92a432a1 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h
@@ -45,8 +45,8 @@ class ScfgRuleWriter
   ScfgRuleWriter(const ScfgRuleWriter &);
   ScfgRuleWriter &operator=(const ScfgRuleWriter &);
 
-  void WriteStandardFormat(const ScfgRule &);
-  void WriteUnpairedFormat(const ScfgRule &);
+  void WriteStandardFormat(const ScfgRule &, std::ostream &, std::ostream &);
+  void WriteUnpairedFormat(const ScfgRule &, std::ostream &, std::ostream &);
   void WriteSymbol(const Symbol &, std::ostream &);
 
   std::ostream &m_fwd;
diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
index e5aedbb16..e048f2c55 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
@@ -101,5 +101,21 @@ int Subgraph::CalcDepth(const Node *n) const
   return maxChildDepth + 1;
 }
 
+float Subgraph::CalcPcfgScore() const
+{
+  if (m_root->GetType() != TREE || m_leaves.empty()) {
+    return 0.0f;
+  }
+  float score = m_root->GetPcfgScore();
+  for (std::set<const Node *>::const_iterator p = m_leaves.begin();
+       p != m_leaves.end(); ++p) {
+    const Node *leaf = *p;
+    if (leaf->GetType() == TREE) {
+      score -= leaf->GetPcfgScore();
+    }
+  }
+  return score;
+}
+
 }  // namespace Moses
 }  // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
index e84903502..ede1233e9 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
+++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
@@ -38,7 +38,8 @@ class Subgraph
       : m_root(root)
       , m_depth(0)
       , m_size(root->GetType() == TREE ? 1 : 0)
-      , m_nodeCount(1) {}
+      , m_nodeCount(1)
+      , m_pcfgScore(0.0f) {}
 
   Subgraph(const Node *root, const std::set<const Node *> &leaves)
       : m_root(root)
@@ -46,10 +47,12 @@ class Subgraph
       , m_depth(-1)
       , m_size(-1)
       , m_nodeCount(-1)
+      , m_pcfgScore(0.0f)
   {
     m_depth = CalcDepth(m_root);
     m_size = CalcSize(m_root);
     m_nodeCount = CountNodes(m_root);
+    m_pcfgScore = CalcPcfgScore();
   }
 
   const Node *GetRoot() const { return m_root; }
@@ -57,6 +60,7 @@ class Subgraph
   int GetDepth() const { return m_depth; }
   int GetSize() const { return m_size; }
   int GetNodeCount() const { return m_nodeCount; }
+  float GetPcfgScore() const { return m_pcfgScore; }
 
   bool IsTrivial() const { return m_leaves.empty(); }
 
@@ -66,6 +70,7 @@ class Subgraph
   void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
   int CalcDepth(const Node *) const;
   int CalcSize(const Node *) const;
+  float CalcPcfgScore() const;
   int CountNodes(const Node *) const;
 
   const Node *m_root;
@@ -73,6 +78,7 @@ class Subgraph
   int m_depth;
   int m_size;
   int m_nodeCount;
+  float m_pcfgScore;
 };
 
 }  // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
index 31c0e3843..cc961dc0c 100644
--- a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@@ -61,6 +61,7 @@ std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree(
     const std::vector<std::string> &words)
 {
   std::auto_ptr<ParseTree> root(new ParseTree(tree.GetLabel()));
+  root->SetPcfgScore(tree.GetPcfgScore());
   const std::vector<SyntaxNode*> &children = tree.GetChildren();
   if (children.empty()) {
     if (tree.GetStart() != tree.GetEnd()) {
diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp
index 2cc9dc54d..a00667b82 100644
--- a/scripts/training/phrase-extract/extract-rules.cpp
+++ b/scripts/training/phrase-extract/extract-rules.cpp
@@ -90,7 +90,7 @@ void addHieroRule( int startT, int endT, int startS, int endS
 void printHieroPhrase( int startT, int endT, int startS, int endS
                        , HoleCollection &holeColl, LabelIndex &labelIndex);
 string printTargetHieroPhrase(  int startT, int endT, int startS, int endS
-                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex);
+                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
 string printSourceHieroPhrase( int startT, int endT, int startS, int endS
                                , HoleCollection &holeColl, const LabelIndex &labelIndex);
 void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
@@ -257,6 +257,8 @@ int main(int argc, char* argv[])
     // if an source phrase is paired with two target phrases, then count(t|s) = 0.5
     else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
       options.fractionalCounting = false;
+    } else if (strcmp(argv[i],"--PCFG") == 0) {
+      options.pcfgScore = true;
     } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
       options.outputNTLengths = true;
 #ifdef WITH_THREADS
@@ -517,7 +519,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
 }
 
 string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS
-                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex)
+                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore)
 {
   HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
   assert(iterHoleList != holeColl.GetHoles().end());
@@ -545,6 +547,11 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
 
       out += "[" + sourceLabel + "][" + targetLabel + "] ";
 
+      if (m_options.pcfgScore) {
+        double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
+        logPCFGScore -= score;
+      }
+
       currPos = hole.GetEnd(1);
       hole.SetPos(outPos, 1);
       ++iterHoleList;
@@ -658,8 +665,16 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
   preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
 
   // target
-  rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex)
+  if (m_options.pcfgScore) {
+    double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+    rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+                + " [" + targetLabel + "]";
+    rule.pcfgScore = std::exp(logPCFGScore);
+  } else {
+    double logPCFGScore = 0.0f;
+    rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
                 + " [" + targetLabel + "]";
+  }
 
   // source
   // holeColl.SortSourceHoles();
@@ -877,6 +892,11 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist
     rule.target += m_sentence->target[ti] + " ";
   rule.target += "[" + targetLabel + "]";
 
+  if (m_options.pcfgScore) {
+    double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+    rule.pcfgScore = std::exp(logPCFGScore);
+  }
+
   // alignment
   for(int ti=startT; ti<=endT; ti++) {
     for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
@@ -957,11 +977,13 @@ void ExtractTask::writeRulesToFile()
     out << rule->source << " ||| "
         << rule->target << " ||| "
         << rule->alignment << " ||| "
-        << rule->count;
+        << rule->count << " ||| ";
     if (m_options.outputNTLengths) {
-      out << " ||| ";
       rule->OutputNTLengths(out); 
     }
+    if (m_options.pcfgScore) {
+      out << " ||| " << rule->pcfgScore;
+    }
     out << "\n";
 
     if (!m_options.onlyDirectFlag) {
diff --git a/scripts/training/phrase-extract/pcfg-common/Jamfile b/scripts/training/phrase-extract/pcfg-common/Jamfile
new file mode 100644
index 000000000..3dc272a56
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/Jamfile
@@ -0,0 +1 @@
+lib pcfg_common : [ glob *.cc ] ..//trees ;
diff --git a/scripts/training/phrase-extract/pcfg-common/exception.h b/scripts/training/phrase-extract/pcfg-common/exception.h
new file mode 100644
index 000000000..3dbd59d0e
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/exception.h
@@ -0,0 +1,41 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXCEPTION_H_
+#define PCFG_EXCEPTION_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+class Exception {
+ public:
+  Exception(const char *msg) : msg_(msg) {}
+  Exception(const std::string &msg) : msg_(msg) {}
+  const std::string &msg() const { return msg_; }
+ private:
+  std::string msg_;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/numbered_set.h b/scripts/training/phrase-extract/pcfg-common/numbered_set.h
new file mode 100644
index 000000000..f88d710ed
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/numbered_set.h
@@ -0,0 +1,109 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_NUMBERED_SET_H_
+#define PCFG_NUMBERED_SET_H_
+
+#include "exception.h"
+
+#include <boost/unordered_map.hpp>
+
+#include <limits>
+#include <sstream>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Stores a set of elements of type T, each of which is allocated an integral
+// ID of type I.  IDs are contiguous starting at 0.  Individual elements cannot
+// be removed once inserted (but the whole set can be cleared).
+template<typename T, typename I=size_t>
+class NumberedSet {
+ private:
+  typedef boost::unordered_map<T, I> ElementToIdMap;
+  typedef std::vector<const T *> IdToElementMap;
+
+ public:
+  typedef I IdType;
+  typedef typename IdToElementMap::const_iterator const_iterator;
+
+  NumberedSet() {}
+
+  const_iterator begin() const { return id_to_element_.begin(); }
+  const_iterator end() const { return id_to_element_.end(); }
+
+  // Static value
+  static I NullId() { return std::numeric_limits<I>::max(); }
+
+  bool Empty() const { return id_to_element_.empty(); }
+  size_t Size() const { return id_to_element_.size(); }
+
+  // Insert the given object and return its ID.
+  I Insert(const T &);
+
+  I Lookup(const T &) const;
+  const T &Lookup(I) const;
+
+  void Clear();
+
+ private:
+  ElementToIdMap element_to_id_;
+  IdToElementMap id_to_element_;
+};
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Lookup(const T &s) const {
+  typename ElementToIdMap::const_iterator p = element_to_id_.find(s);
+  return (p == element_to_id_.end()) ? NullId() : p->second;
+}
+
+template<typename T, typename I>
+const T &NumberedSet<T, I>::Lookup(I id) const {
+  if (id < 0 || id >= id_to_element_.size()) {
+    std::ostringstream msg;
+    msg << "Value not found: " << id;
+    throw Exception(msg.str());
+  }
+  return *(id_to_element_[id]);
+}
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Insert(const T &x) {
+  std::pair<T, I> value(x, id_to_element_.size());
+  std::pair<typename ElementToIdMap::iterator, bool> result =
+      element_to_id_.insert(value);
+  if (result.second) {
+    // x is a new element.
+    id_to_element_.push_back(&result.first->first);
+  }
+  return result.first->second;
+}
+
+template<typename T, typename I>
+void NumberedSet<T, I>::Clear() {
+  element_to_id_.clear();
+  id_to_element_.clear();
+}
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.cc b/scripts/training/phrase-extract/pcfg-common/pcfg.cc
new file mode 100644
index 000000000..d045b820b
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/pcfg.cc
@@ -0,0 +1,106 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "pcfg.h"
+
+#include "exception.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include <cassert>
+
+namespace Moses {
+namespace PCFG {
+
+void Pcfg::Add(const Key &key, double score) {
+  rules_[key] = score;
+}
+
+bool Pcfg::Lookup(const Key &key, double &score) const {
+  Map::const_iterator p = rules_.find(key);
+  if (p == rules_.end()) {
+    return false;
+  }
+  score = p->second;
+  return true;
+}
+
+void Pcfg::Read(std::istream &input, Vocabulary &vocab) {
+  std::string line;
+  std::string lhs_string;
+  std::vector<std::string> rhs_strings;
+  std::string score_string;
+  Key key;
+  while (std::getline(input, line)) {
+    // Read LHS.
+    size_t pos = line.find("|||");
+    if (pos == std::string::npos) {
+      throw Exception("missing first delimiter");
+    }
+    lhs_string = line.substr(0, pos);
+    boost::trim(lhs_string);
+
+    // Read RHS.
+    size_t begin = pos+3;
+    pos = line.find("|||", begin);
+    if (pos == std::string::npos) {
+      throw Exception("missing second delimiter");
+    }
+    std::string rhs_text = line.substr(begin, pos-begin);
+    boost::trim(rhs_text);
+    rhs_strings.clear();
+    boost::split(rhs_strings, rhs_text, boost::algorithm::is_space(),
+                 boost::algorithm::token_compress_on);
+
+    // Read score.
+    score_string = line.substr(pos+3);
+    boost::trim(score_string);
+
+    // Construct key.
+    key.clear();
+    key.reserve(rhs_strings.size()+1);
+    key.push_back(vocab.Insert(lhs_string));
+    for (std::vector<std::string>::const_iterator p = rhs_strings.begin();
+         p != rhs_strings.end(); ++p) {
+      key.push_back(vocab.Insert(*p));
+    }
+
+    // Add rule.
+    double score = boost::lexical_cast<double>(score_string);
+    Add(key, score);
+  }
+}
+
+void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const {
+  for (const_iterator p = begin(); p != end(); ++p) {
+    const Key &key = p->first;
+    double score = p->second;
+    std::vector<size_t>::const_iterator q = key.begin();
+    std::vector<size_t>::const_iterator end = key.end();
+    output << vocab.Lookup(*q++) << " |||";
+    while (q != end) {
+      output << " " << vocab.Lookup(*q++);
+    }
+    output << " ||| " << score << std::endl;
+  }
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.h b/scripts/training/phrase-extract/pcfg-common/pcfg.h
new file mode 100644
index 000000000..757eea449
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/pcfg.h
@@ -0,0 +1,61 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_PCFG_H_
+#define PCFG_PCFG_H_
+
+#include "typedef.h"
+
+#include <istream>
+#include <map>
+#include <ostream>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+class Pcfg {
+ public:
+  typedef std::vector<size_t> Key;
+  typedef std::map<Key, double> Map;
+  typedef Map::iterator iterator;
+  typedef Map::const_iterator const_iterator;
+
+  Pcfg() {}
+
+  iterator begin() { return rules_.begin(); }
+  const_iterator begin() const { return rules_.begin(); }
+
+  iterator end() { return rules_.end(); }
+  const_iterator end() const { return rules_.end(); }
+
+  void Add(const Key &, double);
+  bool Lookup(const Key &, double &) const;
+  void Read(std::istream &, Vocabulary &);
+  void Write(const Vocabulary &, std::ostream &) const;
+
+ private:
+  Map rules_;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h
new file mode 100644
index 000000000..bdac64dfc
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h
@@ -0,0 +1,77 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_PCFG_TREE_H_
+#define PCFG_PCFG_TREE_H_
+
+#include "syntax_tree.h"
+#include "xml_tree_writer.h"
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+template<typename DerivedType>
+class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
+ public:
+  typedef std::string LabelType;
+  typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
+
+  PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
+
+  double score() const { return score_; }
+  void set_score(double s) { score_ = s; }
+
+ private:
+  double score_;
+};
+
+class PcfgTree : public PcfgTreeBase<PcfgTree> {
+ public:
+  typedef PcfgTreeBase<PcfgTree> BaseType;
+  PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
+};
+
+// Specialise XmlOutputHandler for PcfgTree.
+template<>
+class XmlOutputHandler<PcfgTree> {
+ public:
+  typedef std::map<std::string, std::string> AttributeMap;
+
+  void GetLabel(const PcfgTree &tree, std::string &label) const {
+    label = tree.label();
+  }
+
+  void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const {
+    attribute_map.clear();
+    double score = tree.score();
+    if (score != 0.0) {
+      std::ostringstream out;
+      out << tree.score();
+      attribute_map["pcfg"] = out.str();
+    }
+  }
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/syntax_tree.h b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h
new file mode 100644
index 000000000..37f72dd58
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h
@@ -0,0 +1,91 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SYNTAX_TREE_H_
+#define PCFG_SYNTAX_TREE_H_
+
+#include <cassert>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Base class for SyntaxTree, AgreementTree, and friends.
+template<typename T, typename DerivedType>
+class SyntaxTreeBase {
+ public:
+  // Constructors
+  SyntaxTreeBase(const T &label)
+      : label_(label)
+      , children_()
+      , parent_(0) {}
+
+  SyntaxTreeBase(const T &label, const std::vector<DerivedType *> &children)
+      : label_(label)
+      , children_(children)
+      , parent_(0) {}
+
+  // Destructor
+  virtual ~SyntaxTreeBase();
+
+  const T &label() const { return label_; }
+  const DerivedType *parent() const { return parent_; }
+  DerivedType *parent() { return parent_; }
+  const std::vector<DerivedType *> &children() const { return children_; }
+  std::vector<DerivedType *> &children() { return children_; }
+
+  void set_label(const T &label) { label_ = label; }
+  void set_parent(DerivedType *parent) { parent_ = parent; }
+  void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
+
+  bool IsLeaf() const { return children_.empty(); }
+
+  bool IsPreterminal() const {
+    return children_.size() == 1 && children_[0]->IsLeaf();
+  }
+
+  void AddChild(DerivedType *child) { children_.push_back(child); }
+
+ private:
+  T label_;
+  std::vector<DerivedType *> children_;
+  DerivedType *parent_;
+};
+
+template<typename T>
+class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
+ public:
+  typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
+  SyntaxTree(const T &label) : BaseType(label) {}
+  SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
+      : BaseType(label, children) {}
+};
+
+template<typename T, typename DerivedType>
+SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
+  for (size_t i = 0; i < children_.size(); ++i) {
+    delete children_[i];
+  }
+}
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/tool.cc b/scripts/training/phrase-extract/pcfg-common/tool.cc
new file mode 100644
index 000000000..bebd220e1
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/tool.cc
@@ -0,0 +1,80 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "tool.h"
+
+#include <sstream>
+
+namespace Moses {
+namespace PCFG {
+
+std::istream &Tool::OpenInputOrDie(const std::string &filename) {
+  // TODO Check that function is only called once?
+  if (filename.empty() || filename == "-") {
+    input_ptr_ = &(std::cin);
+  } else {
+    input_file_stream_.open(filename.c_str());
+    if (!input_file_stream_) {
+      std::ostringstream msg;
+      msg << "failed to open input file: " << filename;
+      Error(msg.str());
+    }
+    input_ptr_ = &input_file_stream_;
+  }
+  return *input_ptr_;
+}
+
+std::ostream &Tool::OpenOutputOrDie(const std::string &filename) {
+  // TODO Check that function is only called once?
+  if (filename.empty() || filename == "-") {
+    output_ptr_ = &(std::cout);
+  } else {
+    output_file_stream_.open(filename.c_str());
+    if (!output_file_stream_) {
+      std::ostringstream msg;
+      msg << "failed to open output file: " << filename;
+      Error(msg.str());
+    }
+    output_ptr_ = &output_file_stream_;
+  }
+  return *output_ptr_;
+}
+
+void Tool::OpenNamedInputOrDie(const std::string &filename,
+                               std::ifstream &stream) {
+  stream.open(filename.c_str());
+  if (!stream) {
+    std::ostringstream msg;
+    msg << "failed to open input file: " << filename;
+    Error(msg.str());
+  }
+}
+
+void Tool::OpenNamedOutputOrDie(const std::string &filename,
+                                std::ofstream &stream) {
+  stream.open(filename.c_str());
+  if (!stream) {
+    std::ostringstream msg;
+    msg << "failed to open output file: " << filename;
+    Error(msg.str());
+  }
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-common/tool.h b/scripts/training/phrase-extract/pcfg-common/tool.h
new file mode 100644
index 000000000..0af342569
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/tool.h
@@ -0,0 +1,91 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_TOOL_H_
+#define PCFG_TOOL_H_
+
+#include <boost/program_options/cmdline.hpp>
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+class Tool {
+ public:
+  virtual ~Tool() {}
+
+  const std::string &name() const { return name_; }
+
+  virtual int Main(int argc, char *argv[]) = 0;
+
+ protected:
+  Tool(const std::string &name) : name_(name) {}
+
+  // Returns the boost::program_options style that should be used by all tools.
+  static int CommonOptionStyle() {
+    namespace cls = boost::program_options::command_line_style;
+    return cls::default_style & (~cls::allow_guessing);
+  }
+
+  void Warn(const std::string &msg) const {
+    std::cerr << name_ << ": warning: " << msg << std::endl;
+  }
+
+  void Error(const std::string &msg) const {
+    std::cerr << name_ << ": error: " << msg << std::endl;
+    std::exit(1);
+  }
+
+  // Initialises the tool's main input stream and returns a reference that is
+  // valid for the remainder of the tool's lifetime.  If filename is empty or
+  // "-" then input is standard input; otherwise it is the named file.  Calls
+  // Error() if the file cannot be opened for reading.
+  std::istream &OpenInputOrDie(const std::string &filename);
+
+  // Initialises the tool's main output stream and returns a reference that is
+  // valid for the remainder of the tool's lifetime.  If filename is empty or
+  // "-" then output is standard output; otherwise it is the named file.  Calls
+  // Error() if the file cannot be opened for writing.
+  std::ostream &OpenOutputOrDie(const std::string &filename);
+
+  // Opens the named input file using the supplied ifstream.  Calls Error() if
+  // the file cannot be opened for reading.
+  void OpenNamedInputOrDie(const std::string &, std::ifstream &);
+
+  // Opens the named output file using the supplied ofstream.  Calls Error() if
+  // the file cannot be opened for writing.
+  void OpenNamedOutputOrDie(const std::string &, std::ofstream &);
+
+ private:
+  std::string name_;
+  std::istream *input_ptr_;
+  std::ifstream input_file_stream_;
+  std::ostream *output_ptr_;
+  std::ofstream output_file_stream_;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/typedef.h b/scripts/training/phrase-extract/pcfg-common/typedef.h
new file mode 100644
index 000000000..49a12d681
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/typedef.h
@@ -0,0 +1,37 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_TYPEDEF_H_
+#define PCFG_TYPEDEF_H_
+
+#include "numbered_set.h"
+#include "syntax_tree.h"
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+typedef NumberedSet<std::string> Vocabulary;
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc
new file mode 100644
index 000000000..5c596a0fb
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc
@@ -0,0 +1,85 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "xml_tree_parser.h"
+
+#include "exception.h"
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+
+#include <cassert>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+XmlTreeParser::XmlTreeParser()
+{
+}
+
+std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line)
+{
+  m_line = line;
+  m_tree.Clear();
+  try {
+    if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
+      throw Exception("");
+    }
+  } catch (const XmlException &e) {
+    throw Exception(e.getMsg());
+  }
+  m_tree.ConnectNodes();
+  SyntaxNode *root = m_tree.GetTop();
+  assert(root);
+  m_words = tokenize(m_line.c_str());
+  return ConvertTree(*root, m_words);
+}
+
+// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree.
+std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
+    const SyntaxNode &tree,
+    const std::vector<std::string> &words)
+{
+  std::auto_ptr<PcfgTree> root(new PcfgTree(tree.GetLabel()));
+  const std::vector<SyntaxNode*> &children = tree.GetChildren();
+  if (children.empty()) {
+    if (tree.GetStart() != tree.GetEnd()) {
+      std::ostringstream msg;
+      msg << "leaf node covers multiple words (" << tree.GetStart()
+          << "-" << tree.GetEnd() << "): this is currently unsupported";
+      throw Exception(msg.str());
+    }
+    std::auto_ptr<PcfgTree> leaf(new PcfgTree(words[tree.GetStart()]));
+    leaf->set_parent(root.get());
+    root->AddChild(leaf.release());
+  } else {
+    for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
+         p != children.end(); ++p) {
+      assert(*p);
+      std::auto_ptr<PcfgTree> child = ConvertTree(**p, words);
+      child->set_parent(root.get());
+      root->AddChild(child.release());
+    }
+  }
+  return root;
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h
new file mode 100644
index 000000000..6b418c44e
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h
@@ -0,0 +1,56 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_XML_TREE_PARSER_H_
+#define PCFG_XML_TREE_PARSER_H_
+
+#include "pcfg_tree.h"
+#include "SyntaxTree.h"
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Parses a string in Moses' XML parse tree format and returns a PcfgTree
+// object.
+class XmlTreeParser {
+ public:
+  XmlTreeParser();
+  std::auto_ptr<PcfgTree> Parse(const std::string &);
+ private:
+  std::auto_ptr<PcfgTree> ConvertTree(const SyntaxNode &,
+                                      const std::vector<std::string> &);
+
+  std::set<std::string> m_labelSet;
+  std::map<std::string, int> m_topLabelSet;
+  std::string m_line;
+  ::SyntaxTree m_tree;
+  std::vector<std::string> m_words;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h
new file mode 100644
index 000000000..347c352bb
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h
@@ -0,0 +1,127 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_XML_TREE_WRITER_H_
+#define PCFG_XML_TREE_WRITER_H_
+
+#include "syntax_tree.h"
+
+#include "XmlTree.h"
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <vector>
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+template<typename InputTree>
+class XmlOutputHandler {
+ public:
+  typedef std::map<std::string, std::string> AttributeMap;
+
+  void GetLabel(const InputTree &, std::string &) const;
+  void GetAttributes(const InputTree &, AttributeMap &) const;
+};
+
+template<typename InputTree>
+class XmlTreeWriter : public XmlOutputHandler<InputTree> {
+ public:
+  typedef XmlOutputHandler<InputTree> Base;
+  void Write(const InputTree &, std::ostream &) const;
+ private:
+  std::string Escape(const std::string &) const;
+};
+
+template<typename InputTree>
+void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
+                                     std::ostream &out) const {
+  assert(!tree.IsLeaf());
+
+  // Opening tag
+
+  std::string label;
+  Base::GetLabel(tree, label);
+  out << "<tree label=\"" << Escape(label) << "\"";
+
+  typename Base::AttributeMap attribute_map;
+  Base::GetAttributes(tree, attribute_map);
+
+  for (typename Base::AttributeMap::const_iterator p = attribute_map.begin();
+       p != attribute_map.end(); ++p) {
+    out << " " << p->first << "=\"" << p->second << "\"";
+  }
+
+  out << ">";
+
+  // Children
+
+  const std::vector<InputTree *> &children = tree.children();
+  for (typename std::vector<InputTree *>::const_iterator p = children.begin();
+       p != children.end(); ++p) {
+    InputTree &child = **p;
+    if (child.IsLeaf()) {
+      Base::GetLabel(child, label);
+      out << " " << Escape(label);
+    } else {
+      out << " ";
+      Write(**p, out);
+    }
+  }
+
+  // Closing tag
+  out << " </tree>";
+
+  if (tree.parent() == 0) {
+    out << std::endl;
+  }
+}
+
+// Escapes XML special characters.
+template<typename InputTree>
+std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
+  std::string t;
+  size_t len = s.size();
+  t.reserve(len);
+  for (size_t i = 0; i < len; ++i) {
+    if (s[i] == '<') {
+      t += "&lt;";
+    } else if (s[i] == '>') {
+      t += "&gt;";
+    } else if (s[i] == '&') {
+      t += "&amp;";
+    } else if (s[i] == '\'') {
+      t += "&apos;";
+    } else if (s[i] == '"') {
+      t += "&quot;";
+    } else {
+      t += s[i];
+    }
+  }
+  return t;
+}
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/Jamfile b/scripts/training/phrase-extract/pcfg-extract/Jamfile
new file mode 100644
index 000000000..be91d6d2f
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/Jamfile
@@ -0,0 +1 @@
+exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ;
diff --git a/scripts/training/phrase-extract/pcfg-extract/main.cc b/scripts/training/phrase-extract/pcfg-extract/main.cc
new file mode 100644
index 000000000..47b45afc3
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/main.cc
@@ -0,0 +1,25 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "pcfg_extract.h"
+
+int main(int argc, char *argv[]) {
+  Moses::PCFG::PcfgExtract tool;
+  return tool.Main(argc, argv);
+}
diff --git a/scripts/training/phrase-extract/pcfg-extract/options.h b/scripts/training/phrase-extract/pcfg-extract/options.h
new file mode 100644
index 000000000..3acb31b58
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/options.h
@@ -0,0 +1,36 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_OPTIONS_H_
+#define PCFG_EXTRACT_OPTIONS_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+struct Options {
+  std::string corpus_file;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc
new file mode 100644
index 000000000..151c9959c
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -0,0 +1,131 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "pcfg_extract.h"
+
+#include "options.h"
+#include "rule_collection.h"
+#include "rule_extractor.h"
+
+#include "pcfg-common/exception.h"
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/syntax_tree.h"
+#include "pcfg-common/typedef.h"
+#include "pcfg-common/xml_tree_parser.h"
+
+#include <boost/program_options.hpp>
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+int PcfgExtract::Main(int argc, char *argv[]) {
+  // Process command-line options.
+  Options options;
+  ProcessOptions(argc, argv, options);
+
+  // Extract PCFG rules from corpus.
+  Vocabulary non_term_vocab;
+  RuleExtractor rule_extractor(non_term_vocab);
+  RuleCollection rule_collection;
+  XmlTreeParser parser;
+  std::string line;
+  size_t line_num = 0;
+  std::auto_ptr<PcfgTree> tree;
+  while (std::getline(std::cin, line)) {
+    ++line_num;
+    try {
+      tree = parser.Parse(line);
+    } catch (Exception &e) {
+      std::ostringstream msg;
+      msg << "line " << line_num << ": " << e.msg();
+      Error(msg.str());
+    }
+    if (!tree.get()) {
+      std::ostringstream msg;
+      msg << "no tree at line " << line_num;
+      Warn(msg.str());
+      continue;
+    }
+    rule_extractor.Extract(*tree, rule_collection);
+  }
+
+  // Score rules and write PCFG to output.
+  Pcfg pcfg;
+  rule_collection.CreatePcfg(pcfg);
+  pcfg.Write(non_term_vocab, std::cout);
+
+  return 0;
+}
+
+void PcfgExtract::ProcessOptions(int argc, char *argv[],
+                                 Options &options) const {
+  namespace po = boost::program_options;
+
+  std::ostringstream usage_top;
+  usage_top << "Usage: " << name() << "\n\n" << "Options";
+
+  // Declare the command line options that are visible to the user.
+  po::options_description visible(usage_top.str());
+  visible.add_options()
+    ("help", "print help message and exit")
+  ;
+
+  // Declare the command line options that are hidden from the user
+  // (these are used as positional options).
+  po::options_description hidden("Hidden options");
+  hidden.add_options();
+
+  // Compose the full set of command-line options.
+  po::options_description cmd_line_options;
+  cmd_line_options.add(visible).add(hidden);
+
+  // Register the positional options.
+  po::positional_options_description p;
+
+  // Process the command-line.
+  po::variables_map vm;
+  try {
+    po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+              options(cmd_line_options).positional(p).run(), vm);
+    po::notify(vm);
+  } catch (const std::exception &e) {
+    std::ostringstream msg;
+    msg << e.what() << "\n\n" << visible;
+    Error(msg.str());
+  }
+
+  if (vm.count("help")) {
+    std::cout << visible << std::endl;
+    std::exit(0);
+  }
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h
new file mode 100644
index 000000000..1af6cb4fe
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_PCFG_EXTRACT_H_
+#define PCFG_EXTRACT_PCFG_EXTRACT_H_
+
+#include "pcfg-common/tool.h"
+
+namespace Moses {
+namespace PCFG {
+
+class Options;
+
+class PcfgExtract : public Tool {
+ public:
+  PcfgExtract() : Tool("pcfg-extract") {}
+  virtual int Main(int, char *[]);
+ private:
+  void ProcessOptions(int, char *[], Options &) const;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc
new file mode 100644
index 000000000..503b1a9e6
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc
@@ -0,0 +1,58 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "rule_collection.h"
+
+#include "pcfg-common/pcfg.h"
+
+#include <cmath>
+
+namespace Moses {
+namespace PCFG {
+
+void RuleCollection::Add(size_t lhs, const std::vector<size_t> &rhs) {
+  ++collection_[lhs][rhs];
+}
+
+void RuleCollection::CreatePcfg(Pcfg &pcfg) {
+  std::vector<size_t> key;
+  for (const_iterator p = begin(); p != end(); ++p) {
+    size_t lhs = p->first;
+    const RhsCountMap &rhs_counts = p->second;
+    size_t total = 0;
+    for (RhsCountMap::const_iterator q = rhs_counts.begin();
+         q != rhs_counts.end(); ++q) {
+      total += q->second;
+    }
+    for (RhsCountMap::const_iterator q = rhs_counts.begin();
+         q != rhs_counts.end(); ++q) {
+      const std::vector<size_t> &rhs = q->first;
+      size_t count = q->second;
+      double score = std::log(static_cast<double>(count) /
+                              static_cast<double>(total));
+      key.clear();
+      key.push_back(lhs);
+      key.insert(key.end(), rhs.begin(), rhs.end());
+      pcfg.Add(key, score);
+    }
+  }
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.h b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h
new file mode 100644
index 000000000..1b768dd21
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h
@@ -0,0 +1,59 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_RULE_COLLECTION_H_
+#define PCFG_EXTRACT_RULE_COLLECTION_H_
+
+#include "pcfg-common/pcfg.h"
+
+#include <boost/unordered_map.hpp>
+
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Contains PCFG rules and their counts.
+class RuleCollection {
+ public:
+  typedef boost::unordered_map<std::vector<size_t>, size_t> RhsCountMap;
+  typedef boost::unordered_map<size_t, RhsCountMap> Map;
+  typedef Map::iterator iterator;
+  typedef Map::const_iterator const_iterator;
+
+  RuleCollection() {}
+
+  iterator begin() { return collection_.begin(); }
+  const_iterator begin() const { return collection_.begin(); }
+
+  iterator end() { return collection_.end(); }
+  const_iterator end() const { return collection_.end(); }
+
+  void Add(size_t, const std::vector<size_t> &);
+  void CreatePcfg(Pcfg &);
+
+ private:
+  Map collection_;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc
new file mode 100644
index 000000000..48a82a6d0
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc
@@ -0,0 +1,51 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "rule_extractor.h"
+
+#include "pcfg-common/pcfg_tree.h"
+
+namespace Moses {
+namespace PCFG {
+
+RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab)
+    : non_term_vocab_(non_term_vocab) {
+}
+
+void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const {
+  if (tree.IsPreterminal() || tree.IsLeaf()) {
+    return;
+  }
+
+  size_t lhs = non_term_vocab_.Insert(tree.label());
+  std::vector<size_t> rhs;
+
+  const std::vector<PcfgTree *> &children = tree.children();
+  rhs.reserve(children.size());
+  for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+       p != children.end(); ++p) {
+    const PcfgTree &child = **p;
+    rhs.push_back(non_term_vocab_.Insert(child.label()));
+    Extract(child, rc);
+  }
+  rc.Add(lhs, rhs);
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h
new file mode 100644
index 000000000..6bcffbc61
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h
@@ -0,0 +1,45 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
+#define PCFG_EXTRACT_RULE_EXTRACTOR_H_
+
+#include "rule_collection.h"
+
+#include "pcfg-common/typedef.h"
+
+namespace Moses {
+namespace PCFG {
+
+class PcfgTree;
+
+// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
+class RuleExtractor {
+ public:
+  RuleExtractor(Vocabulary &);
+  void Extract(const PcfgTree &, RuleCollection &) const;
+ private:
+  Vocabulary &non_term_vocab_;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-score/Jamfile b/scripts/training/phrase-extract/pcfg-score/Jamfile
new file mode 100644
index 000000000..7225381c0
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/Jamfile
@@ -0,0 +1 @@
+exe pcfg-score : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ;
diff --git a/scripts/training/phrase-extract/pcfg-score/main.cc b/scripts/training/phrase-extract/pcfg-score/main.cc
new file mode 100644
index 000000000..da5392add
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/main.cc
@@ -0,0 +1,25 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "pcfg_score.h"
+
+int main(int argc, char *argv[]) {
+  Moses::PCFG::PcfgScore tool;
+  return tool.Main(argc, argv);
+}
diff --git a/scripts/training/phrase-extract/pcfg-score/options.h b/scripts/training/phrase-extract/pcfg-score/options.h
new file mode 100644
index 000000000..e54b2a0b9
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/options.h
@@ -0,0 +1,36 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_OPTIONS_H_
+#define PCFG_SCORE_OPTIONS_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+struct Options {
+  std::string pcfg_file;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc
new file mode 100644
index 000000000..d780200ad
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc
@@ -0,0 +1,152 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "pcfg_score.h"
+
+#include "options.h"
+#include "tree_scorer.h"
+
+#include "pcfg-common/exception.h"
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/syntax_tree.h"
+#include "pcfg-common/typedef.h"
+#include "pcfg-common/xml_tree_parser.h"
+
+#include <boost/program_options.hpp>
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+int PcfgScore::Main(int argc, char *argv[]) {
+  // Process command-line options.
+  Options options;
+  ProcessOptions(argc, argv, options);
+
+  // Open PCFG stream.
+  std::ifstream pcfg_stream;
+  OpenNamedInputOrDie(options.pcfg_file, pcfg_stream);
+
+  // Read PCFG.
+  Pcfg pcfg;
+  Vocabulary non_term_vocab;
+  pcfg.Read(pcfg_stream, non_term_vocab);
+
+  // Score corpus according to PCFG.
+  TreeScorer scorer(pcfg, non_term_vocab);
+  XmlTreeParser parser;
+  XmlTreeWriter<PcfgTree> writer;
+  std::string line;
+  size_t line_num = 0;
+  std::auto_ptr<PcfgTree> tree;
+  while (std::getline(std::cin, line)) {
+    ++line_num;
+    try {
+      tree = parser.Parse(line);
+    } catch (Exception &e) {
+      std::ostringstream msg;
+      msg << "line " << line_num << ": " << e.msg();
+      Error(msg.str());
+    }
+    if (!tree.get()) {
+      std::ostringstream msg;
+      msg << "no tree at line " << line_num;
+      Warn(msg.str());
+      std::cout << std::endl;
+      continue;
+    }
+    if (!scorer.Score(*tree)) {
+      std::ostringstream msg;
+      msg << "failed to score tree at line " << line_num;
+      Warn(msg.str());
+      std::cout << std::endl;
+      continue;
+    }
+    writer.Write(*tree, std::cout);
+  }
+
+  return 0;
+}
+
+void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const {
+  namespace po = boost::program_options;
+
+  std::ostringstream usage_top;
+  usage_top << "Usage: " << name() << " PCFG\n\n"
+            << "Options";
+
+  // Declare the command line options that are visible to the user.
+  po::options_description visible(usage_top.str());
+  visible.add_options()
+    ("help", "print help message and exit")
+  ;
+
+  // Declare the command line options that are hidden from the user
+  // (these are used as positional options).
+  po::options_description hidden("Hidden options");
+  hidden.add_options()
+    ("pcfg-file", po::value(&options.pcfg_file), "pcfg file")
+  ;
+
+  // Compose the full set of command-line options.
+  po::options_description cmd_line_options;
+  cmd_line_options.add(visible).add(hidden);
+
+  // Register the positional options.
+  po::positional_options_description p;
+  p.add("pcfg-file", 1);
+
+  // Process the command-line.
+  po::variables_map vm;
+  try {
+    po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+              options(cmd_line_options).positional(p).run(), vm);
+    po::notify(vm);
+  } catch (const std::exception &e) {
+    std::ostringstream msg;
+    msg << e.what() << "\n\n" << visible;
+    Error(msg.str());
+  }
+
+  if (vm.count("help")) {
+    std::cout << visible << std::endl;
+    std::exit(0);
+  }
+
+  // Check positional options were given.
+
+  if (!vm.count("pcfg-file")) {
+    std::ostringstream msg;
+    msg << "missing required argument\n\n" << visible << std::endl;
+    Error(msg.str());
+  }
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.h b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h
new file mode 100644
index 000000000..5e506c39d
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_PCFG_SCORE_H_
+#define PCFG_SCORE_PCFG_SCORE_H_
+
+#include "pcfg-common/tool.h"
+
+namespace Moses {
+namespace PCFG {
+
+class Options;
+
+class PcfgScore : public Tool {
+ public:
+  PcfgScore() : Tool("pcfg-score") {}
+  virtual int Main(int, char *[]);
+ private:
+  void ProcessOptions(int, char *[], Options &) const;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc
new file mode 100644
index 000000000..5f695e4fc
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc
@@ -0,0 +1,68 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "tree_scorer.h"
+
+#include <cassert>
+
+namespace Moses {
+namespace PCFG {
+
+TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab)
+    : pcfg_(pcfg)
+    , non_term_vocab_(non_term_vocab) {
+}
+
+bool TreeScorer::Score(PcfgTree &root) const {
+  if (root.IsPreterminal() || root.IsLeaf()) {
+    return true;
+  }
+
+  const std::vector<PcfgTree *> &children = root.children();
+
+  double log_prob = 0.0;
+
+  std::vector<size_t> key;
+  key.reserve(children.size()+1);
+  key.push_back(non_term_vocab_.Lookup(root.label()));
+
+  for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+       p != children.end(); ++p) {
+    PcfgTree *child = *p;
+    assert(!child->IsLeaf());
+    key.push_back(non_term_vocab_.Lookup(child->label()));
+    if (!Score(*child)) {
+      return false;
+    }
+    if (!child->IsPreterminal()) {
+      log_prob += child->score();
+    }
+  }
+  double rule_score;
+  bool found = pcfg_.Lookup(key, rule_score);
+  if (!found) {
+    return false;
+  }
+  log_prob += rule_score;
+  root.set_score(log_prob);
+  return true;
+}
+
+}  // namespace PCFG
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.h b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h
new file mode 100644
index 000000000..36f4e1e99
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h
@@ -0,0 +1,47 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_TREE_SCORER_H_
+#define PCFG_SCORE_TREE_SCORER_H_
+
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/typedef.h"
+
+namespace Moses {
+namespace PCFG {
+
+class TreeScorer {
+ public:
+  TreeScorer(const Pcfg &, const Vocabulary &);
+
+  // Score tree according to PCFG.  Returns false if unsuccessful (due to
+  // missing rule).
+  bool Score(PcfgTree &) const;
+
+ private:
+  const Pcfg &pcfg_;
+  const Vocabulary &non_term_vocab_;
+};
+
+}  // namespace PCFG
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp
index 8bcc9be3b..c5fb0b99f 100644
--- a/scripts/training/phrase-extract/score.cpp
+++ b/scripts/training/phrase-extract/score.cpp
@@ -72,6 +72,7 @@ void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
 LexicalTable lexTable;
 bool inverseFlag = false;
 bool hierarchicalFlag = false;
+bool pcfgFlag = false;
 bool wordAlignmentFlag = false;
 bool goodTuringFlag = false;
 bool kneserNeyFlag = false;
@@ -108,6 +109,9 @@ int main(int argc, char* argv[])
     } else if (strcmp(argv[i],"--Hierarchical") == 0) {
       hierarchicalFlag = true;
       cerr << "processing hierarchical rules\n";
+    } else if (strcmp(argv[i],"--PCFG") == 0) {
+      pcfgFlag = true;
+      cerr << "including PCFG scores\n";
     } else if (strcmp(argv[i],"--WordAlignment") == 0) {
       wordAlignmentFlag = true;
       cerr << "outputing word alignment" << endl;
@@ -193,6 +197,7 @@ int main(int argc, char* argv[])
 	
   // loop through all extracted phrase translations
   float lastCount = 0.0f;
+  float lastPcfgSum = 0.0f;
   vector< PhraseAlignment > phrasePairsWithSameF;
   int i=0;
   char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
@@ -207,6 +212,7 @@ int main(int argc, char* argv[])
     // identical to last line? just add count
     if (strcmp(line,lastLine) == 0) {
       lastPhrasePair->count += lastCount;
+      lastPhrasePair->pcfgSum += lastPcfgSum;
       continue;
     }
     strcpy( lastLine, line );
@@ -215,10 +221,12 @@ int main(int argc, char* argv[])
     PhraseAlignment phrasePair;
     phrasePair.create( line, i );
     lastCount = phrasePair.count;
+    lastPcfgSum = phrasePair.pcfgSum;
 
     // only differs in count? just add count
     if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) {
       lastPhrasePair->count += phrasePair.count;
+      lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
       continue;
     }
 
@@ -438,6 +446,16 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
       countOfCounts[ countInt ]++;
   }
 
+  // compute PCFG score
+  float pcfgScore;
+  if (pcfgFlag && !inverseFlag) {
+    float pcfgSum = 0;
+    for(size_t i=0; i<phrasePair.size(); ++i) {
+        pcfgSum += phrasePair[i]->pcfgSum;
+    }
+    pcfgScore = pcfgSum / count;
+  }
+
   // output phrases
   const PHRASE &phraseS = phrasePair[0]->GetSource();
   const PHRASE &phraseT = phrasePair[0]->GetTarget();
@@ -493,6 +511,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
     phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
   }
 
+  // target-side PCFG score
+  if (pcfgFlag && !inverseFlag) {
+    phraseTableFile << " " << pcfgScore;
+  }
+
   phraseTableFile << " ||| ";
 
   // alignment info for non-terminals
diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir
index 1a7cb3a39..41ea2d682 100755
--- a/scripts/training/train-model.perl.missing_bin_dir
+++ b/scripts/training/train-model.perl.missing_bin_dir
@@ -105,6 +105,7 @@ $_HELP = 1
 		       'glue-grammar-file=s' => \$_GLUE_GRAMMAR_FILE,
 		       'unknown-word-label-file=s' => \$_UNKNOWN_WORD_LABEL_FILE,
 		       'ghkm' => \$_GHKM,
+		       'pcfg' => \$_PCFG,
 		       'extract-options=s' => \$_EXTRACT_OPTIONS,
 		       'score-options=s' => \$_SCORE_OPTIONS,
 		       'source-syntax' => \$_SOURCE_SYNTAX,
@@ -1373,6 +1374,7 @@ sub extract_phrase {
         $cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file";
         $cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR;
         $cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
+        $cmd .= " --PCFG" if $_PCFG;
         if (!defined($_GHKM)) {
           $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
           $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
@@ -1503,6 +1505,7 @@ sub score_phrase_phrase_extract {
         $cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
         $cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT;
         $cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL;
+        $cmd .= " --PCFG" if $_PCFG;
         $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
 
 				# sorting
@@ -1801,6 +1804,7 @@ sub create_ini {
    $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
    $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
    $basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
+   $basic_weight_count++ if $_PCFG;
    foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
      $num_of_ttables++;
      my $ff = $f;
author	Phil Williams <philip.williams@mac.com>	2012-05-25 20:29:47 +0400
committer	Phil Williams <philip.williams@mac.com>	2012-05-25 20:29:47 +0400
commit	90c0bc9f5ceec4e7d33386ec597fd753e7d23d4a (patch)
tree	2e4aa63e87c6150a5317e3e8bae3cc00d9187db3
parent	2fab137aaeeda8077734e4c6e5627bfb44d27691 (diff)