Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPhil Williams <philip.williams@mac.com>2012-05-25 20:29:47 +0400
committerPhil Williams <philip.williams@mac.com>2012-05-25 20:29:47 +0400
commit90c0bc9f5ceec4e7d33386ec597fd753e7d23d4a (patch)
tree2e4aa63e87c6150a5317e3e8bae3cc00d9187db3
parent2fab137aaeeda8077734e4c6e5627bfb44d27691 (diff)
Add an optional PCFG scoring feature for target syntax models (similar to
the p_cfg feature used in Marcu, Wang, Echihabi, and Knight (2006)).
-rw-r--r--scripts/Jamfile2
-rw-r--r--scripts/ems/experiment.meta15
-rwxr-xr-xscripts/ems/experiment.perl2
-rw-r--r--scripts/training/phrase-extract/ExtractedRule.h2
-rw-r--r--scripts/training/phrase-extract/Jamfile2
-rw-r--r--scripts/training/phrase-extract/PhraseAlignment.cpp7
-rw-r--r--scripts/training/phrase-extract/PhraseAlignment.h1
-rw-r--r--scripts/training/phrase-extract/RuleExtractionOptions.h2
-rw-r--r--scripts/training/phrase-extract/SyntaxTree.cpp3
-rw-r--r--scripts/training/phrase-extract/SyntaxTree.h11
-rw-r--r--scripts/training/phrase-extract/XmlTree.cpp9
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp4
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp5
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/Node.h6
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/Options.h2
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/ParseTree.h7
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp1
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/ScfgRule.h2
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp69
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h4
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp16
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/Subgraph.h8
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp1
-rw-r--r--scripts/training/phrase-extract/extract-rules.cpp32
-rw-r--r--scripts/training/phrase-extract/pcfg-common/Jamfile1
-rw-r--r--scripts/training/phrase-extract/pcfg-common/exception.h41
-rw-r--r--scripts/training/phrase-extract/pcfg-common/numbered_set.h109
-rw-r--r--scripts/training/phrase-extract/pcfg-common/pcfg.cc106
-rw-r--r--scripts/training/phrase-extract/pcfg-common/pcfg.h61
-rw-r--r--scripts/training/phrase-extract/pcfg-common/pcfg_tree.h77
-rw-r--r--scripts/training/phrase-extract/pcfg-common/syntax_tree.h91
-rw-r--r--scripts/training/phrase-extract/pcfg-common/tool.cc80
-rw-r--r--scripts/training/phrase-extract/pcfg-common/tool.h91
-rw-r--r--scripts/training/phrase-extract/pcfg-common/typedef.h37
-rw-r--r--scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc85
-rw-r--r--scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h56
-rw-r--r--scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h127
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/Jamfile1
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/main.cc25
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/options.h36
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc131
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h42
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/rule_collection.cc58
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/rule_collection.h59
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc51
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/rule_extractor.h45
-rw-r--r--scripts/training/phrase-extract/pcfg-score/Jamfile1
-rw-r--r--scripts/training/phrase-extract/pcfg-score/main.cc25
-rw-r--r--scripts/training/phrase-extract/pcfg-score/options.h36
-rw-r--r--scripts/training/phrase-extract/pcfg-score/pcfg_score.cc152
-rw-r--r--scripts/training/phrase-extract/pcfg-score/pcfg_score.h42
-rw-r--r--scripts/training/phrase-extract/pcfg-score/tree_scorer.cc68
-rw-r--r--scripts/training/phrase-extract/pcfg-score/tree_scorer.h47
-rw-r--r--scripts/training/phrase-extract/score.cpp23
-rwxr-xr-xscripts/training/train-model.perl.missing_bin_dir4
55 files changed, 1970 insertions, 51 deletions
diff --git a/scripts/Jamfile b/scripts/Jamfile
index 6fb9bad39..b9eefcffe 100644
--- a/scripts/Jamfile
+++ b/scripts/Jamfile
@@ -42,6 +42,8 @@ if $(location) {
install compactify : training/compact-rule-table//compactify : <location>$(location)/training/compact-rule-table/tools ;
install phrase-extract : training/phrase-extract//programs : <location>$(location)/training/phrase-extract ;
+ install pcfg-extract : training/phrase-extract/pcfg-extract//pcfg-extract : <location>$(location)/training/phrase-extract/pcfg-extract ;
+ install pcfg-score : training/phrase-extract/pcfg-score//pcfg-score : <location>$(location)/training/phrase-extract/pcfg-score ;
install lexical-reordering : training/lexical-reordering//score : <location>$(location)/training/lexical-reordering ;
install symal : training/symal//symal : <location>$(location)/training/symal ;
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 51ac0f67a..b33c589d2 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -344,8 +344,21 @@ parse-relax
pass-unless: input-parse-relaxer output-parse-relaxer
template-if: input-parse-relaxer IN.$input-extension OUT.$input-extension
template-if: output-parse-relaxer IN.$output-extension OUT.$output-extension
+pcfg-extract
+ in: parse-relaxed-corpus
+ out: pcfg
+ default-name: model/pcfg
+ ignore-unless: use-pcfg-feature
+ rerun-on-change: use-pcfg-feature
+ template: $moses-script-dir/training/phrase-extract/pcfg-extract/pcfg-extract < IN.$output-extension > OUT.$output-extension
+pcfg-score
+ in: parse-relaxed-corpus pcfg
+ out: scored-corpus
+ default-name: model/scored-corpus
+ pass-unless: use-pcfg-feature
+ template: ln -s IN.$input-extension OUT.$input-extension ; $moses-script-dir/training/phrase-extract/pcfg-score/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension
extract-phrases
- in: word-alignment parse-relaxed-corpus
+ in: word-alignment scored-corpus
out: extracted-phrases
rerun-on-change: max-phrase-length translation-factors reordering-factors hierarchical-rule-set extract-settings training-options script use-ghkm
default-name: model/extract
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 59bd2788f..0c61a2a05 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -2007,6 +2007,7 @@ sub get_training_setting {
my $target_syntax = &get("GENERAL:output-parser");
my $score_settings = &get("TRAINING:score-settings");
my $parallel = &get("TRAINING:parallel");
+ my $pcfg = &get("TRAINING:use-pcfg-feature");
my $xml = $source_syntax || $target_syntax;
@@ -2029,6 +2030,7 @@ sub get_training_setting {
$cmd .= "-glue-grammar " if $hierarchical;
$cmd .= "-score-options '".$score_settings."' " if $score_settings;
$cmd .= "-parallel " if $parallel;
+ $cmd .= "-pcfg " if $pcfg;
# factored training
if (&backoff_and_get("TRAINING:input-factors")) {
diff --git a/scripts/training/phrase-extract/ExtractedRule.h b/scripts/training/phrase-extract/ExtractedRule.h
index 170ccf892..be6e30836 100644
--- a/scripts/training/phrase-extract/ExtractedRule.h
+++ b/scripts/training/phrase-extract/ExtractedRule.h
@@ -43,6 +43,7 @@ public:
int startS;
int endS;
float count;
+ double pcfgScore;
std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
@@ -58,6 +59,7 @@ public:
, startS(sS)
, endS(eS)
, count(0)
+ , pcfgScore(0.0)
{}
void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength)
diff --git a/scripts/training/phrase-extract/Jamfile b/scripts/training/phrase-extract/Jamfile
index 5ed3f20f1..9be67e80a 100644
--- a/scripts/training/phrase-extract/Jamfile
+++ b/scripts/training/phrase-extract/Jamfile
@@ -33,3 +33,5 @@ alias programs : extract extract-rules extract-lex score consolidate consolidate
install legacy : programs : <location>. <install-type>EXE ;
build-project extract-ghkm ;
+build-project pcfg-extract ;
+build-project pcfg-score ;
diff --git a/scripts/training/phrase-extract/PhraseAlignment.cpp b/scripts/training/phrase-extract/PhraseAlignment.cpp
index c0bfbde3e..ceb74f04c 100644
--- a/scripts/training/phrase-extract/PhraseAlignment.cpp
+++ b/scripts/training/phrase-extract/PhraseAlignment.cpp
@@ -13,6 +13,8 @@
#include "tables-core.h"
#include "score.h"
+#include <cstdlib>
+
using namespace std;
extern Vocabulary vcbT;
@@ -111,6 +113,9 @@ void PhraseAlignment::create( char line[], int lineID )
}
else if (item == 5) { // non-term lengths
addNTLength(token[j]);
+ } else if (item == 6) { // target syntax PCFG score
+ float pcfgScore = std::atof(token[j].c_str());
+ pcfgSum = pcfgScore * count;
}
}
@@ -119,7 +124,7 @@ void PhraseAlignment::create( char line[], int lineID )
if (item == 3) {
count = 1.0;
}
- if (item < 3 || item > 5) {
+ if (item < 3 || item > 6) {
cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
}
}
diff --git a/scripts/training/phrase-extract/PhraseAlignment.h b/scripts/training/phrase-extract/PhraseAlignment.h
index 8b8f5115c..8bd83503d 100644
--- a/scripts/training/phrase-extract/PhraseAlignment.h
+++ b/scripts/training/phrase-extract/PhraseAlignment.h
@@ -25,6 +25,7 @@ protected:
void createAlignVec(size_t sourceSize, size_t targetSize);
void addNTLength(const std::string &tok);
public:
+ float pcfgSum;
float count;
std::vector< std::set<size_t> > alignedToT;
std::vector< std::set<size_t> > alignedToS;
diff --git a/scripts/training/phrase-extract/RuleExtractionOptions.h b/scripts/training/phrase-extract/RuleExtractionOptions.h
index 70bb548c9..f9123de86 100644
--- a/scripts/training/phrase-extract/RuleExtractionOptions.h
+++ b/scripts/training/phrase-extract/RuleExtractionOptions.h
@@ -45,6 +45,7 @@ public:
bool targetSyntax;
bool duplicateRules;
bool fractionalCounting;
+ bool pcfgScore;
bool outputNTLengths;
bool gzOutput;
@@ -74,6 +75,7 @@ public:
, targetSyntax(false)
, duplicateRules(true)
, fractionalCounting(true)
+ , pcfgScore(false)
, outputNTLengths(false)
, gzOutput(false)
{}
diff --git a/scripts/training/phrase-extract/SyntaxTree.cpp b/scripts/training/phrase-extract/SyntaxTree.cpp
index e181b1e8a..f2783ffd2 100644
--- a/scripts/training/phrase-extract/SyntaxTree.cpp
+++ b/scripts/training/phrase-extract/SyntaxTree.cpp
@@ -42,11 +42,12 @@ void SyntaxTree::Clear()
m_index.clear();
}
-void SyntaxTree::AddNode( int startPos, int endPos, std::string label )
+SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label )
{
SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
m_nodes.push_back( newNode );
m_index[ startPos ][ endPos ].push_back( newNode );
+ return newNode;
}
ParentNodes SyntaxTree::Parse()
diff --git a/scripts/training/phrase-extract/SyntaxTree.h b/scripts/training/phrase-extract/SyntaxTree.h
index 0ca5ca472..17c106b49 100644
--- a/scripts/training/phrase-extract/SyntaxTree.h
+++ b/scripts/training/phrase-extract/SyntaxTree.h
@@ -34,12 +34,14 @@ protected:
std::string m_label;
std::vector< SyntaxNode* > m_children;
SyntaxNode* m_parent;
+ float m_pcfgScore;
public:
SyntaxNode( int startPos, int endPos, std::string label )
:m_start(startPos)
,m_end(endPos)
,m_label(label)
,m_parent(0)
+ ,m_pcfgScore(0.0f)
{}
int GetStart() const {
return m_start;
@@ -50,6 +52,12 @@ public:
std::string GetLabel() const {
return m_label;
}
+ float GetPcfgScore() const {
+ return m_pcfgScore;
+ }
+ void SetPcfgScore(float score) {
+ m_pcfgScore = score;
+ }
SyntaxNode *GetParent() {
return m_parent;
}
@@ -89,11 +97,12 @@ public:
}
~SyntaxTree();
+ SyntaxNode *AddNode( int startPos, int endPos, std::string label );
+
SyntaxNode *GetTop() {
return m_top;
}
- void AddNode( int startPos, int endPos, std::string label );
ParentNodes Parse();
bool HasNode( int startPos, int endPos ) const;
const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
diff --git a/scripts/training/phrase-extract/XmlTree.cpp b/scripts/training/phrase-extract/XmlTree.cpp
index 716414f86..b22c159a1 100644
--- a/scripts/training/phrase-extract/XmlTree.cpp
+++ b/scripts/training/phrase-extract/XmlTree.cpp
@@ -25,7 +25,7 @@
#include <string>
#include <set>
#include <iostream>
-#include <stdlib.h>
+#include <cstdlib>
#include <sstream>
#include "SyntaxTree.h"
#include "XmlException.h"
@@ -345,13 +345,18 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
string label = ParseXmlTagAttribute(tagContent,"label");
labelCollection.insert( label );
+ string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
+ float pcfgScore = pcfgString == "" ? 0.0f
+ : std::atof(pcfgString.c_str());
+
// report what we have processed so far
if (0) {
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
}
- tree.AddNode( startPos, endPos-1, label );
+ SyntaxNode *node = tree.AddNode( startPos, endPos-1, label );
+ node->SetPcfgScore(pcfgScore);
}
}
}
diff --git a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 0ecffae5c..6bd32a13b 100644
--- a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -212,6 +212,10 @@ Node *AlignmentGraph::CopyParseTree(const ParseTree *root)
std::auto_ptr<Node> n(new Node(root->GetLabel(), nodeType));
+ if (nodeType == TREE) {
+ n->SetPcfgScore(root->GetPcfgScore());
+ }
+
const std::vector<ParseTree *> &children = root->GetChildren();
std::vector<Node *> childNodes;
childNodes.reserve(children.size());
diff --git a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 008026e1a..397ce1e3c 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -285,6 +285,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
"set maximum allowed scope")
("Minimal",
"extract minimal rules only")
+ ("PCFG",
+ "include score based on PCFG scores in target corpus")
("UnknownWordLabel",
po::value(&options.unknownWordFile),
"write unknown word labels to named file")
@@ -361,6 +363,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
if (vm.count("Minimal")) {
options.minimal = true;
}
+ if (vm.count("PCFG")) {
+ options.pcfg = true;
+ }
if (vm.count("UnpairedExtractFormat")) {
options.unpairedExtractFormat = true;
}
diff --git a/scripts/training/phrase-extract/extract-ghkm/Node.h b/scripts/training/phrase-extract/extract-ghkm/Node.h
index 228fdc812..775473362 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Node.h
+++ b/scripts/training/phrase-extract/extract-ghkm/Node.h
@@ -41,8 +41,7 @@ class Node
Node(const std::string &label, NodeType type)
: m_label(label)
, m_type(type)
- , m_children()
- , m_parents() {}
+ , m_pcfgScore(0.0f) {}
~Node();
@@ -50,12 +49,14 @@ class Node
NodeType GetType() const { return m_type; }
const std::vector<Node*> &GetChildren() const { return m_children; }
const std::vector<Node*> &GetParents() const { return m_parents; }
+ float GetPcfgScore() const { return m_pcfgScore; }
const Span &GetSpan() const { return m_span; }
const Span &GetComplementSpan() const { return m_complementSpan; }
const std::vector<const Subgraph*> &GetRules() const { return m_rules; }
void SetChildren(const std::vector<Node*> &c) { m_children = c; }
void SetParents(const std::vector<Node*> &p) { m_parents = p; }
+ void SetPcfgScore(float s) { m_pcfgScore = s; }
void SetSpan(const Span &s) { m_span = s; }
void SetComplementSpan(const Span &cs) { m_complementSpan = cs; }
@@ -92,6 +93,7 @@ class Node
NodeType m_type;
std::vector<Node*> m_children;
std::vector<Node*> m_parents;
+ float m_pcfgScore;
Span m_span;
Span m_complementSpan;
std::vector<const Subgraph*> m_rules;
diff --git a/scripts/training/phrase-extract/extract-ghkm/Options.h b/scripts/training/phrase-extract/extract-ghkm/Options.h
index 108e19d66..c4b57f311 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Options.h
+++ b/scripts/training/phrase-extract/extract-ghkm/Options.h
@@ -36,6 +36,7 @@ struct Options {
, maxRuleSize(3)
, maxScope(3)
, minimal(false)
+ , pcfg(false)
, unpairedExtractFormat(false) {}
// Positional options
@@ -53,6 +54,7 @@ struct Options {
int maxRuleSize;
int maxScope;
bool minimal;
+ bool pcfg;
bool unpairedExtractFormat;
std::string unknownWordFile;
};
diff --git a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
index ec6fc147a..273e2e04e 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
+++ b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
@@ -32,17 +32,19 @@ class ParseTree
public:
ParseTree(const std::string &label)
: m_label(label)
- , m_children()
- , m_parent() {}
+ , m_parent(0)
+ , m_pcfgScore(0.0) {}
~ParseTree();
const std::string &GetLabel() const { return m_label; }
const std::vector<ParseTree*> &GetChildren() const { return m_children; }
const ParseTree *GetParent() const { return m_parent; }
+ float GetPcfgScore() const { return m_pcfgScore; }
void SetParent(ParseTree *);
void SetChildren(const std::vector<ParseTree*> &);
+ void SetPcfgScore(float score) { m_pcfgScore = score; }
void AddChild(ParseTree *);
@@ -59,6 +61,7 @@ class ParseTree
std::string m_label;
std::vector<ParseTree*> m_children;
ParseTree *m_parent;
+ float m_pcfgScore; // log probability
};
template<typename OutputIterator>
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp
index 8473e4283..5dc70052c 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -30,6 +30,7 @@ namespace GHKM {
ScfgRule::ScfgRule(const Subgraph &fragment)
: m_sourceLHS("X", NonTerminal)
, m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
+ , m_pcfgScore(fragment.GetPcfgScore())
{
// Source RHS
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h
index 1ed534d9e..2405d8fa3 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h
@@ -57,6 +57,7 @@ class ScfgRule
const std::vector<Symbol> &GetSourceRHS() const { return m_sourceRHS; }
const std::vector<Symbol> &GetTargetRHS() const { return m_targetRHS; }
const Alignment &GetAlignment() const { return m_alignment; }
+ float GetPcfgScore() const { return m_pcfgScore; }
int Scope() const;
@@ -68,6 +69,7 @@ class ScfgRule
std::vector<Symbol> m_sourceRHS;
std::vector<Symbol> m_targetRHS;
Alignment m_alignment;
+ float m_pcfgScore;
};
} // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
index 4be3f048d..d5d16b790 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
@@ -24,6 +24,7 @@
#include "ScfgRule.h"
#include <cassert>
+#include <cmath>
#include <ostream>
#include <map>
#include <sstream>
@@ -34,14 +35,43 @@ namespace GHKM {
void ScfgRuleWriter::Write(const ScfgRule &rule)
{
+ std::ostringstream sourceSS;
+ std::ostringstream targetSS;
+
if (m_options.unpairedExtractFormat) {
- WriteUnpairedFormat(rule);
+ WriteUnpairedFormat(rule, sourceSS, targetSS);
} else {
- WriteStandardFormat(rule);
+ WriteStandardFormat(rule, sourceSS, targetSS);
+ }
+
+ // Write the rule to the forward and inverse extract files.
+ m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+ m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+
+ const Alignment &alignment = rule.GetAlignment();
+ for (Alignment::const_iterator p = alignment.begin();
+ p != alignment.end(); ++p) {
+ m_fwd << " " << p->first << "-" << p->second;
+ m_inv << " " << p->second << "-" << p->first;
+ }
+
+ // Write a count of 1 and an empty NT length column to the forward extract
+ // file.
+ // TODO Add option to write NT length?
+ m_fwd << " ||| 1 ||| |||";
+ if (m_options.pcfg) {
+ // Write the PCFG score.
+ m_fwd << " " << std::exp(rule.GetPcfgScore());
}
+ m_fwd << std::endl;
+
+ // Write a count of 1 to the inverse extract file.
+ m_inv << " ||| 1" << std::endl;
}
-void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
+void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
+ std::ostream &sourceSS,
+ std::ostream &targetSS)
{
const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
@@ -60,9 +90,6 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
}
}
- std::ostringstream sourceSS;
- std::ostringstream targetSS;
-
// Write the source side of the rule to sourceSS.
int i = 0;
for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
@@ -88,27 +115,14 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
targetSS << " ";
}
WriteSymbol(rule.GetTargetLHS(), targetSS);
-
- // Write the rule to the forward and inverse extract files.
- m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
- m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
- for (Alignment::const_iterator p(alignment.begin());
- p != alignment.end(); ++p) {
- m_fwd << " " << p->first << "-" << p->second;
- m_inv << " " << p->second << "-" << p->first;
- }
- m_fwd << " ||| 1" << std::endl;
- m_inv << " ||| 1" << std::endl;
}
-void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule)
+void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
+ std::ostream &sourceSS,
+ std::ostream &targetSS)
{
const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
- const Alignment &alignment = rule.GetAlignment();
-
- std::ostringstream sourceSS;
- std::ostringstream targetSS;
// Write the source side of the rule to sourceSS.
int i = 0;
@@ -127,17 +141,6 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule)
targetSS << " ";
}
WriteSymbol(rule.GetTargetLHS(), targetSS);
-
- // Write the rule to the forward and inverse extract files.
- m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
- m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
- for (Alignment::const_iterator p(alignment.begin());
- p != alignment.end(); ++p) {
- m_fwd << " " << p->first << "-" << p->second;
- m_inv << " " << p->second << "-" << p->first;
- }
- m_fwd << " ||| 1" << std::endl;
- m_inv << " ||| 1" << std::endl;
}
void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h
index 738d09ce9..b92a432a1 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h
@@ -45,8 +45,8 @@ class ScfgRuleWriter
ScfgRuleWriter(const ScfgRuleWriter &);
ScfgRuleWriter &operator=(const ScfgRuleWriter &);
- void WriteStandardFormat(const ScfgRule &);
- void WriteUnpairedFormat(const ScfgRule &);
+ void WriteStandardFormat(const ScfgRule &, std::ostream &, std::ostream &);
+ void WriteUnpairedFormat(const ScfgRule &, std::ostream &, std::ostream &);
void WriteSymbol(const Symbol &, std::ostream &);
std::ostream &m_fwd;
diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
index e5aedbb16..e048f2c55 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
@@ -101,5 +101,21 @@ int Subgraph::CalcDepth(const Node *n) const
return maxChildDepth + 1;
}
+float Subgraph::CalcPcfgScore() const
+{
+ if (m_root->GetType() != TREE || m_leaves.empty()) {
+ return 0.0f;
+ }
+ float score = m_root->GetPcfgScore();
+ for (std::set<const Node *>::const_iterator p = m_leaves.begin();
+ p != m_leaves.end(); ++p) {
+ const Node *leaf = *p;
+ if (leaf->GetType() == TREE) {
+ score -= leaf->GetPcfgScore();
+ }
+ }
+ return score;
+}
+
} // namespace Moses
} // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
index e84903502..ede1233e9 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
+++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
@@ -38,7 +38,8 @@ class Subgraph
: m_root(root)
, m_depth(0)
, m_size(root->GetType() == TREE ? 1 : 0)
- , m_nodeCount(1) {}
+ , m_nodeCount(1)
+ , m_pcfgScore(0.0f) {}
Subgraph(const Node *root, const std::set<const Node *> &leaves)
: m_root(root)
@@ -46,10 +47,12 @@ class Subgraph
, m_depth(-1)
, m_size(-1)
, m_nodeCount(-1)
+ , m_pcfgScore(0.0f)
{
m_depth = CalcDepth(m_root);
m_size = CalcSize(m_root);
m_nodeCount = CountNodes(m_root);
+ m_pcfgScore = CalcPcfgScore();
}
const Node *GetRoot() const { return m_root; }
@@ -57,6 +60,7 @@ class Subgraph
int GetDepth() const { return m_depth; }
int GetSize() const { return m_size; }
int GetNodeCount() const { return m_nodeCount; }
+ float GetPcfgScore() const { return m_pcfgScore; }
bool IsTrivial() const { return m_leaves.empty(); }
@@ -66,6 +70,7 @@ class Subgraph
void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
int CalcDepth(const Node *) const;
int CalcSize(const Node *) const;
+ float CalcPcfgScore() const;
int CountNodes(const Node *) const;
const Node *m_root;
@@ -73,6 +78,7 @@ class Subgraph
int m_depth;
int m_size;
int m_nodeCount;
+ float m_pcfgScore;
};
} // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
index 31c0e3843..cc961dc0c 100644
--- a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@@ -61,6 +61,7 @@ std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree(
const std::vector<std::string> &words)
{
std::auto_ptr<ParseTree> root(new ParseTree(tree.GetLabel()));
+ root->SetPcfgScore(tree.GetPcfgScore());
const std::vector<SyntaxNode*> &children = tree.GetChildren();
if (children.empty()) {
if (tree.GetStart() != tree.GetEnd()) {
diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp
index 2cc9dc54d..a00667b82 100644
--- a/scripts/training/phrase-extract/extract-rules.cpp
+++ b/scripts/training/phrase-extract/extract-rules.cpp
@@ -90,7 +90,7 @@ void addHieroRule( int startT, int endT, int startS, int endS
void printHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, LabelIndex &labelIndex);
string printTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex);
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
string printSourceHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, const LabelIndex &labelIndex);
void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
@@ -257,6 +257,8 @@ int main(int argc, char* argv[])
// if an source phrase is paired with two target phrases, then count(t|s) = 0.5
else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
options.fractionalCounting = false;
+ } else if (strcmp(argv[i],"--PCFG") == 0) {
+ options.pcfgScore = true;
} else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
options.outputNTLengths = true;
#ifdef WITH_THREADS
@@ -517,7 +519,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
}
string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex)
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore)
{
HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
assert(iterHoleList != holeColl.GetHoles().end());
@@ -545,6 +547,11 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
out += "[" + sourceLabel + "][" + targetLabel + "] ";
+ if (m_options.pcfgScore) {
+ double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
+ logPCFGScore -= score;
+ }
+
currPos = hole.GetEnd(1);
hole.SetPos(outPos, 1);
++iterHoleList;
@@ -658,8 +665,16 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
// target
- rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex)
+ if (m_options.pcfgScore) {
+ double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+ rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ + " [" + targetLabel + "]";
+ rule.pcfgScore = std::exp(logPCFGScore);
+ } else {
+ double logPCFGScore = 0.0f;
+ rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ " [" + targetLabel + "]";
+ }
// source
// holeColl.SortSourceHoles();
@@ -877,6 +892,11 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist
rule.target += m_sentence->target[ti] + " ";
rule.target += "[" + targetLabel + "]";
+ if (m_options.pcfgScore) {
+ double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+ rule.pcfgScore = std::exp(logPCFGScore);
+ }
+
// alignment
for(int ti=startT; ti<=endT; ti++) {
for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
@@ -957,11 +977,13 @@ void ExtractTask::writeRulesToFile()
out << rule->source << " ||| "
<< rule->target << " ||| "
<< rule->alignment << " ||| "
- << rule->count;
+ << rule->count << " ||| ";
if (m_options.outputNTLengths) {
- out << " ||| ";
rule->OutputNTLengths(out);
}
+ if (m_options.pcfgScore) {
+ out << " ||| " << rule->pcfgScore;
+ }
out << "\n";
if (!m_options.onlyDirectFlag) {
diff --git a/scripts/training/phrase-extract/pcfg-common/Jamfile b/scripts/training/phrase-extract/pcfg-common/Jamfile
new file mode 100644
index 000000000..3dc272a56
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/Jamfile
@@ -0,0 +1 @@
+lib pcfg_common : [ glob *.cc ] ..//trees ;
diff --git a/scripts/training/phrase-extract/pcfg-common/exception.h b/scripts/training/phrase-extract/pcfg-common/exception.h
new file mode 100644
index 000000000..3dbd59d0e
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/exception.h
@@ -0,0 +1,41 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXCEPTION_H_
+#define PCFG_EXCEPTION_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+class Exception {
+ public:
+ Exception(const char *msg) : msg_(msg) {}
+ Exception(const std::string &msg) : msg_(msg) {}
+ const std::string &msg() const { return msg_; }
+ private:
+ std::string msg_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/numbered_set.h b/scripts/training/phrase-extract/pcfg-common/numbered_set.h
new file mode 100644
index 000000000..f88d710ed
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/numbered_set.h
@@ -0,0 +1,109 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_NUMBERED_SET_H_
+#define PCFG_NUMBERED_SET_H_
+
+#include "exception.h"
+
+#include <boost/unordered_map.hpp>
+
+#include <limits>
+#include <sstream>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Stores a set of elements of type T, each of which is allocated an integral
+// ID of type I. IDs are contiguous starting at 0. Individual elements cannot
+// be removed once inserted (but the whole set can be cleared).
+template<typename T, typename I=size_t>
+class NumberedSet {
+ private:
+ typedef boost::unordered_map<T, I> ElementToIdMap;
+ typedef std::vector<const T *> IdToElementMap;
+
+ public:
+ typedef I IdType;
+ typedef typename IdToElementMap::const_iterator const_iterator;
+
+ NumberedSet() {}
+
+ const_iterator begin() const { return id_to_element_.begin(); }
+ const_iterator end() const { return id_to_element_.end(); }
+
+ // Static value
+ static I NullId() { return std::numeric_limits<I>::max(); }
+
+ bool Empty() const { return id_to_element_.empty(); }
+ size_t Size() const { return id_to_element_.size(); }
+
+ // Insert the given object and return its ID.
+ I Insert(const T &);
+
+ I Lookup(const T &) const;
+ const T &Lookup(I) const;
+
+ void Clear();
+
+ private:
+ ElementToIdMap element_to_id_;
+ IdToElementMap id_to_element_;
+};
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Lookup(const T &s) const {
+ typename ElementToIdMap::const_iterator p = element_to_id_.find(s);
+ return (p == element_to_id_.end()) ? NullId() : p->second;
+}
+
+template<typename T, typename I>
+const T &NumberedSet<T, I>::Lookup(I id) const {
+ if (id < 0 || id >= id_to_element_.size()) {
+ std::ostringstream msg;
+ msg << "Value not found: " << id;
+ throw Exception(msg.str());
+ }
+ return *(id_to_element_[id]);
+}
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Insert(const T &x) {
+ std::pair<T, I> value(x, id_to_element_.size());
+ std::pair<typename ElementToIdMap::iterator, bool> result =
+ element_to_id_.insert(value);
+ if (result.second) {
+ // x is a new element.
+ id_to_element_.push_back(&result.first->first);
+ }
+ return result.first->second;
+}
+
+template<typename T, typename I>
+void NumberedSet<T, I>::Clear() {
+ element_to_id_.clear();
+ id_to_element_.clear();
+}
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.cc b/scripts/training/phrase-extract/pcfg-common/pcfg.cc
new file mode 100644
index 000000000..d045b820b
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/pcfg.cc
@@ -0,0 +1,106 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg.h"
+
+#include "exception.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include <cassert>
+
+namespace Moses {
+namespace PCFG {
+
+void Pcfg::Add(const Key &key, double score) {
+ rules_[key] = score;
+}
+
+bool Pcfg::Lookup(const Key &key, double &score) const {
+ Map::const_iterator p = rules_.find(key);
+ if (p == rules_.end()) {
+ return false;
+ }
+ score = p->second;
+ return true;
+}
+
+void Pcfg::Read(std::istream &input, Vocabulary &vocab) {
+ std::string line;
+ std::string lhs_string;
+ std::vector<std::string> rhs_strings;
+ std::string score_string;
+ Key key;
+ while (std::getline(input, line)) {
+ // Read LHS.
+ size_t pos = line.find("|||");
+ if (pos == std::string::npos) {
+ throw Exception("missing first delimiter");
+ }
+ lhs_string = line.substr(0, pos);
+ boost::trim(lhs_string);
+
+ // Read RHS.
+ size_t begin = pos+3;
+ pos = line.find("|||", begin);
+ if (pos == std::string::npos) {
+ throw Exception("missing second delimiter");
+ }
+ std::string rhs_text = line.substr(begin, pos-begin);
+ boost::trim(rhs_text);
+ rhs_strings.clear();
+ boost::split(rhs_strings, rhs_text, boost::algorithm::is_space(),
+ boost::algorithm::token_compress_on);
+
+ // Read score.
+ score_string = line.substr(pos+3);
+ boost::trim(score_string);
+
+ // Construct key.
+ key.clear();
+ key.reserve(rhs_strings.size()+1);
+ key.push_back(vocab.Insert(lhs_string));
+ for (std::vector<std::string>::const_iterator p = rhs_strings.begin();
+ p != rhs_strings.end(); ++p) {
+ key.push_back(vocab.Insert(*p));
+ }
+
+ // Add rule.
+ double score = boost::lexical_cast<double>(score_string);
+ Add(key, score);
+ }
+}
+
+void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const {
+ for (const_iterator p = begin(); p != end(); ++p) {
+ const Key &key = p->first;
+ double score = p->second;
+ std::vector<size_t>::const_iterator q = key.begin();
+ std::vector<size_t>::const_iterator end = key.end();
+ output << vocab.Lookup(*q++) << " |||";
+ while (q != end) {
+ output << " " << vocab.Lookup(*q++);
+ }
+ output << " ||| " << score << std::endl;
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.h b/scripts/training/phrase-extract/pcfg-common/pcfg.h
new file mode 100644
index 000000000..757eea449
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/pcfg.h
@@ -0,0 +1,61 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_PCFG_H_
+#define PCFG_PCFG_H_
+
+#include "typedef.h"
+
+#include <istream>
+#include <map>
+#include <ostream>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+class Pcfg {
+ public:
+ typedef std::vector<size_t> Key;
+ typedef std::map<Key, double> Map;
+ typedef Map::iterator iterator;
+ typedef Map::const_iterator const_iterator;
+
+ Pcfg() {}
+
+ iterator begin() { return rules_.begin(); }
+ const_iterator begin() const { return rules_.begin(); }
+
+ iterator end() { return rules_.end(); }
+ const_iterator end() const { return rules_.end(); }
+
+ void Add(const Key &, double);
+ bool Lookup(const Key &, double &) const;
+ void Read(std::istream &, Vocabulary &);
+ void Write(const Vocabulary &, std::ostream &) const;
+
+ private:
+ Map rules_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h
new file mode 100644
index 000000000..bdac64dfc
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h
@@ -0,0 +1,77 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_PCFG_TREE_H_
+#define PCFG_PCFG_TREE_H_
+
+#include "syntax_tree.h"
+#include "xml_tree_writer.h"
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+template<typename DerivedType>
+class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
+ public:
+ typedef std::string LabelType;
+ typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
+
+ PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
+
+ double score() const { return score_; }
+ void set_score(double s) { score_ = s; }
+
+ private:
+ double score_;
+};
+
+class PcfgTree : public PcfgTreeBase<PcfgTree> {
+ public:
+ typedef PcfgTreeBase<PcfgTree> BaseType;
+ PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
+};
+
+// Specialise XmlOutputHandler for PcfgTree.
+template<>
+class XmlOutputHandler<PcfgTree> {
+ public:
+ typedef std::map<std::string, std::string> AttributeMap;
+
+ void GetLabel(const PcfgTree &tree, std::string &label) const {
+ label = tree.label();
+ }
+
+ void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const {
+ attribute_map.clear();
+ double score = tree.score();
+ if (score != 0.0) {
+ std::ostringstream out;
+ out << tree.score();
+ attribute_map["pcfg"] = out.str();
+ }
+ }
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/syntax_tree.h b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h
new file mode 100644
index 000000000..37f72dd58
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h
@@ -0,0 +1,91 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SYNTAX_TREE_H_
+#define PCFG_SYNTAX_TREE_H_
+
+#include <cassert>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Base class for SyntaxTree, AgreementTree, and friends.
+template<typename T, typename DerivedType>
+class SyntaxTreeBase {
+ public:
+ // Constructors
+ SyntaxTreeBase(const T &label)
+ : label_(label)
+ , children_()
+ , parent_(0) {}
+
+ SyntaxTreeBase(const T &label, const std::vector<DerivedType *> &children)
+ : label_(label)
+ , children_(children)
+ , parent_(0) {}
+
+ // Destructor
+ virtual ~SyntaxTreeBase();
+
+ const T &label() const { return label_; }
+ const DerivedType *parent() const { return parent_; }
+ DerivedType *parent() { return parent_; }
+ const std::vector<DerivedType *> &children() const { return children_; }
+ std::vector<DerivedType *> &children() { return children_; }
+
+ void set_label(const T &label) { label_ = label; }
+ void set_parent(DerivedType *parent) { parent_ = parent; }
+ void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
+
+ bool IsLeaf() const { return children_.empty(); }
+
+ bool IsPreterminal() const {
+ return children_.size() == 1 && children_[0]->IsLeaf();
+ }
+
+ void AddChild(DerivedType *child) { children_.push_back(child); }
+
+ private:
+ T label_;
+ std::vector<DerivedType *> children_;
+ DerivedType *parent_;
+};
+
+template<typename T>
+class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
+ public:
+ typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
+ SyntaxTree(const T &label) : BaseType(label) {}
+ SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
+ : BaseType(label, children) {}
+};
+
+template<typename T, typename DerivedType>
+SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
+ for (size_t i = 0; i < children_.size(); ++i) {
+ delete children_[i];
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/tool.cc b/scripts/training/phrase-extract/pcfg-common/tool.cc
new file mode 100644
index 000000000..bebd220e1
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/tool.cc
@@ -0,0 +1,80 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "tool.h"
+
+#include <sstream>
+
+namespace Moses {
+namespace PCFG {
+
+std::istream &Tool::OpenInputOrDie(const std::string &filename) {
+ // TODO Check that function is only called once?
+ if (filename.empty() || filename == "-") {
+ input_ptr_ = &(std::cin);
+ } else {
+ input_file_stream_.open(filename.c_str());
+ if (!input_file_stream_) {
+ std::ostringstream msg;
+ msg << "failed to open input file: " << filename;
+ Error(msg.str());
+ }
+ input_ptr_ = &input_file_stream_;
+ }
+ return *input_ptr_;
+}
+
+std::ostream &Tool::OpenOutputOrDie(const std::string &filename) {
+ // TODO Check that function is only called once?
+ if (filename.empty() || filename == "-") {
+ output_ptr_ = &(std::cout);
+ } else {
+ output_file_stream_.open(filename.c_str());
+ if (!output_file_stream_) {
+ std::ostringstream msg;
+ msg << "failed to open output file: " << filename;
+ Error(msg.str());
+ }
+ output_ptr_ = &output_file_stream_;
+ }
+ return *output_ptr_;
+}
+
+void Tool::OpenNamedInputOrDie(const std::string &filename,
+ std::ifstream &stream) {
+ stream.open(filename.c_str());
+ if (!stream) {
+ std::ostringstream msg;
+ msg << "failed to open input file: " << filename;
+ Error(msg.str());
+ }
+}
+
+void Tool::OpenNamedOutputOrDie(const std::string &filename,
+ std::ofstream &stream) {
+ stream.open(filename.c_str());
+ if (!stream) {
+ std::ostringstream msg;
+ msg << "failed to open output file: " << filename;
+ Error(msg.str());
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-common/tool.h b/scripts/training/phrase-extract/pcfg-common/tool.h
new file mode 100644
index 000000000..0af342569
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/tool.h
@@ -0,0 +1,91 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_TOOL_H_
+#define PCFG_TOOL_H_
+
+#include <boost/program_options/cmdline.hpp>
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+class Tool {
+ public:
+ virtual ~Tool() {}
+
+ const std::string &name() const { return name_; }
+
+ virtual int Main(int argc, char *argv[]) = 0;
+
+ protected:
+ Tool(const std::string &name) : name_(name) {}
+
+ // Returns the boost::program_options style that should be used by all tools.
+ static int CommonOptionStyle() {
+ namespace cls = boost::program_options::command_line_style;
+ return cls::default_style & (~cls::allow_guessing);
+ }
+
+ void Warn(const std::string &msg) const {
+ std::cerr << name_ << ": warning: " << msg << std::endl;
+ }
+
+ void Error(const std::string &msg) const {
+ std::cerr << name_ << ": error: " << msg << std::endl;
+ std::exit(1);
+ }
+
+ // Initialises the tool's main input stream and returns a reference that is
+ // valid for the remainder of the tool's lifetime. If filename is empty or
+ // "-" then input is standard input; otherwise it is the named file. Calls
+ // Error() if the file cannot be opened for reading.
+ std::istream &OpenInputOrDie(const std::string &filename);
+
+ // Initialises the tool's main output stream and returns a reference that is
+ // valid for the remainder of the tool's lifetime. If filename is empty or
+ // "-" then output is standard output; otherwise it is the named file. Calls
+ // Error() if the file cannot be opened for writing.
+ std::ostream &OpenOutputOrDie(const std::string &filename);
+
+ // Opens the named input file using the supplied ifstream. Calls Error() if
+ // the file cannot be opened for reading.
+ void OpenNamedInputOrDie(const std::string &, std::ifstream &);
+
+ // Opens the named output file using the supplied ofstream. Calls Error() if
+ // the file cannot be opened for writing.
+ void OpenNamedOutputOrDie(const std::string &, std::ofstream &);
+
+ private:
+ std::string name_;
+ std::istream *input_ptr_;
+ std::ifstream input_file_stream_;
+ std::ostream *output_ptr_;
+ std::ofstream output_file_stream_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/typedef.h b/scripts/training/phrase-extract/pcfg-common/typedef.h
new file mode 100644
index 000000000..49a12d681
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/typedef.h
@@ -0,0 +1,37 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_TYPEDEF_H_
+#define PCFG_TYPEDEF_H_
+
+#include "numbered_set.h"
+#include "syntax_tree.h"
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+typedef NumberedSet<std::string> Vocabulary;
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc
new file mode 100644
index 000000000..5c596a0fb
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc
@@ -0,0 +1,85 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "xml_tree_parser.h"
+
+#include "exception.h"
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+
+#include <cassert>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+XmlTreeParser::XmlTreeParser()
+{
+}
+
+std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line)
+{
+ m_line = line;
+ m_tree.Clear();
+ try {
+ if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
+ throw Exception("");
+ }
+ } catch (const XmlException &e) {
+ throw Exception(e.getMsg());
+ }
+ m_tree.ConnectNodes();
+ SyntaxNode *root = m_tree.GetTop();
+ assert(root);
+ m_words = tokenize(m_line.c_str());
+ return ConvertTree(*root, m_words);
+}
+
+// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree.
+std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
+ const SyntaxNode &tree,
+ const std::vector<std::string> &words)
+{
+ std::auto_ptr<PcfgTree> root(new PcfgTree(tree.GetLabel()));
+ const std::vector<SyntaxNode*> &children = tree.GetChildren();
+ if (children.empty()) {
+ if (tree.GetStart() != tree.GetEnd()) {
+ std::ostringstream msg;
+ msg << "leaf node covers multiple words (" << tree.GetStart()
+ << "-" << tree.GetEnd() << "): this is currently unsupported";
+ throw Exception(msg.str());
+ }
+ std::auto_ptr<PcfgTree> leaf(new PcfgTree(words[tree.GetStart()]));
+ leaf->set_parent(root.get());
+ root->AddChild(leaf.release());
+ } else {
+ for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ assert(*p);
+ std::auto_ptr<PcfgTree> child = ConvertTree(**p, words);
+ child->set_parent(root.get());
+ root->AddChild(child.release());
+ }
+ }
+ return root;
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h
new file mode 100644
index 000000000..6b418c44e
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h
@@ -0,0 +1,56 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_XML_TREE_PARSER_H_
+#define PCFG_XML_TREE_PARSER_H_
+
+#include "pcfg_tree.h"
+#include "SyntaxTree.h"
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Parses a string in Moses' XML parse tree format and returns a PcfgTree
+// object.
+class XmlTreeParser {
+ public:
+ XmlTreeParser();
+ std::auto_ptr<PcfgTree> Parse(const std::string &);
+ private:
+ std::auto_ptr<PcfgTree> ConvertTree(const SyntaxNode &,
+ const std::vector<std::string> &);
+
+ std::set<std::string> m_labelSet;
+ std::map<std::string, int> m_topLabelSet;
+ std::string m_line;
+ ::SyntaxTree m_tree;
+ std::vector<std::string> m_words;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h
new file mode 100644
index 000000000..347c352bb
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h
@@ -0,0 +1,127 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_XML_TREE_WRITER_H_
+#define PCFG_XML_TREE_WRITER_H_
+
+#include "syntax_tree.h"
+
+#include "XmlTree.h"
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <vector>
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+template<typename InputTree>
+class XmlOutputHandler {
+ public:
+ typedef std::map<std::string, std::string> AttributeMap;
+
+ void GetLabel(const InputTree &, std::string &) const;
+ void GetAttributes(const InputTree &, AttributeMap &) const;
+};
+
+template<typename InputTree>
+class XmlTreeWriter : public XmlOutputHandler<InputTree> {
+ public:
+ typedef XmlOutputHandler<InputTree> Base;
+ void Write(const InputTree &, std::ostream &) const;
+ private:
+ std::string Escape(const std::string &) const;
+};
+
+template<typename InputTree>
+void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
+ std::ostream &out) const {
+ assert(!tree.IsLeaf());
+
+ // Opening tag
+
+ std::string label;
+ Base::GetLabel(tree, label);
+ out << "<tree label=\"" << Escape(label) << "\"";
+
+ typename Base::AttributeMap attribute_map;
+ Base::GetAttributes(tree, attribute_map);
+
+ for (typename Base::AttributeMap::const_iterator p = attribute_map.begin();
+ p != attribute_map.end(); ++p) {
+ out << " " << p->first << "=\"" << p->second << "\"";
+ }
+
+ out << ">";
+
+ // Children
+
+ const std::vector<InputTree *> &children = tree.children();
+ for (typename std::vector<InputTree *>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ InputTree &child = **p;
+ if (child.IsLeaf()) {
+ Base::GetLabel(child, label);
+ out << " " << Escape(label);
+ } else {
+ out << " ";
+ Write(**p, out);
+ }
+ }
+
+ // Closing tag
+ out << " </tree>";
+
+ if (tree.parent() == 0) {
+ out << std::endl;
+ }
+}
+
+// Escapes XML special characters.
+template<typename InputTree>
+std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
+ std::string t;
+ size_t len = s.size();
+ t.reserve(len);
+ for (size_t i = 0; i < len; ++i) {
+ if (s[i] == '<') {
+ t += "&lt;";
+ } else if (s[i] == '>') {
+ t += "&gt;";
+ } else if (s[i] == '&') {
+ t += "&amp;";
+ } else if (s[i] == '\'') {
+ t += "&apos;";
+ } else if (s[i] == '"') {
+ t += "&quot;";
+ } else {
+ t += s[i];
+ }
+ }
+ return t;
+}
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/Jamfile b/scripts/training/phrase-extract/pcfg-extract/Jamfile
new file mode 100644
index 000000000..be91d6d2f
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/Jamfile
@@ -0,0 +1 @@
+exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ;
diff --git a/scripts/training/phrase-extract/pcfg-extract/main.cc b/scripts/training/phrase-extract/pcfg-extract/main.cc
new file mode 100644
index 000000000..47b45afc3
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/main.cc
@@ -0,0 +1,25 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg_extract.h"
+
+int main(int argc, char *argv[]) {
+ Moses::PCFG::PcfgExtract tool;
+ return tool.Main(argc, argv);
+}
diff --git a/scripts/training/phrase-extract/pcfg-extract/options.h b/scripts/training/phrase-extract/pcfg-extract/options.h
new file mode 100644
index 000000000..3acb31b58
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/options.h
@@ -0,0 +1,36 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_OPTIONS_H_
+#define PCFG_EXTRACT_OPTIONS_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+struct Options {
+ std::string corpus_file;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc
new file mode 100644
index 000000000..151c9959c
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -0,0 +1,131 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg_extract.h"
+
+#include "options.h"
+#include "rule_collection.h"
+#include "rule_extractor.h"
+
+#include "pcfg-common/exception.h"
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/syntax_tree.h"
+#include "pcfg-common/typedef.h"
+#include "pcfg-common/xml_tree_parser.h"
+
+#include <boost/program_options.hpp>
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+int PcfgExtract::Main(int argc, char *argv[]) {
+ // Process command-line options.
+ Options options;
+ ProcessOptions(argc, argv, options);
+
+ // Extract PCFG rules from corpus.
+ Vocabulary non_term_vocab;
+ RuleExtractor rule_extractor(non_term_vocab);
+ RuleCollection rule_collection;
+ XmlTreeParser parser;
+ std::string line;
+ size_t line_num = 0;
+ std::auto_ptr<PcfgTree> tree;
+ while (std::getline(std::cin, line)) {
+ ++line_num;
+ try {
+ tree = parser.Parse(line);
+ } catch (Exception &e) {
+ std::ostringstream msg;
+ msg << "line " << line_num << ": " << e.msg();
+ Error(msg.str());
+ }
+ if (!tree.get()) {
+ std::ostringstream msg;
+ msg << "no tree at line " << line_num;
+ Warn(msg.str());
+ continue;
+ }
+ rule_extractor.Extract(*tree, rule_collection);
+ }
+
+ // Score rules and write PCFG to output.
+ Pcfg pcfg;
+ rule_collection.CreatePcfg(pcfg);
+ pcfg.Write(non_term_vocab, std::cout);
+
+ return 0;
+}
+
+void PcfgExtract::ProcessOptions(int argc, char *argv[],
+ Options &options) const {
+ namespace po = boost::program_options;
+
+ std::ostringstream usage_top;
+ usage_top << "Usage: " << name() << "\n\n" << "Options";
+
+ // Declare the command line options that are visible to the user.
+ po::options_description visible(usage_top.str());
+ visible.add_options()
+ ("help", "print help message and exit")
+ ;
+
+ // Declare the command line options that are hidden from the user
+ // (these are used as positional options).
+ po::options_description hidden("Hidden options");
+ hidden.add_options();
+
+ // Compose the full set of command-line options.
+ po::options_description cmd_line_options;
+ cmd_line_options.add(visible).add(hidden);
+
+ // Register the positional options.
+ po::positional_options_description p;
+
+ // Process the command-line.
+ po::variables_map vm;
+ try {
+ po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+ options(cmd_line_options).positional(p).run(), vm);
+ po::notify(vm);
+ } catch (const std::exception &e) {
+ std::ostringstream msg;
+ msg << e.what() << "\n\n" << visible;
+ Error(msg.str());
+ }
+
+ if (vm.count("help")) {
+ std::cout << visible << std::endl;
+ std::exit(0);
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h
new file mode 100644
index 000000000..1af6cb4fe
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_PCFG_EXTRACT_H_
+#define PCFG_EXTRACT_PCFG_EXTRACT_H_
+
+#include "pcfg-common/tool.h"
+
+namespace Moses {
+namespace PCFG {
+
+class Options;
+
+class PcfgExtract : public Tool {
+ public:
+ PcfgExtract() : Tool("pcfg-extract") {}
+ virtual int Main(int, char *[]);
+ private:
+ void ProcessOptions(int, char *[], Options &) const;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc
new file mode 100644
index 000000000..503b1a9e6
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc
@@ -0,0 +1,58 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "rule_collection.h"
+
+#include "pcfg-common/pcfg.h"
+
+#include <cmath>
+
+namespace Moses {
+namespace PCFG {
+
+void RuleCollection::Add(size_t lhs, const std::vector<size_t> &rhs) {
+ ++collection_[lhs][rhs];
+}
+
+void RuleCollection::CreatePcfg(Pcfg &pcfg) {
+ std::vector<size_t> key;
+ for (const_iterator p = begin(); p != end(); ++p) {
+ size_t lhs = p->first;
+ const RhsCountMap &rhs_counts = p->second;
+ size_t total = 0;
+ for (RhsCountMap::const_iterator q = rhs_counts.begin();
+ q != rhs_counts.end(); ++q) {
+ total += q->second;
+ }
+ for (RhsCountMap::const_iterator q = rhs_counts.begin();
+ q != rhs_counts.end(); ++q) {
+ const std::vector<size_t> &rhs = q->first;
+ size_t count = q->second;
+ double score = std::log(static_cast<double>(count) /
+ static_cast<double>(total));
+ key.clear();
+ key.push_back(lhs);
+ key.insert(key.end(), rhs.begin(), rhs.end());
+ pcfg.Add(key, score);
+ }
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.h b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h
new file mode 100644
index 000000000..1b768dd21
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h
@@ -0,0 +1,59 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_RULE_COLLECTION_H_
+#define PCFG_EXTRACT_RULE_COLLECTION_H_
+
+#include "pcfg-common/pcfg.h"
+
+#include <boost/unordered_map.hpp>
+
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Contains PCFG rules and their counts.
+class RuleCollection {
+ public:
+ typedef boost::unordered_map<std::vector<size_t>, size_t> RhsCountMap;
+ typedef boost::unordered_map<size_t, RhsCountMap> Map;
+ typedef Map::iterator iterator;
+ typedef Map::const_iterator const_iterator;
+
+ RuleCollection() {}
+
+ iterator begin() { return collection_.begin(); }
+ const_iterator begin() const { return collection_.begin(); }
+
+ iterator end() { return collection_.end(); }
+ const_iterator end() const { return collection_.end(); }
+
+ void Add(size_t, const std::vector<size_t> &);
+ void CreatePcfg(Pcfg &);
+
+ private:
+ Map collection_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc
new file mode 100644
index 000000000..48a82a6d0
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc
@@ -0,0 +1,51 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "rule_extractor.h"
+
+#include "pcfg-common/pcfg_tree.h"
+
+namespace Moses {
+namespace PCFG {
+
+RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab)
+ : non_term_vocab_(non_term_vocab) {
+}
+
+void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const {
+ if (tree.IsPreterminal() || tree.IsLeaf()) {
+ return;
+ }
+
+ size_t lhs = non_term_vocab_.Insert(tree.label());
+ std::vector<size_t> rhs;
+
+ const std::vector<PcfgTree *> &children = tree.children();
+ rhs.reserve(children.size());
+ for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ const PcfgTree &child = **p;
+ rhs.push_back(non_term_vocab_.Insert(child.label()));
+ Extract(child, rc);
+ }
+ rc.Add(lhs, rhs);
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h
new file mode 100644
index 000000000..6bcffbc61
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h
@@ -0,0 +1,45 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
+#define PCFG_EXTRACT_RULE_EXTRACTOR_H_
+
+#include "rule_collection.h"
+
+#include "pcfg-common/typedef.h"
+
+namespace Moses {
+namespace PCFG {
+
+class PcfgTree;
+
+// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
+class RuleExtractor {
+ public:
+ RuleExtractor(Vocabulary &);
+ void Extract(const PcfgTree &, RuleCollection &) const;
+ private:
+ Vocabulary &non_term_vocab_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-score/Jamfile b/scripts/training/phrase-extract/pcfg-score/Jamfile
new file mode 100644
index 000000000..7225381c0
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/Jamfile
@@ -0,0 +1 @@
+exe pcfg-score : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ;
diff --git a/scripts/training/phrase-extract/pcfg-score/main.cc b/scripts/training/phrase-extract/pcfg-score/main.cc
new file mode 100644
index 000000000..da5392add
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/main.cc
@@ -0,0 +1,25 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg_score.h"
+
+int main(int argc, char *argv[]) {
+ Moses::PCFG::PcfgScore tool;
+ return tool.Main(argc, argv);
+}
diff --git a/scripts/training/phrase-extract/pcfg-score/options.h b/scripts/training/phrase-extract/pcfg-score/options.h
new file mode 100644
index 000000000..e54b2a0b9
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/options.h
@@ -0,0 +1,36 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_OPTIONS_H_
+#define PCFG_SCORE_OPTIONS_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+struct Options {
+ std::string pcfg_file;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc
new file mode 100644
index 000000000..d780200ad
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc
@@ -0,0 +1,152 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg_score.h"
+
+#include "options.h"
+#include "tree_scorer.h"
+
+#include "pcfg-common/exception.h"
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/syntax_tree.h"
+#include "pcfg-common/typedef.h"
+#include "pcfg-common/xml_tree_parser.h"
+
+#include <boost/program_options.hpp>
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+int PcfgScore::Main(int argc, char *argv[]) {
+ // Process command-line options.
+ Options options;
+ ProcessOptions(argc, argv, options);
+
+ // Open PCFG stream.
+ std::ifstream pcfg_stream;
+ OpenNamedInputOrDie(options.pcfg_file, pcfg_stream);
+
+ // Read PCFG.
+ Pcfg pcfg;
+ Vocabulary non_term_vocab;
+ pcfg.Read(pcfg_stream, non_term_vocab);
+
+ // Score corpus according to PCFG.
+ TreeScorer scorer(pcfg, non_term_vocab);
+ XmlTreeParser parser;
+ XmlTreeWriter<PcfgTree> writer;
+ std::string line;
+ size_t line_num = 0;
+ std::auto_ptr<PcfgTree> tree;
+ while (std::getline(std::cin, line)) {
+ ++line_num;
+ try {
+ tree = parser.Parse(line);
+ } catch (Exception &e) {
+ std::ostringstream msg;
+ msg << "line " << line_num << ": " << e.msg();
+ Error(msg.str());
+ }
+ if (!tree.get()) {
+ std::ostringstream msg;
+ msg << "no tree at line " << line_num;
+ Warn(msg.str());
+ std::cout << std::endl;
+ continue;
+ }
+ if (!scorer.Score(*tree)) {
+ std::ostringstream msg;
+ msg << "failed to score tree at line " << line_num;
+ Warn(msg.str());
+ std::cout << std::endl;
+ continue;
+ }
+ writer.Write(*tree, std::cout);
+ }
+
+ return 0;
+}
+
+void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const {
+ namespace po = boost::program_options;
+
+ std::ostringstream usage_top;
+ usage_top << "Usage: " << name() << " PCFG\n\n"
+ << "Options";
+
+ // Declare the command line options that are visible to the user.
+ po::options_description visible(usage_top.str());
+ visible.add_options()
+ ("help", "print help message and exit")
+ ;
+
+ // Declare the command line options that are hidden from the user
+ // (these are used as positional options).
+ po::options_description hidden("Hidden options");
+ hidden.add_options()
+ ("pcfg-file", po::value(&options.pcfg_file), "pcfg file")
+ ;
+
+ // Compose the full set of command-line options.
+ po::options_description cmd_line_options;
+ cmd_line_options.add(visible).add(hidden);
+
+ // Register the positional options.
+ po::positional_options_description p;
+ p.add("pcfg-file", 1);
+
+ // Process the command-line.
+ po::variables_map vm;
+ try {
+ po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+ options(cmd_line_options).positional(p).run(), vm);
+ po::notify(vm);
+ } catch (const std::exception &e) {
+ std::ostringstream msg;
+ msg << e.what() << "\n\n" << visible;
+ Error(msg.str());
+ }
+
+ if (vm.count("help")) {
+ std::cout << visible << std::endl;
+ std::exit(0);
+ }
+
+ // Check positional options were given.
+
+ if (!vm.count("pcfg-file")) {
+ std::ostringstream msg;
+ msg << "missing required argument\n\n" << visible << std::endl;
+ Error(msg.str());
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.h b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h
new file mode 100644
index 000000000..5e506c39d
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_PCFG_SCORE_H_
+#define PCFG_SCORE_PCFG_SCORE_H_
+
+#include "pcfg-common/tool.h"
+
+namespace Moses {
+namespace PCFG {
+
+class Options;
+
+class PcfgScore : public Tool {
+ public:
+ PcfgScore() : Tool("pcfg-score") {}
+ virtual int Main(int, char *[]);
+ private:
+ void ProcessOptions(int, char *[], Options &) const;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc
new file mode 100644
index 000000000..5f695e4fc
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc
@@ -0,0 +1,68 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "tree_scorer.h"
+
+#include <cassert>
+
+namespace Moses {
+namespace PCFG {
+
+TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab)
+ : pcfg_(pcfg)
+ , non_term_vocab_(non_term_vocab) {
+}
+
+bool TreeScorer::Score(PcfgTree &root) const {
+ if (root.IsPreterminal() || root.IsLeaf()) {
+ return true;
+ }
+
+ const std::vector<PcfgTree *> &children = root.children();
+
+ double log_prob = 0.0;
+
+ std::vector<size_t> key;
+ key.reserve(children.size()+1);
+ key.push_back(non_term_vocab_.Lookup(root.label()));
+
+ for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ PcfgTree *child = *p;
+ assert(!child->IsLeaf());
+ key.push_back(non_term_vocab_.Lookup(child->label()));
+ if (!Score(*child)) {
+ return false;
+ }
+ if (!child->IsPreterminal()) {
+ log_prob += child->score();
+ }
+ }
+ double rule_score;
+ bool found = pcfg_.Lookup(key, rule_score);
+ if (!found) {
+ return false;
+ }
+ log_prob += rule_score;
+ root.set_score(log_prob);
+ return true;
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.h b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h
new file mode 100644
index 000000000..36f4e1e99
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h
@@ -0,0 +1,47 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_TREE_SCORER_H_
+#define PCFG_SCORE_TREE_SCORER_H_
+
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/typedef.h"
+
+namespace Moses {
+namespace PCFG {
+
+class TreeScorer {
+ public:
+ TreeScorer(const Pcfg &, const Vocabulary &);
+
+ // Score tree according to PCFG. Returns false if unsuccessful (due to
+ // missing rule).
+ bool Score(PcfgTree &) const;
+
+ private:
+ const Pcfg &pcfg_;
+ const Vocabulary &non_term_vocab_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp
index 8bcc9be3b..c5fb0b99f 100644
--- a/scripts/training/phrase-extract/score.cpp
+++ b/scripts/training/phrase-extract/score.cpp
@@ -72,6 +72,7 @@ void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
LexicalTable lexTable;
bool inverseFlag = false;
bool hierarchicalFlag = false;
+bool pcfgFlag = false;
bool wordAlignmentFlag = false;
bool goodTuringFlag = false;
bool kneserNeyFlag = false;
@@ -108,6 +109,9 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--Hierarchical") == 0) {
hierarchicalFlag = true;
cerr << "processing hierarchical rules\n";
+ } else if (strcmp(argv[i],"--PCFG") == 0) {
+ pcfgFlag = true;
+ cerr << "including PCFG scores\n";
} else if (strcmp(argv[i],"--WordAlignment") == 0) {
wordAlignmentFlag = true;
cerr << "outputing word alignment" << endl;
@@ -193,6 +197,7 @@ int main(int argc, char* argv[])
// loop through all extracted phrase translations
float lastCount = 0.0f;
+ float lastPcfgSum = 0.0f;
vector< PhraseAlignment > phrasePairsWithSameF;
int i=0;
char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
@@ -207,6 +212,7 @@ int main(int argc, char* argv[])
// identical to last line? just add count
if (strcmp(line,lastLine) == 0) {
lastPhrasePair->count += lastCount;
+ lastPhrasePair->pcfgSum += lastPcfgSum;
continue;
}
strcpy( lastLine, line );
@@ -215,10 +221,12 @@ int main(int argc, char* argv[])
PhraseAlignment phrasePair;
phrasePair.create( line, i );
lastCount = phrasePair.count;
+ lastPcfgSum = phrasePair.pcfgSum;
// only differs in count? just add count
if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) {
lastPhrasePair->count += phrasePair.count;
+ lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
continue;
}
@@ -438,6 +446,16 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
countOfCounts[ countInt ]++;
}
+ // compute PCFG score
+ float pcfgScore;
+ if (pcfgFlag && !inverseFlag) {
+ float pcfgSum = 0;
+ for(size_t i=0; i<phrasePair.size(); ++i) {
+ pcfgSum += phrasePair[i]->pcfgSum;
+ }
+ pcfgScore = pcfgSum / count;
+ }
+
// output phrases
const PHRASE &phraseS = phrasePair[0]->GetSource();
const PHRASE &phraseT = phrasePair[0]->GetTarget();
@@ -493,6 +511,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
}
+ // target-side PCFG score
+ if (pcfgFlag && !inverseFlag) {
+ phraseTableFile << " " << pcfgScore;
+ }
+
phraseTableFile << " ||| ";
// alignment info for non-terminals
diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir
index 1a7cb3a39..41ea2d682 100755
--- a/scripts/training/train-model.perl.missing_bin_dir
+++ b/scripts/training/train-model.perl.missing_bin_dir
@@ -105,6 +105,7 @@ $_HELP = 1
'glue-grammar-file=s' => \$_GLUE_GRAMMAR_FILE,
'unknown-word-label-file=s' => \$_UNKNOWN_WORD_LABEL_FILE,
'ghkm' => \$_GHKM,
+ 'pcfg' => \$_PCFG,
'extract-options=s' => \$_EXTRACT_OPTIONS,
'score-options=s' => \$_SCORE_OPTIONS,
'source-syntax' => \$_SOURCE_SYNTAX,
@@ -1373,6 +1374,7 @@ sub extract_phrase {
$cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file";
$cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR;
$cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
+ $cmd .= " --PCFG" if $_PCFG;
if (!defined($_GHKM)) {
$cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
$cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
@@ -1503,6 +1505,7 @@ sub score_phrase_phrase_extract {
$cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
$cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT;
$cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL;
+ $cmd .= " --PCFG" if $_PCFG;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
# sorting
@@ -1801,6 +1804,7 @@ sub create_ini {
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
$basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
$basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
+ $basic_weight_count++ if $_PCFG;
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
$num_of_ttables++;
my $ff = $f;