Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2012-05-26 03:10:08 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2012-05-26 03:10:08 +0400
commit07fafd51b509e93db7be238107325c45ca5f57cd (patch)
treee8cccec20ee05726accd9f87a041b68370adeaab /scripts/training
parent561b9ac9567d3e5b0bbc56fdae3b29961b8bc728 (diff)
parenta72744c49b7821bf0355e7fe4638c392a74b0d60 (diff)
Merge branch 'master' of git://github.com/moses-smt/mosesdecoder
Diffstat (limited to 'scripts/training')
-rw-r--r--scripts/training/phrase-extract/ExtractedRule.h2
-rw-r--r--scripts/training/phrase-extract/Jamfile6
-rw-r--r--scripts/training/phrase-extract/PhraseAlignment.cpp7
-rw-r--r--scripts/training/phrase-extract/PhraseAlignment.h1
-rw-r--r--scripts/training/phrase-extract/RuleExtractionOptions.h2
-rw-r--r--scripts/training/phrase-extract/SyntaxTree.cpp3
-rw-r--r--scripts/training/phrase-extract/SyntaxTree.h11
-rw-r--r--scripts/training/phrase-extract/XmlTree.cpp9
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp4
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp5
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/Node.h6
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/Options.h2
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/ParseTree.h7
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp1
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/ScfgRule.h2
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp69
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h4
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp16
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/Subgraph.h8
-rw-r--r--scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp1
-rw-r--r--scripts/training/phrase-extract/extract-rules.cpp32
-rw-r--r--scripts/training/phrase-extract/extract.cpp34
-rw-r--r--scripts/training/phrase-extract/pcfg-common/Jamfile1
-rw-r--r--scripts/training/phrase-extract/pcfg-common/exception.h41
-rw-r--r--scripts/training/phrase-extract/pcfg-common/numbered_set.h109
-rw-r--r--scripts/training/phrase-extract/pcfg-common/pcfg.cc106
-rw-r--r--scripts/training/phrase-extract/pcfg-common/pcfg.h61
-rw-r--r--scripts/training/phrase-extract/pcfg-common/pcfg_tree.h77
-rw-r--r--scripts/training/phrase-extract/pcfg-common/syntax_tree.h91
-rw-r--r--scripts/training/phrase-extract/pcfg-common/tool.cc80
-rw-r--r--scripts/training/phrase-extract/pcfg-common/tool.h91
-rw-r--r--scripts/training/phrase-extract/pcfg-common/typedef.h37
-rw-r--r--scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc85
-rw-r--r--scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h56
-rw-r--r--scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h127
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/Jamfile1
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/main.cc25
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/options.h36
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc131
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h42
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/rule_collection.cc58
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/rule_collection.h59
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc51
-rw-r--r--scripts/training/phrase-extract/pcfg-extract/rule_extractor.h45
-rw-r--r--scripts/training/phrase-extract/pcfg-score/Jamfile1
-rw-r--r--scripts/training/phrase-extract/pcfg-score/main.cc25
-rw-r--r--scripts/training/phrase-extract/pcfg-score/options.h36
-rw-r--r--scripts/training/phrase-extract/pcfg-score/pcfg_score.cc152
-rw-r--r--scripts/training/phrase-extract/pcfg-score/pcfg_score.h42
-rw-r--r--scripts/training/phrase-extract/pcfg-score/tree_scorer.cc68
-rw-r--r--scripts/training/phrase-extract/pcfg-score/tree_scorer.h47
-rw-r--r--scripts/training/phrase-extract/score.cpp59
-rwxr-xr-xscripts/training/train-model.perl.missing_bin_dir191
53 files changed, 2087 insertions, 176 deletions
diff --git a/scripts/training/phrase-extract/ExtractedRule.h b/scripts/training/phrase-extract/ExtractedRule.h
index 170ccf892..be6e30836 100644
--- a/scripts/training/phrase-extract/ExtractedRule.h
+++ b/scripts/training/phrase-extract/ExtractedRule.h
@@ -43,6 +43,7 @@ public:
int startS;
int endS;
float count;
+ double pcfgScore;
std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
@@ -58,6 +59,7 @@ public:
, startS(sS)
, endS(eS)
, count(0)
+ , pcfgScore(0.0)
{}
void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength)
diff --git a/scripts/training/phrase-extract/Jamfile b/scripts/training/phrase-extract/Jamfile
index 0872130f9..9be67e80a 100644
--- a/scripts/training/phrase-extract/Jamfile
+++ b/scripts/training/phrase-extract/Jamfile
@@ -10,13 +10,13 @@ obj XmlTree.o : XmlTree.cpp : <include>. ;
alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
-exe extract : tables-core.o SentenceAlignment.o extract.cpp InputFileStream ../../..//boost_iostreams ;
+exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ;
exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../../../moses/src//ThreadPool ../../..//boost_iostreams ;
exe extract-lex : extract-lex.cpp InputFileStream ;
-exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp InputFileStream ../../..//boost_iostreams ;
+exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ;
exe consolidate : consolidate.cpp tables-core.o OutputFileStream.cpp InputFileStream ../../..//boost_iostreams ;
@@ -33,3 +33,5 @@ alias programs : extract extract-rules extract-lex score consolidate consolidate
install legacy : programs : <location>. <install-type>EXE ;
build-project extract-ghkm ;
+build-project pcfg-extract ;
+build-project pcfg-score ;
diff --git a/scripts/training/phrase-extract/PhraseAlignment.cpp b/scripts/training/phrase-extract/PhraseAlignment.cpp
index c0bfbde3e..ceb74f04c 100644
--- a/scripts/training/phrase-extract/PhraseAlignment.cpp
+++ b/scripts/training/phrase-extract/PhraseAlignment.cpp
@@ -13,6 +13,8 @@
#include "tables-core.h"
#include "score.h"
+#include <cstdlib>
+
using namespace std;
extern Vocabulary vcbT;
@@ -111,6 +113,9 @@ void PhraseAlignment::create( char line[], int lineID )
}
else if (item == 5) { // non-term lengths
addNTLength(token[j]);
+ } else if (item == 6) { // target syntax PCFG score
+ float pcfgScore = std::atof(token[j].c_str());
+ pcfgSum = pcfgScore * count;
}
}
@@ -119,7 +124,7 @@ void PhraseAlignment::create( char line[], int lineID )
if (item == 3) {
count = 1.0;
}
- if (item < 3 || item > 5) {
+ if (item < 3 || item > 6) {
cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
}
}
diff --git a/scripts/training/phrase-extract/PhraseAlignment.h b/scripts/training/phrase-extract/PhraseAlignment.h
index 8b8f5115c..8bd83503d 100644
--- a/scripts/training/phrase-extract/PhraseAlignment.h
+++ b/scripts/training/phrase-extract/PhraseAlignment.h
@@ -25,6 +25,7 @@ protected:
void createAlignVec(size_t sourceSize, size_t targetSize);
void addNTLength(const std::string &tok);
public:
+ float pcfgSum;
float count;
std::vector< std::set<size_t> > alignedToT;
std::vector< std::set<size_t> > alignedToS;
diff --git a/scripts/training/phrase-extract/RuleExtractionOptions.h b/scripts/training/phrase-extract/RuleExtractionOptions.h
index 70bb548c9..f9123de86 100644
--- a/scripts/training/phrase-extract/RuleExtractionOptions.h
+++ b/scripts/training/phrase-extract/RuleExtractionOptions.h
@@ -45,6 +45,7 @@ public:
bool targetSyntax;
bool duplicateRules;
bool fractionalCounting;
+ bool pcfgScore;
bool outputNTLengths;
bool gzOutput;
@@ -74,6 +75,7 @@ public:
, targetSyntax(false)
, duplicateRules(true)
, fractionalCounting(true)
+ , pcfgScore(false)
, outputNTLengths(false)
, gzOutput(false)
{}
diff --git a/scripts/training/phrase-extract/SyntaxTree.cpp b/scripts/training/phrase-extract/SyntaxTree.cpp
index e181b1e8a..f2783ffd2 100644
--- a/scripts/training/phrase-extract/SyntaxTree.cpp
+++ b/scripts/training/phrase-extract/SyntaxTree.cpp
@@ -42,11 +42,12 @@ void SyntaxTree::Clear()
m_index.clear();
}
-void SyntaxTree::AddNode( int startPos, int endPos, std::string label )
+SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label )
{
SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
m_nodes.push_back( newNode );
m_index[ startPos ][ endPos ].push_back( newNode );
+ return newNode;
}
ParentNodes SyntaxTree::Parse()
diff --git a/scripts/training/phrase-extract/SyntaxTree.h b/scripts/training/phrase-extract/SyntaxTree.h
index 0ca5ca472..17c106b49 100644
--- a/scripts/training/phrase-extract/SyntaxTree.h
+++ b/scripts/training/phrase-extract/SyntaxTree.h
@@ -34,12 +34,14 @@ protected:
std::string m_label;
std::vector< SyntaxNode* > m_children;
SyntaxNode* m_parent;
+ float m_pcfgScore;
public:
SyntaxNode( int startPos, int endPos, std::string label )
:m_start(startPos)
,m_end(endPos)
,m_label(label)
,m_parent(0)
+ ,m_pcfgScore(0.0f)
{}
int GetStart() const {
return m_start;
@@ -50,6 +52,12 @@ public:
std::string GetLabel() const {
return m_label;
}
+ float GetPcfgScore() const {
+ return m_pcfgScore;
+ }
+ void SetPcfgScore(float score) {
+ m_pcfgScore = score;
+ }
SyntaxNode *GetParent() {
return m_parent;
}
@@ -89,11 +97,12 @@ public:
}
~SyntaxTree();
+ SyntaxNode *AddNode( int startPos, int endPos, std::string label );
+
SyntaxNode *GetTop() {
return m_top;
}
- void AddNode( int startPos, int endPos, std::string label );
ParentNodes Parse();
bool HasNode( int startPos, int endPos ) const;
const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
diff --git a/scripts/training/phrase-extract/XmlTree.cpp b/scripts/training/phrase-extract/XmlTree.cpp
index 19825c02c..29c0d94aa 100644
--- a/scripts/training/phrase-extract/XmlTree.cpp
+++ b/scripts/training/phrase-extract/XmlTree.cpp
@@ -25,7 +25,7 @@
#include <string>
#include <set>
#include <iostream>
-#include <stdlib.h>
+#include <cstdlib>
#include <sstream>
#include "SyntaxTree.h"
#include "XmlException.h"
@@ -355,13 +355,18 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label
string label = ParseXmlTagAttribute(tagContent,"label");
labelCollection.insert( label );
+ string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
+ float pcfgScore = pcfgString == "" ? 0.0f
+ : std::atof(pcfgString.c_str());
+
// report what we have processed so far
if (0) {
cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
}
- tree.AddNode( startPos, endPos-1, label );
+ SyntaxNode *node = tree.AddNode( startPos, endPos-1, label );
+ node->SetPcfgScore(pcfgScore);
}
}
}
diff --git a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 0ecffae5c..6bd32a13b 100644
--- a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -212,6 +212,10 @@ Node *AlignmentGraph::CopyParseTree(const ParseTree *root)
std::auto_ptr<Node> n(new Node(root->GetLabel(), nodeType));
+ if (nodeType == TREE) {
+ n->SetPcfgScore(root->GetPcfgScore());
+ }
+
const std::vector<ParseTree *> &children = root->GetChildren();
std::vector<Node *> childNodes;
childNodes.reserve(children.size());
diff --git a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index 008026e1a..397ce1e3c 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -285,6 +285,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
"set maximum allowed scope")
("Minimal",
"extract minimal rules only")
+ ("PCFG",
+ "include score based on PCFG scores in target corpus")
("UnknownWordLabel",
po::value(&options.unknownWordFile),
"write unknown word labels to named file")
@@ -361,6 +363,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
if (vm.count("Minimal")) {
options.minimal = true;
}
+ if (vm.count("PCFG")) {
+ options.pcfg = true;
+ }
if (vm.count("UnpairedExtractFormat")) {
options.unpairedExtractFormat = true;
}
diff --git a/scripts/training/phrase-extract/extract-ghkm/Node.h b/scripts/training/phrase-extract/extract-ghkm/Node.h
index 228fdc812..775473362 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Node.h
+++ b/scripts/training/phrase-extract/extract-ghkm/Node.h
@@ -41,8 +41,7 @@ class Node
Node(const std::string &label, NodeType type)
: m_label(label)
, m_type(type)
- , m_children()
- , m_parents() {}
+ , m_pcfgScore(0.0f) {}
~Node();
@@ -50,12 +49,14 @@ class Node
NodeType GetType() const { return m_type; }
const std::vector<Node*> &GetChildren() const { return m_children; }
const std::vector<Node*> &GetParents() const { return m_parents; }
+ float GetPcfgScore() const { return m_pcfgScore; }
const Span &GetSpan() const { return m_span; }
const Span &GetComplementSpan() const { return m_complementSpan; }
const std::vector<const Subgraph*> &GetRules() const { return m_rules; }
void SetChildren(const std::vector<Node*> &c) { m_children = c; }
void SetParents(const std::vector<Node*> &p) { m_parents = p; }
+ void SetPcfgScore(float s) { m_pcfgScore = s; }
void SetSpan(const Span &s) { m_span = s; }
void SetComplementSpan(const Span &cs) { m_complementSpan = cs; }
@@ -92,6 +93,7 @@ class Node
NodeType m_type;
std::vector<Node*> m_children;
std::vector<Node*> m_parents;
+ float m_pcfgScore;
Span m_span;
Span m_complementSpan;
std::vector<const Subgraph*> m_rules;
diff --git a/scripts/training/phrase-extract/extract-ghkm/Options.h b/scripts/training/phrase-extract/extract-ghkm/Options.h
index 108e19d66..c4b57f311 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Options.h
+++ b/scripts/training/phrase-extract/extract-ghkm/Options.h
@@ -36,6 +36,7 @@ struct Options {
, maxRuleSize(3)
, maxScope(3)
, minimal(false)
+ , pcfg(false)
, unpairedExtractFormat(false) {}
// Positional options
@@ -53,6 +54,7 @@ struct Options {
int maxRuleSize;
int maxScope;
bool minimal;
+ bool pcfg;
bool unpairedExtractFormat;
std::string unknownWordFile;
};
diff --git a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
index ec6fc147a..273e2e04e 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
+++ b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
@@ -32,17 +32,19 @@ class ParseTree
public:
ParseTree(const std::string &label)
: m_label(label)
- , m_children()
- , m_parent() {}
+ , m_parent(0)
+ , m_pcfgScore(0.0) {}
~ParseTree();
const std::string &GetLabel() const { return m_label; }
const std::vector<ParseTree*> &GetChildren() const { return m_children; }
const ParseTree *GetParent() const { return m_parent; }
+ float GetPcfgScore() const { return m_pcfgScore; }
void SetParent(ParseTree *);
void SetChildren(const std::vector<ParseTree*> &);
+ void SetPcfgScore(float score) { m_pcfgScore = score; }
void AddChild(ParseTree *);
@@ -59,6 +61,7 @@ class ParseTree
std::string m_label;
std::vector<ParseTree*> m_children;
ParseTree *m_parent;
+ float m_pcfgScore; // log probability
};
template<typename OutputIterator>
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp
index 8473e4283..5dc70052c 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -30,6 +30,7 @@ namespace GHKM {
ScfgRule::ScfgRule(const Subgraph &fragment)
: m_sourceLHS("X", NonTerminal)
, m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
+ , m_pcfgScore(fragment.GetPcfgScore())
{
// Source RHS
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h
index 1ed534d9e..2405d8fa3 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h
@@ -57,6 +57,7 @@ class ScfgRule
const std::vector<Symbol> &GetSourceRHS() const { return m_sourceRHS; }
const std::vector<Symbol> &GetTargetRHS() const { return m_targetRHS; }
const Alignment &GetAlignment() const { return m_alignment; }
+ float GetPcfgScore() const { return m_pcfgScore; }
int Scope() const;
@@ -68,6 +69,7 @@ class ScfgRule
std::vector<Symbol> m_sourceRHS;
std::vector<Symbol> m_targetRHS;
Alignment m_alignment;
+ float m_pcfgScore;
};
} // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
index 4be3f048d..d5d16b790 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
@@ -24,6 +24,7 @@
#include "ScfgRule.h"
#include <cassert>
+#include <cmath>
#include <ostream>
#include <map>
#include <sstream>
@@ -34,14 +35,43 @@ namespace GHKM {
void ScfgRuleWriter::Write(const ScfgRule &rule)
{
+ std::ostringstream sourceSS;
+ std::ostringstream targetSS;
+
if (m_options.unpairedExtractFormat) {
- WriteUnpairedFormat(rule);
+ WriteUnpairedFormat(rule, sourceSS, targetSS);
} else {
- WriteStandardFormat(rule);
+ WriteStandardFormat(rule, sourceSS, targetSS);
+ }
+
+ // Write the rule to the forward and inverse extract files.
+ m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+ m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+
+ const Alignment &alignment = rule.GetAlignment();
+ for (Alignment::const_iterator p = alignment.begin();
+ p != alignment.end(); ++p) {
+ m_fwd << " " << p->first << "-" << p->second;
+ m_inv << " " << p->second << "-" << p->first;
+ }
+
+ // Write a count of 1 and an empty NT length column to the forward extract
+ // file.
+ // TODO Add option to write NT length?
+ m_fwd << " ||| 1 ||| |||";
+ if (m_options.pcfg) {
+ // Write the PCFG score.
+ m_fwd << " " << std::exp(rule.GetPcfgScore());
}
+ m_fwd << std::endl;
+
+ // Write a count of 1 to the inverse extract file.
+ m_inv << " ||| 1" << std::endl;
}
-void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
+void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
+ std::ostream &sourceSS,
+ std::ostream &targetSS)
{
const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
@@ -60,9 +90,6 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
}
}
- std::ostringstream sourceSS;
- std::ostringstream targetSS;
-
// Write the source side of the rule to sourceSS.
int i = 0;
for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
@@ -88,27 +115,14 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
targetSS << " ";
}
WriteSymbol(rule.GetTargetLHS(), targetSS);
-
- // Write the rule to the forward and inverse extract files.
- m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
- m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
- for (Alignment::const_iterator p(alignment.begin());
- p != alignment.end(); ++p) {
- m_fwd << " " << p->first << "-" << p->second;
- m_inv << " " << p->second << "-" << p->first;
- }
- m_fwd << " ||| 1" << std::endl;
- m_inv << " ||| 1" << std::endl;
}
-void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule)
+void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
+ std::ostream &sourceSS,
+ std::ostream &targetSS)
{
const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
- const Alignment &alignment = rule.GetAlignment();
-
- std::ostringstream sourceSS;
- std::ostringstream targetSS;
// Write the source side of the rule to sourceSS.
int i = 0;
@@ -127,17 +141,6 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule)
targetSS << " ";
}
WriteSymbol(rule.GetTargetLHS(), targetSS);
-
- // Write the rule to the forward and inverse extract files.
- m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
- m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
- for (Alignment::const_iterator p(alignment.begin());
- p != alignment.end(); ++p) {
- m_fwd << " " << p->first << "-" << p->second;
- m_inv << " " << p->second << "-" << p->first;
- }
- m_fwd << " ||| 1" << std::endl;
- m_inv << " ||| 1" << std::endl;
}
void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h
index 738d09ce9..b92a432a1 100644
--- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h
+++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h
@@ -45,8 +45,8 @@ class ScfgRuleWriter
ScfgRuleWriter(const ScfgRuleWriter &);
ScfgRuleWriter &operator=(const ScfgRuleWriter &);
- void WriteStandardFormat(const ScfgRule &);
- void WriteUnpairedFormat(const ScfgRule &);
+ void WriteStandardFormat(const ScfgRule &, std::ostream &, std::ostream &);
+ void WriteUnpairedFormat(const ScfgRule &, std::ostream &, std::ostream &);
void WriteSymbol(const Symbol &, std::ostream &);
std::ostream &m_fwd;
diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
index e5aedbb16..e048f2c55 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
@@ -101,5 +101,21 @@ int Subgraph::CalcDepth(const Node *n) const
return maxChildDepth + 1;
}
+float Subgraph::CalcPcfgScore() const
+{
+ if (m_root->GetType() != TREE || m_leaves.empty()) {
+ return 0.0f;
+ }
+ float score = m_root->GetPcfgScore();
+ for (std::set<const Node *>::const_iterator p = m_leaves.begin();
+ p != m_leaves.end(); ++p) {
+ const Node *leaf = *p;
+ if (leaf->GetType() == TREE) {
+ score -= leaf->GetPcfgScore();
+ }
+ }
+ return score;
+}
+
} // namespace Moses
} // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
index e84903502..ede1233e9 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
+++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
@@ -38,7 +38,8 @@ class Subgraph
: m_root(root)
, m_depth(0)
, m_size(root->GetType() == TREE ? 1 : 0)
- , m_nodeCount(1) {}
+ , m_nodeCount(1)
+ , m_pcfgScore(0.0f) {}
Subgraph(const Node *root, const std::set<const Node *> &leaves)
: m_root(root)
@@ -46,10 +47,12 @@ class Subgraph
, m_depth(-1)
, m_size(-1)
, m_nodeCount(-1)
+ , m_pcfgScore(0.0f)
{
m_depth = CalcDepth(m_root);
m_size = CalcSize(m_root);
m_nodeCount = CountNodes(m_root);
+ m_pcfgScore = CalcPcfgScore();
}
const Node *GetRoot() const { return m_root; }
@@ -57,6 +60,7 @@ class Subgraph
int GetDepth() const { return m_depth; }
int GetSize() const { return m_size; }
int GetNodeCount() const { return m_nodeCount; }
+ float GetPcfgScore() const { return m_pcfgScore; }
bool IsTrivial() const { return m_leaves.empty(); }
@@ -66,6 +70,7 @@ class Subgraph
void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
int CalcDepth(const Node *) const;
int CalcSize(const Node *) const;
+ float CalcPcfgScore() const;
int CountNodes(const Node *) const;
const Node *m_root;
@@ -73,6 +78,7 @@ class Subgraph
int m_depth;
int m_size;
int m_nodeCount;
+ float m_pcfgScore;
};
} // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
index 31c0e3843..cc961dc0c 100644
--- a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@@ -61,6 +61,7 @@ std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree(
const std::vector<std::string> &words)
{
std::auto_ptr<ParseTree> root(new ParseTree(tree.GetLabel()));
+ root->SetPcfgScore(tree.GetPcfgScore());
const std::vector<SyntaxNode*> &children = tree.GetChildren();
if (children.empty()) {
if (tree.GetStart() != tree.GetEnd()) {
diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp
index 2cc9dc54d..a00667b82 100644
--- a/scripts/training/phrase-extract/extract-rules.cpp
+++ b/scripts/training/phrase-extract/extract-rules.cpp
@@ -90,7 +90,7 @@ void addHieroRule( int startT, int endT, int startS, int endS
void printHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, LabelIndex &labelIndex);
string printTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex);
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
string printSourceHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, const LabelIndex &labelIndex);
void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
@@ -257,6 +257,8 @@ int main(int argc, char* argv[])
// if an source phrase is paired with two target phrases, then count(t|s) = 0.5
else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
options.fractionalCounting = false;
+ } else if (strcmp(argv[i],"--PCFG") == 0) {
+ options.pcfgScore = true;
} else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
options.outputNTLengths = true;
#ifdef WITH_THREADS
@@ -517,7 +519,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
}
string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex)
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore)
{
HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
assert(iterHoleList != holeColl.GetHoles().end());
@@ -545,6 +547,11 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
out += "[" + sourceLabel + "][" + targetLabel + "] ";
+ if (m_options.pcfgScore) {
+ double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
+ logPCFGScore -= score;
+ }
+
currPos = hole.GetEnd(1);
hole.SetPos(outPos, 1);
++iterHoleList;
@@ -658,8 +665,16 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
// target
- rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex)
+ if (m_options.pcfgScore) {
+ double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+ rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ + " [" + targetLabel + "]";
+ rule.pcfgScore = std::exp(logPCFGScore);
+ } else {
+ double logPCFGScore = 0.0f;
+ rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ " [" + targetLabel + "]";
+ }
// source
// holeColl.SortSourceHoles();
@@ -877,6 +892,11 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist
rule.target += m_sentence->target[ti] + " ";
rule.target += "[" + targetLabel + "]";
+ if (m_options.pcfgScore) {
+ double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+ rule.pcfgScore = std::exp(logPCFGScore);
+ }
+
// alignment
for(int ti=startT; ti<=endT; ti++) {
for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
@@ -957,11 +977,13 @@ void ExtractTask::writeRulesToFile()
out << rule->source << " ||| "
<< rule->target << " ||| "
<< rule->alignment << " ||| "
- << rule->count;
+ << rule->count << " ||| ";
if (m_options.outputNTLengths) {
- out << " ||| ";
rule->OutputNTLengths(out);
}
+ if (m_options.pcfgScore) {
+ out << " ||| " << rule->pcfgScore;
+ }
out << "\n";
if (!m_options.onlyDirectFlag) {
diff --git a/scripts/training/phrase-extract/extract.cpp b/scripts/training/phrase-extract/extract.cpp
index f6d6cbb9b..16b413da9 100644
--- a/scripts/training/phrase-extract/extract.cpp
+++ b/scripts/training/phrase-extract/extract.cpp
@@ -22,6 +22,7 @@
#include "SentenceAlignment.h"
#include "tables-core.h"
#include "InputFileStream.h"
+#include "OutputFileStream.h"
using namespace std;
@@ -82,15 +83,16 @@ bool hierModel = false;
REO_MODEL_TYPE hierType = REO_MSD;
-ofstream extractFile;
-ofstream extractFileInv;
-ofstream extractFileOrientation;
-ofstream extractFileSentenceId;
+Moses::OutputFileStream extractFile;
+Moses::OutputFileStream extractFileInv;
+Moses::OutputFileStream extractFileOrientation;
+Moses::OutputFileStream extractFileSentenceId;
int maxPhraseLength;
bool orientationFlag = false;
bool translationFlag = true;
bool sentenceIdFlag = false; //create extract file with sentence id
bool onlyOutputSpanInfo = false;
+bool gzOutput = false;
int main(int argc, char* argv[])
{
@@ -116,6 +118,8 @@ int main(int argc, char* argv[])
translationFlag = false;
} else if (strcmp(argv[i], "--SentenceId") == 0) {
sentenceIdFlag = true;
+ } else if (strcmp(argv[i], "--GZOutput") == 0) {
+ gzOutput = true;
} else if(strcmp(argv[i],"--model") == 0) {
if (i+1 >= argc) {
cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
@@ -193,18 +197,18 @@ int main(int argc, char* argv[])
// open output files
if (translationFlag) {
- string fileNameExtractInv = fileNameExtract + ".inv";
- extractFile.open(fileNameExtract.c_str());
- extractFileInv.open(fileNameExtractInv.c_str());
+ string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
+ extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
+ extractFileInv.Open(fileNameExtractInv.c_str());
}
if (orientationFlag) {
- string fileNameExtractOrientation = fileNameExtract + ".o";
- extractFileOrientation.open(fileNameExtractOrientation.c_str());
+ string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
+ extractFileOrientation.Open(fileNameExtractOrientation.c_str());
}
if (sentenceIdFlag) {
- string fileNameExtractSentenceId = fileNameExtract + ".sid";
- extractFileSentenceId.open(fileNameExtractSentenceId.c_str());
+ string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
+ extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
}
int i=0;
@@ -239,12 +243,12 @@ int main(int argc, char* argv[])
//az: only close if we actually opened it
if (!onlyOutputSpanInfo) {
if (translationFlag) {
- extractFile.close();
- extractFileInv.close();
+ extractFile.Close();
+ extractFileInv.Close();
}
- if (orientationFlag) extractFileOrientation.close();
+ if (orientationFlag) extractFileOrientation.Close();
if (sentenceIdFlag) {
- extractFileSentenceId.close();
+ extractFileSentenceId.Close();
}
}
}
diff --git a/scripts/training/phrase-extract/pcfg-common/Jamfile b/scripts/training/phrase-extract/pcfg-common/Jamfile
new file mode 100644
index 000000000..3dc272a56
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/Jamfile
@@ -0,0 +1 @@
+lib pcfg_common : [ glob *.cc ] ..//trees ;
diff --git a/scripts/training/phrase-extract/pcfg-common/exception.h b/scripts/training/phrase-extract/pcfg-common/exception.h
new file mode 100644
index 000000000..3dbd59d0e
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/exception.h
@@ -0,0 +1,41 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXCEPTION_H_
+#define PCFG_EXCEPTION_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+class Exception {
+ public:
+ Exception(const char *msg) : msg_(msg) {}
+ Exception(const std::string &msg) : msg_(msg) {}
+ const std::string &msg() const { return msg_; }
+ private:
+ std::string msg_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/numbered_set.h b/scripts/training/phrase-extract/pcfg-common/numbered_set.h
new file mode 100644
index 000000000..f88d710ed
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/numbered_set.h
@@ -0,0 +1,109 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_NUMBERED_SET_H_
+#define PCFG_NUMBERED_SET_H_
+
+#include "exception.h"
+
+#include <boost/unordered_map.hpp>
+
+#include <limits>
+#include <sstream>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Stores a set of elements of type T, each of which is allocated an integral
+// ID of type I. IDs are contiguous starting at 0. Individual elements cannot
+// be removed once inserted (but the whole set can be cleared).
+template<typename T, typename I=size_t>
+class NumberedSet {
+ private:
+ typedef boost::unordered_map<T, I> ElementToIdMap;
+ typedef std::vector<const T *> IdToElementMap;
+
+ public:
+ typedef I IdType;
+ typedef typename IdToElementMap::const_iterator const_iterator;
+
+ NumberedSet() {}
+
+ const_iterator begin() const { return id_to_element_.begin(); }
+ const_iterator end() const { return id_to_element_.end(); }
+
+ // Static value
+ static I NullId() { return std::numeric_limits<I>::max(); }
+
+ bool Empty() const { return id_to_element_.empty(); }
+ size_t Size() const { return id_to_element_.size(); }
+
+ // Insert the given object and return its ID.
+ I Insert(const T &);
+
+ I Lookup(const T &) const;
+ const T &Lookup(I) const;
+
+ void Clear();
+
+ private:
+ ElementToIdMap element_to_id_;
+ IdToElementMap id_to_element_;
+};
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Lookup(const T &s) const {
+ typename ElementToIdMap::const_iterator p = element_to_id_.find(s);
+ return (p == element_to_id_.end()) ? NullId() : p->second;
+}
+
+template<typename T, typename I>
+const T &NumberedSet<T, I>::Lookup(I id) const {
+ if (id < 0 || id >= id_to_element_.size()) {
+ std::ostringstream msg;
+ msg << "Value not found: " << id;
+ throw Exception(msg.str());
+ }
+ return *(id_to_element_[id]);
+}
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Insert(const T &x) {
+ std::pair<T, I> value(x, id_to_element_.size());
+ std::pair<typename ElementToIdMap::iterator, bool> result =
+ element_to_id_.insert(value);
+ if (result.second) {
+ // x is a new element.
+ id_to_element_.push_back(&result.first->first);
+ }
+ return result.first->second;
+}
+
+template<typename T, typename I>
+void NumberedSet<T, I>::Clear() {
+ element_to_id_.clear();
+ id_to_element_.clear();
+}
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.cc b/scripts/training/phrase-extract/pcfg-common/pcfg.cc
new file mode 100644
index 000000000..d045b820b
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/pcfg.cc
@@ -0,0 +1,106 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg.h"
+
+#include "exception.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include <cassert>
+
+namespace Moses {
+namespace PCFG {
+
+void Pcfg::Add(const Key &key, double score) {
+ rules_[key] = score;
+}
+
+bool Pcfg::Lookup(const Key &key, double &score) const {
+ Map::const_iterator p = rules_.find(key);
+ if (p == rules_.end()) {
+ return false;
+ }
+ score = p->second;
+ return true;
+}
+
+void Pcfg::Read(std::istream &input, Vocabulary &vocab) {
+ std::string line;
+ std::string lhs_string;
+ std::vector<std::string> rhs_strings;
+ std::string score_string;
+ Key key;
+ while (std::getline(input, line)) {
+ // Read LHS.
+ size_t pos = line.find("|||");
+ if (pos == std::string::npos) {
+ throw Exception("missing first delimiter");
+ }
+ lhs_string = line.substr(0, pos);
+ boost::trim(lhs_string);
+
+ // Read RHS.
+ size_t begin = pos+3;
+ pos = line.find("|||", begin);
+ if (pos == std::string::npos) {
+ throw Exception("missing second delimiter");
+ }
+ std::string rhs_text = line.substr(begin, pos-begin);
+ boost::trim(rhs_text);
+ rhs_strings.clear();
+ boost::split(rhs_strings, rhs_text, boost::algorithm::is_space(),
+ boost::algorithm::token_compress_on);
+
+ // Read score.
+ score_string = line.substr(pos+3);
+ boost::trim(score_string);
+
+ // Construct key.
+ key.clear();
+ key.reserve(rhs_strings.size()+1);
+ key.push_back(vocab.Insert(lhs_string));
+ for (std::vector<std::string>::const_iterator p = rhs_strings.begin();
+ p != rhs_strings.end(); ++p) {
+ key.push_back(vocab.Insert(*p));
+ }
+
+ // Add rule.
+ double score = boost::lexical_cast<double>(score_string);
+ Add(key, score);
+ }
+}
+
+void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const {
+ for (const_iterator p = begin(); p != end(); ++p) {
+ const Key &key = p->first;
+ double score = p->second;
+ std::vector<size_t>::const_iterator q = key.begin();
+ std::vector<size_t>::const_iterator end = key.end();
+ output << vocab.Lookup(*q++) << " |||";
+ while (q != end) {
+ output << " " << vocab.Lookup(*q++);
+ }
+ output << " ||| " << score << std::endl;
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.h b/scripts/training/phrase-extract/pcfg-common/pcfg.h
new file mode 100644
index 000000000..757eea449
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/pcfg.h
@@ -0,0 +1,61 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_PCFG_H_
+#define PCFG_PCFG_H_
+
+#include "typedef.h"
+
+#include <istream>
+#include <map>
+#include <ostream>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+class Pcfg {
+ public:
+ typedef std::vector<size_t> Key;
+ typedef std::map<Key, double> Map;
+ typedef Map::iterator iterator;
+ typedef Map::const_iterator const_iterator;
+
+ Pcfg() {}
+
+ iterator begin() { return rules_.begin(); }
+ const_iterator begin() const { return rules_.begin(); }
+
+ iterator end() { return rules_.end(); }
+ const_iterator end() const { return rules_.end(); }
+
+ void Add(const Key &, double);
+ bool Lookup(const Key &, double &) const;
+ void Read(std::istream &, Vocabulary &);
+ void Write(const Vocabulary &, std::ostream &) const;
+
+ private:
+ Map rules_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h
new file mode 100644
index 000000000..bdac64dfc
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h
@@ -0,0 +1,77 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_PCFG_TREE_H_
+#define PCFG_PCFG_TREE_H_
+
+#include "syntax_tree.h"
+#include "xml_tree_writer.h"
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+template<typename DerivedType>
+class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
+ public:
+ typedef std::string LabelType;
+ typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
+
+ PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
+
+ double score() const { return score_; }
+ void set_score(double s) { score_ = s; }
+
+ private:
+ double score_;
+};
+
+class PcfgTree : public PcfgTreeBase<PcfgTree> {
+ public:
+ typedef PcfgTreeBase<PcfgTree> BaseType;
+ PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
+};
+
+// Specialise XmlOutputHandler for PcfgTree.
+template<>
+class XmlOutputHandler<PcfgTree> {
+ public:
+ typedef std::map<std::string, std::string> AttributeMap;
+
+ void GetLabel(const PcfgTree &tree, std::string &label) const {
+ label = tree.label();
+ }
+
+ void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const {
+ attribute_map.clear();
+ double score = tree.score();
+ if (score != 0.0) {
+ std::ostringstream out;
+ out << tree.score();
+ attribute_map["pcfg"] = out.str();
+ }
+ }
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/syntax_tree.h b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h
new file mode 100644
index 000000000..37f72dd58
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h
@@ -0,0 +1,91 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SYNTAX_TREE_H_
+#define PCFG_SYNTAX_TREE_H_
+
+#include <cassert>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Base class for SyntaxTree, AgreementTree, and friends.
+template<typename T, typename DerivedType>
+class SyntaxTreeBase {
+ public:
+ // Constructors
+ SyntaxTreeBase(const T &label)
+ : label_(label)
+ , children_()
+ , parent_(0) {}
+
+ SyntaxTreeBase(const T &label, const std::vector<DerivedType *> &children)
+ : label_(label)
+ , children_(children)
+ , parent_(0) {}
+
+ // Destructor
+ virtual ~SyntaxTreeBase();
+
+ const T &label() const { return label_; }
+ const DerivedType *parent() const { return parent_; }
+ DerivedType *parent() { return parent_; }
+ const std::vector<DerivedType *> &children() const { return children_; }
+ std::vector<DerivedType *> &children() { return children_; }
+
+ void set_label(const T &label) { label_ = label; }
+ void set_parent(DerivedType *parent) { parent_ = parent; }
+ void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
+
+ bool IsLeaf() const { return children_.empty(); }
+
+ bool IsPreterminal() const {
+ return children_.size() == 1 && children_[0]->IsLeaf();
+ }
+
+ void AddChild(DerivedType *child) { children_.push_back(child); }
+
+ private:
+ T label_;
+ std::vector<DerivedType *> children_;
+ DerivedType *parent_;
+};
+
+template<typename T>
+class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
+ public:
+ typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
+ SyntaxTree(const T &label) : BaseType(label) {}
+ SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
+ : BaseType(label, children) {}
+};
+
+template<typename T, typename DerivedType>
+SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
+ for (size_t i = 0; i < children_.size(); ++i) {
+ delete children_[i];
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/tool.cc b/scripts/training/phrase-extract/pcfg-common/tool.cc
new file mode 100644
index 000000000..bebd220e1
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/tool.cc
@@ -0,0 +1,80 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "tool.h"
+
+#include <sstream>
+
+namespace Moses {
+namespace PCFG {
+
+std::istream &Tool::OpenInputOrDie(const std::string &filename) {
+ // TODO Check that function is only called once?
+ if (filename.empty() || filename == "-") {
+ input_ptr_ = &(std::cin);
+ } else {
+ input_file_stream_.open(filename.c_str());
+ if (!input_file_stream_) {
+ std::ostringstream msg;
+ msg << "failed to open input file: " << filename;
+ Error(msg.str());
+ }
+ input_ptr_ = &input_file_stream_;
+ }
+ return *input_ptr_;
+}
+
+std::ostream &Tool::OpenOutputOrDie(const std::string &filename) {
+ // TODO Check that function is only called once?
+ if (filename.empty() || filename == "-") {
+ output_ptr_ = &(std::cout);
+ } else {
+ output_file_stream_.open(filename.c_str());
+ if (!output_file_stream_) {
+ std::ostringstream msg;
+ msg << "failed to open output file: " << filename;
+ Error(msg.str());
+ }
+ output_ptr_ = &output_file_stream_;
+ }
+ return *output_ptr_;
+}
+
+void Tool::OpenNamedInputOrDie(const std::string &filename,
+ std::ifstream &stream) {
+ stream.open(filename.c_str());
+ if (!stream) {
+ std::ostringstream msg;
+ msg << "failed to open input file: " << filename;
+ Error(msg.str());
+ }
+}
+
+void Tool::OpenNamedOutputOrDie(const std::string &filename,
+ std::ofstream &stream) {
+ stream.open(filename.c_str());
+ if (!stream) {
+ std::ostringstream msg;
+ msg << "failed to open output file: " << filename;
+ Error(msg.str());
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-common/tool.h b/scripts/training/phrase-extract/pcfg-common/tool.h
new file mode 100644
index 000000000..0af342569
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/tool.h
@@ -0,0 +1,91 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_TOOL_H_
+#define PCFG_TOOL_H_
+
+#include <boost/program_options/cmdline.hpp>
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+class Tool {
+ public:
+ virtual ~Tool() {}
+
+ const std::string &name() const { return name_; }
+
+ virtual int Main(int argc, char *argv[]) = 0;
+
+ protected:
+ Tool(const std::string &name) : name_(name) {}
+
+ // Returns the boost::program_options style that should be used by all tools.
+ static int CommonOptionStyle() {
+ namespace cls = boost::program_options::command_line_style;
+ return cls::default_style & (~cls::allow_guessing);
+ }
+
+ void Warn(const std::string &msg) const {
+ std::cerr << name_ << ": warning: " << msg << std::endl;
+ }
+
+ void Error(const std::string &msg) const {
+ std::cerr << name_ << ": error: " << msg << std::endl;
+ std::exit(1);
+ }
+
+ // Initialises the tool's main input stream and returns a reference that is
+ // valid for the remainder of the tool's lifetime. If filename is empty or
+ // "-" then input is standard input; otherwise it is the named file. Calls
+ // Error() if the file cannot be opened for reading.
+ std::istream &OpenInputOrDie(const std::string &filename);
+
+ // Initialises the tool's main output stream and returns a reference that is
+ // valid for the remainder of the tool's lifetime. If filename is empty or
+ // "-" then output is standard output; otherwise it is the named file. Calls
+ // Error() if the file cannot be opened for writing.
+ std::ostream &OpenOutputOrDie(const std::string &filename);
+
+ // Opens the named input file using the supplied ifstream. Calls Error() if
+ // the file cannot be opened for reading.
+ void OpenNamedInputOrDie(const std::string &, std::ifstream &);
+
+ // Opens the named output file using the supplied ofstream. Calls Error() if
+ // the file cannot be opened for writing.
+ void OpenNamedOutputOrDie(const std::string &, std::ofstream &);
+
+ private:
+ std::string name_;
+ std::istream *input_ptr_;
+ std::ifstream input_file_stream_;
+ std::ostream *output_ptr_;
+ std::ofstream output_file_stream_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/typedef.h b/scripts/training/phrase-extract/pcfg-common/typedef.h
new file mode 100644
index 000000000..49a12d681
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/typedef.h
@@ -0,0 +1,37 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_TYPEDEF_H_
+#define PCFG_TYPEDEF_H_
+
+#include "numbered_set.h"
+#include "syntax_tree.h"
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+typedef NumberedSet<std::string> Vocabulary;
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc
new file mode 100644
index 000000000..5c596a0fb
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc
@@ -0,0 +1,85 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "xml_tree_parser.h"
+
+#include "exception.h"
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+
+#include <cassert>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+XmlTreeParser::XmlTreeParser()
+{
+}
+
+std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line)
+{
+ m_line = line;
+ m_tree.Clear();
+ try {
+ if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
+ throw Exception("");
+ }
+ } catch (const XmlException &e) {
+ throw Exception(e.getMsg());
+ }
+ m_tree.ConnectNodes();
+ SyntaxNode *root = m_tree.GetTop();
+ assert(root);
+ m_words = tokenize(m_line.c_str());
+ return ConvertTree(*root, m_words);
+}
+
+// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree.
+std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
+ const SyntaxNode &tree,
+ const std::vector<std::string> &words)
+{
+ std::auto_ptr<PcfgTree> root(new PcfgTree(tree.GetLabel()));
+ const std::vector<SyntaxNode*> &children = tree.GetChildren();
+ if (children.empty()) {
+ if (tree.GetStart() != tree.GetEnd()) {
+ std::ostringstream msg;
+ msg << "leaf node covers multiple words (" << tree.GetStart()
+ << "-" << tree.GetEnd() << "): this is currently unsupported";
+ throw Exception(msg.str());
+ }
+ std::auto_ptr<PcfgTree> leaf(new PcfgTree(words[tree.GetStart()]));
+ leaf->set_parent(root.get());
+ root->AddChild(leaf.release());
+ } else {
+ for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ assert(*p);
+ std::auto_ptr<PcfgTree> child = ConvertTree(**p, words);
+ child->set_parent(root.get());
+ root->AddChild(child.release());
+ }
+ }
+ return root;
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h
new file mode 100644
index 000000000..6b418c44e
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h
@@ -0,0 +1,56 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_XML_TREE_PARSER_H_
+#define PCFG_XML_TREE_PARSER_H_
+
+#include "pcfg_tree.h"
+#include "SyntaxTree.h"
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Parses a string in Moses' XML parse tree format and returns a PcfgTree
+// object.
+class XmlTreeParser {
+ public:
+ XmlTreeParser();
+ std::auto_ptr<PcfgTree> Parse(const std::string &);
+ private:
+ std::auto_ptr<PcfgTree> ConvertTree(const SyntaxNode &,
+ const std::vector<std::string> &);
+
+ std::set<std::string> m_labelSet;
+ std::map<std::string, int> m_topLabelSet;
+ std::string m_line;
+ ::SyntaxTree m_tree;
+ std::vector<std::string> m_words;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h
new file mode 100644
index 000000000..347c352bb
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h
@@ -0,0 +1,127 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_XML_TREE_WRITER_H_
+#define PCFG_XML_TREE_WRITER_H_
+
+#include "syntax_tree.h"
+
+#include "XmlTree.h"
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <vector>
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+template<typename InputTree>
+class XmlOutputHandler {
+ public:
+ typedef std::map<std::string, std::string> AttributeMap;
+
+ void GetLabel(const InputTree &, std::string &) const;
+ void GetAttributes(const InputTree &, AttributeMap &) const;
+};
+
+template<typename InputTree>
+class XmlTreeWriter : public XmlOutputHandler<InputTree> {
+ public:
+ typedef XmlOutputHandler<InputTree> Base;
+ void Write(const InputTree &, std::ostream &) const;
+ private:
+ std::string Escape(const std::string &) const;
+};
+
+template<typename InputTree>
+void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
+ std::ostream &out) const {
+ assert(!tree.IsLeaf());
+
+ // Opening tag
+
+ std::string label;
+ Base::GetLabel(tree, label);
+ out << "<tree label=\"" << Escape(label) << "\"";
+
+ typename Base::AttributeMap attribute_map;
+ Base::GetAttributes(tree, attribute_map);
+
+ for (typename Base::AttributeMap::const_iterator p = attribute_map.begin();
+ p != attribute_map.end(); ++p) {
+ out << " " << p->first << "=\"" << p->second << "\"";
+ }
+
+ out << ">";
+
+ // Children
+
+ const std::vector<InputTree *> &children = tree.children();
+ for (typename std::vector<InputTree *>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ InputTree &child = **p;
+ if (child.IsLeaf()) {
+ Base::GetLabel(child, label);
+ out << " " << Escape(label);
+ } else {
+ out << " ";
+ Write(**p, out);
+ }
+ }
+
+ // Closing tag
+ out << " </tree>";
+
+ if (tree.parent() == 0) {
+ out << std::endl;
+ }
+}
+
+// Escapes XML special characters.
+template<typename InputTree>
+std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
+ std::string t;
+ size_t len = s.size();
+ t.reserve(len);
+ for (size_t i = 0; i < len; ++i) {
+ if (s[i] == '<') {
+ t += "&lt;";
+ } else if (s[i] == '>') {
+ t += "&gt;";
+ } else if (s[i] == '&') {
+ t += "&amp;";
+ } else if (s[i] == '\'') {
+ t += "&apos;";
+ } else if (s[i] == '"') {
+ t += "&quot;";
+ } else {
+ t += s[i];
+ }
+ }
+ return t;
+}
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/Jamfile b/scripts/training/phrase-extract/pcfg-extract/Jamfile
new file mode 100644
index 000000000..be91d6d2f
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/Jamfile
@@ -0,0 +1 @@
+exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ;
diff --git a/scripts/training/phrase-extract/pcfg-extract/main.cc b/scripts/training/phrase-extract/pcfg-extract/main.cc
new file mode 100644
index 000000000..47b45afc3
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/main.cc
@@ -0,0 +1,25 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg_extract.h"
+
+int main(int argc, char *argv[]) {
+ Moses::PCFG::PcfgExtract tool;
+ return tool.Main(argc, argv);
+}
diff --git a/scripts/training/phrase-extract/pcfg-extract/options.h b/scripts/training/phrase-extract/pcfg-extract/options.h
new file mode 100644
index 000000000..3acb31b58
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/options.h
@@ -0,0 +1,36 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_OPTIONS_H_
+#define PCFG_EXTRACT_OPTIONS_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+struct Options {
+ std::string corpus_file;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc
new file mode 100644
index 000000000..151c9959c
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -0,0 +1,131 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg_extract.h"
+
+#include "options.h"
+#include "rule_collection.h"
+#include "rule_extractor.h"
+
+#include "pcfg-common/exception.h"
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/syntax_tree.h"
+#include "pcfg-common/typedef.h"
+#include "pcfg-common/xml_tree_parser.h"
+
+#include <boost/program_options.hpp>
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+int PcfgExtract::Main(int argc, char *argv[]) {
+ // Process command-line options.
+ Options options;
+ ProcessOptions(argc, argv, options);
+
+ // Extract PCFG rules from corpus.
+ Vocabulary non_term_vocab;
+ RuleExtractor rule_extractor(non_term_vocab);
+ RuleCollection rule_collection;
+ XmlTreeParser parser;
+ std::string line;
+ size_t line_num = 0;
+ std::auto_ptr<PcfgTree> tree;
+ while (std::getline(std::cin, line)) {
+ ++line_num;
+ try {
+ tree = parser.Parse(line);
+ } catch (Exception &e) {
+ std::ostringstream msg;
+ msg << "line " << line_num << ": " << e.msg();
+ Error(msg.str());
+ }
+ if (!tree.get()) {
+ std::ostringstream msg;
+ msg << "no tree at line " << line_num;
+ Warn(msg.str());
+ continue;
+ }
+ rule_extractor.Extract(*tree, rule_collection);
+ }
+
+ // Score rules and write PCFG to output.
+ Pcfg pcfg;
+ rule_collection.CreatePcfg(pcfg);
+ pcfg.Write(non_term_vocab, std::cout);
+
+ return 0;
+}
+
+void PcfgExtract::ProcessOptions(int argc, char *argv[],
+ Options &options) const {
+ namespace po = boost::program_options;
+
+ std::ostringstream usage_top;
+ usage_top << "Usage: " << name() << "\n\n" << "Options";
+
+ // Declare the command line options that are visible to the user.
+ po::options_description visible(usage_top.str());
+ visible.add_options()
+ ("help", "print help message and exit")
+ ;
+
+ // Declare the command line options that are hidden from the user
+ // (these are used as positional options).
+ po::options_description hidden("Hidden options");
+ hidden.add_options();
+
+ // Compose the full set of command-line options.
+ po::options_description cmd_line_options;
+ cmd_line_options.add(visible).add(hidden);
+
+ // Register the positional options.
+ po::positional_options_description p;
+
+ // Process the command-line.
+ po::variables_map vm;
+ try {
+ po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+ options(cmd_line_options).positional(p).run(), vm);
+ po::notify(vm);
+ } catch (const std::exception &e) {
+ std::ostringstream msg;
+ msg << e.what() << "\n\n" << visible;
+ Error(msg.str());
+ }
+
+ if (vm.count("help")) {
+ std::cout << visible << std::endl;
+ std::exit(0);
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h
new file mode 100644
index 000000000..1af6cb4fe
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_PCFG_EXTRACT_H_
+#define PCFG_EXTRACT_PCFG_EXTRACT_H_
+
+#include "pcfg-common/tool.h"
+
+namespace Moses {
+namespace PCFG {
+
+class Options;
+
+class PcfgExtract : public Tool {
+ public:
+ PcfgExtract() : Tool("pcfg-extract") {}
+ virtual int Main(int, char *[]);
+ private:
+ void ProcessOptions(int, char *[], Options &) const;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc
new file mode 100644
index 000000000..503b1a9e6
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc
@@ -0,0 +1,58 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "rule_collection.h"
+
+#include "pcfg-common/pcfg.h"
+
+#include <cmath>
+
+namespace Moses {
+namespace PCFG {
+
+void RuleCollection::Add(size_t lhs, const std::vector<size_t> &rhs) {
+ ++collection_[lhs][rhs];
+}
+
+void RuleCollection::CreatePcfg(Pcfg &pcfg) {
+ std::vector<size_t> key;
+ for (const_iterator p = begin(); p != end(); ++p) {
+ size_t lhs = p->first;
+ const RhsCountMap &rhs_counts = p->second;
+ size_t total = 0;
+ for (RhsCountMap::const_iterator q = rhs_counts.begin();
+ q != rhs_counts.end(); ++q) {
+ total += q->second;
+ }
+ for (RhsCountMap::const_iterator q = rhs_counts.begin();
+ q != rhs_counts.end(); ++q) {
+ const std::vector<size_t> &rhs = q->first;
+ size_t count = q->second;
+ double score = std::log(static_cast<double>(count) /
+ static_cast<double>(total));
+ key.clear();
+ key.push_back(lhs);
+ key.insert(key.end(), rhs.begin(), rhs.end());
+ pcfg.Add(key, score);
+ }
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.h b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h
new file mode 100644
index 000000000..1b768dd21
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h
@@ -0,0 +1,59 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_RULE_COLLECTION_H_
+#define PCFG_EXTRACT_RULE_COLLECTION_H_
+
+#include "pcfg-common/pcfg.h"
+
+#include <boost/unordered_map.hpp>
+
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Contains PCFG rules and their counts.
+class RuleCollection {
+ public:
+ typedef boost::unordered_map<std::vector<size_t>, size_t> RhsCountMap;
+ typedef boost::unordered_map<size_t, RhsCountMap> Map;
+ typedef Map::iterator iterator;
+ typedef Map::const_iterator const_iterator;
+
+ RuleCollection() {}
+
+ iterator begin() { return collection_.begin(); }
+ const_iterator begin() const { return collection_.begin(); }
+
+ iterator end() { return collection_.end(); }
+ const_iterator end() const { return collection_.end(); }
+
+ void Add(size_t, const std::vector<size_t> &);
+ void CreatePcfg(Pcfg &);
+
+ private:
+ Map collection_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc
new file mode 100644
index 000000000..48a82a6d0
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc
@@ -0,0 +1,51 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "rule_extractor.h"
+
+#include "pcfg-common/pcfg_tree.h"
+
+namespace Moses {
+namespace PCFG {
+
+RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab)
+ : non_term_vocab_(non_term_vocab) {
+}
+
+void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const {
+ if (tree.IsPreterminal() || tree.IsLeaf()) {
+ return;
+ }
+
+ size_t lhs = non_term_vocab_.Insert(tree.label());
+ std::vector<size_t> rhs;
+
+ const std::vector<PcfgTree *> &children = tree.children();
+ rhs.reserve(children.size());
+ for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ const PcfgTree &child = **p;
+ rhs.push_back(non_term_vocab_.Insert(child.label()));
+ Extract(child, rc);
+ }
+ rc.Add(lhs, rhs);
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h
new file mode 100644
index 000000000..6bcffbc61
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h
@@ -0,0 +1,45 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
+#define PCFG_EXTRACT_RULE_EXTRACTOR_H_
+
+#include "rule_collection.h"
+
+#include "pcfg-common/typedef.h"
+
+namespace Moses {
+namespace PCFG {
+
+class PcfgTree;
+
+// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
+class RuleExtractor {
+ public:
+ RuleExtractor(Vocabulary &);
+ void Extract(const PcfgTree &, RuleCollection &) const;
+ private:
+ Vocabulary &non_term_vocab_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-score/Jamfile b/scripts/training/phrase-extract/pcfg-score/Jamfile
new file mode 100644
index 000000000..7225381c0
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/Jamfile
@@ -0,0 +1 @@
+exe pcfg-score : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ;
diff --git a/scripts/training/phrase-extract/pcfg-score/main.cc b/scripts/training/phrase-extract/pcfg-score/main.cc
new file mode 100644
index 000000000..da5392add
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/main.cc
@@ -0,0 +1,25 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg_score.h"
+
+int main(int argc, char *argv[]) {
+ Moses::PCFG::PcfgScore tool;
+ return tool.Main(argc, argv);
+}
diff --git a/scripts/training/phrase-extract/pcfg-score/options.h b/scripts/training/phrase-extract/pcfg-score/options.h
new file mode 100644
index 000000000..e54b2a0b9
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/options.h
@@ -0,0 +1,36 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_OPTIONS_H_
+#define PCFG_SCORE_OPTIONS_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+struct Options {
+ std::string pcfg_file;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc
new file mode 100644
index 000000000..d780200ad
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc
@@ -0,0 +1,152 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg_score.h"
+
+#include "options.h"
+#include "tree_scorer.h"
+
+#include "pcfg-common/exception.h"
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/syntax_tree.h"
+#include "pcfg-common/typedef.h"
+#include "pcfg-common/xml_tree_parser.h"
+
+#include <boost/program_options.hpp>
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+int PcfgScore::Main(int argc, char *argv[]) {
+ // Process command-line options.
+ Options options;
+ ProcessOptions(argc, argv, options);
+
+ // Open PCFG stream.
+ std::ifstream pcfg_stream;
+ OpenNamedInputOrDie(options.pcfg_file, pcfg_stream);
+
+ // Read PCFG.
+ Pcfg pcfg;
+ Vocabulary non_term_vocab;
+ pcfg.Read(pcfg_stream, non_term_vocab);
+
+ // Score corpus according to PCFG.
+ TreeScorer scorer(pcfg, non_term_vocab);
+ XmlTreeParser parser;
+ XmlTreeWriter<PcfgTree> writer;
+ std::string line;
+ size_t line_num = 0;
+ std::auto_ptr<PcfgTree> tree;
+ while (std::getline(std::cin, line)) {
+ ++line_num;
+ try {
+ tree = parser.Parse(line);
+ } catch (Exception &e) {
+ std::ostringstream msg;
+ msg << "line " << line_num << ": " << e.msg();
+ Error(msg.str());
+ }
+ if (!tree.get()) {
+ std::ostringstream msg;
+ msg << "no tree at line " << line_num;
+ Warn(msg.str());
+ std::cout << std::endl;
+ continue;
+ }
+ if (!scorer.Score(*tree)) {
+ std::ostringstream msg;
+ msg << "failed to score tree at line " << line_num;
+ Warn(msg.str());
+ std::cout << std::endl;
+ continue;
+ }
+ writer.Write(*tree, std::cout);
+ }
+
+ return 0;
+}
+
+void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const {
+ namespace po = boost::program_options;
+
+ std::ostringstream usage_top;
+ usage_top << "Usage: " << name() << " PCFG\n\n"
+ << "Options";
+
+ // Declare the command line options that are visible to the user.
+ po::options_description visible(usage_top.str());
+ visible.add_options()
+ ("help", "print help message and exit")
+ ;
+
+ // Declare the command line options that are hidden from the user
+ // (these are used as positional options).
+ po::options_description hidden("Hidden options");
+ hidden.add_options()
+ ("pcfg-file", po::value(&options.pcfg_file), "pcfg file")
+ ;
+
+ // Compose the full set of command-line options.
+ po::options_description cmd_line_options;
+ cmd_line_options.add(visible).add(hidden);
+
+ // Register the positional options.
+ po::positional_options_description p;
+ p.add("pcfg-file", 1);
+
+ // Process the command-line.
+ po::variables_map vm;
+ try {
+ po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+ options(cmd_line_options).positional(p).run(), vm);
+ po::notify(vm);
+ } catch (const std::exception &e) {
+ std::ostringstream msg;
+ msg << e.what() << "\n\n" << visible;
+ Error(msg.str());
+ }
+
+ if (vm.count("help")) {
+ std::cout << visible << std::endl;
+ std::exit(0);
+ }
+
+ // Check positional options were given.
+
+ if (!vm.count("pcfg-file")) {
+ std::ostringstream msg;
+ msg << "missing required argument\n\n" << visible << std::endl;
+ Error(msg.str());
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.h b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h
new file mode 100644
index 000000000..5e506c39d
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_PCFG_SCORE_H_
+#define PCFG_SCORE_PCFG_SCORE_H_
+
+#include "pcfg-common/tool.h"
+
+namespace Moses {
+namespace PCFG {
+
+class Options;
+
+class PcfgScore : public Tool {
+ public:
+ PcfgScore() : Tool("pcfg-score") {}
+ virtual int Main(int, char *[]);
+ private:
+ void ProcessOptions(int, char *[], Options &) const;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc
new file mode 100644
index 000000000..5f695e4fc
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc
@@ -0,0 +1,68 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "tree_scorer.h"
+
+#include <cassert>
+
+namespace Moses {
+namespace PCFG {
+
+TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab)
+ : pcfg_(pcfg)
+ , non_term_vocab_(non_term_vocab) {
+}
+
+bool TreeScorer::Score(PcfgTree &root) const {
+ if (root.IsPreterminal() || root.IsLeaf()) {
+ return true;
+ }
+
+ const std::vector<PcfgTree *> &children = root.children();
+
+ double log_prob = 0.0;
+
+ std::vector<size_t> key;
+ key.reserve(children.size()+1);
+ key.push_back(non_term_vocab_.Lookup(root.label()));
+
+ for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ PcfgTree *child = *p;
+ assert(!child->IsLeaf());
+ key.push_back(non_term_vocab_.Lookup(child->label()));
+ if (!Score(*child)) {
+ return false;
+ }
+ if (!child->IsPreterminal()) {
+ log_prob += child->score();
+ }
+ }
+ double rule_score;
+ bool found = pcfg_.Lookup(key, rule_score);
+ if (!found) {
+ return false;
+ }
+ log_prob += rule_score;
+ root.set_score(log_prob);
+ return true;
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.h b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h
new file mode 100644
index 000000000..36f4e1e99
--- /dev/null
+++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h
@@ -0,0 +1,47 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_TREE_SCORER_H_
+#define PCFG_SCORE_TREE_SCORER_H_
+
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/typedef.h"
+
+namespace Moses {
+namespace PCFG {
+
+class TreeScorer {
+ public:
+ TreeScorer(const Pcfg &, const Vocabulary &);
+
+ // Score tree according to PCFG. Returns false if unsuccessful (due to
+ // missing rule).
+ bool Score(PcfgTree &) const;
+
+ private:
+ const Pcfg &pcfg_;
+ const Vocabulary &non_term_vocab_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp
index af7401132..c5fb0b99f 100644
--- a/scripts/training/phrase-extract/score.cpp
+++ b/scripts/training/phrase-extract/score.cpp
@@ -32,6 +32,7 @@
#include "PhraseAlignment.h"
#include "score.h"
#include "InputFileStream.h"
+#include "OutputFileStream.h"
using namespace std;
@@ -56,7 +57,7 @@ public:
vector<string> tokenize( const char [] );
-void writeCountOfCounts( const char* fileNameCountOfCounts );
+void writeCountOfCounts( const string &fileNameCountOfCounts );
void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile);
PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair );
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile );
@@ -71,6 +72,7 @@ void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
LexicalTable lexTable;
bool inverseFlag = false;
bool hierarchicalFlag = false;
+bool pcfgFlag = false;
bool wordAlignmentFlag = false;
bool goodTuringFlag = false;
bool kneserNeyFlag = false;
@@ -91,13 +93,13 @@ int main(int argc, char* argv[])
<< "scoring methods for extracted rules\n";
if (argc < 4) {
- cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring coc-file] [--KneserNey coc-file] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] \n";
+ cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] \n";
exit(1);
}
char* fileNameExtract = argv[1];
char* fileNameLex = argv[2];
char* fileNamePhraseTable = argv[3];
- char* fileNameCountOfCounts;
+ string fileNameCountOfCounts;
char* fileNameFunctionWords;
for(int i=4; i<argc; i++) {
@@ -107,6 +109,9 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--Hierarchical") == 0) {
hierarchicalFlag = true;
cerr << "processing hierarchical rules\n";
+ } else if (strcmp(argv[i],"--PCFG") == 0) {
+ pcfgFlag = true;
+ cerr << "including PCFG scores\n";
} else if (strcmp(argv[i],"--WordAlignment") == 0) {
wordAlignmentFlag = true;
cerr << "outputing word alignment" << endl;
@@ -115,19 +120,11 @@ int main(int argc, char* argv[])
cerr << "not computing lexical translation score\n";
} else if (strcmp(argv[i],"--GoodTuring") == 0) {
goodTuringFlag = true;
- if (i+1==argc) {
- cerr << "ERROR: specify count of count files for Good Turing discounting!\n";
- exit(1);
- }
- fileNameCountOfCounts = argv[++i];
+ fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
} else if (strcmp(argv[i],"--KneserNey") == 0) {
kneserNeyFlag = true;
- if (i+1==argc) {
- cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
- exit(1);
- }
- fileNameCountOfCounts = argv[++i];
+ fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
} else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
unalignedFlag = true;
@@ -188,9 +185,9 @@ int main(int argc, char* argv[])
phraseTableFile = &cout;
}
else {
- ofstream *outputFile = new ofstream();
- outputFile->open(fileNamePhraseTable);
- if (outputFile->fail()) {
+ Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
+ bool success = outputFile->Open(fileNamePhraseTable);
+ if (!success) {
cerr << "ERROR: could not open file phrase table file "
<< fileNamePhraseTable << endl;
exit(1);
@@ -200,6 +197,7 @@ int main(int argc, char* argv[])
// loop through all extracted phrase translations
float lastCount = 0.0f;
+ float lastPcfgSum = 0.0f;
vector< PhraseAlignment > phrasePairsWithSameF;
int i=0;
char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
@@ -214,6 +212,7 @@ int main(int argc, char* argv[])
// identical to last line? just add count
if (strcmp(line,lastLine) == 0) {
lastPhrasePair->count += lastCount;
+ lastPhrasePair->pcfgSum += lastPcfgSum;
continue;
}
strcpy( lastLine, line );
@@ -222,10 +221,12 @@ int main(int argc, char* argv[])
PhraseAlignment phrasePair;
phrasePair.create( line, i );
lastCount = phrasePair.count;
+ lastPcfgSum = phrasePair.pcfgSum;
// only differs in count? just add count
if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) {
lastPhrasePair->count += phrasePair.count;
+ lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
continue;
}
@@ -245,7 +246,6 @@ int main(int argc, char* argv[])
phraseTableFile->flush();
if (phraseTableFile != &cout) {
- (dynamic_cast<ofstream*>(phraseTableFile))->close();
delete phraseTableFile;
}
@@ -255,12 +255,12 @@ int main(int argc, char* argv[])
}
}
-void writeCountOfCounts( const char* fileNameCountOfCounts )
+void writeCountOfCounts( const string &fileNameCountOfCounts )
{
// open file
- ofstream countOfCountsFile;
- countOfCountsFile.open(fileNameCountOfCounts);
- if (countOfCountsFile.fail()) {
+ Moses::OutputFileStream countOfCountsFile;
+ bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str());
+ if (!success) {
cerr << "ERROR: could not open count-of-counts file "
<< fileNameCountOfCounts << endl;
return;
@@ -273,7 +273,7 @@ void writeCountOfCounts( const char* fileNameCountOfCounts )
for(int i=1; i<=COC_MAX; i++) {
countOfCountsFile << countOfCounts[ i ] << endl;
}
- countOfCountsFile.close();
+ countOfCountsFile.Close();
}
void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile )
@@ -446,6 +446,16 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
countOfCounts[ countInt ]++;
}
+ // compute PCFG score
+ float pcfgScore;
+ if (pcfgFlag && !inverseFlag) {
+ float pcfgSum = 0;
+ for(size_t i=0; i<phrasePair.size(); ++i) {
+ pcfgSum += phrasePair[i]->pcfgSum;
+ }
+ pcfgScore = pcfgSum / count;
+ }
+
// output phrases
const PHRASE &phraseS = phrasePair[0]->GetSource();
const PHRASE &phraseT = phrasePair[0]->GetTarget();
@@ -501,6 +511,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
}
+ // target-side PCFG score
+ if (pcfgFlag && !inverseFlag) {
+ phraseTableFile << " " << pcfgScore;
+ }
+
phraseTableFile << " ||| ";
// alignment info for non-terminals
diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir
index d3748fdc9..869f979fc 100755
--- a/scripts/training/train-model.perl.missing_bin_dir
+++ b/scripts/training/train-model.perl.missing_bin_dir
@@ -19,7 +19,7 @@ if ($SCRIPTS_ROOTDIR eq '') {
$SCRIPTS_ROOTDIR =~ s/\/training$//;
$SCRIPTS_ROOTDIR = $ENV{"SCRIPTS_ROOTDIR"} if defined($ENV{"SCRIPTS_ROOTDIR"});
-my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_CORPUS,
+my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_SORT_BUFFER_SIZE, $_SORT_BATCH_SIZE, $_SORT_COMPRESS, $_SORT_PARALLEL, $_CORPUS,
$_CORPUS_COMPRESSION, $_FIRST_STEP, $_LAST_STEP, $_F, $_E, $_MAX_PHRASE_LENGTH,
$_LEXICAL_FILE, $_NO_LEXICAL_WEIGHTING, $_VERBOSE, $_ALIGNMENT,
$_ALIGNMENT_FILE, $_ALIGNMENT_STEM, @_LM, $_EXTRACT_FILE, $_GIZA_OPTION, $_HELP, $_PARTS,
@@ -29,18 +29,19 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
$_DECODING_GRAPH_BACKOFF,
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
- $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_HMM_ALIGN, $_CONFIG,
- $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS,
+ $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG,
+ $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_PCFG,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS,
$_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
$_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
$_ADDITIONAL_INI,
$_DICTIONARY, $_EPPEX);
+my $_CORES = 1;
my $debug = 0; # debug this script, do not delete any files in debug mode
# the following line is set installation time by 'make release'. BEWARE!
-my $BINDIR="/Users/hieuhoang/workspace/bin/training-tools";
+my $BINDIR="/home/hieu/workspace/bin/training-tools/";
$_HELP = 1
unless &GetOptions('root-dir=s' => \$_ROOT_DIR,
@@ -57,7 +58,9 @@ $_HELP = 1
'model-dir=s' => \$_MODEL_DIR,
'temp-dir=s' => \$_TEMP_DIR,
'sort-buffer-size=s' => \$_SORT_BUFFER_SIZE,
- 'sort-batch-size=s' => \$_SORT_BATCH_SIZE,
+ 'sort-batch-size=i' => \$_SORT_BATCH_SIZE,
+ 'sort-compress=s' => \$_SORT_COMPRESS,
+ 'sort-parallel=i' => \$_SORT_PARALLEL,
'extract-file=s' => \$_EXTRACT_FILE,
'alignment=s' => \$_ALIGNMENT,
'alignment-file=s' => \$_ALIGNMENT_FILE,
@@ -72,6 +75,7 @@ $_HELP = 1
'help' => \$_HELP,
'mgiza' => \$_MGIZA, # multi-thread
'mgiza-cpus=i' => \$_MGIZA_CPUS, # multi-thread
+ 'snt2cooc=s' => \$_SNT2COOC, # override snt2cooc exe. For when you want to run reduced memory snt2cooc.perl from mgiza
'hmm-align' => \$_HMM_ALIGN,
'final-alignment-model=s' => \$_FINAL_ALIGNMENT_MODEL, # use word alignment model 1/2/hmm/3/4/5 as final (default is 4); value 'hmm' equivalent to the --hmm-align switch
'debug' => \$debug,
@@ -101,6 +105,7 @@ $_HELP = 1
'glue-grammar-file=s' => \$_GLUE_GRAMMAR_FILE,
'unknown-word-label-file=s' => \$_UNKNOWN_WORD_LABEL_FILE,
'ghkm' => \$_GHKM,
+ 'pcfg' => \$_PCFG,
'extract-options=s' => \$_EXTRACT_OPTIONS,
'score-options=s' => \$_SCORE_OPTIONS,
'source-syntax' => \$_SOURCE_SYNTAX,
@@ -114,7 +119,8 @@ $_HELP = 1
'force-factored-filenames' => \$_FORCE_FACTORED_FILENAMES,
'dictionary=s' => \$_DICTIONARY,
'eppex:s' => \$_EPPEX,
- 'additional-ini=s' => \$_ADDITIONAL_INI
+ 'additional-ini=s' => \$_ADDITIONAL_INI,
+ 'cores=i' => \$_CORES
);
if ($_HELP) {
@@ -185,29 +191,63 @@ my $SNT2COOC;
if(!defined $_MGIZA ){
$GIZA = "$BINDIR/GIZA++";
if (-x "$BINDIR/snt2cooc.out") {
- $SNT2COOC = "$BINDIR/snt2cooc.out";
+ $SNT2COOC = "$BINDIR/snt2cooc.out";
} elsif (-x "$BINDIR/snt2cooc") { # Since "snt2cooc.out" and "snt2cooc" work the same
$SNT2COOC = "$BINDIR/snt2cooc";
}
print STDERR "Using single-thread GIZA\n";
} else {
- $GIZA = "$BINDIR/mgiza";
+ $GIZA = "$BINDIR/mgiza";
if (-x "$BINDIR/snt2cooc") {
- $SNT2COOC = "$BINDIR/snt2cooc";
- } elsif (-x "$BINDIR/snt2cooc.out") { # Important for users that use MGIZA and copy only the "mgiza" file to $BINDIR
- $SNT2COOC = "$BINDIR/snt2cooc.out";
- }
+ $SNT2COOC = "$BINDIR/snt2cooc";
+ } elsif (-x "$BINDIR/snt2cooc.out") { # Important for users that use MGIZA and copy only the "mgiza" file to $BINDIR
+ $SNT2COOC = "$BINDIR/snt2cooc.out";
+ }
print STDERR "Using multi-thread GIZA\n";
- if (!defined($_MGIZA_CPUS)) {
- $_MGIZA_CPUS=4;
- }
- die("ERROR: Cannot find $MGIZA_MERGE_ALIGN") unless (-x $MGIZA_MERGE_ALIGN);
+ if (!defined($_MGIZA_CPUS)) {
+ $_MGIZA_CPUS=4;
+ }
+ die("ERROR: Cannot find $MGIZA_MERGE_ALIGN") unless (-x $MGIZA_MERGE_ALIGN);
}
+# override
+$SNT2COOC = "$BINDIR/$_SNT2COOC" if defined($_SNT2COOC);
+
my $MKCLS = "$BINDIR/mkcls";
+# parallel extract
+my $SPLIT_EXEC = `gsplit --help 2>/dev/null`;
+if($SPLIT_EXEC) {
+ $SPLIT_EXEC = 'gsplit';
+}
+else {
+ $SPLIT_EXEC = 'split';
+}
+
+my $SORT_EXEC = `gsort --help 2>/dev/null`;
+if($SORT_EXEC) {
+ $SORT_EXEC = 'gsort';
+}
+else {
+ $SORT_EXEC = 'sort';
+}
+
+my $__SORT_BUFFER_SIZE = "";
+$__SORT_BUFFER_SIZE = "-S $_SORT_BUFFER_SIZE" if $_SORT_BUFFER_SIZE;
+
+my $__SORT_BATCH_SIZE = "";
+$__SORT_BATCH_SIZE = "--batch-size $_SORT_BATCH_SIZE" if $_SORT_BATCH_SIZE;
+
+my $__SORT_COMPRESS = "";
+$__SORT_COMPRESS = "--compress-program $_SORT_COMPRESS" if $_SORT_COMPRESS;
+
+my $__SORT_PARALLEL = "";
+$__SORT_PARALLEL = "--parallel $_SORT_PARALLEL" if $_SORT_PARALLEL;
+
# supporting scripts/binaries from this package
my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract";
+$PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $PHRASE_EXTRACT";
+
my $RULE_EXTRACT;
if (defined($_GHKM)) {
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-ghkm/tools/extract-ghkm";
@@ -215,12 +255,17 @@ if (defined($_GHKM)) {
else {
$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-rules";
}
+$RULE_EXTRACT = "$SCRIPTS_ROOTDIR/generic/extract-parallel.perl $_CORES $SPLIT_EXEC \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $RULE_EXTRACT";
+
my $LEXICAL_REO_SCORER = "$SCRIPTS_ROOTDIR/training/lexical-reordering/score";
my $MEMSCORE = "$SCRIPTS_ROOTDIR/training/memscore/memscore";
my $EPPEX = "$SCRIPTS_ROOTDIR/training/eppex/eppex";
my $SYMAL = "$SCRIPTS_ROOTDIR/training/symal/symal";
my $GIZA2BAL = "$SCRIPTS_ROOTDIR/training/symal/giza2bal.pl";
+
my $PHRASE_SCORE = "$SCRIPTS_ROOTDIR/training/phrase-extract/score";
+$PHRASE_SCORE = "$SCRIPTS_ROOTDIR/generic/score-parallel.perl $_CORES \"$SORT_EXEC $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE $__SORT_COMPRESS $__SORT_PARALLEL\" $PHRASE_SCORE";
+
my $PHRASE_CONSOLIDATE = "$SCRIPTS_ROOTDIR/training/phrase-extract/consolidate";
# utilities
@@ -308,12 +353,6 @@ $_DONT_ZIP = $___DONT_ZIP unless $___DONT_ZIP;
my $___TEMP_DIR = $___MODEL_DIR;
$___TEMP_DIR = $_TEMP_DIR if $_TEMP_DIR;
-my $__SORT_BUFFER_SIZE = "";
-$__SORT_BUFFER_SIZE = "-S $_SORT_BUFFER_SIZE" if $_SORT_BUFFER_SIZE;
-
-my $__SORT_BATCH_SIZE = "";
-$__SORT_BATCH_SIZE = "--batch-size $_SORT_BATCH_SIZE" if $_SORT_BATCH_SIZE;
-
my $___CONTINUE = 0;
$___CONTINUE = $_CONTINUE if $_CONTINUE;
@@ -1335,6 +1374,7 @@ sub extract_phrase {
$cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file";
$cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR;
$cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
+ $cmd .= " --PCFG" if $_PCFG;
if (!defined($_GHKM)) {
$cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
$cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
@@ -1362,20 +1402,16 @@ sub extract_phrase {
$cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
}
}
+
+ $cmd .= " --GZOutput ";
+
map { die "File not found: $_" if ! -e $_ } ($alignment_file_e, $alignment_file_f, $alignment_file_a);
print STDERR "$cmd\n";
safesystem("$cmd") or die "ERROR: Phrase extraction failed (missing input files?)";
foreach my $f (@tempfiles) {
unlink $f;
}
- if (! $___DONT_ZIP) {
- safesystem("gzip $extract_file.o") if -e "$extract_file.o";
- safesystem("gzip $extract_file.sid") if -e "$extract_file.sid";
- if ($ttable_flag) {
- safesystem("gzip $extract_file.inv") or die("ERROR");
- safesystem("gzip $extract_file") or die("ERROR");
- }
- }
+
}
### (6) PHRASE SCORING
@@ -1457,41 +1493,32 @@ sub score_phrase_phrase_extract {
$inverse = " --Inverse";
$extract_filename = $extract_file.".inv";
}
- my $extract = "$extract_filename.sorted";
-
- if (!($___CONTINUE && -e "$extract_filename.sorted")) {
- # sorting
- print STDERR "(6.".($substep++).") sorting $direction @ ".`date`;
- if (-e "$extract_filename.gz") {
- safesystem("gunzip < $extract_filename.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_filename.sorted") or die("ERROR");
- }
- else {
- safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $extract_filename > $extract_filename.sorted") or die("ERROR");
- }
- }
+ my $extract = "$extract_filename.sorted.gz";
print STDERR "(6.".($substep++).") creating table half $ttable_file.half.$direction @ ".`date`;
my $cmd = "$PHRASE_SCORE $extract $lexical_file.$direction $ttable_file.half.$direction $inverse";
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
$cmd .= " --WordAlignment" if $_PHRASE_WORD_ALIGNMENT;
- $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
- $cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING && $inverse eq "";
+ $cmd .= " --KneserNey" if $KNESER_NEY;
+ $cmd .= " --GoodTuring" if $GOOD_TURING && $inverse eq "";
$cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT;
$cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT;
$cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL;
+ $cmd .= " --PCFG" if $_PCFG;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
- print $cmd."\n";
+
+ # sorting
+ if ($direction eq "e2f") {
+ $cmd .= " 1 ";
+ }
+ else {
+ $cmd .= " 0 ";
+ }
+
+ print $cmd."\n";
safesystem($cmd) or die "ERROR: Scoring of phrases failed";
- if (! $debug) { safesystem("rm -f $extract") or die("ERROR"); }
- # sorting inverse phrase-table-half to sync up with regular one
- if ($direction eq "e2f" && ! ($___CONTINUE && -e "$ttable_file.half.e2f.sorted")) {
- print STDERR "(6." . ($substep++) . ") sorting inverse e2f table@ ".`date`;
- safesystem("LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR $ttable_file.half.e2f > $ttable_file.half.e2f.sorted") or die("ERROR");
- if (! $debug) { safesystem("rm -f $ttable_file.half.e2f") or die("ERROR"); }
- }
-
exit();
}
else
@@ -1516,20 +1543,17 @@ sub score_phrase_phrase_extract {
# merging the two halves
print STDERR "(6.6) consolidating the two halves @ ".`date`;
return if $___CONTINUE && -e "$ttable_file.gz";
- my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e $ttable_file.half.e2f.sorted $ttable_file";
+ my $cmd = "$PHRASE_CONSOLIDATE $ttable_file.half.f2e.gz $ttable_file.half.e2f.sorted.gz $ttable_file.gz";
$cmd .= " --Hierarchical" if $_HIERARCHICAL;
$cmd .= " --LogProb" if $LOG_PROB;
$cmd .= " --NegLogProb" if $NEG_LOG_PROB;
$cmd .= " --OnlyDirect" if $ONLY_DIRECT;
$cmd .= " --NoPhraseCount" unless $PHRASE_COUNT;
$cmd .= " --LowCountFeature" if $LOW_COUNT;
- $cmd .= " --GoodTuring $ttable_file.coc" if $GOOD_TURING;
- $cmd .= " --KneserNey $ttable_file.coc" if $KNESER_NEY;
+ $cmd .= " --GoodTuring $ttable_file.half.f2e.coc" if $GOOD_TURING;
+ $cmd .= " --KneserNey $ttable_file.half.f2e.coc" if $KNESER_NEY;
safesystem($cmd) or die "ERROR: Consolidating the two phrase table halves failed";
if (! $debug) { safesystem("rm -f $ttable_file.half.*") or die("ERROR"); }
- if (! $___DONT_ZIP) {
- safesystem("gzip $ttable_file") || die("ERROR: could not gzip $ttable_file");
- }
}
sub score_phrase_memscore {
@@ -1597,35 +1621,27 @@ sub get_reordering_factored {
}
sub get_reordering {
- my ($extract_file,$reo_model_path) = @_;
- if (-e "$extract_file.o.gz") {
- safesystem("gunzip < $extract_file.o.gz | LC_ALL=C sort $__SORT_BUFFER_SIZE $__SORT_BATCH_SIZE -T $___TEMP_DIR > $extract_file.o.sorted") or die("ERROR");
- }
- else {
- safesystem("LC_ALL=C sort -T $___TEMP_DIR $extract_file.o > $extract_file.o.sorted") or die("ERROR");
- }
-
- my $smooth = $___REORDERING_SMOOTH;
-
- print STDERR "(7.2) building tables @ ".`date`;
-
- #create cmd string for lexical reordering scoring
- my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted $smooth $reo_model_path";
- $cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/);
- for my $mtype (keys %REORDERING_MODEL_TYPES) {
- $cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
- foreach my $model (@REORDERING_MODELS) {
- if ($model->{"type"} eq $mtype) {
- $cmd .= " ".$model->{"filename"};
- }
+ my ($extract_file,$reo_model_path) = @_;
+ my $smooth = $___REORDERING_SMOOTH;
+
+ print STDERR "(7.2) building tables @ ".`date`;
+
+ #create cmd string for lexical reordering scoring
+ my $cmd = "$LEXICAL_REO_SCORER $extract_file.o.sorted.gz $smooth $reo_model_path";
+ $cmd .= " --SmoothWithCounts" if ($smooth =~ /(.+)u$/);
+ for my $mtype (keys %REORDERING_MODEL_TYPES) {
+ $cmd .= " --model \"$mtype $REORDERING_MODEL_TYPES{$mtype}";
+ foreach my $model (@REORDERING_MODELS) {
+ if ($model->{"type"} eq $mtype) {
+ $cmd .= " ".$model->{"filename"};
+ }
+ }
+ $cmd .= "\"";
}
- $cmd .= "\"";
- }
-
- #Call the lexical reordering scorer
- safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
-
- if (! $debug) { safesystem("rm $extract_file.o.sorted") or die("ERROR");}
+
+ #Call the lexical reordering scorer
+ safesystem("$cmd") or die "ERROR: Lexical reordering scoring failed";
+
}
@@ -1788,6 +1804,7 @@ sub create_ini {
$basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/;
$basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature
$basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature
+ $basic_weight_count++ if $_PCFG;
foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) {
$num_of_ttables++;
my $ff = $f;