From 90c0bc9f5ceec4e7d33386ec597fd753e7d23d4a Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Fri, 25 May 2012 17:29:47 +0100 Subject: Add an optional PCFG scoring feature for target syntax models (similar to the p_cfg feature used in Marcu, Wang, Echihabi, and Knight (2006)). --- scripts/Jamfile | 2 + scripts/ems/experiment.meta | 15 +- scripts/ems/experiment.perl | 2 + scripts/training/phrase-extract/ExtractedRule.h | 2 + scripts/training/phrase-extract/Jamfile | 2 + .../training/phrase-extract/PhraseAlignment.cpp | 7 +- scripts/training/phrase-extract/PhraseAlignment.h | 1 + .../phrase-extract/RuleExtractionOptions.h | 2 + scripts/training/phrase-extract/SyntaxTree.cpp | 3 +- scripts/training/phrase-extract/SyntaxTree.h | 11 +- scripts/training/phrase-extract/XmlTree.cpp | 9 +- .../phrase-extract/extract-ghkm/AlignmentGraph.cpp | 4 + .../phrase-extract/extract-ghkm/ExtractGHKM.cpp | 5 + .../training/phrase-extract/extract-ghkm/Node.h | 6 +- .../training/phrase-extract/extract-ghkm/Options.h | 2 + .../phrase-extract/extract-ghkm/ParseTree.h | 7 +- .../phrase-extract/extract-ghkm/ScfgRule.cpp | 1 + .../phrase-extract/extract-ghkm/ScfgRule.h | 2 + .../phrase-extract/extract-ghkm/ScfgRuleWriter.cpp | 69 +++++----- .../phrase-extract/extract-ghkm/ScfgRuleWriter.h | 4 +- .../phrase-extract/extract-ghkm/Subgraph.cpp | 16 +++ .../phrase-extract/extract-ghkm/Subgraph.h | 8 +- .../phrase-extract/extract-ghkm/XmlTreeParser.cpp | 1 + scripts/training/phrase-extract/extract-rules.cpp | 32 ++++- .../training/phrase-extract/pcfg-common/Jamfile | 1 + .../phrase-extract/pcfg-common/exception.h | 41 ++++++ .../phrase-extract/pcfg-common/numbered_set.h | 109 +++++++++++++++ .../training/phrase-extract/pcfg-common/pcfg.cc | 106 ++++++++++++++ scripts/training/phrase-extract/pcfg-common/pcfg.h | 61 +++++++++ .../phrase-extract/pcfg-common/pcfg_tree.h | 77 +++++++++++ .../phrase-extract/pcfg-common/syntax_tree.h | 91 ++++++++++++ .../training/phrase-extract/pcfg-common/tool.cc | 80 +++++++++++ scripts/training/phrase-extract/pcfg-common/tool.h | 91 ++++++++++++ .../training/phrase-extract/pcfg-common/typedef.h | 37 +++++ .../phrase-extract/pcfg-common/xml_tree_parser.cc | 85 ++++++++++++ .../phrase-extract/pcfg-common/xml_tree_parser.h | 56 ++++++++ .../phrase-extract/pcfg-common/xml_tree_writer.h | 127 +++++++++++++++++ .../training/phrase-extract/pcfg-extract/Jamfile | 1 + .../training/phrase-extract/pcfg-extract/main.cc | 25 ++++ .../training/phrase-extract/pcfg-extract/options.h | 36 +++++ .../phrase-extract/pcfg-extract/pcfg_extract.cc | 131 ++++++++++++++++++ .../phrase-extract/pcfg-extract/pcfg_extract.h | 42 ++++++ .../phrase-extract/pcfg-extract/rule_collection.cc | 58 ++++++++ .../phrase-extract/pcfg-extract/rule_collection.h | 59 ++++++++ .../phrase-extract/pcfg-extract/rule_extractor.cc | 51 +++++++ .../phrase-extract/pcfg-extract/rule_extractor.h | 45 ++++++ scripts/training/phrase-extract/pcfg-score/Jamfile | 1 + scripts/training/phrase-extract/pcfg-score/main.cc | 25 ++++ .../training/phrase-extract/pcfg-score/options.h | 36 +++++ .../phrase-extract/pcfg-score/pcfg_score.cc | 152 +++++++++++++++++++++ .../phrase-extract/pcfg-score/pcfg_score.h | 42 ++++++ .../phrase-extract/pcfg-score/tree_scorer.cc | 68 +++++++++ .../phrase-extract/pcfg-score/tree_scorer.h | 47 +++++++ scripts/training/phrase-extract/score.cpp | 23 ++++ scripts/training/train-model.perl.missing_bin_dir | 4 + 55 files changed, 1970 insertions(+), 51 deletions(-) create mode 100644 scripts/training/phrase-extract/pcfg-common/Jamfile create mode 100644 scripts/training/phrase-extract/pcfg-common/exception.h create mode 100644 scripts/training/phrase-extract/pcfg-common/numbered_set.h create mode 100644 scripts/training/phrase-extract/pcfg-common/pcfg.cc create mode 100644 scripts/training/phrase-extract/pcfg-common/pcfg.h create mode 100644 scripts/training/phrase-extract/pcfg-common/pcfg_tree.h create mode 100644 scripts/training/phrase-extract/pcfg-common/syntax_tree.h create mode 100644 scripts/training/phrase-extract/pcfg-common/tool.cc create mode 100644 scripts/training/phrase-extract/pcfg-common/tool.h create mode 100644 scripts/training/phrase-extract/pcfg-common/typedef.h create mode 100644 scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc create mode 100644 scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h create mode 100644 scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h create mode 100644 scripts/training/phrase-extract/pcfg-extract/Jamfile create mode 100644 scripts/training/phrase-extract/pcfg-extract/main.cc create mode 100644 scripts/training/phrase-extract/pcfg-extract/options.h create mode 100644 scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc create mode 100644 scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h create mode 100644 scripts/training/phrase-extract/pcfg-extract/rule_collection.cc create mode 100644 scripts/training/phrase-extract/pcfg-extract/rule_collection.h create mode 100644 scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc create mode 100644 scripts/training/phrase-extract/pcfg-extract/rule_extractor.h create mode 100644 scripts/training/phrase-extract/pcfg-score/Jamfile create mode 100644 scripts/training/phrase-extract/pcfg-score/main.cc create mode 100644 scripts/training/phrase-extract/pcfg-score/options.h create mode 100644 scripts/training/phrase-extract/pcfg-score/pcfg_score.cc create mode 100644 scripts/training/phrase-extract/pcfg-score/pcfg_score.h create mode 100644 scripts/training/phrase-extract/pcfg-score/tree_scorer.cc create mode 100644 scripts/training/phrase-extract/pcfg-score/tree_scorer.h diff --git a/scripts/Jamfile b/scripts/Jamfile index 6fb9bad39..b9eefcffe 100644 --- a/scripts/Jamfile +++ b/scripts/Jamfile @@ -42,6 +42,8 @@ if $(location) { install compactify : training/compact-rule-table//compactify : $(location)/training/compact-rule-table/tools ; install phrase-extract : training/phrase-extract//programs : $(location)/training/phrase-extract ; + install pcfg-extract : training/phrase-extract/pcfg-extract//pcfg-extract : $(location)/training/phrase-extract/pcfg-extract ; + install pcfg-score : training/phrase-extract/pcfg-score//pcfg-score : $(location)/training/phrase-extract/pcfg-score ; install lexical-reordering : training/lexical-reordering//score : $(location)/training/lexical-reordering ; install symal : training/symal//symal : $(location)/training/symal ; diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 51ac0f67a..b33c589d2 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -344,8 +344,21 @@ parse-relax pass-unless: input-parse-relaxer output-parse-relaxer template-if: input-parse-relaxer IN.$input-extension OUT.$input-extension template-if: output-parse-relaxer IN.$output-extension OUT.$output-extension +pcfg-extract + in: parse-relaxed-corpus + out: pcfg + default-name: model/pcfg + ignore-unless: use-pcfg-feature + rerun-on-change: use-pcfg-feature + template: $moses-script-dir/training/phrase-extract/pcfg-extract/pcfg-extract < IN.$output-extension > OUT.$output-extension +pcfg-score + in: parse-relaxed-corpus pcfg + out: scored-corpus + default-name: model/scored-corpus + pass-unless: use-pcfg-feature + template: ln -s IN.$input-extension OUT.$input-extension ; $moses-script-dir/training/phrase-extract/pcfg-score/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension extract-phrases - in: word-alignment parse-relaxed-corpus + in: word-alignment scored-corpus out: extracted-phrases rerun-on-change: max-phrase-length translation-factors reordering-factors hierarchical-rule-set extract-settings training-options script use-ghkm default-name: model/extract diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 59bd2788f..0c61a2a05 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -2007,6 +2007,7 @@ sub get_training_setting { my $target_syntax = &get("GENERAL:output-parser"); my $score_settings = &get("TRAINING:score-settings"); my $parallel = &get("TRAINING:parallel"); + my $pcfg = &get("TRAINING:use-pcfg-feature"); my $xml = $source_syntax || $target_syntax; @@ -2029,6 +2030,7 @@ sub get_training_setting { $cmd .= "-glue-grammar " if $hierarchical; $cmd .= "-score-options '".$score_settings."' " if $score_settings; $cmd .= "-parallel " if $parallel; + $cmd .= "-pcfg " if $pcfg; # factored training if (&backoff_and_get("TRAINING:input-factors")) { diff --git a/scripts/training/phrase-extract/ExtractedRule.h b/scripts/training/phrase-extract/ExtractedRule.h index 170ccf892..be6e30836 100644 --- a/scripts/training/phrase-extract/ExtractedRule.h +++ b/scripts/training/phrase-extract/ExtractedRule.h @@ -43,6 +43,7 @@ public: int startS; int endS; float count; + double pcfgScore; std::map > m_ntLengths; @@ -58,6 +59,7 @@ public: , startS(sS) , endS(eS) , count(0) + , pcfgScore(0.0) {} void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength) diff --git a/scripts/training/phrase-extract/Jamfile b/scripts/training/phrase-extract/Jamfile index 5ed3f20f1..9be67e80a 100644 --- a/scripts/training/phrase-extract/Jamfile +++ b/scripts/training/phrase-extract/Jamfile @@ -33,3 +33,5 @@ alias programs : extract extract-rules extract-lex score consolidate consolidate install legacy : programs : . EXE ; build-project extract-ghkm ; +build-project pcfg-extract ; +build-project pcfg-score ; diff --git a/scripts/training/phrase-extract/PhraseAlignment.cpp b/scripts/training/phrase-extract/PhraseAlignment.cpp index c0bfbde3e..ceb74f04c 100644 --- a/scripts/training/phrase-extract/PhraseAlignment.cpp +++ b/scripts/training/phrase-extract/PhraseAlignment.cpp @@ -13,6 +13,8 @@ #include "tables-core.h" #include "score.h" +#include + using namespace std; extern Vocabulary vcbT; @@ -111,6 +113,9 @@ void PhraseAlignment::create( char line[], int lineID ) } else if (item == 5) { // non-term lengths addNTLength(token[j]); + } else if (item == 6) { // target syntax PCFG score + float pcfgScore = std::atof(token[j].c_str()); + pcfgSum = pcfgScore * count; } } @@ -119,7 +124,7 @@ void PhraseAlignment::create( char line[], int lineID ) if (item == 3) { count = 1.0; } - if (item < 3 || item > 5) { + if (item < 3 || item > 6) { cerr << "ERROR: faulty line " << lineID << ": " << line << endl; } } diff --git a/scripts/training/phrase-extract/PhraseAlignment.h b/scripts/training/phrase-extract/PhraseAlignment.h index 8b8f5115c..8bd83503d 100644 --- a/scripts/training/phrase-extract/PhraseAlignment.h +++ b/scripts/training/phrase-extract/PhraseAlignment.h @@ -25,6 +25,7 @@ protected: void createAlignVec(size_t sourceSize, size_t targetSize); void addNTLength(const std::string &tok); public: + float pcfgSum; float count; std::vector< std::set > alignedToT; std::vector< std::set > alignedToS; diff --git a/scripts/training/phrase-extract/RuleExtractionOptions.h b/scripts/training/phrase-extract/RuleExtractionOptions.h index 70bb548c9..f9123de86 100644 --- a/scripts/training/phrase-extract/RuleExtractionOptions.h +++ b/scripts/training/phrase-extract/RuleExtractionOptions.h @@ -45,6 +45,7 @@ public: bool targetSyntax; bool duplicateRules; bool fractionalCounting; + bool pcfgScore; bool outputNTLengths; bool gzOutput; @@ -74,6 +75,7 @@ public: , targetSyntax(false) , duplicateRules(true) , fractionalCounting(true) + , pcfgScore(false) , outputNTLengths(false) , gzOutput(false) {} diff --git a/scripts/training/phrase-extract/SyntaxTree.cpp b/scripts/training/phrase-extract/SyntaxTree.cpp index e181b1e8a..f2783ffd2 100644 --- a/scripts/training/phrase-extract/SyntaxTree.cpp +++ b/scripts/training/phrase-extract/SyntaxTree.cpp @@ -42,11 +42,12 @@ void SyntaxTree::Clear() m_index.clear(); } -void SyntaxTree::AddNode( int startPos, int endPos, std::string label ) +SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label ) { SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label ); m_nodes.push_back( newNode ); m_index[ startPos ][ endPos ].push_back( newNode ); + return newNode; } ParentNodes SyntaxTree::Parse() diff --git a/scripts/training/phrase-extract/SyntaxTree.h b/scripts/training/phrase-extract/SyntaxTree.h index 0ca5ca472..17c106b49 100644 --- a/scripts/training/phrase-extract/SyntaxTree.h +++ b/scripts/training/phrase-extract/SyntaxTree.h @@ -34,12 +34,14 @@ protected: std::string m_label; std::vector< SyntaxNode* > m_children; SyntaxNode* m_parent; + float m_pcfgScore; public: SyntaxNode( int startPos, int endPos, std::string label ) :m_start(startPos) ,m_end(endPos) ,m_label(label) ,m_parent(0) + ,m_pcfgScore(0.0f) {} int GetStart() const { return m_start; @@ -50,6 +52,12 @@ public: std::string GetLabel() const { return m_label; } + float GetPcfgScore() const { + return m_pcfgScore; + } + void SetPcfgScore(float score) { + m_pcfgScore = score; + } SyntaxNode *GetParent() { return m_parent; } @@ -89,11 +97,12 @@ public: } ~SyntaxTree(); + SyntaxNode *AddNode( int startPos, int endPos, std::string label ); + SyntaxNode *GetTop() { return m_top; } - void AddNode( int startPos, int endPos, std::string label ); ParentNodes Parse(); bool HasNode( int startPos, int endPos ) const; const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const; diff --git a/scripts/training/phrase-extract/XmlTree.cpp b/scripts/training/phrase-extract/XmlTree.cpp index 716414f86..b22c159a1 100644 --- a/scripts/training/phrase-extract/XmlTree.cpp +++ b/scripts/training/phrase-extract/XmlTree.cpp @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include "SyntaxTree.h" #include "XmlException.h" @@ -345,13 +345,18 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label string label = ParseXmlTagAttribute(tagContent,"label"); labelCollection.insert( label ); + string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg"); + float pcfgScore = pcfgString == "" ? 0.0f + : std::atof(pcfgString.c_str()); + // report what we have processed so far if (0) { cerr << "XML TAG NAME IS: '" << tagName << "'" << endl; cerr << "XML TAG LABEL IS: '" << label << "'" << endl; cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; } - tree.AddNode( startPos, endPos-1, label ); + SyntaxNode *node = tree.AddNode( startPos, endPos-1, label ); + node->SetPcfgScore(pcfgScore); } } } diff --git a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp index 0ecffae5c..6bd32a13b 100644 --- a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp @@ -212,6 +212,10 @@ Node *AlignmentGraph::CopyParseTree(const ParseTree *root) std::auto_ptr n(new Node(root->GetLabel(), nodeType)); + if (nodeType == TREE) { + n->SetPcfgScore(root->GetPcfgScore()); + } + const std::vector &children = root->GetChildren(); std::vector childNodes; childNodes.reserve(children.size()); diff --git a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 008026e1a..397ce1e3c 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -285,6 +285,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], "set maximum allowed scope") ("Minimal", "extract minimal rules only") + ("PCFG", + "include score based on PCFG scores in target corpus") ("UnknownWordLabel", po::value(&options.unknownWordFile), "write unknown word labels to named file") @@ -361,6 +363,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], if (vm.count("Minimal")) { options.minimal = true; } + if (vm.count("PCFG")) { + options.pcfg = true; + } if (vm.count("UnpairedExtractFormat")) { options.unpairedExtractFormat = true; } diff --git a/scripts/training/phrase-extract/extract-ghkm/Node.h b/scripts/training/phrase-extract/extract-ghkm/Node.h index 228fdc812..775473362 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Node.h +++ b/scripts/training/phrase-extract/extract-ghkm/Node.h @@ -41,8 +41,7 @@ class Node Node(const std::string &label, NodeType type) : m_label(label) , m_type(type) - , m_children() - , m_parents() {} + , m_pcfgScore(0.0f) {} ~Node(); @@ -50,12 +49,14 @@ class Node NodeType GetType() const { return m_type; } const std::vector &GetChildren() const { return m_children; } const std::vector &GetParents() const { return m_parents; } + float GetPcfgScore() const { return m_pcfgScore; } const Span &GetSpan() const { return m_span; } const Span &GetComplementSpan() const { return m_complementSpan; } const std::vector &GetRules() const { return m_rules; } void SetChildren(const std::vector &c) { m_children = c; } void SetParents(const std::vector &p) { m_parents = p; } + void SetPcfgScore(float s) { m_pcfgScore = s; } void SetSpan(const Span &s) { m_span = s; } void SetComplementSpan(const Span &cs) { m_complementSpan = cs; } @@ -92,6 +93,7 @@ class Node NodeType m_type; std::vector m_children; std::vector m_parents; + float m_pcfgScore; Span m_span; Span m_complementSpan; std::vector m_rules; diff --git a/scripts/training/phrase-extract/extract-ghkm/Options.h b/scripts/training/phrase-extract/extract-ghkm/Options.h index 108e19d66..c4b57f311 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Options.h +++ b/scripts/training/phrase-extract/extract-ghkm/Options.h @@ -36,6 +36,7 @@ struct Options { , maxRuleSize(3) , maxScope(3) , minimal(false) + , pcfg(false) , unpairedExtractFormat(false) {} // Positional options @@ -53,6 +54,7 @@ struct Options { int maxRuleSize; int maxScope; bool minimal; + bool pcfg; bool unpairedExtractFormat; std::string unknownWordFile; }; diff --git a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h index ec6fc147a..273e2e04e 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h +++ b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h @@ -32,17 +32,19 @@ class ParseTree public: ParseTree(const std::string &label) : m_label(label) - , m_children() - , m_parent() {} + , m_parent(0) + , m_pcfgScore(0.0) {} ~ParseTree(); const std::string &GetLabel() const { return m_label; } const std::vector &GetChildren() const { return m_children; } const ParseTree *GetParent() const { return m_parent; } + float GetPcfgScore() const { return m_pcfgScore; } void SetParent(ParseTree *); void SetChildren(const std::vector &); + void SetPcfgScore(float score) { m_pcfgScore = score; } void AddChild(ParseTree *); @@ -59,6 +61,7 @@ class ParseTree std::string m_label; std::vector m_children; ParseTree *m_parent; + float m_pcfgScore; // log probability }; template diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp index 8473e4283..5dc70052c 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -30,6 +30,7 @@ namespace GHKM { ScfgRule::ScfgRule(const Subgraph &fragment) : m_sourceLHS("X", NonTerminal) , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal) + , m_pcfgScore(fragment.GetPcfgScore()) { // Source RHS diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h index 1ed534d9e..2405d8fa3 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h @@ -57,6 +57,7 @@ class ScfgRule const std::vector &GetSourceRHS() const { return m_sourceRHS; } const std::vector &GetTargetRHS() const { return m_targetRHS; } const Alignment &GetAlignment() const { return m_alignment; } + float GetPcfgScore() const { return m_pcfgScore; } int Scope() const; @@ -68,6 +69,7 @@ class ScfgRule std::vector m_sourceRHS; std::vector m_targetRHS; Alignment m_alignment; + float m_pcfgScore; }; } // namespace GHKM diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp index 4be3f048d..d5d16b790 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp @@ -24,6 +24,7 @@ #include "ScfgRule.h" #include +#include #include #include #include @@ -34,14 +35,43 @@ namespace GHKM { void ScfgRuleWriter::Write(const ScfgRule &rule) { + std::ostringstream sourceSS; + std::ostringstream targetSS; + if (m_options.unpairedExtractFormat) { - WriteUnpairedFormat(rule); + WriteUnpairedFormat(rule, sourceSS, targetSS); } else { - WriteStandardFormat(rule); + WriteStandardFormat(rule, sourceSS, targetSS); + } + + // Write the rule to the forward and inverse extract files. + m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||"; + m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||"; + + const Alignment &alignment = rule.GetAlignment(); + for (Alignment::const_iterator p = alignment.begin(); + p != alignment.end(); ++p) { + m_fwd << " " << p->first << "-" << p->second; + m_inv << " " << p->second << "-" << p->first; + } + + // Write a count of 1 and an empty NT length column to the forward extract + // file. + // TODO Add option to write NT length? + m_fwd << " ||| 1 ||| |||"; + if (m_options.pcfg) { + // Write the PCFG score. + m_fwd << " " << std::exp(rule.GetPcfgScore()); } + m_fwd << std::endl; + + // Write a count of 1 to the inverse extract file. + m_inv << " ||| 1" << std::endl; } -void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule) +void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule, + std::ostream &sourceSS, + std::ostream &targetSS) { const std::vector &sourceRHS = rule.GetSourceRHS(); const std::vector &targetRHS = rule.GetTargetRHS(); @@ -60,9 +90,6 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule) } } - std::ostringstream sourceSS; - std::ostringstream targetSS; - // Write the source side of the rule to sourceSS. int i = 0; for (std::vector::const_iterator p(sourceRHS.begin()); @@ -88,27 +115,14 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule) targetSS << " "; } WriteSymbol(rule.GetTargetLHS(), targetSS); - - // Write the rule to the forward and inverse extract files. - m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||"; - m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||"; - for (Alignment::const_iterator p(alignment.begin()); - p != alignment.end(); ++p) { - m_fwd << " " << p->first << "-" << p->second; - m_inv << " " << p->second << "-" << p->first; - } - m_fwd << " ||| 1" << std::endl; - m_inv << " ||| 1" << std::endl; } -void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule) +void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule, + std::ostream &sourceSS, + std::ostream &targetSS) { const std::vector &sourceRHS = rule.GetSourceRHS(); const std::vector &targetRHS = rule.GetTargetRHS(); - const Alignment &alignment = rule.GetAlignment(); - - std::ostringstream sourceSS; - std::ostringstream targetSS; // Write the source side of the rule to sourceSS. int i = 0; @@ -127,17 +141,6 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule) targetSS << " "; } WriteSymbol(rule.GetTargetLHS(), targetSS); - - // Write the rule to the forward and inverse extract files. - m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||"; - m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||"; - for (Alignment::const_iterator p(alignment.begin()); - p != alignment.end(); ++p) { - m_fwd << " " << p->first << "-" << p->second; - m_inv << " " << p->second << "-" << p->first; - } - m_fwd << " ||| 1" << std::endl; - m_inv << " ||| 1" << std::endl; } void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out) diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h index 738d09ce9..b92a432a1 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h @@ -45,8 +45,8 @@ class ScfgRuleWriter ScfgRuleWriter(const ScfgRuleWriter &); ScfgRuleWriter &operator=(const ScfgRuleWriter &); - void WriteStandardFormat(const ScfgRule &); - void WriteUnpairedFormat(const ScfgRule &); + void WriteStandardFormat(const ScfgRule &, std::ostream &, std::ostream &); + void WriteUnpairedFormat(const ScfgRule &, std::ostream &, std::ostream &); void WriteSymbol(const Symbol &, std::ostream &); std::ostream &m_fwd; diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp index e5aedbb16..e048f2c55 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp @@ -101,5 +101,21 @@ int Subgraph::CalcDepth(const Node *n) const return maxChildDepth + 1; } +float Subgraph::CalcPcfgScore() const +{ + if (m_root->GetType() != TREE || m_leaves.empty()) { + return 0.0f; + } + float score = m_root->GetPcfgScore(); + for (std::set::const_iterator p = m_leaves.begin(); + p != m_leaves.end(); ++p) { + const Node *leaf = *p; + if (leaf->GetType() == TREE) { + score -= leaf->GetPcfgScore(); + } + } + return score; +} + } // namespace Moses } // namespace GHKM diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h index e84903502..ede1233e9 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h +++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h @@ -38,7 +38,8 @@ class Subgraph : m_root(root) , m_depth(0) , m_size(root->GetType() == TREE ? 1 : 0) - , m_nodeCount(1) {} + , m_nodeCount(1) + , m_pcfgScore(0.0f) {} Subgraph(const Node *root, const std::set &leaves) : m_root(root) @@ -46,10 +47,12 @@ class Subgraph , m_depth(-1) , m_size(-1) , m_nodeCount(-1) + , m_pcfgScore(0.0f) { m_depth = CalcDepth(m_root); m_size = CalcSize(m_root); m_nodeCount = CountNodes(m_root); + m_pcfgScore = CalcPcfgScore(); } const Node *GetRoot() const { return m_root; } @@ -57,6 +60,7 @@ class Subgraph int GetDepth() const { return m_depth; } int GetSize() const { return m_size; } int GetNodeCount() const { return m_nodeCount; } + float GetPcfgScore() const { return m_pcfgScore; } bool IsTrivial() const { return m_leaves.empty(); } @@ -66,6 +70,7 @@ class Subgraph void GetTargetLeaves(const Node *, std::vector &) const; int CalcDepth(const Node *) const; int CalcSize(const Node *) const; + float CalcPcfgScore() const; int CountNodes(const Node *) const; const Node *m_root; @@ -73,6 +78,7 @@ class Subgraph int m_depth; int m_size; int m_nodeCount; + float m_pcfgScore; }; } // namespace GHKM diff --git a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp index 31c0e3843..cc961dc0c 100644 --- a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp @@ -61,6 +61,7 @@ std::auto_ptr XmlTreeParser::ConvertTree( const std::vector &words) { std::auto_ptr root(new ParseTree(tree.GetLabel())); + root->SetPcfgScore(tree.GetPcfgScore()); const std::vector &children = tree.GetChildren(); if (children.empty()) { if (tree.GetStart() != tree.GetEnd()) { diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp index 2cc9dc54d..a00667b82 100644 --- a/scripts/training/phrase-extract/extract-rules.cpp +++ b/scripts/training/phrase-extract/extract-rules.cpp @@ -90,7 +90,7 @@ void addHieroRule( int startT, int endT, int startS, int endS void printHieroPhrase( int startT, int endT, int startS, int endS , HoleCollection &holeColl, LabelIndex &labelIndex); string printTargetHieroPhrase( int startT, int endT, int startS, int endS - , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex); + , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore); string printSourceHieroPhrase( int startT, int endT, int startS, int endS , HoleCollection &holeColl, const LabelIndex &labelIndex); void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS @@ -257,6 +257,8 @@ int main(int argc, char* argv[]) // if an source phrase is paired with two target phrases, then count(t|s) = 0.5 else if (strcmp(argv[i],"--NoFractionalCounting") == 0) { options.fractionalCounting = false; + } else if (strcmp(argv[i],"--PCFG") == 0) { + options.pcfgScore = true; } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { options.outputNTLengths = true; #ifdef WITH_THREADS @@ -517,7 +519,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS, } string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS - , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex) + , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore) { HoleList::iterator iterHoleList = holeColl.GetHoles().begin(); assert(iterHoleList != holeColl.GetHoles().end()); @@ -545,6 +547,11 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in out += "[" + sourceLabel + "][" + targetLabel + "] "; + if (m_options.pcfgScore) { + double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore(); + logPCFGScore -= score; + } + currPos = hole.GetEnd(1); hole.SetPos(outPos, 1); ++iterHoleList; @@ -658,8 +665,16 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex); // target - rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex) + if (m_options.pcfgScore) { + double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore(); + rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore) + + " [" + targetLabel + "]"; + rule.pcfgScore = std::exp(logPCFGScore); + } else { + double logPCFGScore = 0.0f; + rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore) + " [" + targetLabel + "]"; + } // source // holeColl.SortSourceHoles(); @@ -877,6 +892,11 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist rule.target += m_sentence->target[ti] + " "; rule.target += "[" + targetLabel + "]"; + if (m_options.pcfgScore) { + double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore(); + rule.pcfgScore = std::exp(logPCFGScore); + } + // alignment for(int ti=startT; ti<=endT; ti++) { for(unsigned int i=0; ialignedToT[ti].size(); i++) { @@ -957,11 +977,13 @@ void ExtractTask::writeRulesToFile() out << rule->source << " ||| " << rule->target << " ||| " << rule->alignment << " ||| " - << rule->count; + << rule->count << " ||| "; if (m_options.outputNTLengths) { - out << " ||| "; rule->OutputNTLengths(out); } + if (m_options.pcfgScore) { + out << " ||| " << rule->pcfgScore; + } out << "\n"; if (!m_options.onlyDirectFlag) { diff --git a/scripts/training/phrase-extract/pcfg-common/Jamfile b/scripts/training/phrase-extract/pcfg-common/Jamfile new file mode 100644 index 000000000..3dc272a56 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/Jamfile @@ -0,0 +1 @@ +lib pcfg_common : [ glob *.cc ] ..//trees ; diff --git a/scripts/training/phrase-extract/pcfg-common/exception.h b/scripts/training/phrase-extract/pcfg-common/exception.h new file mode 100644 index 000000000..3dbd59d0e --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/exception.h @@ -0,0 +1,41 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXCEPTION_H_ +#define PCFG_EXCEPTION_H_ + +#include + +namespace Moses { +namespace PCFG { + +class Exception { + public: + Exception(const char *msg) : msg_(msg) {} + Exception(const std::string &msg) : msg_(msg) {} + const std::string &msg() const { return msg_; } + private: + std::string msg_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/numbered_set.h b/scripts/training/phrase-extract/pcfg-common/numbered_set.h new file mode 100644 index 000000000..f88d710ed --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/numbered_set.h @@ -0,0 +1,109 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_NUMBERED_SET_H_ +#define PCFG_NUMBERED_SET_H_ + +#include "exception.h" + +#include + +#include +#include +#include + +namespace Moses { +namespace PCFG { + +// Stores a set of elements of type T, each of which is allocated an integral +// ID of type I. IDs are contiguous starting at 0. Individual elements cannot +// be removed once inserted (but the whole set can be cleared). +template +class NumberedSet { + private: + typedef boost::unordered_map ElementToIdMap; + typedef std::vector IdToElementMap; + + public: + typedef I IdType; + typedef typename IdToElementMap::const_iterator const_iterator; + + NumberedSet() {} + + const_iterator begin() const { return id_to_element_.begin(); } + const_iterator end() const { return id_to_element_.end(); } + + // Static value + static I NullId() { return std::numeric_limits::max(); } + + bool Empty() const { return id_to_element_.empty(); } + size_t Size() const { return id_to_element_.size(); } + + // Insert the given object and return its ID. + I Insert(const T &); + + I Lookup(const T &) const; + const T &Lookup(I) const; + + void Clear(); + + private: + ElementToIdMap element_to_id_; + IdToElementMap id_to_element_; +}; + +template +I NumberedSet::Lookup(const T &s) const { + typename ElementToIdMap::const_iterator p = element_to_id_.find(s); + return (p == element_to_id_.end()) ? NullId() : p->second; +} + +template +const T &NumberedSet::Lookup(I id) const { + if (id < 0 || id >= id_to_element_.size()) { + std::ostringstream msg; + msg << "Value not found: " << id; + throw Exception(msg.str()); + } + return *(id_to_element_[id]); +} + +template +I NumberedSet::Insert(const T &x) { + std::pair value(x, id_to_element_.size()); + std::pair result = + element_to_id_.insert(value); + if (result.second) { + // x is a new element. + id_to_element_.push_back(&result.first->first); + } + return result.first->second; +} + +template +void NumberedSet::Clear() { + element_to_id_.clear(); + id_to_element_.clear(); +} + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.cc b/scripts/training/phrase-extract/pcfg-common/pcfg.cc new file mode 100644 index 000000000..d045b820b --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/pcfg.cc @@ -0,0 +1,106 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg.h" + +#include "exception.h" + +#include +#include + +#include + +namespace Moses { +namespace PCFG { + +void Pcfg::Add(const Key &key, double score) { + rules_[key] = score; +} + +bool Pcfg::Lookup(const Key &key, double &score) const { + Map::const_iterator p = rules_.find(key); + if (p == rules_.end()) { + return false; + } + score = p->second; + return true; +} + +void Pcfg::Read(std::istream &input, Vocabulary &vocab) { + std::string line; + std::string lhs_string; + std::vector rhs_strings; + std::string score_string; + Key key; + while (std::getline(input, line)) { + // Read LHS. + size_t pos = line.find("|||"); + if (pos == std::string::npos) { + throw Exception("missing first delimiter"); + } + lhs_string = line.substr(0, pos); + boost::trim(lhs_string); + + // Read RHS. + size_t begin = pos+3; + pos = line.find("|||", begin); + if (pos == std::string::npos) { + throw Exception("missing second delimiter"); + } + std::string rhs_text = line.substr(begin, pos-begin); + boost::trim(rhs_text); + rhs_strings.clear(); + boost::split(rhs_strings, rhs_text, boost::algorithm::is_space(), + boost::algorithm::token_compress_on); + + // Read score. + score_string = line.substr(pos+3); + boost::trim(score_string); + + // Construct key. + key.clear(); + key.reserve(rhs_strings.size()+1); + key.push_back(vocab.Insert(lhs_string)); + for (std::vector::const_iterator p = rhs_strings.begin(); + p != rhs_strings.end(); ++p) { + key.push_back(vocab.Insert(*p)); + } + + // Add rule. + double score = boost::lexical_cast(score_string); + Add(key, score); + } +} + +void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const { + for (const_iterator p = begin(); p != end(); ++p) { + const Key &key = p->first; + double score = p->second; + std::vector::const_iterator q = key.begin(); + std::vector::const_iterator end = key.end(); + output << vocab.Lookup(*q++) << " |||"; + while (q != end) { + output << " " << vocab.Lookup(*q++); + } + output << " ||| " << score << std::endl; + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.h b/scripts/training/phrase-extract/pcfg-common/pcfg.h new file mode 100644 index 000000000..757eea449 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/pcfg.h @@ -0,0 +1,61 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_PCFG_H_ +#define PCFG_PCFG_H_ + +#include "typedef.h" + +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +class Pcfg { + public: + typedef std::vector Key; + typedef std::map Map; + typedef Map::iterator iterator; + typedef Map::const_iterator const_iterator; + + Pcfg() {} + + iterator begin() { return rules_.begin(); } + const_iterator begin() const { return rules_.begin(); } + + iterator end() { return rules_.end(); } + const_iterator end() const { return rules_.end(); } + + void Add(const Key &, double); + bool Lookup(const Key &, double &) const; + void Read(std::istream &, Vocabulary &); + void Write(const Vocabulary &, std::ostream &) const; + + private: + Map rules_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h new file mode 100644 index 000000000..bdac64dfc --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h @@ -0,0 +1,77 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_PCFG_TREE_H_ +#define PCFG_PCFG_TREE_H_ + +#include "syntax_tree.h" +#include "xml_tree_writer.h" + +#include + +namespace Moses { +namespace PCFG { + +template +class PcfgTreeBase : public SyntaxTreeBase { + public: + typedef std::string LabelType; + typedef SyntaxTreeBase BaseType; + + PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {} + + double score() const { return score_; } + void set_score(double s) { score_ = s; } + + private: + double score_; +}; + +class PcfgTree : public PcfgTreeBase { + public: + typedef PcfgTreeBase BaseType; + PcfgTree(const BaseType::LabelType &label) : BaseType(label) {} +}; + +// Specialise XmlOutputHandler for PcfgTree. +template<> +class XmlOutputHandler { + public: + typedef std::map AttributeMap; + + void GetLabel(const PcfgTree &tree, std::string &label) const { + label = tree.label(); + } + + void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const { + attribute_map.clear(); + double score = tree.score(); + if (score != 0.0) { + std::ostringstream out; + out << tree.score(); + attribute_map["pcfg"] = out.str(); + } + } +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/syntax_tree.h b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h new file mode 100644 index 000000000..37f72dd58 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h @@ -0,0 +1,91 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_SYNTAX_TREE_H_ +#define PCFG_SYNTAX_TREE_H_ + +#include +#include + +namespace Moses { +namespace PCFG { + +// Base class for SyntaxTree, AgreementTree, and friends. +template +class SyntaxTreeBase { + public: + // Constructors + SyntaxTreeBase(const T &label) + : label_(label) + , children_() + , parent_(0) {} + + SyntaxTreeBase(const T &label, const std::vector &children) + : label_(label) + , children_(children) + , parent_(0) {} + + // Destructor + virtual ~SyntaxTreeBase(); + + const T &label() const { return label_; } + const DerivedType *parent() const { return parent_; } + DerivedType *parent() { return parent_; } + const std::vector &children() const { return children_; } + std::vector &children() { return children_; } + + void set_label(const T &label) { label_ = label; } + void set_parent(DerivedType *parent) { parent_ = parent; } + void set_children(const std::vector &c) { children_ = c; } + + bool IsLeaf() const { return children_.empty(); } + + bool IsPreterminal() const { + return children_.size() == 1 && children_[0]->IsLeaf(); + } + + void AddChild(DerivedType *child) { children_.push_back(child); } + + private: + T label_; + std::vector children_; + DerivedType *parent_; +}; + +template +class SyntaxTree : public SyntaxTreeBase > { + public: + typedef SyntaxTreeBase > BaseType; + SyntaxTree(const T &label) : BaseType(label) {} + SyntaxTree(const T &label, const std::vector &children) + : BaseType(label, children) {} +}; + +template +SyntaxTreeBase::~SyntaxTreeBase() { + for (size_t i = 0; i < children_.size(); ++i) { + delete children_[i]; + } +} + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/tool.cc b/scripts/training/phrase-extract/pcfg-common/tool.cc new file mode 100644 index 000000000..bebd220e1 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/tool.cc @@ -0,0 +1,80 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "tool.h" + +#include + +namespace Moses { +namespace PCFG { + +std::istream &Tool::OpenInputOrDie(const std::string &filename) { + // TODO Check that function is only called once? + if (filename.empty() || filename == "-") { + input_ptr_ = &(std::cin); + } else { + input_file_stream_.open(filename.c_str()); + if (!input_file_stream_) { + std::ostringstream msg; + msg << "failed to open input file: " << filename; + Error(msg.str()); + } + input_ptr_ = &input_file_stream_; + } + return *input_ptr_; +} + +std::ostream &Tool::OpenOutputOrDie(const std::string &filename) { + // TODO Check that function is only called once? + if (filename.empty() || filename == "-") { + output_ptr_ = &(std::cout); + } else { + output_file_stream_.open(filename.c_str()); + if (!output_file_stream_) { + std::ostringstream msg; + msg << "failed to open output file: " << filename; + Error(msg.str()); + } + output_ptr_ = &output_file_stream_; + } + return *output_ptr_; +} + +void Tool::OpenNamedInputOrDie(const std::string &filename, + std::ifstream &stream) { + stream.open(filename.c_str()); + if (!stream) { + std::ostringstream msg; + msg << "failed to open input file: " << filename; + Error(msg.str()); + } +} + +void Tool::OpenNamedOutputOrDie(const std::string &filename, + std::ofstream &stream) { + stream.open(filename.c_str()); + if (!stream) { + std::ostringstream msg; + msg << "failed to open output file: " << filename; + Error(msg.str()); + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-common/tool.h b/scripts/training/phrase-extract/pcfg-common/tool.h new file mode 100644 index 000000000..0af342569 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/tool.h @@ -0,0 +1,91 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_TOOL_H_ +#define PCFG_TOOL_H_ + +#include + +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +class Tool { + public: + virtual ~Tool() {} + + const std::string &name() const { return name_; } + + virtual int Main(int argc, char *argv[]) = 0; + + protected: + Tool(const std::string &name) : name_(name) {} + + // Returns the boost::program_options style that should be used by all tools. + static int CommonOptionStyle() { + namespace cls = boost::program_options::command_line_style; + return cls::default_style & (~cls::allow_guessing); + } + + void Warn(const std::string &msg) const { + std::cerr << name_ << ": warning: " << msg << std::endl; + } + + void Error(const std::string &msg) const { + std::cerr << name_ << ": error: " << msg << std::endl; + std::exit(1); + } + + // Initialises the tool's main input stream and returns a reference that is + // valid for the remainder of the tool's lifetime. If filename is empty or + // "-" then input is standard input; otherwise it is the named file. Calls + // Error() if the file cannot be opened for reading. + std::istream &OpenInputOrDie(const std::string &filename); + + // Initialises the tool's main output stream and returns a reference that is + // valid for the remainder of the tool's lifetime. If filename is empty or + // "-" then output is standard output; otherwise it is the named file. Calls + // Error() if the file cannot be opened for writing. + std::ostream &OpenOutputOrDie(const std::string &filename); + + // Opens the named input file using the supplied ifstream. Calls Error() if + // the file cannot be opened for reading. + void OpenNamedInputOrDie(const std::string &, std::ifstream &); + + // Opens the named output file using the supplied ofstream. Calls Error() if + // the file cannot be opened for writing. + void OpenNamedOutputOrDie(const std::string &, std::ofstream &); + + private: + std::string name_; + std::istream *input_ptr_; + std::ifstream input_file_stream_; + std::ostream *output_ptr_; + std::ofstream output_file_stream_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/typedef.h b/scripts/training/phrase-extract/pcfg-common/typedef.h new file mode 100644 index 000000000..49a12d681 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/typedef.h @@ -0,0 +1,37 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_TYPEDEF_H_ +#define PCFG_TYPEDEF_H_ + +#include "numbered_set.h" +#include "syntax_tree.h" + +#include + +namespace Moses { +namespace PCFG { + +typedef NumberedSet Vocabulary; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc new file mode 100644 index 000000000..5c596a0fb --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc @@ -0,0 +1,85 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "xml_tree_parser.h" + +#include "exception.h" +#include "tables-core.h" +#include "XmlException.h" +#include "XmlTree.h" + +#include +#include + +namespace Moses { +namespace PCFG { + +XmlTreeParser::XmlTreeParser() +{ +} + +std::auto_ptr XmlTreeParser::Parse(const std::string &line) +{ + m_line = line; + m_tree.Clear(); + try { + if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) { + throw Exception(""); + } + } catch (const XmlException &e) { + throw Exception(e.getMsg()); + } + m_tree.ConnectNodes(); + SyntaxNode *root = m_tree.GetTop(); + assert(root); + m_words = tokenize(m_line.c_str()); + return ConvertTree(*root, m_words); +} + +// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree. +std::auto_ptr XmlTreeParser::ConvertTree( + const SyntaxNode &tree, + const std::vector &words) +{ + std::auto_ptr root(new PcfgTree(tree.GetLabel())); + const std::vector &children = tree.GetChildren(); + if (children.empty()) { + if (tree.GetStart() != tree.GetEnd()) { + std::ostringstream msg; + msg << "leaf node covers multiple words (" << tree.GetStart() + << "-" << tree.GetEnd() << "): this is currently unsupported"; + throw Exception(msg.str()); + } + std::auto_ptr leaf(new PcfgTree(words[tree.GetStart()])); + leaf->set_parent(root.get()); + root->AddChild(leaf.release()); + } else { + for (std::vector::const_iterator p = children.begin(); + p != children.end(); ++p) { + assert(*p); + std::auto_ptr child = ConvertTree(**p, words); + child->set_parent(root.get()); + root->AddChild(child.release()); + } + } + return root; +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h new file mode 100644 index 000000000..6b418c44e --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h @@ -0,0 +1,56 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_XML_TREE_PARSER_H_ +#define PCFG_XML_TREE_PARSER_H_ + +#include "pcfg_tree.h" +#include "SyntaxTree.h" + +#include +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +// Parses a string in Moses' XML parse tree format and returns a PcfgTree +// object. +class XmlTreeParser { + public: + XmlTreeParser(); + std::auto_ptr Parse(const std::string &); + private: + std::auto_ptr ConvertTree(const SyntaxNode &, + const std::vector &); + + std::set m_labelSet; + std::map m_topLabelSet; + std::string m_line; + ::SyntaxTree m_tree; + std::vector m_words; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h new file mode 100644 index 000000000..347c352bb --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h @@ -0,0 +1,127 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_XML_TREE_WRITER_H_ +#define PCFG_XML_TREE_WRITER_H_ + +#include "syntax_tree.h" + +#include "XmlTree.h" + +#include +#include +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +template +class XmlOutputHandler { + public: + typedef std::map AttributeMap; + + void GetLabel(const InputTree &, std::string &) const; + void GetAttributes(const InputTree &, AttributeMap &) const; +}; + +template +class XmlTreeWriter : public XmlOutputHandler { + public: + typedef XmlOutputHandler Base; + void Write(const InputTree &, std::ostream &) const; + private: + std::string Escape(const std::string &) const; +}; + +template +void XmlTreeWriter::Write(const InputTree &tree, + std::ostream &out) const { + assert(!tree.IsLeaf()); + + // Opening tag + + std::string label; + Base::GetLabel(tree, label); + out << "first << "=\"" << p->second << "\""; + } + + out << ">"; + + // Children + + const std::vector &children = tree.children(); + for (typename std::vector::const_iterator p = children.begin(); + p != children.end(); ++p) { + InputTree &child = **p; + if (child.IsLeaf()) { + Base::GetLabel(child, label); + out << " " << Escape(label); + } else { + out << " "; + Write(**p, out); + } + } + + // Closing tag + out << " "; + + if (tree.parent() == 0) { + out << std::endl; + } +} + +// Escapes XML special characters. +template +std::string XmlTreeWriter::Escape(const std::string &s) const { + std::string t; + size_t len = s.size(); + t.reserve(len); + for (size_t i = 0; i < len; ++i) { + if (s[i] == '<') { + t += "<"; + } else if (s[i] == '>') { + t += ">"; + } else if (s[i] == '&') { + t += "&"; + } else if (s[i] == '\'') { + t += "'"; + } else if (s[i] == '"') { + t += """; + } else { + t += s[i]; + } + } + return t; +} + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-extract/Jamfile b/scripts/training/phrase-extract/pcfg-extract/Jamfile new file mode 100644 index 000000000..be91d6d2f --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/Jamfile @@ -0,0 +1 @@ +exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ; diff --git a/scripts/training/phrase-extract/pcfg-extract/main.cc b/scripts/training/phrase-extract/pcfg-extract/main.cc new file mode 100644 index 000000000..47b45afc3 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/main.cc @@ -0,0 +1,25 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg_extract.h" + +int main(int argc, char *argv[]) { + Moses::PCFG::PcfgExtract tool; + return tool.Main(argc, argv); +} diff --git a/scripts/training/phrase-extract/pcfg-extract/options.h b/scripts/training/phrase-extract/pcfg-extract/options.h new file mode 100644 index 000000000..3acb31b58 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/options.h @@ -0,0 +1,36 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXTRACT_OPTIONS_H_ +#define PCFG_EXTRACT_OPTIONS_H_ + +#include + +namespace Moses { +namespace PCFG { + +struct Options { + std::string corpus_file; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc new file mode 100644 index 000000000..151c9959c --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc @@ -0,0 +1,131 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg_extract.h" + +#include "options.h" +#include "rule_collection.h" +#include "rule_extractor.h" + +#include "pcfg-common/exception.h" +#include "pcfg-common/pcfg.h" +#include "pcfg-common/pcfg_tree.h" +#include "pcfg-common/syntax_tree.h" +#include "pcfg-common/typedef.h" +#include "pcfg-common/xml_tree_parser.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +int PcfgExtract::Main(int argc, char *argv[]) { + // Process command-line options. + Options options; + ProcessOptions(argc, argv, options); + + // Extract PCFG rules from corpus. + Vocabulary non_term_vocab; + RuleExtractor rule_extractor(non_term_vocab); + RuleCollection rule_collection; + XmlTreeParser parser; + std::string line; + size_t line_num = 0; + std::auto_ptr tree; + while (std::getline(std::cin, line)) { + ++line_num; + try { + tree = parser.Parse(line); + } catch (Exception &e) { + std::ostringstream msg; + msg << "line " << line_num << ": " << e.msg(); + Error(msg.str()); + } + if (!tree.get()) { + std::ostringstream msg; + msg << "no tree at line " << line_num; + Warn(msg.str()); + continue; + } + rule_extractor.Extract(*tree, rule_collection); + } + + // Score rules and write PCFG to output. + Pcfg pcfg; + rule_collection.CreatePcfg(pcfg); + pcfg.Write(non_term_vocab, std::cout); + + return 0; +} + +void PcfgExtract::ProcessOptions(int argc, char *argv[], + Options &options) const { + namespace po = boost::program_options; + + std::ostringstream usage_top; + usage_top << "Usage: " << name() << "\n\n" << "Options"; + + // Declare the command line options that are visible to the user. + po::options_description visible(usage_top.str()); + visible.add_options() + ("help", "print help message and exit") + ; + + // Declare the command line options that are hidden from the user + // (these are used as positional options). + po::options_description hidden("Hidden options"); + hidden.add_options(); + + // Compose the full set of command-line options. + po::options_description cmd_line_options; + cmd_line_options.add(visible).add(hidden); + + // Register the positional options. + po::positional_options_description p; + + // Process the command-line. + po::variables_map vm; + try { + po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()). + options(cmd_line_options).positional(p).run(), vm); + po::notify(vm); + } catch (const std::exception &e) { + std::ostringstream msg; + msg << e.what() << "\n\n" << visible; + Error(msg.str()); + } + + if (vm.count("help")) { + std::cout << visible << std::endl; + std::exit(0); + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h new file mode 100644 index 000000000..1af6cb4fe --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h @@ -0,0 +1,42 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXTRACT_PCFG_EXTRACT_H_ +#define PCFG_EXTRACT_PCFG_EXTRACT_H_ + +#include "pcfg-common/tool.h" + +namespace Moses { +namespace PCFG { + +class Options; + +class PcfgExtract : public Tool { + public: + PcfgExtract() : Tool("pcfg-extract") {} + virtual int Main(int, char *[]); + private: + void ProcessOptions(int, char *[], Options &) const; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc new file mode 100644 index 000000000..503b1a9e6 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc @@ -0,0 +1,58 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "rule_collection.h" + +#include "pcfg-common/pcfg.h" + +#include + +namespace Moses { +namespace PCFG { + +void RuleCollection::Add(size_t lhs, const std::vector &rhs) { + ++collection_[lhs][rhs]; +} + +void RuleCollection::CreatePcfg(Pcfg &pcfg) { + std::vector key; + for (const_iterator p = begin(); p != end(); ++p) { + size_t lhs = p->first; + const RhsCountMap &rhs_counts = p->second; + size_t total = 0; + for (RhsCountMap::const_iterator q = rhs_counts.begin(); + q != rhs_counts.end(); ++q) { + total += q->second; + } + for (RhsCountMap::const_iterator q = rhs_counts.begin(); + q != rhs_counts.end(); ++q) { + const std::vector &rhs = q->first; + size_t count = q->second; + double score = std::log(static_cast(count) / + static_cast(total)); + key.clear(); + key.push_back(lhs); + key.insert(key.end(), rhs.begin(), rhs.end()); + pcfg.Add(key, score); + } + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.h b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h new file mode 100644 index 000000000..1b768dd21 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h @@ -0,0 +1,59 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXTRACT_RULE_COLLECTION_H_ +#define PCFG_EXTRACT_RULE_COLLECTION_H_ + +#include "pcfg-common/pcfg.h" + +#include + +#include + +namespace Moses { +namespace PCFG { + +// Contains PCFG rules and their counts. +class RuleCollection { + public: + typedef boost::unordered_map, size_t> RhsCountMap; + typedef boost::unordered_map Map; + typedef Map::iterator iterator; + typedef Map::const_iterator const_iterator; + + RuleCollection() {} + + iterator begin() { return collection_.begin(); } + const_iterator begin() const { return collection_.begin(); } + + iterator end() { return collection_.end(); } + const_iterator end() const { return collection_.end(); } + + void Add(size_t, const std::vector &); + void CreatePcfg(Pcfg &); + + private: + Map collection_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc new file mode 100644 index 000000000..48a82a6d0 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc @@ -0,0 +1,51 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "rule_extractor.h" + +#include "pcfg-common/pcfg_tree.h" + +namespace Moses { +namespace PCFG { + +RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab) + : non_term_vocab_(non_term_vocab) { +} + +void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const { + if (tree.IsPreterminal() || tree.IsLeaf()) { + return; + } + + size_t lhs = non_term_vocab_.Insert(tree.label()); + std::vector rhs; + + const std::vector &children = tree.children(); + rhs.reserve(children.size()); + for (std::vector::const_iterator p(children.begin()); + p != children.end(); ++p) { + const PcfgTree &child = **p; + rhs.push_back(non_term_vocab_.Insert(child.label())); + Extract(child, rc); + } + rc.Add(lhs, rhs); +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h new file mode 100644 index 000000000..6bcffbc61 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h @@ -0,0 +1,45 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_ +#define PCFG_EXTRACT_RULE_EXTRACTOR_H_ + +#include "rule_collection.h" + +#include "pcfg-common/typedef.h" + +namespace Moses { +namespace PCFG { + +class PcfgTree; + +// Extracts PCFG rules from syntax trees and adds them to a RuleCollection. +class RuleExtractor { + public: + RuleExtractor(Vocabulary &); + void Extract(const PcfgTree &, RuleCollection &) const; + private: + Vocabulary &non_term_vocab_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-score/Jamfile b/scripts/training/phrase-extract/pcfg-score/Jamfile new file mode 100644 index 000000000..7225381c0 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/Jamfile @@ -0,0 +1 @@ +exe pcfg-score : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ; diff --git a/scripts/training/phrase-extract/pcfg-score/main.cc b/scripts/training/phrase-extract/pcfg-score/main.cc new file mode 100644 index 000000000..da5392add --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/main.cc @@ -0,0 +1,25 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg_score.h" + +int main(int argc, char *argv[]) { + Moses::PCFG::PcfgScore tool; + return tool.Main(argc, argv); +} diff --git a/scripts/training/phrase-extract/pcfg-score/options.h b/scripts/training/phrase-extract/pcfg-score/options.h new file mode 100644 index 000000000..e54b2a0b9 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/options.h @@ -0,0 +1,36 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_SCORE_OPTIONS_H_ +#define PCFG_SCORE_OPTIONS_H_ + +#include + +namespace Moses { +namespace PCFG { + +struct Options { + std::string pcfg_file; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc new file mode 100644 index 000000000..d780200ad --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc @@ -0,0 +1,152 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg_score.h" + +#include "options.h" +#include "tree_scorer.h" + +#include "pcfg-common/exception.h" +#include "pcfg-common/pcfg.h" +#include "pcfg-common/pcfg_tree.h" +#include "pcfg-common/syntax_tree.h" +#include "pcfg-common/typedef.h" +#include "pcfg-common/xml_tree_parser.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +int PcfgScore::Main(int argc, char *argv[]) { + // Process command-line options. + Options options; + ProcessOptions(argc, argv, options); + + // Open PCFG stream. + std::ifstream pcfg_stream; + OpenNamedInputOrDie(options.pcfg_file, pcfg_stream); + + // Read PCFG. + Pcfg pcfg; + Vocabulary non_term_vocab; + pcfg.Read(pcfg_stream, non_term_vocab); + + // Score corpus according to PCFG. + TreeScorer scorer(pcfg, non_term_vocab); + XmlTreeParser parser; + XmlTreeWriter writer; + std::string line; + size_t line_num = 0; + std::auto_ptr tree; + while (std::getline(std::cin, line)) { + ++line_num; + try { + tree = parser.Parse(line); + } catch (Exception &e) { + std::ostringstream msg; + msg << "line " << line_num << ": " << e.msg(); + Error(msg.str()); + } + if (!tree.get()) { + std::ostringstream msg; + msg << "no tree at line " << line_num; + Warn(msg.str()); + std::cout << std::endl; + continue; + } + if (!scorer.Score(*tree)) { + std::ostringstream msg; + msg << "failed to score tree at line " << line_num; + Warn(msg.str()); + std::cout << std::endl; + continue; + } + writer.Write(*tree, std::cout); + } + + return 0; +} + +void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const { + namespace po = boost::program_options; + + std::ostringstream usage_top; + usage_top << "Usage: " << name() << " PCFG\n\n" + << "Options"; + + // Declare the command line options that are visible to the user. + po::options_description visible(usage_top.str()); + visible.add_options() + ("help", "print help message and exit") + ; + + // Declare the command line options that are hidden from the user + // (these are used as positional options). + po::options_description hidden("Hidden options"); + hidden.add_options() + ("pcfg-file", po::value(&options.pcfg_file), "pcfg file") + ; + + // Compose the full set of command-line options. + po::options_description cmd_line_options; + cmd_line_options.add(visible).add(hidden); + + // Register the positional options. + po::positional_options_description p; + p.add("pcfg-file", 1); + + // Process the command-line. + po::variables_map vm; + try { + po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()). + options(cmd_line_options).positional(p).run(), vm); + po::notify(vm); + } catch (const std::exception &e) { + std::ostringstream msg; + msg << e.what() << "\n\n" << visible; + Error(msg.str()); + } + + if (vm.count("help")) { + std::cout << visible << std::endl; + std::exit(0); + } + + // Check positional options were given. + + if (!vm.count("pcfg-file")) { + std::ostringstream msg; + msg << "missing required argument\n\n" << visible << std::endl; + Error(msg.str()); + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.h b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h new file mode 100644 index 000000000..5e506c39d --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h @@ -0,0 +1,42 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_SCORE_PCFG_SCORE_H_ +#define PCFG_SCORE_PCFG_SCORE_H_ + +#include "pcfg-common/tool.h" + +namespace Moses { +namespace PCFG { + +class Options; + +class PcfgScore : public Tool { + public: + PcfgScore() : Tool("pcfg-score") {} + virtual int Main(int, char *[]); + private: + void ProcessOptions(int, char *[], Options &) const; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc new file mode 100644 index 000000000..5f695e4fc --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc @@ -0,0 +1,68 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "tree_scorer.h" + +#include + +namespace Moses { +namespace PCFG { + +TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab) + : pcfg_(pcfg) + , non_term_vocab_(non_term_vocab) { +} + +bool TreeScorer::Score(PcfgTree &root) const { + if (root.IsPreterminal() || root.IsLeaf()) { + return true; + } + + const std::vector &children = root.children(); + + double log_prob = 0.0; + + std::vector key; + key.reserve(children.size()+1); + key.push_back(non_term_vocab_.Lookup(root.label())); + + for (std::vector::const_iterator p(children.begin()); + p != children.end(); ++p) { + PcfgTree *child = *p; + assert(!child->IsLeaf()); + key.push_back(non_term_vocab_.Lookup(child->label())); + if (!Score(*child)) { + return false; + } + if (!child->IsPreterminal()) { + log_prob += child->score(); + } + } + double rule_score; + bool found = pcfg_.Lookup(key, rule_score); + if (!found) { + return false; + } + log_prob += rule_score; + root.set_score(log_prob); + return true; +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.h b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h new file mode 100644 index 000000000..36f4e1e99 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h @@ -0,0 +1,47 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_SCORE_TREE_SCORER_H_ +#define PCFG_SCORE_TREE_SCORER_H_ + +#include "pcfg-common/pcfg.h" +#include "pcfg-common/pcfg_tree.h" +#include "pcfg-common/typedef.h" + +namespace Moses { +namespace PCFG { + +class TreeScorer { + public: + TreeScorer(const Pcfg &, const Vocabulary &); + + // Score tree according to PCFG. Returns false if unsuccessful (due to + // missing rule). + bool Score(PcfgTree &) const; + + private: + const Pcfg &pcfg_; + const Vocabulary &non_term_vocab_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp index 8bcc9be3b..c5fb0b99f 100644 --- a/scripts/training/phrase-extract/score.cpp +++ b/scripts/training/phrase-extract/score.cpp @@ -72,6 +72,7 @@ void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs LexicalTable lexTable; bool inverseFlag = false; bool hierarchicalFlag = false; +bool pcfgFlag = false; bool wordAlignmentFlag = false; bool goodTuringFlag = false; bool kneserNeyFlag = false; @@ -108,6 +109,9 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; + } else if (strcmp(argv[i],"--PCFG") == 0) { + pcfgFlag = true; + cerr << "including PCFG scores\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; @@ -193,6 +197,7 @@ int main(int argc, char* argv[]) // loop through all extracted phrase translations float lastCount = 0.0f; + float lastPcfgSum = 0.0f; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; @@ -207,6 +212,7 @@ int main(int argc, char* argv[]) // identical to last line? just add count if (strcmp(line,lastLine) == 0) { lastPhrasePair->count += lastCount; + lastPhrasePair->pcfgSum += lastPcfgSum; continue; } strcpy( lastLine, line ); @@ -215,10 +221,12 @@ int main(int argc, char* argv[]) PhraseAlignment phrasePair; phrasePair.create( line, i ); lastCount = phrasePair.count; + lastPcfgSum = phrasePair.pcfgSum; // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) { lastPhrasePair->count += phrasePair.count; + lastPhrasePair->pcfgSum += phrasePair.pcfgSum; continue; } @@ -438,6 +446,16 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo countOfCounts[ countInt ]++; } + // compute PCFG score + float pcfgScore; + if (pcfgFlag && !inverseFlag) { + float pcfgSum = 0; + for(size_t i=0; ipcfgSum; + } + pcfgScore = pcfgSum / count; + } + // output phrases const PHRASE &phraseS = phrasePair[0]->GetSource(); const PHRASE &phraseT = phrasePair[0]->GetTarget(); @@ -493,6 +511,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty ); } + // target-side PCFG score + if (pcfgFlag && !inverseFlag) { + phraseTableFile << " " << pcfgScore; + } + phraseTableFile << " ||| "; // alignment info for non-terminals diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index 1a7cb3a39..41ea2d682 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -105,6 +105,7 @@ $_HELP = 1 'glue-grammar-file=s' => \$_GLUE_GRAMMAR_FILE, 'unknown-word-label-file=s' => \$_UNKNOWN_WORD_LABEL_FILE, 'ghkm' => \$_GHKM, + 'pcfg' => \$_PCFG, 'extract-options=s' => \$_EXTRACT_OPTIONS, 'score-options=s' => \$_SCORE_OPTIONS, 'source-syntax' => \$_SOURCE_SYNTAX, @@ -1373,6 +1374,7 @@ sub extract_phrase { $cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file"; $cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR; $cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE); + $cmd .= " --PCFG" if $_PCFG; if (!defined($_GHKM)) { $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX; $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX; @@ -1503,6 +1505,7 @@ sub score_phrase_phrase_extract { $cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT; $cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT; $cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL; + $cmd .= " --PCFG" if $_PCFG; $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS); # sorting @@ -1801,6 +1804,7 @@ sub create_ini { $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/; $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature $basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature + $basic_weight_count++ if $_PCFG; foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) { $num_of_ttables++; my $ff = $f; -- cgit v1.2.3