From e7fc7852e986ce7ec9c76598b01060cde6ca29b9 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Thu, 24 May 2012 23:26:19 +0100 Subject: Add an optional PCFG scoring feature for target syntax models (similar to the p_cfg feature used in Marcu, Wang, Echihabi, and Knight (2006)). --- scripts/Jamfile | 2 + scripts/ems/experiment.meta | 15 +- scripts/ems/experiment.perl | 2 + scripts/training/phrase-extract/ExtractedRule.h | 2 + scripts/training/phrase-extract/Jamfile | 2 + .../training/phrase-extract/PhraseAlignment.cpp | 7 +- scripts/training/phrase-extract/PhraseAlignment.h | 1 + .../phrase-extract/RuleExtractionOptions.h | 2 + scripts/training/phrase-extract/SyntaxTree.cpp | 3 +- scripts/training/phrase-extract/SyntaxTree.h | 11 +- scripts/training/phrase-extract/XmlTree.cpp | 9 +- .../phrase-extract/extract-ghkm/AlignmentGraph.cpp | 4 + .../phrase-extract/extract-ghkm/ExtractGHKM.cpp | 5 + .../training/phrase-extract/extract-ghkm/Node.h | 6 +- .../training/phrase-extract/extract-ghkm/Options.h | 2 + .../phrase-extract/extract-ghkm/ParseTree.h | 7 +- .../phrase-extract/extract-ghkm/ScfgRule.cpp | 1 + .../phrase-extract/extract-ghkm/ScfgRule.h | 2 + .../phrase-extract/extract-ghkm/ScfgRuleWriter.cpp | 69 +++++----- .../phrase-extract/extract-ghkm/ScfgRuleWriter.h | 4 +- .../phrase-extract/extract-ghkm/Subgraph.cpp | 16 +++ .../phrase-extract/extract-ghkm/Subgraph.h | 8 +- .../phrase-extract/extract-ghkm/XmlTreeParser.cpp | 1 + scripts/training/phrase-extract/extract-rules.cpp | 32 ++++- .../training/phrase-extract/pcfg-common/Jamfile | 1 + .../phrase-extract/pcfg-common/exception.h | 41 ++++++ .../phrase-extract/pcfg-common/numbered_set.h | 109 +++++++++++++++ .../training/phrase-extract/pcfg-common/pcfg.cc | 106 ++++++++++++++ scripts/training/phrase-extract/pcfg-common/pcfg.h | 61 +++++++++ .../phrase-extract/pcfg-common/pcfg_tree.h | 77 +++++++++++ .../phrase-extract/pcfg-common/syntax_tree.h | 91 ++++++++++++ .../training/phrase-extract/pcfg-common/tool.cc | 80 +++++++++++ scripts/training/phrase-extract/pcfg-common/tool.h | 91 ++++++++++++ .../training/phrase-extract/pcfg-common/typedef.h | 37 +++++ .../phrase-extract/pcfg-common/xml_tree_parser.cc | 85 ++++++++++++ .../phrase-extract/pcfg-common/xml_tree_parser.h | 56 ++++++++ .../phrase-extract/pcfg-common/xml_tree_writer.h | 127 +++++++++++++++++ .../training/phrase-extract/pcfg-extract/Jamfile | 1 + .../training/phrase-extract/pcfg-extract/main.cc | 25 ++++ .../training/phrase-extract/pcfg-extract/options.h | 36 +++++ .../phrase-extract/pcfg-extract/pcfg_extract.cc | 131 ++++++++++++++++++ .../phrase-extract/pcfg-extract/pcfg_extract.h | 42 ++++++ .../phrase-extract/pcfg-extract/rule_collection.cc | 58 ++++++++ .../phrase-extract/pcfg-extract/rule_collection.h | 59 ++++++++ .../phrase-extract/pcfg-extract/rule_extractor.cc | 51 +++++++ .../phrase-extract/pcfg-extract/rule_extractor.h | 45 ++++++ scripts/training/phrase-extract/pcfg-score/Jamfile | 1 + scripts/training/phrase-extract/pcfg-score/main.cc | 25 ++++ .../training/phrase-extract/pcfg-score/options.h | 36 +++++ .../phrase-extract/pcfg-score/pcfg_score.cc | 152 +++++++++++++++++++++ .../phrase-extract/pcfg-score/pcfg_score.h | 42 ++++++ .../phrase-extract/pcfg-score/tree_scorer.cc | 68 +++++++++ .../phrase-extract/pcfg-score/tree_scorer.h | 47 +++++++ scripts/training/phrase-extract/score.cpp | 23 ++++ scripts/training/train-model.perl.missing_bin_dir | 6 +- 55 files changed, 1971 insertions(+), 52 deletions(-) create mode 100644 scripts/training/phrase-extract/pcfg-common/Jamfile create mode 100644 scripts/training/phrase-extract/pcfg-common/exception.h create mode 100644 scripts/training/phrase-extract/pcfg-common/numbered_set.h create mode 100644 scripts/training/phrase-extract/pcfg-common/pcfg.cc create mode 100644 scripts/training/phrase-extract/pcfg-common/pcfg.h create mode 100644 scripts/training/phrase-extract/pcfg-common/pcfg_tree.h create mode 100644 scripts/training/phrase-extract/pcfg-common/syntax_tree.h create mode 100644 scripts/training/phrase-extract/pcfg-common/tool.cc create mode 100644 scripts/training/phrase-extract/pcfg-common/tool.h create mode 100644 scripts/training/phrase-extract/pcfg-common/typedef.h create mode 100644 scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc create mode 100644 scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h create mode 100644 scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h create mode 100644 scripts/training/phrase-extract/pcfg-extract/Jamfile create mode 100644 scripts/training/phrase-extract/pcfg-extract/main.cc create mode 100644 scripts/training/phrase-extract/pcfg-extract/options.h create mode 100644 scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc create mode 100644 scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h create mode 100644 scripts/training/phrase-extract/pcfg-extract/rule_collection.cc create mode 100644 scripts/training/phrase-extract/pcfg-extract/rule_collection.h create mode 100644 scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc create mode 100644 scripts/training/phrase-extract/pcfg-extract/rule_extractor.h create mode 100644 scripts/training/phrase-extract/pcfg-score/Jamfile create mode 100644 scripts/training/phrase-extract/pcfg-score/main.cc create mode 100644 scripts/training/phrase-extract/pcfg-score/options.h create mode 100644 scripts/training/phrase-extract/pcfg-score/pcfg_score.cc create mode 100644 scripts/training/phrase-extract/pcfg-score/pcfg_score.h create mode 100644 scripts/training/phrase-extract/pcfg-score/tree_scorer.cc create mode 100644 scripts/training/phrase-extract/pcfg-score/tree_scorer.h diff --git a/scripts/Jamfile b/scripts/Jamfile index 6fb9bad39..b9eefcffe 100644 --- a/scripts/Jamfile +++ b/scripts/Jamfile @@ -42,6 +42,8 @@ if $(location) { install compactify : training/compact-rule-table//compactify : $(location)/training/compact-rule-table/tools ; install phrase-extract : training/phrase-extract//programs : $(location)/training/phrase-extract ; + install pcfg-extract : training/phrase-extract/pcfg-extract//pcfg-extract : $(location)/training/phrase-extract/pcfg-extract ; + install pcfg-score : training/phrase-extract/pcfg-score//pcfg-score : $(location)/training/phrase-extract/pcfg-score ; install lexical-reordering : training/lexical-reordering//score : $(location)/training/lexical-reordering ; install symal : training/symal//symal : $(location)/training/symal ; diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 51ac0f67a..b33c589d2 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -344,8 +344,21 @@ parse-relax pass-unless: input-parse-relaxer output-parse-relaxer template-if: input-parse-relaxer IN.$input-extension OUT.$input-extension template-if: output-parse-relaxer IN.$output-extension OUT.$output-extension +pcfg-extract + in: parse-relaxed-corpus + out: pcfg + default-name: model/pcfg + ignore-unless: use-pcfg-feature + rerun-on-change: use-pcfg-feature + template: $moses-script-dir/training/phrase-extract/pcfg-extract/pcfg-extract < IN.$output-extension > OUT.$output-extension +pcfg-score + in: parse-relaxed-corpus pcfg + out: scored-corpus + default-name: model/scored-corpus + pass-unless: use-pcfg-feature + template: ln -s IN.$input-extension OUT.$input-extension ; $moses-script-dir/training/phrase-extract/pcfg-score/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension extract-phrases - in: word-alignment parse-relaxed-corpus + in: word-alignment scored-corpus out: extracted-phrases rerun-on-change: max-phrase-length translation-factors reordering-factors hierarchical-rule-set extract-settings training-options script use-ghkm default-name: model/extract diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 59bd2788f..0c61a2a05 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -2007,6 +2007,7 @@ sub get_training_setting { my $target_syntax = &get("GENERAL:output-parser"); my $score_settings = &get("TRAINING:score-settings"); my $parallel = &get("TRAINING:parallel"); + my $pcfg = &get("TRAINING:use-pcfg-feature"); my $xml = $source_syntax || $target_syntax; @@ -2029,6 +2030,7 @@ sub get_training_setting { $cmd .= "-glue-grammar " if $hierarchical; $cmd .= "-score-options '".$score_settings."' " if $score_settings; $cmd .= "-parallel " if $parallel; + $cmd .= "-pcfg " if $pcfg; # factored training if (&backoff_and_get("TRAINING:input-factors")) { diff --git a/scripts/training/phrase-extract/ExtractedRule.h b/scripts/training/phrase-extract/ExtractedRule.h index 170ccf892..be6e30836 100644 --- a/scripts/training/phrase-extract/ExtractedRule.h +++ b/scripts/training/phrase-extract/ExtractedRule.h @@ -43,6 +43,7 @@ public: int startS; int endS; float count; + double pcfgScore; std::map > m_ntLengths; @@ -58,6 +59,7 @@ public: , startS(sS) , endS(eS) , count(0) + , pcfgScore(0.0) {} void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength) diff --git a/scripts/training/phrase-extract/Jamfile b/scripts/training/phrase-extract/Jamfile index 0872130f9..f8644be80 100644 --- a/scripts/training/phrase-extract/Jamfile +++ b/scripts/training/phrase-extract/Jamfile @@ -33,3 +33,5 @@ alias programs : extract extract-rules extract-lex score consolidate consolidate install legacy : programs : . EXE ; build-project extract-ghkm ; +build-project pcfg-extract ; +build-project pcfg-score ; diff --git a/scripts/training/phrase-extract/PhraseAlignment.cpp b/scripts/training/phrase-extract/PhraseAlignment.cpp index c0bfbde3e..ceb74f04c 100644 --- a/scripts/training/phrase-extract/PhraseAlignment.cpp +++ b/scripts/training/phrase-extract/PhraseAlignment.cpp @@ -13,6 +13,8 @@ #include "tables-core.h" #include "score.h" +#include + using namespace std; extern Vocabulary vcbT; @@ -111,6 +113,9 @@ void PhraseAlignment::create( char line[], int lineID ) } else if (item == 5) { // non-term lengths addNTLength(token[j]); + } else if (item == 6) { // target syntax PCFG score + float pcfgScore = std::atof(token[j].c_str()); + pcfgSum = pcfgScore * count; } } @@ -119,7 +124,7 @@ void PhraseAlignment::create( char line[], int lineID ) if (item == 3) { count = 1.0; } - if (item < 3 || item > 5) { + if (item < 3 || item > 6) { cerr << "ERROR: faulty line " << lineID << ": " << line << endl; } } diff --git a/scripts/training/phrase-extract/PhraseAlignment.h b/scripts/training/phrase-extract/PhraseAlignment.h index 8b8f5115c..8bd83503d 100644 --- a/scripts/training/phrase-extract/PhraseAlignment.h +++ b/scripts/training/phrase-extract/PhraseAlignment.h @@ -25,6 +25,7 @@ protected: void createAlignVec(size_t sourceSize, size_t targetSize); void addNTLength(const std::string &tok); public: + float pcfgSum; float count; std::vector< std::set > alignedToT; std::vector< std::set > alignedToS; diff --git a/scripts/training/phrase-extract/RuleExtractionOptions.h b/scripts/training/phrase-extract/RuleExtractionOptions.h index 70bb548c9..f9123de86 100644 --- a/scripts/training/phrase-extract/RuleExtractionOptions.h +++ b/scripts/training/phrase-extract/RuleExtractionOptions.h @@ -45,6 +45,7 @@ public: bool targetSyntax; bool duplicateRules; bool fractionalCounting; + bool pcfgScore; bool outputNTLengths; bool gzOutput; @@ -74,6 +75,7 @@ public: , targetSyntax(false) , duplicateRules(true) , fractionalCounting(true) + , pcfgScore(false) , outputNTLengths(false) , gzOutput(false) {} diff --git a/scripts/training/phrase-extract/SyntaxTree.cpp b/scripts/training/phrase-extract/SyntaxTree.cpp index e181b1e8a..f2783ffd2 100644 --- a/scripts/training/phrase-extract/SyntaxTree.cpp +++ b/scripts/training/phrase-extract/SyntaxTree.cpp @@ -42,11 +42,12 @@ void SyntaxTree::Clear() m_index.clear(); } -void SyntaxTree::AddNode( int startPos, int endPos, std::string label ) +SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label ) { SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label ); m_nodes.push_back( newNode ); m_index[ startPos ][ endPos ].push_back( newNode ); + return newNode; } ParentNodes SyntaxTree::Parse() diff --git a/scripts/training/phrase-extract/SyntaxTree.h b/scripts/training/phrase-extract/SyntaxTree.h index 0ca5ca472..17c106b49 100644 --- a/scripts/training/phrase-extract/SyntaxTree.h +++ b/scripts/training/phrase-extract/SyntaxTree.h @@ -34,12 +34,14 @@ protected: std::string m_label; std::vector< SyntaxNode* > m_children; SyntaxNode* m_parent; + float m_pcfgScore; public: SyntaxNode( int startPos, int endPos, std::string label ) :m_start(startPos) ,m_end(endPos) ,m_label(label) ,m_parent(0) + ,m_pcfgScore(0.0f) {} int GetStart() const { return m_start; @@ -50,6 +52,12 @@ public: std::string GetLabel() const { return m_label; } + float GetPcfgScore() const { + return m_pcfgScore; + } + void SetPcfgScore(float score) { + m_pcfgScore = score; + } SyntaxNode *GetParent() { return m_parent; } @@ -89,11 +97,12 @@ public: } ~SyntaxTree(); + SyntaxNode *AddNode( int startPos, int endPos, std::string label ); + SyntaxNode *GetTop() { return m_top; } - void AddNode( int startPos, int endPos, std::string label ); ParentNodes Parse(); bool HasNode( int startPos, int endPos ) const; const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const; diff --git a/scripts/training/phrase-extract/XmlTree.cpp b/scripts/training/phrase-extract/XmlTree.cpp index 716414f86..b22c159a1 100644 --- a/scripts/training/phrase-extract/XmlTree.cpp +++ b/scripts/training/phrase-extract/XmlTree.cpp @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include "SyntaxTree.h" #include "XmlException.h" @@ -345,13 +345,18 @@ bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &label string label = ParseXmlTagAttribute(tagContent,"label"); labelCollection.insert( label ); + string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg"); + float pcfgScore = pcfgString == "" ? 0.0f + : std::atof(pcfgString.c_str()); + // report what we have processed so far if (0) { cerr << "XML TAG NAME IS: '" << tagName << "'" << endl; cerr << "XML TAG LABEL IS: '" << label << "'" << endl; cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl; } - tree.AddNode( startPos, endPos-1, label ); + SyntaxNode *node = tree.AddNode( startPos, endPos-1, label ); + node->SetPcfgScore(pcfgScore); } } } diff --git a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp index 0ecffae5c..6bd32a13b 100644 --- a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp @@ -212,6 +212,10 @@ Node *AlignmentGraph::CopyParseTree(const ParseTree *root) std::auto_ptr n(new Node(root->GetLabel(), nodeType)); + if (nodeType == TREE) { + n->SetPcfgScore(root->GetPcfgScore()); + } + const std::vector &children = root->GetChildren(); std::vector childNodes; childNodes.reserve(children.size()); diff --git a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp index 008026e1a..397ce1e3c 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -285,6 +285,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], "set maximum allowed scope") ("Minimal", "extract minimal rules only") + ("PCFG", + "include score based on PCFG scores in target corpus") ("UnknownWordLabel", po::value(&options.unknownWordFile), "write unknown word labels to named file") @@ -361,6 +363,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], if (vm.count("Minimal")) { options.minimal = true; } + if (vm.count("PCFG")) { + options.pcfg = true; + } if (vm.count("UnpairedExtractFormat")) { options.unpairedExtractFormat = true; } diff --git a/scripts/training/phrase-extract/extract-ghkm/Node.h b/scripts/training/phrase-extract/extract-ghkm/Node.h index 228fdc812..775473362 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Node.h +++ b/scripts/training/phrase-extract/extract-ghkm/Node.h @@ -41,8 +41,7 @@ class Node Node(const std::string &label, NodeType type) : m_label(label) , m_type(type) - , m_children() - , m_parents() {} + , m_pcfgScore(0.0f) {} ~Node(); @@ -50,12 +49,14 @@ class Node NodeType GetType() const { return m_type; } const std::vector &GetChildren() const { return m_children; } const std::vector &GetParents() const { return m_parents; } + float GetPcfgScore() const { return m_pcfgScore; } const Span &GetSpan() const { return m_span; } const Span &GetComplementSpan() const { return m_complementSpan; } const std::vector &GetRules() const { return m_rules; } void SetChildren(const std::vector &c) { m_children = c; } void SetParents(const std::vector &p) { m_parents = p; } + void SetPcfgScore(float s) { m_pcfgScore = s; } void SetSpan(const Span &s) { m_span = s; } void SetComplementSpan(const Span &cs) { m_complementSpan = cs; } @@ -92,6 +93,7 @@ class Node NodeType m_type; std::vector m_children; std::vector m_parents; + float m_pcfgScore; Span m_span; Span m_complementSpan; std::vector m_rules; diff --git a/scripts/training/phrase-extract/extract-ghkm/Options.h b/scripts/training/phrase-extract/extract-ghkm/Options.h index 108e19d66..c4b57f311 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Options.h +++ b/scripts/training/phrase-extract/extract-ghkm/Options.h @@ -36,6 +36,7 @@ struct Options { , maxRuleSize(3) , maxScope(3) , minimal(false) + , pcfg(false) , unpairedExtractFormat(false) {} // Positional options @@ -53,6 +54,7 @@ struct Options { int maxRuleSize; int maxScope; bool minimal; + bool pcfg; bool unpairedExtractFormat; std::string unknownWordFile; }; diff --git a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h index ec6fc147a..273e2e04e 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h +++ b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h @@ -32,17 +32,19 @@ class ParseTree public: ParseTree(const std::string &label) : m_label(label) - , m_children() - , m_parent() {} + , m_parent(0) + , m_pcfgScore(0.0) {} ~ParseTree(); const std::string &GetLabel() const { return m_label; } const std::vector &GetChildren() const { return m_children; } const ParseTree *GetParent() const { return m_parent; } + float GetPcfgScore() const { return m_pcfgScore; } void SetParent(ParseTree *); void SetChildren(const std::vector &); + void SetPcfgScore(float score) { m_pcfgScore = score; } void AddChild(ParseTree *); @@ -59,6 +61,7 @@ class ParseTree std::string m_label; std::vector m_children; ParseTree *m_parent; + float m_pcfgScore; // log probability }; template diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp index 8473e4283..5dc70052c 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -30,6 +30,7 @@ namespace GHKM { ScfgRule::ScfgRule(const Subgraph &fragment) : m_sourceLHS("X", NonTerminal) , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal) + , m_pcfgScore(fragment.GetPcfgScore()) { // Source RHS diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h index 1ed534d9e..2405d8fa3 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRule.h @@ -57,6 +57,7 @@ class ScfgRule const std::vector &GetSourceRHS() const { return m_sourceRHS; } const std::vector &GetTargetRHS() const { return m_targetRHS; } const Alignment &GetAlignment() const { return m_alignment; } + float GetPcfgScore() const { return m_pcfgScore; } int Scope() const; @@ -68,6 +69,7 @@ class ScfgRule std::vector m_sourceRHS; std::vector m_targetRHS; Alignment m_alignment; + float m_pcfgScore; }; } // namespace GHKM diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp index 4be3f048d..d5d16b790 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp @@ -24,6 +24,7 @@ #include "ScfgRule.h" #include +#include #include #include #include @@ -34,14 +35,43 @@ namespace GHKM { void ScfgRuleWriter::Write(const ScfgRule &rule) { + std::ostringstream sourceSS; + std::ostringstream targetSS; + if (m_options.unpairedExtractFormat) { - WriteUnpairedFormat(rule); + WriteUnpairedFormat(rule, sourceSS, targetSS); } else { - WriteStandardFormat(rule); + WriteStandardFormat(rule, sourceSS, targetSS); + } + + // Write the rule to the forward and inverse extract files. + m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||"; + m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||"; + + const Alignment &alignment = rule.GetAlignment(); + for (Alignment::const_iterator p = alignment.begin(); + p != alignment.end(); ++p) { + m_fwd << " " << p->first << "-" << p->second; + m_inv << " " << p->second << "-" << p->first; + } + + // Write a count of 1 and an empty NT length column to the forward extract + // file. + // TODO Add option to write NT length? + m_fwd << " ||| 1 ||| |||"; + if (m_options.pcfg) { + // Write the PCFG score. + m_fwd << " " << std::exp(rule.GetPcfgScore()); } + m_fwd << std::endl; + + // Write a count of 1 to the inverse extract file. + m_inv << " ||| 1" << std::endl; } -void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule) +void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule, + std::ostream &sourceSS, + std::ostream &targetSS) { const std::vector &sourceRHS = rule.GetSourceRHS(); const std::vector &targetRHS = rule.GetTargetRHS(); @@ -60,9 +90,6 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule) } } - std::ostringstream sourceSS; - std::ostringstream targetSS; - // Write the source side of the rule to sourceSS. int i = 0; for (std::vector::const_iterator p(sourceRHS.begin()); @@ -88,27 +115,14 @@ void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule) targetSS << " "; } WriteSymbol(rule.GetTargetLHS(), targetSS); - - // Write the rule to the forward and inverse extract files. - m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||"; - m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||"; - for (Alignment::const_iterator p(alignment.begin()); - p != alignment.end(); ++p) { - m_fwd << " " << p->first << "-" << p->second; - m_inv << " " << p->second << "-" << p->first; - } - m_fwd << " ||| 1" << std::endl; - m_inv << " ||| 1" << std::endl; } -void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule) +void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule, + std::ostream &sourceSS, + std::ostream &targetSS) { const std::vector &sourceRHS = rule.GetSourceRHS(); const std::vector &targetRHS = rule.GetTargetRHS(); - const Alignment &alignment = rule.GetAlignment(); - - std::ostringstream sourceSS; - std::ostringstream targetSS; // Write the source side of the rule to sourceSS. int i = 0; @@ -127,17 +141,6 @@ void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule) targetSS << " "; } WriteSymbol(rule.GetTargetLHS(), targetSS); - - // Write the rule to the forward and inverse extract files. - m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||"; - m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||"; - for (Alignment::const_iterator p(alignment.begin()); - p != alignment.end(); ++p) { - m_fwd << " " << p->first << "-" << p->second; - m_inv << " " << p->second << "-" << p->first; - } - m_fwd << " ||| 1" << std::endl; - m_inv << " ||| 1" << std::endl; } void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out) diff --git a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h index 738d09ce9..b92a432a1 100644 --- a/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h +++ b/scripts/training/phrase-extract/extract-ghkm/ScfgRuleWriter.h @@ -45,8 +45,8 @@ class ScfgRuleWriter ScfgRuleWriter(const ScfgRuleWriter &); ScfgRuleWriter &operator=(const ScfgRuleWriter &); - void WriteStandardFormat(const ScfgRule &); - void WriteUnpairedFormat(const ScfgRule &); + void WriteStandardFormat(const ScfgRule &, std::ostream &, std::ostream &); + void WriteUnpairedFormat(const ScfgRule &, std::ostream &, std::ostream &); void WriteSymbol(const Symbol &, std::ostream &); std::ostream &m_fwd; diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp index e5aedbb16..e048f2c55 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp @@ -101,5 +101,21 @@ int Subgraph::CalcDepth(const Node *n) const return maxChildDepth + 1; } +float Subgraph::CalcPcfgScore() const +{ + if (m_root->GetType() != TREE || m_leaves.empty()) { + return 0.0f; + } + float score = m_root->GetPcfgScore(); + for (std::set::const_iterator p = m_leaves.begin(); + p != m_leaves.end(); ++p) { + const Node *leaf = *p; + if (leaf->GetType() == TREE) { + score -= leaf->GetPcfgScore(); + } + } + return score; +} + } // namespace Moses } // namespace GHKM diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h index e84903502..ede1233e9 100644 --- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h +++ b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h @@ -38,7 +38,8 @@ class Subgraph : m_root(root) , m_depth(0) , m_size(root->GetType() == TREE ? 1 : 0) - , m_nodeCount(1) {} + , m_nodeCount(1) + , m_pcfgScore(0.0f) {} Subgraph(const Node *root, const std::set &leaves) : m_root(root) @@ -46,10 +47,12 @@ class Subgraph , m_depth(-1) , m_size(-1) , m_nodeCount(-1) + , m_pcfgScore(0.0f) { m_depth = CalcDepth(m_root); m_size = CalcSize(m_root); m_nodeCount = CountNodes(m_root); + m_pcfgScore = CalcPcfgScore(); } const Node *GetRoot() const { return m_root; } @@ -57,6 +60,7 @@ class Subgraph int GetDepth() const { return m_depth; } int GetSize() const { return m_size; } int GetNodeCount() const { return m_nodeCount; } + float GetPcfgScore() const { return m_pcfgScore; } bool IsTrivial() const { return m_leaves.empty(); } @@ -66,6 +70,7 @@ class Subgraph void GetTargetLeaves(const Node *, std::vector &) const; int CalcDepth(const Node *) const; int CalcSize(const Node *) const; + float CalcPcfgScore() const; int CountNodes(const Node *) const; const Node *m_root; @@ -73,6 +78,7 @@ class Subgraph int m_depth; int m_size; int m_nodeCount; + float m_pcfgScore; }; } // namespace GHKM diff --git a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp index 31c0e3843..cc961dc0c 100644 --- a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp +++ b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp @@ -61,6 +61,7 @@ std::auto_ptr XmlTreeParser::ConvertTree( const std::vector &words) { std::auto_ptr root(new ParseTree(tree.GetLabel())); + root->SetPcfgScore(tree.GetPcfgScore()); const std::vector &children = tree.GetChildren(); if (children.empty()) { if (tree.GetStart() != tree.GetEnd()) { diff --git a/scripts/training/phrase-extract/extract-rules.cpp b/scripts/training/phrase-extract/extract-rules.cpp index 2cc9dc54d..a00667b82 100644 --- a/scripts/training/phrase-extract/extract-rules.cpp +++ b/scripts/training/phrase-extract/extract-rules.cpp @@ -90,7 +90,7 @@ void addHieroRule( int startT, int endT, int startS, int endS void printHieroPhrase( int startT, int endT, int startS, int endS , HoleCollection &holeColl, LabelIndex &labelIndex); string printTargetHieroPhrase( int startT, int endT, int startS, int endS - , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex); + , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore); string printSourceHieroPhrase( int startT, int endT, int startS, int endS , HoleCollection &holeColl, const LabelIndex &labelIndex); void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS @@ -257,6 +257,8 @@ int main(int argc, char* argv[]) // if an source phrase is paired with two target phrases, then count(t|s) = 0.5 else if (strcmp(argv[i],"--NoFractionalCounting") == 0) { options.fractionalCounting = false; + } else if (strcmp(argv[i],"--PCFG") == 0) { + options.pcfgScore = true; } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { options.outputNTLengths = true; #ifdef WITH_THREADS @@ -517,7 +519,7 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS, } string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS - , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex) + , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore) { HoleList::iterator iterHoleList = holeColl.GetHoles().begin(); assert(iterHoleList != holeColl.GetHoles().end()); @@ -545,6 +547,11 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in out += "[" + sourceLabel + "][" + targetLabel + "] "; + if (m_options.pcfgScore) { + double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore(); + logPCFGScore -= score; + } + currPos = hole.GetEnd(1); hole.SetPos(outPos, 1); ++iterHoleList; @@ -658,8 +665,16 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex); // target - rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex) + if (m_options.pcfgScore) { + double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore(); + rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore) + + " [" + targetLabel + "]"; + rule.pcfgScore = std::exp(logPCFGScore); + } else { + double logPCFGScore = 0.0f; + rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore) + " [" + targetLabel + "]"; + } // source // holeColl.SortSourceHoles(); @@ -877,6 +892,11 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist rule.target += m_sentence->target[ti] + " "; rule.target += "[" + targetLabel + "]"; + if (m_options.pcfgScore) { + double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore(); + rule.pcfgScore = std::exp(logPCFGScore); + } + // alignment for(int ti=startT; ti<=endT; ti++) { for(unsigned int i=0; ialignedToT[ti].size(); i++) { @@ -957,11 +977,13 @@ void ExtractTask::writeRulesToFile() out << rule->source << " ||| " << rule->target << " ||| " << rule->alignment << " ||| " - << rule->count; + << rule->count << " ||| "; if (m_options.outputNTLengths) { - out << " ||| "; rule->OutputNTLengths(out); } + if (m_options.pcfgScore) { + out << " ||| " << rule->pcfgScore; + } out << "\n"; if (!m_options.onlyDirectFlag) { diff --git a/scripts/training/phrase-extract/pcfg-common/Jamfile b/scripts/training/phrase-extract/pcfg-common/Jamfile new file mode 100644 index 000000000..3dc272a56 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/Jamfile @@ -0,0 +1 @@ +lib pcfg_common : [ glob *.cc ] ..//trees ; diff --git a/scripts/training/phrase-extract/pcfg-common/exception.h b/scripts/training/phrase-extract/pcfg-common/exception.h new file mode 100644 index 000000000..3dbd59d0e --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/exception.h @@ -0,0 +1,41 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXCEPTION_H_ +#define PCFG_EXCEPTION_H_ + +#include + +namespace Moses { +namespace PCFG { + +class Exception { + public: + Exception(const char *msg) : msg_(msg) {} + Exception(const std::string &msg) : msg_(msg) {} + const std::string &msg() const { return msg_; } + private: + std::string msg_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/numbered_set.h b/scripts/training/phrase-extract/pcfg-common/numbered_set.h new file mode 100644 index 000000000..f88d710ed --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/numbered_set.h @@ -0,0 +1,109 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_NUMBERED_SET_H_ +#define PCFG_NUMBERED_SET_H_ + +#include "exception.h" + +#include + +#include +#include +#include + +namespace Moses { +namespace PCFG { + +// Stores a set of elements of type T, each of which is allocated an integral +// ID of type I. IDs are contiguous starting at 0. Individual elements cannot +// be removed once inserted (but the whole set can be cleared). +template +class NumberedSet { + private: + typedef boost::unordered_map ElementToIdMap; + typedef std::vector IdToElementMap; + + public: + typedef I IdType; + typedef typename IdToElementMap::const_iterator const_iterator; + + NumberedSet() {} + + const_iterator begin() const { return id_to_element_.begin(); } + const_iterator end() const { return id_to_element_.end(); } + + // Static value + static I NullId() { return std::numeric_limits::max(); } + + bool Empty() const { return id_to_element_.empty(); } + size_t Size() const { return id_to_element_.size(); } + + // Insert the given object and return its ID. + I Insert(const T &); + + I Lookup(const T &) const; + const T &Lookup(I) const; + + void Clear(); + + private: + ElementToIdMap element_to_id_; + IdToElementMap id_to_element_; +}; + +template +I NumberedSet::Lookup(const T &s) const { + typename ElementToIdMap::const_iterator p = element_to_id_.find(s); + return (p == element_to_id_.end()) ? NullId() : p->second; +} + +template +const T &NumberedSet::Lookup(I id) const { + if (id < 0 || id >= id_to_element_.size()) { + std::ostringstream msg; + msg << "Value not found: " << id; + throw Exception(msg.str()); + } + return *(id_to_element_[id]); +} + +template +I NumberedSet::Insert(const T &x) { + std::pair value(x, id_to_element_.size()); + std::pair result = + element_to_id_.insert(value); + if (result.second) { + // x is a new element. + id_to_element_.push_back(&result.first->first); + } + return result.first->second; +} + +template +void NumberedSet::Clear() { + element_to_id_.clear(); + id_to_element_.clear(); +} + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.cc b/scripts/training/phrase-extract/pcfg-common/pcfg.cc new file mode 100644 index 000000000..d045b820b --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/pcfg.cc @@ -0,0 +1,106 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg.h" + +#include "exception.h" + +#include +#include + +#include + +namespace Moses { +namespace PCFG { + +void Pcfg::Add(const Key &key, double score) { + rules_[key] = score; +} + +bool Pcfg::Lookup(const Key &key, double &score) const { + Map::const_iterator p = rules_.find(key); + if (p == rules_.end()) { + return false; + } + score = p->second; + return true; +} + +void Pcfg::Read(std::istream &input, Vocabulary &vocab) { + std::string line; + std::string lhs_string; + std::vector rhs_strings; + std::string score_string; + Key key; + while (std::getline(input, line)) { + // Read LHS. + size_t pos = line.find("|||"); + if (pos == std::string::npos) { + throw Exception("missing first delimiter"); + } + lhs_string = line.substr(0, pos); + boost::trim(lhs_string); + + // Read RHS. + size_t begin = pos+3; + pos = line.find("|||", begin); + if (pos == std::string::npos) { + throw Exception("missing second delimiter"); + } + std::string rhs_text = line.substr(begin, pos-begin); + boost::trim(rhs_text); + rhs_strings.clear(); + boost::split(rhs_strings, rhs_text, boost::algorithm::is_space(), + boost::algorithm::token_compress_on); + + // Read score. + score_string = line.substr(pos+3); + boost::trim(score_string); + + // Construct key. + key.clear(); + key.reserve(rhs_strings.size()+1); + key.push_back(vocab.Insert(lhs_string)); + for (std::vector::const_iterator p = rhs_strings.begin(); + p != rhs_strings.end(); ++p) { + key.push_back(vocab.Insert(*p)); + } + + // Add rule. + double score = boost::lexical_cast(score_string); + Add(key, score); + } +} + +void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const { + for (const_iterator p = begin(); p != end(); ++p) { + const Key &key = p->first; + double score = p->second; + std::vector::const_iterator q = key.begin(); + std::vector::const_iterator end = key.end(); + output << vocab.Lookup(*q++) << " |||"; + while (q != end) { + output << " " << vocab.Lookup(*q++); + } + output << " ||| " << score << std::endl; + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg.h b/scripts/training/phrase-extract/pcfg-common/pcfg.h new file mode 100644 index 000000000..757eea449 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/pcfg.h @@ -0,0 +1,61 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_PCFG_H_ +#define PCFG_PCFG_H_ + +#include "typedef.h" + +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +class Pcfg { + public: + typedef std::vector Key; + typedef std::map Map; + typedef Map::iterator iterator; + typedef Map::const_iterator const_iterator; + + Pcfg() {} + + iterator begin() { return rules_.begin(); } + const_iterator begin() const { return rules_.begin(); } + + iterator end() { return rules_.end(); } + const_iterator end() const { return rules_.end(); } + + void Add(const Key &, double); + bool Lookup(const Key &, double &) const; + void Read(std::istream &, Vocabulary &); + void Write(const Vocabulary &, std::ostream &) const; + + private: + Map rules_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h new file mode 100644 index 000000000..bdac64dfc --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/pcfg_tree.h @@ -0,0 +1,77 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_PCFG_TREE_H_ +#define PCFG_PCFG_TREE_H_ + +#include "syntax_tree.h" +#include "xml_tree_writer.h" + +#include + +namespace Moses { +namespace PCFG { + +template +class PcfgTreeBase : public SyntaxTreeBase { + public: + typedef std::string LabelType; + typedef SyntaxTreeBase BaseType; + + PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {} + + double score() const { return score_; } + void set_score(double s) { score_ = s; } + + private: + double score_; +}; + +class PcfgTree : public PcfgTreeBase { + public: + typedef PcfgTreeBase BaseType; + PcfgTree(const BaseType::LabelType &label) : BaseType(label) {} +}; + +// Specialise XmlOutputHandler for PcfgTree. +template<> +class XmlOutputHandler { + public: + typedef std::map AttributeMap; + + void GetLabel(const PcfgTree &tree, std::string &label) const { + label = tree.label(); + } + + void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const { + attribute_map.clear(); + double score = tree.score(); + if (score != 0.0) { + std::ostringstream out; + out << tree.score(); + attribute_map["pcfg"] = out.str(); + } + } +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/syntax_tree.h b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h new file mode 100644 index 000000000..37f72dd58 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/syntax_tree.h @@ -0,0 +1,91 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_SYNTAX_TREE_H_ +#define PCFG_SYNTAX_TREE_H_ + +#include +#include + +namespace Moses { +namespace PCFG { + +// Base class for SyntaxTree, AgreementTree, and friends. +template +class SyntaxTreeBase { + public: + // Constructors + SyntaxTreeBase(const T &label) + : label_(label) + , children_() + , parent_(0) {} + + SyntaxTreeBase(const T &label, const std::vector &children) + : label_(label) + , children_(children) + , parent_(0) {} + + // Destructor + virtual ~SyntaxTreeBase(); + + const T &label() const { return label_; } + const DerivedType *parent() const { return parent_; } + DerivedType *parent() { return parent_; } + const std::vector &children() const { return children_; } + std::vector &children() { return children_; } + + void set_label(const T &label) { label_ = label; } + void set_parent(DerivedType *parent) { parent_ = parent; } + void set_children(const std::vector &c) { children_ = c; } + + bool IsLeaf() const { return children_.empty(); } + + bool IsPreterminal() const { + return children_.size() == 1 && children_[0]->IsLeaf(); + } + + void AddChild(DerivedType *child) { children_.push_back(child); } + + private: + T label_; + std::vector children_; + DerivedType *parent_; +}; + +template +class SyntaxTree : public SyntaxTreeBase > { + public: + typedef SyntaxTreeBase > BaseType; + SyntaxTree(const T &label) : BaseType(label) {} + SyntaxTree(const T &label, const std::vector &children) + : BaseType(label, children) {} +}; + +template +SyntaxTreeBase::~SyntaxTreeBase() { + for (size_t i = 0; i < children_.size(); ++i) { + delete children_[i]; + } +} + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/tool.cc b/scripts/training/phrase-extract/pcfg-common/tool.cc new file mode 100644 index 000000000..bebd220e1 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/tool.cc @@ -0,0 +1,80 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "tool.h" + +#include + +namespace Moses { +namespace PCFG { + +std::istream &Tool::OpenInputOrDie(const std::string &filename) { + // TODO Check that function is only called once? + if (filename.empty() || filename == "-") { + input_ptr_ = &(std::cin); + } else { + input_file_stream_.open(filename.c_str()); + if (!input_file_stream_) { + std::ostringstream msg; + msg << "failed to open input file: " << filename; + Error(msg.str()); + } + input_ptr_ = &input_file_stream_; + } + return *input_ptr_; +} + +std::ostream &Tool::OpenOutputOrDie(const std::string &filename) { + // TODO Check that function is only called once? + if (filename.empty() || filename == "-") { + output_ptr_ = &(std::cout); + } else { + output_file_stream_.open(filename.c_str()); + if (!output_file_stream_) { + std::ostringstream msg; + msg << "failed to open output file: " << filename; + Error(msg.str()); + } + output_ptr_ = &output_file_stream_; + } + return *output_ptr_; +} + +void Tool::OpenNamedInputOrDie(const std::string &filename, + std::ifstream &stream) { + stream.open(filename.c_str()); + if (!stream) { + std::ostringstream msg; + msg << "failed to open input file: " << filename; + Error(msg.str()); + } +} + +void Tool::OpenNamedOutputOrDie(const std::string &filename, + std::ofstream &stream) { + stream.open(filename.c_str()); + if (!stream) { + std::ostringstream msg; + msg << "failed to open output file: " << filename; + Error(msg.str()); + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-common/tool.h b/scripts/training/phrase-extract/pcfg-common/tool.h new file mode 100644 index 000000000..0af342569 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/tool.h @@ -0,0 +1,91 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_TOOL_H_ +#define PCFG_TOOL_H_ + +#include + +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +class Tool { + public: + virtual ~Tool() {} + + const std::string &name() const { return name_; } + + virtual int Main(int argc, char *argv[]) = 0; + + protected: + Tool(const std::string &name) : name_(name) {} + + // Returns the boost::program_options style that should be used by all tools. + static int CommonOptionStyle() { + namespace cls = boost::program_options::command_line_style; + return cls::default_style & (~cls::allow_guessing); + } + + void Warn(const std::string &msg) const { + std::cerr << name_ << ": warning: " << msg << std::endl; + } + + void Error(const std::string &msg) const { + std::cerr << name_ << ": error: " << msg << std::endl; + std::exit(1); + } + + // Initialises the tool's main input stream and returns a reference that is + // valid for the remainder of the tool's lifetime. If filename is empty or + // "-" then input is standard input; otherwise it is the named file. Calls + // Error() if the file cannot be opened for reading. + std::istream &OpenInputOrDie(const std::string &filename); + + // Initialises the tool's main output stream and returns a reference that is + // valid for the remainder of the tool's lifetime. If filename is empty or + // "-" then output is standard output; otherwise it is the named file. Calls + // Error() if the file cannot be opened for writing. + std::ostream &OpenOutputOrDie(const std::string &filename); + + // Opens the named input file using the supplied ifstream. Calls Error() if + // the file cannot be opened for reading. + void OpenNamedInputOrDie(const std::string &, std::ifstream &); + + // Opens the named output file using the supplied ofstream. Calls Error() if + // the file cannot be opened for writing. + void OpenNamedOutputOrDie(const std::string &, std::ofstream &); + + private: + std::string name_; + std::istream *input_ptr_; + std::ifstream input_file_stream_; + std::ostream *output_ptr_; + std::ofstream output_file_stream_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/typedef.h b/scripts/training/phrase-extract/pcfg-common/typedef.h new file mode 100644 index 000000000..49a12d681 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/typedef.h @@ -0,0 +1,37 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_TYPEDEF_H_ +#define PCFG_TYPEDEF_H_ + +#include "numbered_set.h" +#include "syntax_tree.h" + +#include + +namespace Moses { +namespace PCFG { + +typedef NumberedSet Vocabulary; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc new file mode 100644 index 000000000..5c596a0fb --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.cc @@ -0,0 +1,85 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "xml_tree_parser.h" + +#include "exception.h" +#include "tables-core.h" +#include "XmlException.h" +#include "XmlTree.h" + +#include +#include + +namespace Moses { +namespace PCFG { + +XmlTreeParser::XmlTreeParser() +{ +} + +std::auto_ptr XmlTreeParser::Parse(const std::string &line) +{ + m_line = line; + m_tree.Clear(); + try { + if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) { + throw Exception(""); + } + } catch (const XmlException &e) { + throw Exception(e.getMsg()); + } + m_tree.ConnectNodes(); + SyntaxNode *root = m_tree.GetTop(); + assert(root); + m_words = tokenize(m_line.c_str()); + return ConvertTree(*root, m_words); +} + +// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree. +std::auto_ptr XmlTreeParser::ConvertTree( + const SyntaxNode &tree, + const std::vector &words) +{ + std::auto_ptr root(new PcfgTree(tree.GetLabel())); + const std::vector &children = tree.GetChildren(); + if (children.empty()) { + if (tree.GetStart() != tree.GetEnd()) { + std::ostringstream msg; + msg << "leaf node covers multiple words (" << tree.GetStart() + << "-" << tree.GetEnd() << "): this is currently unsupported"; + throw Exception(msg.str()); + } + std::auto_ptr leaf(new PcfgTree(words[tree.GetStart()])); + leaf->set_parent(root.get()); + root->AddChild(leaf.release()); + } else { + for (std::vector::const_iterator p = children.begin(); + p != children.end(); ++p) { + assert(*p); + std::auto_ptr child = ConvertTree(**p, words); + child->set_parent(root.get()); + root->AddChild(child.release()); + } + } + return root; +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h new file mode 100644 index 000000000..6b418c44e --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_parser.h @@ -0,0 +1,56 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_XML_TREE_PARSER_H_ +#define PCFG_XML_TREE_PARSER_H_ + +#include "pcfg_tree.h" +#include "SyntaxTree.h" + +#include +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +// Parses a string in Moses' XML parse tree format and returns a PcfgTree +// object. +class XmlTreeParser { + public: + XmlTreeParser(); + std::auto_ptr Parse(const std::string &); + private: + std::auto_ptr ConvertTree(const SyntaxNode &, + const std::vector &); + + std::set m_labelSet; + std::map m_topLabelSet; + std::string m_line; + ::SyntaxTree m_tree; + std::vector m_words; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h new file mode 100644 index 000000000..347c352bb --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-common/xml_tree_writer.h @@ -0,0 +1,127 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_XML_TREE_WRITER_H_ +#define PCFG_XML_TREE_WRITER_H_ + +#include "syntax_tree.h" + +#include "XmlTree.h" + +#include +#include +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +template +class XmlOutputHandler { + public: + typedef std::map AttributeMap; + + void GetLabel(const InputTree &, std::string &) const; + void GetAttributes(const InputTree &, AttributeMap &) const; +}; + +template +class XmlTreeWriter : public XmlOutputHandler { + public: + typedef XmlOutputHandler Base; + void Write(const InputTree &, std::ostream &) const; + private: + std::string Escape(const std::string &) const; +}; + +template +void XmlTreeWriter::Write(const InputTree &tree, + std::ostream &out) const { + assert(!tree.IsLeaf()); + + // Opening tag + + std::string label; + Base::GetLabel(tree, label); + out << "first << "=\"" << p->second << "\""; + } + + out << ">"; + + // Children + + const std::vector &children = tree.children(); + for (typename std::vector::const_iterator p = children.begin(); + p != children.end(); ++p) { + InputTree &child = **p; + if (child.IsLeaf()) { + Base::GetLabel(child, label); + out << " " << Escape(label); + } else { + out << " "; + Write(**p, out); + } + } + + // Closing tag + out << " "; + + if (tree.parent() == 0) { + out << std::endl; + } +} + +// Escapes XML special characters. +template +std::string XmlTreeWriter::Escape(const std::string &s) const { + std::string t; + size_t len = s.size(); + t.reserve(len); + for (size_t i = 0; i < len; ++i) { + if (s[i] == '<') { + t += "<"; + } else if (s[i] == '>') { + t += ">"; + } else if (s[i] == '&') { + t += "&"; + } else if (s[i] == '\'') { + t += "'"; + } else if (s[i] == '"') { + t += """; + } else { + t += s[i]; + } + } + return t; +} + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-extract/Jamfile b/scripts/training/phrase-extract/pcfg-extract/Jamfile new file mode 100644 index 000000000..be91d6d2f --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/Jamfile @@ -0,0 +1 @@ +exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ; diff --git a/scripts/training/phrase-extract/pcfg-extract/main.cc b/scripts/training/phrase-extract/pcfg-extract/main.cc new file mode 100644 index 000000000..47b45afc3 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/main.cc @@ -0,0 +1,25 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg_extract.h" + +int main(int argc, char *argv[]) { + Moses::PCFG::PcfgExtract tool; + return tool.Main(argc, argv); +} diff --git a/scripts/training/phrase-extract/pcfg-extract/options.h b/scripts/training/phrase-extract/pcfg-extract/options.h new file mode 100644 index 000000000..3acb31b58 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/options.h @@ -0,0 +1,36 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXTRACT_OPTIONS_H_ +#define PCFG_EXTRACT_OPTIONS_H_ + +#include + +namespace Moses { +namespace PCFG { + +struct Options { + std::string corpus_file; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc new file mode 100644 index 000000000..151c9959c --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.cc @@ -0,0 +1,131 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg_extract.h" + +#include "options.h" +#include "rule_collection.h" +#include "rule_extractor.h" + +#include "pcfg-common/exception.h" +#include "pcfg-common/pcfg.h" +#include "pcfg-common/pcfg_tree.h" +#include "pcfg-common/syntax_tree.h" +#include "pcfg-common/typedef.h" +#include "pcfg-common/xml_tree_parser.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +int PcfgExtract::Main(int argc, char *argv[]) { + // Process command-line options. + Options options; + ProcessOptions(argc, argv, options); + + // Extract PCFG rules from corpus. + Vocabulary non_term_vocab; + RuleExtractor rule_extractor(non_term_vocab); + RuleCollection rule_collection; + XmlTreeParser parser; + std::string line; + size_t line_num = 0; + std::auto_ptr tree; + while (std::getline(std::cin, line)) { + ++line_num; + try { + tree = parser.Parse(line); + } catch (Exception &e) { + std::ostringstream msg; + msg << "line " << line_num << ": " << e.msg(); + Error(msg.str()); + } + if (!tree.get()) { + std::ostringstream msg; + msg << "no tree at line " << line_num; + Warn(msg.str()); + continue; + } + rule_extractor.Extract(*tree, rule_collection); + } + + // Score rules and write PCFG to output. + Pcfg pcfg; + rule_collection.CreatePcfg(pcfg); + pcfg.Write(non_term_vocab, std::cout); + + return 0; +} + +void PcfgExtract::ProcessOptions(int argc, char *argv[], + Options &options) const { + namespace po = boost::program_options; + + std::ostringstream usage_top; + usage_top << "Usage: " << name() << "\n\n" << "Options"; + + // Declare the command line options that are visible to the user. + po::options_description visible(usage_top.str()); + visible.add_options() + ("help", "print help message and exit") + ; + + // Declare the command line options that are hidden from the user + // (these are used as positional options). + po::options_description hidden("Hidden options"); + hidden.add_options(); + + // Compose the full set of command-line options. + po::options_description cmd_line_options; + cmd_line_options.add(visible).add(hidden); + + // Register the positional options. + po::positional_options_description p; + + // Process the command-line. + po::variables_map vm; + try { + po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()). + options(cmd_line_options).positional(p).run(), vm); + po::notify(vm); + } catch (const std::exception &e) { + std::ostringstream msg; + msg << e.what() << "\n\n" << visible; + Error(msg.str()); + } + + if (vm.count("help")) { + std::cout << visible << std::endl; + std::exit(0); + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h new file mode 100644 index 000000000..1af6cb4fe --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/pcfg_extract.h @@ -0,0 +1,42 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXTRACT_PCFG_EXTRACT_H_ +#define PCFG_EXTRACT_PCFG_EXTRACT_H_ + +#include "pcfg-common/tool.h" + +namespace Moses { +namespace PCFG { + +class Options; + +class PcfgExtract : public Tool { + public: + PcfgExtract() : Tool("pcfg-extract") {} + virtual int Main(int, char *[]); + private: + void ProcessOptions(int, char *[], Options &) const; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc new file mode 100644 index 000000000..503b1a9e6 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.cc @@ -0,0 +1,58 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "rule_collection.h" + +#include "pcfg-common/pcfg.h" + +#include + +namespace Moses { +namespace PCFG { + +void RuleCollection::Add(size_t lhs, const std::vector &rhs) { + ++collection_[lhs][rhs]; +} + +void RuleCollection::CreatePcfg(Pcfg &pcfg) { + std::vector key; + for (const_iterator p = begin(); p != end(); ++p) { + size_t lhs = p->first; + const RhsCountMap &rhs_counts = p->second; + size_t total = 0; + for (RhsCountMap::const_iterator q = rhs_counts.begin(); + q != rhs_counts.end(); ++q) { + total += q->second; + } + for (RhsCountMap::const_iterator q = rhs_counts.begin(); + q != rhs_counts.end(); ++q) { + const std::vector &rhs = q->first; + size_t count = q->second; + double score = std::log(static_cast(count) / + static_cast(total)); + key.clear(); + key.push_back(lhs); + key.insert(key.end(), rhs.begin(), rhs.end()); + pcfg.Add(key, score); + } + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_collection.h b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h new file mode 100644 index 000000000..1b768dd21 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/rule_collection.h @@ -0,0 +1,59 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXTRACT_RULE_COLLECTION_H_ +#define PCFG_EXTRACT_RULE_COLLECTION_H_ + +#include "pcfg-common/pcfg.h" + +#include + +#include + +namespace Moses { +namespace PCFG { + +// Contains PCFG rules and their counts. +class RuleCollection { + public: + typedef boost::unordered_map, size_t> RhsCountMap; + typedef boost::unordered_map Map; + typedef Map::iterator iterator; + typedef Map::const_iterator const_iterator; + + RuleCollection() {} + + iterator begin() { return collection_.begin(); } + const_iterator begin() const { return collection_.begin(); } + + iterator end() { return collection_.end(); } + const_iterator end() const { return collection_.end(); } + + void Add(size_t, const std::vector &); + void CreatePcfg(Pcfg &); + + private: + Map collection_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc new file mode 100644 index 000000000..48a82a6d0 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.cc @@ -0,0 +1,51 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "rule_extractor.h" + +#include "pcfg-common/pcfg_tree.h" + +namespace Moses { +namespace PCFG { + +RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab) + : non_term_vocab_(non_term_vocab) { +} + +void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const { + if (tree.IsPreterminal() || tree.IsLeaf()) { + return; + } + + size_t lhs = non_term_vocab_.Insert(tree.label()); + std::vector rhs; + + const std::vector &children = tree.children(); + rhs.reserve(children.size()); + for (std::vector::const_iterator p(children.begin()); + p != children.end(); ++p) { + const PcfgTree &child = **p; + rhs.push_back(non_term_vocab_.Insert(child.label())); + Extract(child, rc); + } + rc.Add(lhs, rhs); +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h new file mode 100644 index 000000000..6bcffbc61 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-extract/rule_extractor.h @@ -0,0 +1,45 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_ +#define PCFG_EXTRACT_RULE_EXTRACTOR_H_ + +#include "rule_collection.h" + +#include "pcfg-common/typedef.h" + +namespace Moses { +namespace PCFG { + +class PcfgTree; + +// Extracts PCFG rules from syntax trees and adds them to a RuleCollection. +class RuleExtractor { + public: + RuleExtractor(Vocabulary &); + void Extract(const PcfgTree &, RuleCollection &) const; + private: + Vocabulary &non_term_vocab_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-score/Jamfile b/scripts/training/phrase-extract/pcfg-score/Jamfile new file mode 100644 index 000000000..7225381c0 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/Jamfile @@ -0,0 +1 @@ +exe pcfg-score : [ glob *.cc ] ..//pcfg-common ../../../..//boost_program_options ; diff --git a/scripts/training/phrase-extract/pcfg-score/main.cc b/scripts/training/phrase-extract/pcfg-score/main.cc new file mode 100644 index 000000000..da5392add --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/main.cc @@ -0,0 +1,25 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg_score.h" + +int main(int argc, char *argv[]) { + Moses::PCFG::PcfgScore tool; + return tool.Main(argc, argv); +} diff --git a/scripts/training/phrase-extract/pcfg-score/options.h b/scripts/training/phrase-extract/pcfg-score/options.h new file mode 100644 index 000000000..e54b2a0b9 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/options.h @@ -0,0 +1,36 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_SCORE_OPTIONS_H_ +#define PCFG_SCORE_OPTIONS_H_ + +#include + +namespace Moses { +namespace PCFG { + +struct Options { + std::string pcfg_file; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc new file mode 100644 index 000000000..d780200ad --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.cc @@ -0,0 +1,152 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "pcfg_score.h" + +#include "options.h" +#include "tree_scorer.h" + +#include "pcfg-common/exception.h" +#include "pcfg-common/pcfg.h" +#include "pcfg-common/pcfg_tree.h" +#include "pcfg-common/syntax_tree.h" +#include "pcfg-common/typedef.h" +#include "pcfg-common/xml_tree_parser.h" + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace Moses { +namespace PCFG { + +int PcfgScore::Main(int argc, char *argv[]) { + // Process command-line options. + Options options; + ProcessOptions(argc, argv, options); + + // Open PCFG stream. + std::ifstream pcfg_stream; + OpenNamedInputOrDie(options.pcfg_file, pcfg_stream); + + // Read PCFG. + Pcfg pcfg; + Vocabulary non_term_vocab; + pcfg.Read(pcfg_stream, non_term_vocab); + + // Score corpus according to PCFG. + TreeScorer scorer(pcfg, non_term_vocab); + XmlTreeParser parser; + XmlTreeWriter writer; + std::string line; + size_t line_num = 0; + std::auto_ptr tree; + while (std::getline(std::cin, line)) { + ++line_num; + try { + tree = parser.Parse(line); + } catch (Exception &e) { + std::ostringstream msg; + msg << "line " << line_num << ": " << e.msg(); + Error(msg.str()); + } + if (!tree.get()) { + std::ostringstream msg; + msg << "no tree at line " << line_num; + Warn(msg.str()); + std::cout << std::endl; + continue; + } + if (!scorer.Score(*tree)) { + std::ostringstream msg; + msg << "failed to score tree at line " << line_num; + Warn(msg.str()); + std::cout << std::endl; + continue; + } + writer.Write(*tree, std::cout); + } + + return 0; +} + +void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const { + namespace po = boost::program_options; + + std::ostringstream usage_top; + usage_top << "Usage: " << name() << " PCFG\n\n" + << "Options"; + + // Declare the command line options that are visible to the user. + po::options_description visible(usage_top.str()); + visible.add_options() + ("help", "print help message and exit") + ; + + // Declare the command line options that are hidden from the user + // (these are used as positional options). + po::options_description hidden("Hidden options"); + hidden.add_options() + ("pcfg-file", po::value(&options.pcfg_file), "pcfg file") + ; + + // Compose the full set of command-line options. + po::options_description cmd_line_options; + cmd_line_options.add(visible).add(hidden); + + // Register the positional options. + po::positional_options_description p; + p.add("pcfg-file", 1); + + // Process the command-line. + po::variables_map vm; + try { + po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()). + options(cmd_line_options).positional(p).run(), vm); + po::notify(vm); + } catch (const std::exception &e) { + std::ostringstream msg; + msg << e.what() << "\n\n" << visible; + Error(msg.str()); + } + + if (vm.count("help")) { + std::cout << visible << std::endl; + std::exit(0); + } + + // Check positional options were given. + + if (!vm.count("pcfg-file")) { + std::ostringstream msg; + msg << "missing required argument\n\n" << visible << std::endl; + Error(msg.str()); + } +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-score/pcfg_score.h b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h new file mode 100644 index 000000000..5e506c39d --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/pcfg_score.h @@ -0,0 +1,42 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_SCORE_PCFG_SCORE_H_ +#define PCFG_SCORE_PCFG_SCORE_H_ + +#include "pcfg-common/tool.h" + +namespace Moses { +namespace PCFG { + +class Options; + +class PcfgScore : public Tool { + public: + PcfgScore() : Tool("pcfg-score") {} + virtual int Main(int, char *[]); + private: + void ProcessOptions(int, char *[], Options &) const; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc new file mode 100644 index 000000000..5f695e4fc --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.cc @@ -0,0 +1,68 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "tree_scorer.h" + +#include + +namespace Moses { +namespace PCFG { + +TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab) + : pcfg_(pcfg) + , non_term_vocab_(non_term_vocab) { +} + +bool TreeScorer::Score(PcfgTree &root) const { + if (root.IsPreterminal() || root.IsLeaf()) { + return true; + } + + const std::vector &children = root.children(); + + double log_prob = 0.0; + + std::vector key; + key.reserve(children.size()+1); + key.push_back(non_term_vocab_.Lookup(root.label())); + + for (std::vector::const_iterator p(children.begin()); + p != children.end(); ++p) { + PcfgTree *child = *p; + assert(!child->IsLeaf()); + key.push_back(non_term_vocab_.Lookup(child->label())); + if (!Score(*child)) { + return false; + } + if (!child->IsPreterminal()) { + log_prob += child->score(); + } + } + double rule_score; + bool found = pcfg_.Lookup(key, rule_score); + if (!found) { + return false; + } + log_prob += rule_score; + root.set_score(log_prob); + return true; +} + +} // namespace PCFG +} // namespace Moses diff --git a/scripts/training/phrase-extract/pcfg-score/tree_scorer.h b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h new file mode 100644 index 000000000..36f4e1e99 --- /dev/null +++ b/scripts/training/phrase-extract/pcfg-score/tree_scorer.h @@ -0,0 +1,47 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2012 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once +#ifndef PCFG_SCORE_TREE_SCORER_H_ +#define PCFG_SCORE_TREE_SCORER_H_ + +#include "pcfg-common/pcfg.h" +#include "pcfg-common/pcfg_tree.h" +#include "pcfg-common/typedef.h" + +namespace Moses { +namespace PCFG { + +class TreeScorer { + public: + TreeScorer(const Pcfg &, const Vocabulary &); + + // Score tree according to PCFG. Returns false if unsuccessful (due to + // missing rule). + bool Score(PcfgTree &) const; + + private: + const Pcfg &pcfg_; + const Vocabulary &non_term_vocab_; +}; + +} // namespace PCFG +} // namespace Moses + +#endif diff --git a/scripts/training/phrase-extract/score.cpp b/scripts/training/phrase-extract/score.cpp index af7401132..944face48 100644 --- a/scripts/training/phrase-extract/score.cpp +++ b/scripts/training/phrase-extract/score.cpp @@ -71,6 +71,7 @@ void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs LexicalTable lexTable; bool inverseFlag = false; bool hierarchicalFlag = false; +bool pcfgFlag = false; bool wordAlignmentFlag = false; bool goodTuringFlag = false; bool kneserNeyFlag = false; @@ -107,6 +108,9 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; + } else if (strcmp(argv[i],"--PCFG") == 0) { + pcfgFlag = true; + cerr << "including PCFG scores\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; @@ -200,6 +204,7 @@ int main(int argc, char* argv[]) // loop through all extracted phrase translations float lastCount = 0.0f; + float lastPcfgSum = 0.0f; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; @@ -214,6 +219,7 @@ int main(int argc, char* argv[]) // identical to last line? just add count if (strcmp(line,lastLine) == 0) { lastPhrasePair->count += lastCount; + lastPhrasePair->pcfgSum += lastPcfgSum; continue; } strcpy( lastLine, line ); @@ -222,10 +228,12 @@ int main(int argc, char* argv[]) PhraseAlignment phrasePair; phrasePair.create( line, i ); lastCount = phrasePair.count; + lastPcfgSum = phrasePair.pcfgSum; // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) { lastPhrasePair->count += phrasePair.count; + lastPhrasePair->pcfgSum += phrasePair.pcfgSum; continue; } @@ -446,6 +454,16 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo countOfCounts[ countInt ]++; } + // compute PCFG score + float pcfgScore; + if (pcfgFlag && !inverseFlag) { + float pcfgSum = 0; + for(size_t i=0; ipcfgSum; + } + pcfgScore = pcfgSum / count; + } + // output phrases const PHRASE &phraseS = phrasePair[0]->GetSource(); const PHRASE &phraseT = phrasePair[0]->GetTarget(); @@ -501,6 +519,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty ); } + // target-side PCFG score + if (pcfgFlag && !inverseFlag) { + phraseTableFile << " " << pcfgScore; + } + phraseTableFile << " ||| "; // alignment info for non-terminals diff --git a/scripts/training/train-model.perl.missing_bin_dir b/scripts/training/train-model.perl.missing_bin_dir index d3748fdc9..5b43a938f 100755 --- a/scripts/training/train-model.perl.missing_bin_dir +++ b/scripts/training/train-model.perl.missing_bin_dir @@ -30,7 +30,7 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_ $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE, @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS, $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_HMM_ALIGN, $_CONFIG, - $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS, + $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_PCFG,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS, $_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES, $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL, $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS, @@ -101,6 +101,7 @@ $_HELP = 1 'glue-grammar-file=s' => \$_GLUE_GRAMMAR_FILE, 'unknown-word-label-file=s' => \$_UNKNOWN_WORD_LABEL_FILE, 'ghkm' => \$_GHKM, + 'pcfg' => \$_PCFG, 'extract-options=s' => \$_EXTRACT_OPTIONS, 'score-options=s' => \$_SCORE_OPTIONS, 'source-syntax' => \$_SOURCE_SYNTAX, @@ -1335,6 +1336,7 @@ sub extract_phrase { $cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file"; $cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR; $cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE); + $cmd .= " --PCFG" if $_PCFG; if (!defined($_GHKM)) { $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX; $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX; @@ -1480,6 +1482,7 @@ sub score_phrase_phrase_extract { $cmd .= " --UnalignedPenalty" if $UNALIGNED_COUNT; $cmd .= " --UnalignedFunctionWordPenalty ".($inverse ? $UNALIGNED_FW_F : $UNALIGNED_FW_E) if $UNALIGNED_FW_COUNT; $cmd .= " --MinCountHierarchical $MIN_COUNT_HIERARCHICAL" if $MIN_COUNT_HIERARCHICAL; + $cmd .= " --PCFG" if $_PCFG; $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS); print $cmd."\n"; safesystem($cmd) or die "ERROR: Scoring of phrases failed"; @@ -1788,6 +1791,7 @@ sub create_ini { $basic_weight_count /= 2 if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /OnlyDirect/; $basic_weight_count++ unless defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoPhraseCount/; # phrase count feature $basic_weight_count++ if defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /LowCountFeature/; # low count feature + $basic_weight_count++ if $_PCFG; foreach my $f (split(/\+/,$___TRANSLATION_FACTORS)) { $num_of_ttables++; my $ff = $f; -- cgit v1.2.3 From 8bb49c9053af9e61bb248b04ede98c51095eb071 Mon Sep 17 00:00:00 2001 From: phikoehn Date: Wed, 26 Sep 2012 22:57:15 +0100 Subject: chart decoder search graph viz and other fixes to web interface of ems --- scripts/ems/web/analysis.php | 12 +- scripts/ems/web/analysis_diff.php | 253 +++++- scripts/ems/web/base64.js | 108 +++ scripts/ems/web/favicon.ico | Bin 0 -> 1676 bytes scripts/ems/web/index.php | 5 + scripts/ems/web/lib.php | 10 +- scripts/ems/web/overview.php | 43 +- scripts/ems/web/sgviz.js | 1703 +++++++++++++++++++++++++++++++++++++ scripts/ems/web/sgviz.php | 65 ++ 9 files changed, 2175 insertions(+), 24 deletions(-) create mode 100644 scripts/ems/web/base64.js create mode 100644 scripts/ems/web/favicon.ico create mode 100644 scripts/ems/web/sgviz.js create mode 100644 scripts/ems/web/sgviz.php diff --git a/scripts/ems/web/analysis.php b/scripts/ems/web/analysis.php index 489bdc50d..f4beaa274 100644 --- a/scripts/ems/web/analysis.php +++ b/scripts/ems/web/analysis.php @@ -869,9 +869,12 @@ function rule_summary() { printf("tree depth: %.2f
\n",$depth); printf("nt/rule: %.2f
\n",$nt_count/$total); print "\n"; + arsort($count_nt); + $i=0; foreach ($count_nt as $rule => $count) { - printf("\n",$rule,$count,$count/$total*100,'%'); + if ($i++ < 5) { printf("\n",$rule,$count,$count/$total*100,'%'); } } + if (count($count_nt)>5) { print "\n"; } print "
%s%d%.1f%s
%s%d%.1f%s
.........
\n"; } @@ -920,6 +923,7 @@ function bleu_show() { if ($filter != "") { print "; filter: '$filter'"; } + sentence_annotation($count,$filter); print "

5 more | "; print "10 more | "; @@ -1126,6 +1130,12 @@ function sentence_annotation($count,$filter) { //print "

$sort / $offset
"; for($i=$offset;$i<$count+$offset && $ishow search graph
\n"; + } + if ($hierarchical) { annotation_hierarchical($line["id"],$segmentation[$line["id"]],$segmentation_out[$line["id"]],$node[$line["id"]]); } diff --git a/scripts/ems/web/analysis_diff.php b/scripts/ems/web/analysis_diff.php index 9cb853030..51c8e50ef 100644 --- a/scripts/ems/web/analysis_diff.php +++ b/scripts/ems/web/analysis_diff.php @@ -116,8 +116,10 @@ function precision_by_coverage_diff() { $log_info[$log_count]["length"] -= $item[3]; } + print "

By log2-count in the training corpus

"; precision_by_coverage_diff_graph("byCoverage",$log_info,$log_info_new,$total,$img_width,SORT_NUMERIC); + precision_by_coverage_diff_matrix(); // load factored data $d = dir("$dir/evaluation/$set.analysis.".get_precision_analysis_version($dir,$set,$id)); @@ -290,6 +292,244 @@ function precision_by_word_diff($type) { print "\n"; } +function precision_by_coverage_diff_matrix() { + global $id,$id2; + print "

Impact of Change in Coverage

"; + print "Coverage in run $id is the x-axis, change in coverage in run $id2 is the y-axis. Size of box reflects how many output words are produced, yellow is the number of correct translations, green indicates increase, green decrease. The bleu rectangle below each box indicates number of words dropped, and increase (cyan) or decrease (purple).

("; + $scale = 30; + for($i=1; $i<=5; $i++) { + $size = (int)(sqrt($i*$scale)); + $name = "size-$i"; + print " = $i word"; + if ($i>1) { print "s"; } + if ($i<5) { print ", "; } + } + print ")

"; + # get base data + $data = file(get_current_analysis_filename("precision","precision-by-input-word")); + $word = array(); $class = array(); + for($i=0;$i $max_base_log_count) { $max_base_log_count = $log_count; } + } + + # get alternative data + $data = file(get_current_analysis_filename2("precision","precision-by-input-word")); + for($i=0;$i $max_alt_log_count) { $max_alt_log_count = $alt; } + if ($alt < $min_alt_log_count) { $min_alt_log_count = $alt; } + + if (!array_key_exists($alt,$matrix[$base])) { + $matrix[$base][$alt] = array(); + $matrix[$base][$alt]["precision1"] = 0; + $matrix[$base][$alt]["delete1"] = 0; + $matrix[$base][$alt]["total1"] = 0; + $matrix[$base][$alt]["coverage1"] = 0; + $matrix[$base][$alt]["precision2"] = 0; + $matrix[$base][$alt]["delete2"] = 0; + $matrix[$base][$alt]["total2"] = 0; + $matrix[$base][$alt]["coverage2"] = 0; + } + # ignore mismatches in source words due to tokenization / casing + if (array_key_exists($surface,$word)) { + $matrix[$base][$alt]["precision1"] += $word[$surface]["precision"]; + $matrix[$base][$alt]["delete1"] += $word[$surface]["delete"]; + $matrix[$base][$alt]["total1"] += $word[$surface]["total"]; + $matrix[$base][$alt]["coverage1"] += $word[$surface]["count"]; + $matrix[$base][$alt]["precision2"] += $item[0]; + $matrix[$base][$alt]["delete2"] += $item[1]; + $matrix[$base][$alt]["total2"] += $item[2]; + $matrix[$base][$alt]["coverage2"] += $item[4]; + } + } + + # make table + print ""; + for($base=-1;$base<=$max_base_log_count;$base++) { + print ""; + } + print ""; + for($alt=$max_alt_log_count;$alt>=$min_alt_log_count;$alt--) { + print ""; + for($base=-1;$base<=$max_base_log_count;$base++) { + print ""; + } + print ""; + } + print ""; + for($base=-1;$base<=$max_base_log_count;$base++) { + print ""; + } + print "
 $base
$alt"; + if (array_key_exists($base,$matrix) && + array_key_exists($alt,$matrix[$base])) { + #print $matrix[$base][$alt]["precision1"]."->". + # $matrix[$base][$alt]["precision2"]."
"; + #print $matrix[$base][$alt]["delete1"]."->". + # $matrix[$base][$alt]["delete2"]."
"; + #print $matrix[$base][$alt]["total1"]."->". + # $matrix[$base][$alt]["total2"]."
"; + $scale = 30; + $total = $matrix[$base][$alt]["total1"]; + if ($matrix[$base][$alt]["total2"] > $total) { + $total = $matrix[$base][$alt]["total2"]; + } + $total = (int)(sqrt($total*$scale)); + if ($total>0) { + $prec1 = $matrix[$base][$alt]["precision1"]*$scale; + $prec2 = $matrix[$base][$alt]["precision2"]*$scale; + if ($prec1 > $prec2) { + $prec_base = (int)(sqrt($prec1)); + $prec_imp = (int)(sqrt($prec1-$prec2)); + $prec_color = "255,100,100"; + } + else { + $prec_base = (int)(sqrt($prec2)); + $prec_imp = (int)(sqrt($prec2-$prec1)); + $prec_color = "100,255,100"; + } + $prec_base_top = (int)(($total-$prec_base)/2); + $prec_imp_top = (int)(($total-$prec_imp)/2); + + $del1 = $matrix[$base][$alt]["delete1"]*$scale; + $del2 = $matrix[$base][$alt]["delete2"]*$scale; + if ($del1 > $del2) { + $del_base = $del1; + $del_imp = $del1-$del2; + $del_color = "150,100,255"; + } + else { + $del_base = $del2; + $del_imp = $del2-$del1; + $del_color = "100,200,200"; + } + $del_base_height = (int)($del_base/$total); + $del_imp_height = (int)($del_imp/$total); + + $name = "matrix-$base-$alt"; + #print "$total/$prec1/$prec2 -> $prec_base/$prec_imp
"; + print ""; + print ""; + print ""; + } + } + print "
$alt
$base

"; +} + +function precision_by_coverage_diff_matrix_details() { + $alt = $_GET["alt"]; + $base = $_GET["base"]; + + $impact_total = 0; + $data = file(get_current_analysis_filename("precision","precision-by-input-word")); + $word = array(); $class = array(); + for($i=0;$i PrecisionPrecision ImpactDeleteDelete Impact\n"; + + # get alternative data + $data = file(get_current_analysis_filename2("precision","precision-by-input-word")); + for($i=0;$i%.1f%s%+.1f%s%+.1f/%d", + $precision/$total*100,"%", + ($precision-$word[$surface]["precision"])/$total*100,"%", + $precision-$word[$surface]["precision"],$total); + $out .= sprintf("%+.2f%s%+.1f/%d", + ($precision-$word[$surface]["precision"])/$impact_total*100,"%", + $precision-$word[$surface]["precision"],$impact_total); + $out .= sprintf("%.1f%s%+.1f%s%+.1f/%d", + $delete/$total*100,"%", + ($delete-$word[$surface]["delete"])/$total*100,"%", + $delete-$word[$surface]["delete"],$total); + $out .= sprintf("%+.2f%s%+.1f/%d", + ($delete-$word[$surface]["delete"])/$impact_total*100,"%", + $delete-$word[$surface]["delete"],$impact_total); + $out .= ""; + $all_out[] = $out; + } + } + sort($all_out); + foreach($all_out as $out) { $o = explode("\t",$out); print $o[1]; } + print ""; +} function precision_by_coverage_diff_graph($name,$log_info,$log_info_new,$total,$img_width,$sort_type) { $keys = array_keys($log_info); @@ -502,12 +742,22 @@ function bleu_diff() { print "
\n"; bleu_diff_annotation(); + print ""; + print "more "; + print "
\n"; } function bleu_diff_annotation() { global $set,$id,$id2,$dir; - // load data + // load input + $input_annotation = file(get_analysis_filename($dir,$set,$id,"coverage","input-annotation")); + for($i=0;$i[src] ".$input[$line["id"]]."
"; $word_with_score1 = split(" ",$line["system1"]); $word_with_score0 = split(" ",$line["system0"]); diff --git a/scripts/ems/web/base64.js b/scripts/ems/web/base64.js new file mode 100644 index 000000000..e0e94d765 --- /dev/null +++ b/scripts/ems/web/base64.js @@ -0,0 +1,108 @@ +var END_OF_INPUT = -1; + +var base64Chars = new Array( + 'A','B','C','D','E','F','G','H', + 'I','J','K','L','M','N','O','P', + 'Q','R','S','T','U','V','W','X', + 'Y','Z','a','b','c','d','e','f', + 'g','h','i','j','k','l','m','n', + 'o','p','q','r','s','t','u','v', + 'w','x','y','z','0','1','2','3', + '4','5','6','7','8','9','+','/' +); + +var reverseBase64Chars = new Array(); +for (var i=0; i < base64Chars.length; i++){ + reverseBase64Chars[base64Chars[i]] = i; +} + +var base64Str; +var base64Count; +function setBase64Str(str){ + base64Str = str; + base64Count = 0; +} +function readBase64(){ + if (!base64Str) return END_OF_INPUT; + if (base64Count >= base64Str.length) return END_OF_INPUT; + var c = base64Str.charCodeAt(base64Count) & 0xff; + base64Count++; + return c; +} +function encodeBase64(str){ + setBase64Str(str); + var result = ''; + var inBuffer = new Array(3); + var lineCount = 0; + var done = false; + while (!done && (inBuffer[0] = readBase64()) != END_OF_INPUT){ + inBuffer[1] = readBase64(); + inBuffer[2] = readBase64(); + result += (base64Chars[ inBuffer[0] >> 2 ]); + if (inBuffer[1] != END_OF_INPUT){ + result += (base64Chars [(( inBuffer[0] << 4 ) & 0x30) | (inBuffer[1] >> 4) ]); + if (inBuffer[2] != END_OF_INPUT){ + result += (base64Chars [((inBuffer[1] << 2) & 0x3c) | (inBuffer[2] >> 6) ]); + result += (base64Chars [inBuffer[2] & 0x3F]); + } else { + result += (base64Chars [((inBuffer[1] << 2) & 0x3c)]); + result += ('='); + done = true; + } + } else { + result += (base64Chars [(( inBuffer[0] << 4 ) & 0x30)]); + result += ('='); + result += ('='); + done = true; + } + lineCount += 4; + if (lineCount >= 76){ + result += ('\n'); + lineCount = 0; + } + } + return result; +} +function readReverseBase64(){ + if (!base64Str) return END_OF_INPUT; + while (true){ + if (base64Count >= base64Str.length) return END_OF_INPUT; + var nextCharacter = base64Str.charAt(base64Count); + base64Count++; + if (reverseBase64Chars[nextCharacter]){ + return reverseBase64Chars[nextCharacter]; + } + if (nextCharacter == 'A') return 0; + } + return END_OF_INPUT; +} +function ntos(n){ + n=n.toString(16); + if (n.length == 1) n="0"+n; + n="%"+n; + return unescape(n); +} + +function decodeBase64(str){ + setBase64Str(str); + var result = ""; + var inBuffer = new Array(4); + var done = false; + while (!done && (inBuffer[0] = readReverseBase64()) != END_OF_INPUT + && (inBuffer[1] = readReverseBase64()) != END_OF_INPUT){ + inBuffer[2] = readReverseBase64(); + inBuffer[3] = readReverseBase64(); + result += ntos((((inBuffer[0] << 2) & 0xff)| inBuffer[1] >> 4)); + if (inBuffer[2] != END_OF_INPUT){ + result += ntos((((inBuffer[1] << 4) & 0xff)| inBuffer[2] >> 2)); + if (inBuffer[3] != END_OF_INPUT){ + result += ntos((((inBuffer[2] << 6) & 0xff) | inBuffer[3])); + } else { + done = true; + } + } else { + done = true; + } + } + return result; +} diff --git a/scripts/ems/web/favicon.ico b/scripts/ems/web/favicon.ico new file mode 100644 index 000000000..d93c5b031 Binary files /dev/null and b/scripts/ems/web/favicon.ico differ diff --git a/scripts/ems/web/index.php b/scripts/ems/web/index.php index 466af9013..099c9078f 100644 --- a/scripts/ems/web/index.php +++ b/scripts/ems/web/index.php @@ -5,10 +5,12 @@ require("overview.php"); require("analysis.php"); require("analysis_diff.php"); require("diff.php"); +require("sgviz.php"); function head($title) { print ' '.$title.' + @@ -43,8 +45,11 @@ if (array_key_exists("setup",$_POST) || array_key_exists("setup",$_GET)) { else if (preg_match("/PrecisionByWordDiff(.+)_show/",$action,$match)) { precision_by_word_diff($match[1]); } else if (preg_match("/PrecisionByWord(.+)_show/",$action,$match)) { precision_by_word($match[1]); } else if ($action == "CoverageDetails_show") { coverage_details(); } + else if ($action == "CoverageMatrixDetails_show") { precision_by_coverage_diff_matrix_details(); } else if ($action == "SegmentationSummary_show") { segmentation_summary(); } else if ($action == "biconcor") { biconcor(base64_decode($_GET["phrase"])); } + else if ($action == "sgviz") { sgviz($_GET["sentence"]); } + else if ($action == "sgviz_data") { sgviz_data($_GET["sentence"]); } else { print "ERROR! $action"; } } else if (array_key_exists("analysis_diff_home",$_GET)) { diff --git a/scripts/ems/web/lib.php b/scripts/ems/web/lib.php index 440940d9c..c1011e7df 100644 --- a/scripts/ems/web/lib.php +++ b/scripts/ems/web/lib.php @@ -65,7 +65,13 @@ function load_experiment_info() { } krsort($experiment); - ksort($evalset); + uksort($evalset,"evalsetsort"); +} + +function evalsetsort($a,$b) { + if ($a == "avg") { return -1; } + if ($b == "avg") { return 1; } + return strcmp($a,$b); } function load_parameter($run) { @@ -187,7 +193,7 @@ function get_analysis_version($dir,$set,$id) { if (file_exists("$dir/steps/$id/REPORTING_report.$id")) { $report = file("$dir/steps/$id/REPORTING_report.$id.INFO"); foreach ($report as $line) { - if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis/",$line,$match) && + if (preg_match("/\# reuse run (\d+) for EVALUATION:(.+):analysis$/",$line,$match) && $match[2] == $set) { if (file_exists("$prefix.$match[1]/summary")) { $analysis_version[$id][$set]["basic"] = $match[1]; diff --git a/scripts/ems/web/overview.php b/scripts/ems/web/overview.php index 179dda464..47d3f8850 100644 --- a/scripts/ems/web/overview.php +++ b/scripts/ems/web/overview.php @@ -13,7 +13,7 @@ function setup() { print "$dir[0]$dir[1]$dir[2]$dir[3]\n"; } print "\n"; - print "

To add experiment, edit setup in web directory"; + print "

To add experiment, edit the file 'setup' in the web directory."; } function overview() { @@ -134,7 +134,9 @@ function overview() { print "var best_score = [];\n"; reset($evalset); while (list($set,$dummy) = each($evalset)) { - print "best_score[\"$set\"] = ".$best[$set].";\n"; + if ($best[$set] != "" && $best[$set]>0) { + print "best_score[\"$set\"] = ".$best[$set].";\n"; + } } ?> @@ -282,28 +284,29 @@ function output_score($id,$info) { $each_score = explode(" ; ",$score); for($i=0;$i0) { print " "; } - $opened_a_tag = 0; - if ($set != "avg") { - if (file_exists("$dir/evaluation/$set.cleaned.$id")) { - print ""; - $opened_a_tag = 1; - } - else if (file_exists("$dir/evaluation/$set.output.$id")) { - print ""; - $opened_a_tag = 1; - } - } - if ($set == "avg" && count($each_score)>1) { print $match[2].": "; } - print "

"; - if ($opened_a_tag) { print ""; } + if (preg_match('/([\d\(\)\.\s]+) (BLEU[\-c]*)/',$each_score[$i],$match) || + preg_match('/([\d\(\)\.\s]+) (IBM[\-c]*)/',$each_score[$i],$match)) { + if ($i>0) { print "
"; } + $opened_a_tag = 0; + if ($set != "avg") { + if (file_exists("$dir/evaluation/$set.cleaned.$id")) { + print ""; + $opened_a_tag = 1; + } + else if (file_exists("$dir/evaluation/$set.output.$id")) { + print ""; + $opened_a_tag = 1; + } + } + if ($set == "avg" && count($each_score)>1) { print $match[2].": "; } + print $match[1]; + if ($opened_a_tag) { print ""; } } else { - print "-"; + print "-"; } } - + print ""; if ($has_analysis && array_key_exists($set,$has_analysis)) { print ""; diff --git a/scripts/ems/web/sgviz.js b/scripts/ems/web/sgviz.js new file mode 100644 index 000000000..03ad4741a --- /dev/null +++ b/scripts/ems/web/sgviz.js @@ -0,0 +1,1703 @@ +var xmlns="http://www.w3.org/2000/svg"; +var RECOMBINED = 0; +var FROM = 1; +var TO = 2; +var OUTPUT = 3; +var ALIGNMENT = 4; +var CHILDREN = 5; +var RULE_SCORE = 6; +var HEURISTIC_RULE_SCORE = 7; +var HYP_SCORE = 8; +var LHS = 9; +var DERIVATION_SCORE = 10; +var CHART_WIDTH = window.innerWidth * 0.8; +var CHART_HEIGHT = window.innerHeight; +var CELL_WIDTH = CHART_WIDTH/input.length; +var CELL_HEIGHT = CHART_HEIGHT/(input.length+1); +var CELL_MARGIN = 4; +var CELL_BORDER = 2; +var CELL_PADDING = 2; +if (input.length < 6) { CELL_MARGIN = 5; CELL_BORDER = 3; CELL_PADDING = 3; } +if (input.length > 10) { CELL_MARGIN = 1; CELL_BORDER = 1; CELL_PADDING = 2; } +if (input.length > 20) { CELL_MARGIN = 0; CELL_BORDER = 0; CELL_PADDING = 1; } +var BUTTON_WIDTH = 170; +var BUTTON_HEIGHT = 30; +var OPTION_WIDTH = 60; +var OPTION_HEIGHT = BUTTON_HEIGHT; +var CELL_HIGHLIGHT_COLOR = "#c0ffc0"; +var CELL_REGULAR_COLOR = "#ffff80"; +var INPUT_HIGHLIGHT_COLOR = "#c0c0c0"; +var INPUT_REGULAR_COLOR = "#ffffff"; +var SORT_OPTION = 2; +var ZOOM = 0; +var ZOOM_FROM = 0; +var ZOOM_TO = input.length+1; +var ZOOM_WIDTH = input.length; + +var length = input.length; +var chart = document.getElementById("chart"); +var reachable = new Array(); +var cell_hyps = new Array(length); +var cell_derivation_score = Array(); + +// init basic layout +draw_chart(); +draw_menu(); +draw_options(); + +// process hypotheses +function process_hypotheses() { + index_hypotheses_by_cell(); + find_reachable_hypotheses(); + compute_best_derivation_scores(); +} + +// +// INITIALIZATION +// + +function index_hypotheses_by_cell() { + // init edge_lists + for(var from=0; from=0; width-- ) { + for(var from=0; from cell_max_score) { + cell_max_score = edge[id][DERIVATION_SCORE]; + } + } + cell_derivation_score[from][to] = cell_max_score; + } + } +} + +// +// MENU +// + +function draw_menu() { + draw_menu_button(1,"Best Derivation"); + draw_menu_button(2,"Number of Hypotheses"); + draw_menu_button(3,"Number of Rule Cubes"); + draw_menu_button(4,"Derivation Score"); + draw_menu_button(5,"Non-Terminals") + draw_menu_button(6,"Hypotheses") +} +var MENU_POSITION_HYPOTHESES = 6; // where is "Hypotheses" in the menu? + +var current_menu_selection = 0; +var menu_processing = 0; +function click_menu( id, force_flag ) { + if (!force_flag && (menu_processing || current_menu_selection == id)) { + return; + } + menu_processing = 1; + + if (current_menu_selection == 1) { best_derivation(0); } + if (current_menu_selection == 2) { unannotate_cells(); } + if (current_menu_selection == 3) { unannotate_cells(); } + if (current_menu_selection == 4) { unannotate_cells(); } + if (current_menu_selection == 5) { remove_non_terminal_treemap(0); } + if (current_menu_selection == 6 && SORT_OPTION != 3) { remove_hypothesis_overview(); } + if (current_menu_selection == 6 && SORT_OPTION == 3) { remove_hypothesis_overview(); remove_non_terminal_treemap(); } + if (current_menu_selection > 0) { + highlight_menu_button( current_menu_selection, 0 ); + } + + if (id == 1) { best_derivation(1); } + if (id == 2) { annotate_cells_with_hypcount(); } + if (id == 3) { annotate_cells_with_rulecount(); } + if (id == 4) { annotate_cells_with_derivation_score(); } + if (id == 5) { non_terminal_treemap(); } + if (id == 6 && SORT_OPTION != 3) { hypothesis_overview(); } + if (id == 6 && SORT_OPTION == 3) { draw_hypothesis_sort_buttons(); non_terminal_treemap(1); } + highlight_menu_button( id, 1 ); + current_menu_selection = id; + menu_processing = 0; +} + +function draw_menu_button( id, label ) { + var button = document.createElementNS(xmlns,"rect"); + button.setAttribute("id", "button-" + id); + button.setAttribute("x", 5); + button.setAttribute("y", 5 + BUTTON_HEIGHT*(id-1)); + button.setAttribute("rx", 3); + button.setAttribute("ry", 3); + button.setAttribute("width", BUTTON_WIDTH-10); + button.setAttribute("height", BUTTON_HEIGHT-10); + //button.setAttribute("opacity",.75); + button.setAttribute("fill", "#c0c0ff"); + button.setAttribute("stroke", "black"); + button.setAttribute("stroke-width", "1"); + button.setAttribute("onclick","click_menu(" + id + ",0);") + chart.appendChild( button ); + + var button_label = document.createElementNS(xmlns,"text"); + button_label.setAttribute("x", BUTTON_WIDTH/2); + button_label.setAttribute("y", 4+BUTTON_HEIGHT/2 + BUTTON_HEIGHT*(id-1)); + button_label.setAttribute("style", "font-size: 12; font-family: Verdana, Arial;"); + button_label.setAttribute("text-anchor", "middle"); + button_label.setAttribute("pointer-events", "none"); + var content = document.createTextNode( label ); + button_label.appendChild( content ); + button_label.setAttribute("onclick","click_menu(" + id + ",0);") + + chart.appendChild( button_label ); +} + +function highlight_menu_button( id, on_off ) { + var button = document.getElementById("button-" + id); + if (on_off) { + button.setAttribute("fill", "#8080ff"); + } + else { + button.setAttribute("fill", "#c0c0ff"); + } +} + +// OPTIONS + +function draw_options() { + draw_option_button(0,1,"score"); + draw_option_button(0,2,"deriv."); + draw_option_button(0,3,"id"); +} + +function draw_rule_options() { + draw_option_button(1,1,"score"); + draw_option_button(1,2,"deriv."); + draw_option_button(1,3,"zoom"); + highlight_option_button(1,1,show_hyp_score); + highlight_option_button(1,2,show_derivation_score); +} + +function draw_option_button( rule_option, id, label ) { + var button = document.createElementNS(xmlns,"rect"); + button.setAttribute("id", (rule_option?"rule-":"") + "option-" + id); + button.setAttribute("x", rule_option ? CHART_WIDTH-BUTTON_WIDTH-OPTION_WIDTH : BUTTON_WIDTH+10); + button.setAttribute("y", 5 + OPTION_HEIGHT*(id-1)); + button.setAttribute("rx", 3); + button.setAttribute("ry", 3); + button.setAttribute("width", OPTION_WIDTH-10); + button.setAttribute("height", OPTION_HEIGHT-10); + button.setAttribute("fill", "#fdd017"); + button.setAttribute("stroke", "black"); + button.setAttribute("stroke-width", "1"); + button.setAttribute("onclick","click_"+(rule_option?"rule_":"")+"option(" + id + ");") + chart.appendChild( button ); + + var button_label = document.createElementNS(xmlns,"text"); + var distance_from_side = BUTTON_WIDTH+5+OPTION_WIDTH/2; + button_label.setAttribute("id", (rule_option?"rule-":"") + "option-label-" + id); + button_label.setAttribute("x", rule_option ? CHART_WIDTH-distance_from_side : distance_from_side); + button_label.setAttribute("y", 4+OPTION_HEIGHT/2 + OPTION_HEIGHT*(id-1)); + button_label.setAttribute("style", "font-size: 12; font-family: Verdana, Arial;"); + button_label.setAttribute("text-anchor", "middle"); + button_label.setAttribute("pointer-events", "none"); + var content = document.createTextNode( label ); + button_label.appendChild( content ); + + chart.appendChild( button_label ); +} + +function draw_sort_button( id, label ) { + var BASE_X = 5 + id/SORT_BUTTON_COUNT * (BUTTON_WIDTH-10+5); + var BASE_Y = -5 + BUTTON_HEIGHT * MENU_POSITION_HYPOTHESES; + var WIDTH = ((BUTTON_WIDTH-10+5)/SORT_BUTTON_COUNT)-5; + + var button = document.createElementNS(xmlns,"rect"); + + button.setAttribute("id", "sort-" + id); + button.setAttribute("x", BASE_X); + button.setAttribute("y", BASE_Y); + button.setAttribute("width", WIDTH); + button.setAttribute("height", BUTTON_HEIGHT-12); + if (id==0) { + button.setAttribute("fill", "none"); + } + else { + button.setAttribute("rx", 3); + button.setAttribute("ry", 3); + if (SORT_OPTION == id) { + button.setAttribute("fill", "#6080ff"); + } + else { + button.setAttribute("fill", "#a0c0ff"); + } + button.setAttribute("onclick","click_sort(" + id + ");") + button.setAttribute("stroke", "black"); + button.setAttribute("stroke-width", "1"); + } + chart.appendChild( button ); + + var button_label = document.createElementNS(xmlns,"text"); + button_label.setAttribute("id", "sort-label-" + id); + button_label.setAttribute("x", BASE_X + WIDTH/2); + button_label.setAttribute("y", BASE_Y + 12); + button_label.setAttribute("style", "font-size: 10; font-family: Verdana, Arial;"); + button_label.setAttribute("text-anchor", "middle"); + button_label.setAttribute("pointer-events", "none"); + var content = document.createTextNode( label ); + button_label.appendChild( content ); + + chart.appendChild( button_label ); +} + +function click_sort( id ) { + if (SORT_OPTION == 3) { + remove_non_terminal_treemap(1) + } + remove_hypothesis_overview(); + + SORT_OPTION = id; + + if (SORT_OPTION == 3) { + non_terminal_treemap(1); + draw_hypothesis_sort_buttons(); + } + else { + hypothesis_overview(); + } +} + +var show_scores = 0; +var show_id = 0; +var show_derivation = 0; +function click_option( id ) { + if (id == 1) { + show_scores = !show_scores; + highlight_option_button( 0, 1, show_scores ); + } + if (id == 2) { + show_derivation = !show_derivation; + color_cells(); + highlight_option_button( 0, 2, show_derivation ); + } + if (id == 3) { + show_id = !show_id; + highlight_option_button( 0, 3, show_id ); + } + if (current_menu_selection > 0) { + click_menu( current_menu_selection, 1 ); + } +} + +var show_hyp_score = 0; +var show_derivation_score = 0; +function click_rule_option( id ) { + if (id == 1) { + show_hyp_score = !show_hyp_score; + highlight_option_button( 1, 1, show_hyp_score ); + } + if (id == 2) { + show_derivation_score = !show_derivation_score; + highlight_option_button( 1, 2, show_derivation_score ); + } + if (id == 3) { + if (ZOOM > 0) { + ZOOM = 0; + } + else { + ZOOM = 0.3; + } + assign_chart_coordinates(); + highlight_option_button( 1, 3, ZOOM ); + } + draw_rule_cube(current_z_pos_string); +} + +function highlight_option_button( rule_option, id, on_off ) { + var button = document.getElementById((rule_option?"rule-":"") + "option-" + id); + if (on_off) { + button.setAttribute("fill", "#cd853f"); + } + else { + button.setAttribute("fill", "#fdd017"); + } +} + +// INITIALIZE THE CHART + +function draw_chart() { + for (var from=0;from= 0) { + highlight_input( current_from, current_to, 0) + } + highlight_input( from, to, 1) + current_from = from; + current_to = to; +} + +function click_cell( from, to ) { + if (from == current_rule_from && to == current_rule_to) { + unshow_rules(); + current_rule_from = -1; + ZOOM = 0; + } + else { + show_rules( from, to ); + ZOOM_FROM = current_rule_from; + ZOOM_TO = current_rule_to; + ZOOM_WIDTH = to-from+1; + } + assign_chart_coordinates(); +} + +function highlight_input( from, to, on_off ) { + for(var i=from; i<=to; i++) { + var input_box = document.getElementById("inputbox-" + i); + input_box.setAttribute("fill", on_off ? INPUT_HIGHLIGHT_COLOR : INPUT_REGULAR_COLOR); + } +} + +// +// VISUALIZATION OF CHART CELLS +// + +// BASIC ANNOTATION WITH NUMBERS + +function annotate_cells_with_hypcount() { + for (var from=0;from"); + for(var i=0; i= next_worst) { + current_worst = next_worst; // ... and keep going + } + else { + // compute rectangles... + var sum = 0; + for(var j=start; j 20) { font_size = 20; } + if (font_size >= 3) { + var rect_label = document.createElementNS(xmlns,"text"); + rect_label.setAttribute("id", "rect-label-" + from + "-" + to + "-" + j); + rect_label.setAttribute("x", CELL_MARGIN + (offset_x + cum_x + this_width/2) * scale_factor); + rect_label.setAttribute("y", CELL_MARGIN + (offset_y + cum_y + this_height/2) * scale_factor + font_size/2 -2); + rect_label.setAttribute("style", "font-size: " + font_size + "; font-family: Verdana, Arial; font-weight:900;"); + rect_label.setAttribute("fill", "#00f"); + rect_label.setAttribute("opacity", .3); + rect_label.setAttribute("text-anchor", "middle"); + rect_label.setAttribute("pointer-events", "none"); + var content = document.createTextNode( label[j] ); + rect_label.appendChild( content ); + cell.appendChild( rect_label ); + } + if (adding_on_left) { cum_y += this_height; } + else { cum_x += this_width; } + } + if (adding_on_left) { offset_x += this_width; } + else { offset_y += this_height; } + + // move to next sequence + if (i != label.length) { + start = i; + extend = Math.min( width-offset_x, height-offset_y ); + current_worst = squarify_worst( label, count, i, i, extend ); + } + } + } +} + +function squarify_worst( label, count, start, end, extend ) { + var sum = 0; + for(var i=start; i<=end; i++) { + sum += count[label[i]]; + } + var max_ratio = 0; + for(var i=start; i<=end; i++) { + var ratio = count[label[i]] * extend*extend /sum/sum; + if (ratio < 1) { ratio = 1/ratio; } + max_ratio = Math.max( ratio, max_ratio ); + } + return max_ratio; +} + +// HIGHLIGHT BEST DERIVATION + +function best_derivation( on_off ) { + var best_score = -9e9; + var best_id = -1; + for (var i=0;i best_score) { + best_score = edge[id][HYP_SCORE]; + best_id = id; + } + } + best_derivation_recurse( best_id, on_off, -1, -1, 0 ); +} + +function best_derivation_recurse( id, on_off, parent_from, parent_to, child_pos ) { + var from = edge[id][FROM]; + var to = edge[id][TO]; + + // highlight cell and annotate with rule + highlight_cell( from, to, on_off ); + if (on_off) { + var annotation = ""; + if (show_id) { annotation += id + "
"; } + annotation += edge[id][LHS] + "\u2192"; + annotation += edge[id][OUTPUT]; + if (show_scores) { annotation += "
" + edge[id][HYP_SCORE]; } + annotate_cell( from, to, annotation, 10 ); + } + else { + unannotate_cell( from, to ); + } + + // highlight hyp + highlight_hyp( id, on_off ); + + // arrow to parent + if (parent_from >= 0) { + if (on_off) { + make_arrow( id, parent_from, parent_to, from, to, 0, child_pos ); + } + else { + var arrow = document.getElementById("arrow-" + id); + chart.removeChild(arrow); + } + } + + var child_order = Array(); + if (edge[id][ALIGNMENT] != "") { + var alignment = edge[id][ALIGNMENT].split(" "); + // sorting: array position is source nonterminal pos + alignment.sort(); + // alignment target sympol pos -> source nonterminal pos + var reversed_alignment = Array(); + for(var i=0; i target nonterminal pos + for(var i=0; i255) { dec = 255; } + var color = Math.round(dec).toString(16); + return "#ffff"+color; +} + +function get_children( id ) { + if (edge[id][CHILDREN] == "") { + return []; + } + return edge[id][CHILDREN].split(" "); +} + +// OVERVIEW ALL HYPOTHESES +function hypothesis_overview() { + for (var from=0;from= row_size) { + column = 0; + y += diameter; + x = 0; + } + } +} + +function remove_hypothesis_overview() { + for (var id in edge) { + var cell = document.getElementById("cell-" + edge[id][FROM] + "-" + edge[id][TO]); + var hyp = document.getElementById("hyp-" + id); + cell.removeChild(hyp); + } + // remove sort buttons + for(var i=0; i<4; i++) { + var old = document.getElementById("sort-" + i); + chart.removeChild( old ); + var old = document.getElementById("sort-label-" + i); + chart.removeChild( old ); + } +} + +function hover_hyp( id ) { + best_derivation_recurse( id, 1, -1, -1 ); +} + +function unhover_hyp( id ) { + best_derivation_recurse( id, 0, -1, -1 ); +} + +function hover_rule_hyp( id ) { + highlight_rule_hyp( id, 1 ); + if (current_menu_selection == 1) { + best_derivation( 0 ); + } + if (current_menu_selection <= 2) { + best_derivation_recurse( id, 1, -1, -1 ); + } +} + +function unhover_rule_hyp( id ) { + highlight_rule_hyp( id, 0 ); + if (current_menu_selection <= 2) { + best_derivation_recurse( id, 0, -1, -1 ); + } + if (current_menu_selection == 1) { + best_derivation( 1 ); + } +} + +function highlight_hyp( id, on_off ) { + var hyp = document.getElementById("hyp-" + id); + if (hyp == null) { return; } + hyp.setAttribute("fill", hyp_color(id, on_off)); +} + +function highlight_rule_hyp( id, on_off ) { + var hyp = document.getElementById("rule-hyp-" + id); + if (hyp == null) { return; } + hyp.setAttribute("fill", rule_hyp_color(id, on_off)); +} + +function hyp_color( id, on_off ) { + if (on_off) { + var color = "#ff0000"; + if (edge[id][RECOMBINED]>0) { color = "#808080"; } + else if (id in reachable) { color = "#00c000"; } + return color; + } + var color = "#ffc0c0"; + if (edge[id][RECOMBINED]>0) { color = "#c0c0c0"; } + else if (id in reachable) { color = "#80ff80"; } + return color; +} + +// RULES + +function get_rule( id ) { + // get non-terminal labels + if (edge[id] === undefined) { alert("unknown edge "+id); return ""; } + var output = edge[id][OUTPUT].split(" "); + var alignment = edge[id][ALIGNMENT].split(" "); + alignment.sort(); + var nt_label = Array(); + for(var i=0;i best_hyp_score) { + best_hyp_score = edge[id][HYP_SCORE]; + } + } + function sortByRuleCount( a, b ) { + return rule_count[rule_hash[b]] - rule_count[rule_hash[a]]; + } + rule_list = rule_list.sort(sortByRuleCount); + + RULE_HEIGHT = 15; + RULE_FONT_SIZE = 11; + // squeeze if too many rules + if (rule_list.length * RULE_HEIGHT > (CHART_HEIGHT-50)) { + var factor = (CHART_HEIGHT-50)/rule_list.length/RULE_HEIGHT; + RULE_HEIGHT = Math.floor( RULE_HEIGHT * factor ); + RULE_FONT_SIZE = Math.ceil( RULE_FONT_SIZE * factor ); + } + + draw_rule_options(); + for(var i=-1; i 0) { + click_rule( from, to, 0 ); + } +} + +function unshow_rules() { + if (current_rule_from >= 0) { + var cell = document.getElementById("cellbox-" + current_rule_from + "-" + current_rule_to); + cell.setAttribute("stroke", "black"); + cell.setAttribute("stroke-width", "1"); + } + var finished = 0; + for(var i=-1; !finished; i++) { + var old = document.getElementById("rule-" + i); + if (old != null) { chart.removeChild( old ); } + else { finished = 1; } + } + var old = document.getElementById("rule-message"); + if (old != null) { chart.removeChild( old ); } + old = document.getElementById("rule-cube"); + if (old != null) { chart.removeChild( old ); } + finished = 0; + for(var i=1; !finished; i++) { + var old = document.getElementById("rule-option-" + i); + if (old != null) { + chart.removeChild( old ); + var old = document.getElementById("rule-option-label-" + i); + chart.removeChild( old ); + } + else { finished = 1; } + } +} + +function draw_rule( from, to, rule_id ) { + var rule_label = document.createElementNS(xmlns,"text"); + rule_label.setAttribute("id", "rule-" + rule_id); + rule_label.setAttribute("x", CHART_WIDTH-120); + rule_label.setAttribute("y", 10 + RULE_HEIGHT*(rule_id+1)); + rule_label.setAttribute("text-anchor", "middle"); + if (rule_id>-1) { + rule_label.setAttribute("style", "font-size: "+RULE_FONT_SIZE+"; font-family: Verdana, Arial;"); + rule_label.setAttribute("onclick","click_rule(" + from + "," + to + "," + rule_id + ");"); + var content = document.createTextNode( rule_list[rule_id] ); + rule_label.appendChild( content ); + } + else { + rule_label.setAttribute("style", "font-size: "+(RULE_FONT_SIZE-2)+"; font-family: Verdana, Arial; font-weight: bold;"); + var content = document.createTextNode( rule_list.length == 0 ? "NO RULES" : "RULES" ); + rule_label.appendChild( content ); + } + chart.appendChild( rule_label ); +} + +function draw_rule_message( message ) { + var old = document.getElementById("rule-message"); + if (old != null) { chart.removeChild( old ); } + + var rule_message_group = document.createElementNS(xmlns,"svg"); + rule_message_group.setAttribute("id","rule-message"); + rule_message_group.setAttribute("x", 0); + rule_message_group.setAttribute("y", 250); + var line = message.split("
"); + for(var i=0;i=0) { + var rule_label = document.getElementById("rule-"+current_rule_id); + rule_label.setAttribute("style", "font-size: "+RULE_FONT_SIZE+"; font-family: Verdana, Arial;"); + } + var rule_label = document.getElementById("rule-"+rule_id); + rule_label.setAttribute("style", "font-size: "+RULE_FONT_SIZE+"; font-family: Verdana, Arial; font-weight: bold;"); + current_rule_id = rule_id; + + // first get all the data + output_list = Array(); + var output_hash = Array(); + children_list = Array(); + var children_hash = Array(); + current_edge = Array(); + for (var i=0;i children_list.length-1) { + children_hash.push([]); + children_list.push([]); + } + // build index + var child = ""+children[j]; + if (children_hash[j][child] === undefined) { + children_hash[j][child] = children_list[j].length; + children_list[j].push(parseInt(child)); + } + } + } + } + + // sort + function sortBySecond(a,b) { + asplit = a.split("|"); + bsplit = b.split("|"); + return bsplit[1] - asplit[1]; + } + output_list = output_list.sort(sortBySecond); + + function sortHypByScore(a,b) { + return edge[b][HYP_SCORE] - edge[a][HYP_SCORE]; + } + for(var i=0;i1 && axis[dimension_order[1]].length > max_length) { + // max_length = axis[dimension_order[1]].length; + //} + + // space for additional dimensions + var z_dimension_length = 0; + if (dimension_order.length > 2) { + z_dimension_length = -2; + for(var i=2; i 2) { + // for(var i=2; i max_z_dimension_length) { + // max_z_dimension_length = axis[dimension_order[i]].length; + // } + // } + //} + //if (max_z_dimension_length > 10) { + // max_z_dimension_length = 10; + //} + //var y_length = axis[dimension_order[0]].length; + //if (max_z_dimension_length > 0) { + // y_length += max_z_dimension_length + 2; + // if (y_length > max_length) { + // max_length = y_length; + // } + //} + + // calculate table cell and font size + if (max_length+8 <= CHART_HEIGHT/15) { + RULE_CUBE_HYP_SIZE = 15; + RULE_CUBE_FONT_SIZE = 11; + } + else if (max_length+8 > CHART_HEIGHT/9) { + RULE_CUBE_HYP_SIZE = 9; + RULE_CUBE_FONT_SIZE = 7; + } + else { + RULE_CUBE_HYP_SIZE = CHART_HEIGHT/(max_length+8); + RULE_CUBE_FONT_SIZE = (RULE_CUBE_HYP_SIZE * 12/15).toFixed(0); + } + var Z_HEIGHT = 0; + if (dimension_order.length > 2) { + Z_HEIGHT = (z_dimension_length + 2) * RULE_CUBE_HYP_SIZE; + } + + var rule_cube = document.createElementNS(xmlns,"svg"); + rule_cube.setAttribute("id","rule-cube"); + rule_cube.setAttribute("x", CHART_WIDTH - 30); + rule_cube.setAttribute("y", 0); + chart.appendChild( rule_cube ); + + // draw y axis + var label = get_rule_axis_name(dimension_order[0]); + draw_rule_row(-1,label); + for(var y=0; y (CHART_HEIGHT-Z_HEIGHT)/9-10) { + draw_rule_row(Math.ceil(CHART_HEIGHT/9-10),"(more, "+axis[dimension_order[0]].length+" total)"); + } + + // draw x axis + if (axis.length > 1) { + var label = get_rule_axis_name(dimension_order[1]); + draw_rule_column(-1,label); + for(var x=0; x CHART_HEIGHT/9-10) { + draw_rule_column(Math.ceil(CHART_HEIGHT/9-10),"(more, "+axis[dimension_order[1]].length+" total)"); + } + } + + // draw hyps + for(var y=0; y=0) { + rule_label.setAttribute("style", "font-size: "+RULE_CUBE_FONT_SIZE+"; font-family: Verdana, Arial;"); + rule_label.setAttribute("x", RULE_CUBE_FONT_SIZE*10+5); + } + else { + rule_label.setAttribute("style", "font-size: "+(RULE_CUBE_FONT_SIZE-2)+"; font-family: Verdana, Arial; font-weight: bold;"); + rule_label.setAttribute("x", RULE_CUBE_FONT_SIZE*10-30); + } + rule_label.setAttribute("text-anchor", "end"); + var content = document.createTextNode( label ); + rule_label.appendChild( content ); + var rule_cube = document.getElementById("rule-cube"); + rule_cube.appendChild( rule_label ); +} + +function draw_rule_column( pos, label ) { + var rule_label = document.createElementNS(xmlns,"text"); + rule_label.setAttribute("id", "rule-column-" + pos); + rule_label.setAttribute("x", RULE_CUBE_FONT_SIZE*10 -3 + RULE_CUBE_HYP_SIZE*(1+pos) ); + rule_label.setAttribute("y", RULE_CUBE_FONT_SIZE*10 -12); + rule_label.setAttribute("transform", "rotate(60 "+ (RULE_CUBE_FONT_SIZE*10-3+RULE_CUBE_HYP_SIZE*(1+pos)) +" "+(RULE_CUBE_FONT_SIZE*10 - 12)+")") + if (pos>=0) { + rule_label.setAttribute("style", "font-size: "+RULE_CUBE_FONT_SIZE+"; font-family: Verdana, Arial;"); + } + else { + rule_label.setAttribute("style", "font-size: "+(RULE_CUBE_FONT_SIZE-2)+"; font-family: Verdana, Arial; font-weight: bold;"); + } + rule_label.setAttribute("text-anchor", "end"); + var content = document.createTextNode( label ); + rule_label.appendChild( content ); + var rule_cube = document.getElementById("rule-cube"); + rule_cube.appendChild( rule_label ); +} + +function draw_rule_z( z,total_z, z_pos, pos,pos_offset, label ) { + var rule_label = document.createElementNS(xmlns,"text"); + rule_label.setAttribute("id", "rule-z-" + z + "-" + pos); + //rule_label.setAttribute("x", RULE_CUBE_FONT_SIZE*10+10 + CHART_HEIGHT*z/(total_z+1) ); + rule_label.setAttribute("x", RULE_CUBE_FONT_SIZE*10+10 ); + rule_label.setAttribute("y", RULE_CUBE_FONT_SIZE*10 + RULE_CUBE_HYP_SIZE*(pos+pos_offset)); + if (pos >= 0) { + rule_label.setAttribute("style", "font-size: "+RULE_CUBE_FONT_SIZE+"; font-family: Verdana, Arial;" + +((z_pos[z] == pos)?" font-weight: bold;":"")); + z_pos_copy = z_pos.join(",").split(","); + z_pos_copy[z] = pos; + rule_label.setAttribute("onclick","draw_rule_cube(\"" + z_pos_copy.join(",") + "\");"); + } + else { + rule_label.setAttribute("style", "font-size: "+(RULE_CUBE_FONT_SIZE-2)+"; font-family: Verdana, Arial; font-weight: bold;"); + } + + var content = document.createTextNode( label ); + rule_label.appendChild( content ); + var rule_cube = document.getElementById("rule-cube"); + rule_cube.appendChild( rule_label ); +} + +function draw_rule_hyp( xpos, ypos, id ) { + if (id == -1) { return; } + var diameter = RULE_CUBE_HYP_SIZE-2; + var hyp = document.createElementNS(xmlns,"circle"); + hyp.setAttribute("id", "rule-hyp-" + id); + hyp.setAttribute("cx", RULE_CUBE_FONT_SIZE*10+10 + RULE_CUBE_HYP_SIZE*xpos + diameter/2); + hyp.setAttribute("cy", RULE_CUBE_FONT_SIZE*10-2 + RULE_CUBE_HYP_SIZE*(ypos-0.5) + diameter/2); + hyp.setAttribute("r", diameter/2); + hyp.setAttribute("fill", rule_hyp_color(id, 0)); + //hyp.setAttribute("opacity",.5); + hyp.setAttribute("onmouseover","hover_rule_hyp(" + id + ");") + hyp.setAttribute("onmouseout","unhover_rule_hyp(" + id + ");") + var rule_cube = document.getElementById("rule-cube"); + rule_cube.appendChild( hyp ); +} + +function rule_hyp_color( id, on_off ) { + if (!show_hyp_score && !show_derivation_score) { + return hyp_color( id, on_off ); + } + var inactive_color = on_off ? "80" : "00"; + var hyp_score_color = inactive_color; + var derivation_score_color = inactive_color; + if (show_hyp_score) { + hyp_score_color = get_score_from_color(best_hyp_score-edge[id][HYP_SCORE]); + } + if (show_derivation_score) { + if (edge[id][DERIVATION_SCORE] == null) { + derivation_score_color = "00"; + } + else { + derivation_score_color = get_score_from_color(best_derivation_score-edge[id][DERIVATION_SCORE]); + } + } + return "#" + inactive_color + derivation_score_color + hyp_score_color; +} + +function get_score_from_color( score, on_off ) { + if (score == null) { return "00"; } + var dec = 255 - 255 * (score/8); + if (dec < 0) { dec = 0; } + if (on_off) { dec = dec/2+128; } + dec = Math.floor(dec/16)*16+15; + var color = dec.toString(16); + if (dec < 16) { color = "0"+color; } + return color; +} diff --git a/scripts/ems/web/sgviz.php b/scripts/ems/web/sgviz.php new file mode 100644 index 000000000..a2a4f7fc8 --- /dev/null +++ b/scripts/ems/web/sgviz.php @@ -0,0 +1,65 @@ +Search Graph Visualization, Sentence <?php $sentence ?> + + + + + + + Date: Thu, 27 Sep 2012 18:02:23 +0100 Subject: Use absolute path so regtests still work --- .gitmodules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitmodules b/.gitmodules index 0470c63fd..19ceb74f6 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,3 @@ [submodule "regression-testing/tests"] path = regression-testing/tests - url = ../moses-regression-tests.git + url = git@github.com:moses-smt/moses-regression-tests.git -- cgit v1.2.3 From 287836438cf3209fda3172ca9a6a74dc30193b89 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Fri, 28 Sep 2012 13:14:25 +0100 Subject: Tweak Boost command line --- BUILD-INSTRUCTIONS.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/BUILD-INSTRUCTIONS.txt b/BUILD-INSTRUCTIONS.txt index 5b4ec2565..0ddd16070 100644 --- a/BUILD-INSTRUCTIONS.txt +++ b/BUILD-INSTRUCTIONS.txt @@ -73,7 +73,7 @@ you're ready to install packages in non-standard paths: #For Boost: ./bootstrap.sh -./b2 --prefix=$PREFIX --libdir=$PREFIX/lib64 --layout=tagged link=static,shared threading=multi install +./b2 --prefix=$PREFIX --libdir=$LIBDIR --layout=tagged link=static,shared threading=multi,single install -------------------------------------------------------------------------- -- cgit v1.2.3 From 78f295c0a012503b68be7567910e1118fc8f3a28 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Fri, 28 Sep 2012 15:04:48 +0100 Subject: KenLM c34d00 --- lm/Jamfile | 2 +- lm/bhiksha.cc | 2 +- lm/bhiksha.hh | 4 ++-- lm/binary_format.cc | 4 ++-- lm/binary_format.hh | 4 ++-- lm/build_binary.cc | 10 ++++++---- lm/left.hh | 1 + lm/max_order.cc | 1 + lm/max_order.hh | 12 ++++++++++++ lm/model.cc | 25 +++++++++++++++++++------ lm/model.hh | 2 +- lm/quantize.hh | 9 +++++---- lm/read_arpa.cc | 22 ++++++++++++++-------- lm/search_hashed.hh | 6 +++--- lm/search_trie.cc | 3 ++- lm/search_trie.hh | 4 ++-- lm/state.hh | 1 + lm/trie.cc | 4 ++-- lm/trie.hh | 8 ++++---- lm/trie_sort.cc | 20 +++++++------------- lm/trie_sort.hh | 3 +-- lm/vocab.cc | 4 ++-- lm/vocab.hh | 4 ++-- util/ersatz_progress.cc | 10 +++++----- util/ersatz_progress.hh | 10 ++++++---- util/exception.cc | 3 +++ util/exception.hh | 22 ++++++++++++++++++++++ util/file.cc | 19 ++++++++++++++++++- util/file.hh | 3 +++ util/file_piece.cc | 2 ++ util/probing_hash_table.hh | 5 +++-- 31 files changed, 155 insertions(+), 74 deletions(-) create mode 100644 lm/max_order.hh diff --git a/lm/Jamfile b/lm/Jamfile index 88455709b..fd169b000 100644 --- a/lm/Jamfile +++ b/lm/Jamfile @@ -17,4 +17,4 @@ run model_test.cc ../util//kenutil kenlm ..//boost_unit_test_framework : : test. exe query : ngram_query.cc kenlm ../util//kenutil ; exe build_binary : build_binary.cc kenlm ../util//kenutil ; -exe kenlm_max_order : max_order.cc : $(max-order) ; +exe kenlm_max_order : max_order.cc : .. $(max-order) ; diff --git a/lm/bhiksha.cc b/lm/bhiksha.cc index 870a4eee5..088ea98d4 100644 --- a/lm/bhiksha.cc +++ b/lm/bhiksha.cc @@ -50,7 +50,7 @@ std::size_t ArrayCount(uint64_t max_offset, uint64_t max_next, const Config &con } } // namespace -std::size_t ArrayBhiksha::Size(uint64_t max_offset, uint64_t max_next, const Config &config) { +uint64_t ArrayBhiksha::Size(uint64_t max_offset, uint64_t max_next, const Config &config) { return sizeof(uint64_t) * (1 /* header */ + ArrayCount(max_offset, max_next, config)) + 7 /* 8-byte alignment */; } diff --git a/lm/bhiksha.hh b/lm/bhiksha.hh index 9734f3abd..8ff88654d 100644 --- a/lm/bhiksha.hh +++ b/lm/bhiksha.hh @@ -33,7 +33,7 @@ class DontBhiksha { static void UpdateConfigFromBinary(int /*fd*/, Config &/*config*/) {} - static std::size_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; } + static uint64_t Size(uint64_t /*max_offset*/, uint64_t /*max_next*/, const Config &/*config*/) { return 0; } static uint8_t InlineBits(uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) { return util::RequiredBits(max_next); @@ -67,7 +67,7 @@ class ArrayBhiksha { static void UpdateConfigFromBinary(int fd, Config &config); - static std::size_t Size(uint64_t max_offset, uint64_t max_next, const Config &config); + static uint64_t Size(uint64_t max_offset, uint64_t max_next, const Config &config); static uint8_t InlineBits(uint64_t max_offset, uint64_t max_next, const Config &config); diff --git a/lm/binary_format.cc b/lm/binary_format.cc index a56e998ef..fd841e592 100644 --- a/lm/binary_format.cc +++ b/lm/binary_format.cc @@ -200,10 +200,10 @@ void SeekPastHeader(int fd, const Parameters ¶ms) { util::SeekOrThrow(fd, TotalHeaderSize(params.counts.size())); } -uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, std::size_t memory_size, Backing &backing) { +uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing) { const uint64_t file_size = util::SizeFile(backing.file.get()); // The header is smaller than a page, so we have to map the whole header as well. - std::size_t total_map = TotalHeaderSize(params.counts.size()) + memory_size; + std::size_t total_map = util::CheckOverflow(TotalHeaderSize(params.counts.size()) + memory_size); if (file_size != util::kBadSize && static_cast(file_size) < total_map) UTIL_THROW(FormatLoadException, "Binary file has size " << file_size << " but the headers say it should be at least " << total_map); diff --git a/lm/binary_format.hh b/lm/binary_format.hh index dd795f620..bf699d5f4 100644 --- a/lm/binary_format.hh +++ b/lm/binary_format.hh @@ -70,7 +70,7 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet void SeekPastHeader(int fd, const Parameters ¶ms); -uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, std::size_t memory_size, Backing &backing); +uint8_t *SetupBinary(const Config &config, const Parameters ¶ms, uint64_t memory_size, Backing &backing); void ComplainAboutARPA(const Config &config, ModelType model_type); @@ -90,7 +90,7 @@ template void LoadLM(const char *file, const Config &config, To &to) new_config.probing_multiplier = params.fixed.probing_multiplier; detail::SeekPastHeader(backing.file.get(), params); To::UpdateConfigFromBinary(backing.file.get(), params.counts, new_config); - std::size_t memory_size = To::Size(params.counts, new_config); + uint64_t memory_size = To::Size(params.counts, new_config); uint8_t *start = detail::SetupBinary(new_config, params, memory_size, backing); to.InitializeFromBinary(start, params, new_config, backing.file.get()); } else { diff --git a/lm/build_binary.cc b/lm/build_binary.cc index 49901c9ea..2b8c9d5b2 100644 --- a/lm/build_binary.cc +++ b/lm/build_binary.cc @@ -11,6 +11,8 @@ #ifdef WIN32 #include "util/getopt.hh" +#else +#include #endif namespace lm { @@ -85,16 +87,16 @@ void ShowSizes(const char *file, const lm::ngram::Config &config) { std::vector counts; util::FilePiece f(file); lm::ReadARPACounts(f, counts); - std::size_t sizes[6]; + uint64_t sizes[6]; sizes[0] = ProbingModel::Size(counts, config); sizes[1] = RestProbingModel::Size(counts, config); sizes[2] = TrieModel::Size(counts, config); sizes[3] = QuantTrieModel::Size(counts, config); sizes[4] = ArrayTrieModel::Size(counts, config); sizes[5] = QuantArrayTrieModel::Size(counts, config); - std::size_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(size_t)); - std::size_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(size_t)); - std::size_t divide; + uint64_t max_length = *std::max_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t)); + uint64_t min_length = *std::min_element(sizes, sizes + sizeof(sizes) / sizeof(uint64_t)); + uint64_t divide; char prefix; if (min_length < (1 << 10) * 10) { prefix = ' '; diff --git a/lm/left.hh b/lm/left.hh index 751984c5e..8c27232e5 100644 --- a/lm/left.hh +++ b/lm/left.hh @@ -38,6 +38,7 @@ #ifndef LM_LEFT__ #define LM_LEFT__ +#include "lm/max_order.hh" #include "lm/state.hh" #include "lm/return.hh" diff --git a/lm/max_order.cc b/lm/max_order.cc index 6d4895bd4..94221201c 100644 --- a/lm/max_order.cc +++ b/lm/max_order.cc @@ -1,3 +1,4 @@ +#include "lm/max_order.hh" #include int main(int argc, char *argv[]) { diff --git a/lm/max_order.hh b/lm/max_order.hh new file mode 100644 index 000000000..e89f36a18 --- /dev/null +++ b/lm/max_order.hh @@ -0,0 +1,12 @@ +/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM. + * If not, this is the default maximum order. + * Having this limit means that State can be + * (kMaxOrder - 1) * sizeof(float) bytes instead of + * sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead + */ +#ifndef KENLM_MAX_ORDER +#define KENLM_MAX_ORDER 6 +#endif +#ifndef KENLM_ORDER_MESSAGE +#define KENLM_ORDER_MESSAGE "Recompile with e.g. `bjam --kenlm-max-order=6 -a' to change the maximum order." +#endif diff --git a/lm/model.cc b/lm/model.cc index aace40df9..40af8a637 100644 --- a/lm/model.cc +++ b/lm/model.cc @@ -5,12 +5,14 @@ #include "lm/search_hashed.hh" #include "lm/search_trie.hh" #include "lm/read_arpa.hh" +#include "util/have.hh" #include "util/murmur_hash.hh" #include #include #include #include +#include namespace lm { namespace ngram { @@ -18,17 +20,18 @@ namespace detail { template const ModelType GenericModel::kModelType = Search::kModelType; -template size_t GenericModel::Size(const std::vector &counts, const Config &config) { +template uint64_t GenericModel::Size(const std::vector &counts, const Config &config) { return VocabularyT::Size(counts[0], config) + Search::Size(counts, config); } template void GenericModel::SetupMemory(void *base, const std::vector &counts, const Config &config) { + size_t goal_size = util::CheckOverflow(Size(counts, config)); uint8_t *start = static_cast(base); size_t allocated = VocabularyT::Size(counts[0], config); vocab_.SetupMemory(start, allocated, counts[0], config); start += allocated; start = search_.SetupMemory(start, counts, config); - if (static_cast(start - static_cast(base)) != Size(counts, config)) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast(base)) << " but Size says they should take " << Size(counts, config)); + if (static_cast(start - static_cast(base)) != goal_size) UTIL_THROW(FormatLoadException, "The data structures took " << (start - static_cast(base)) << " but Size says they should take " << goal_size); } template GenericModel::GenericModel(const char *file, const Config &config) { @@ -47,8 +50,19 @@ template GenericModel::Ge P::Init(begin_sentence, null_context, vocab_, search_.Order()); } +namespace { +void CheckCounts(const std::vector &counts) { + UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << " but KenLM was compiled to support up to " << KENLM_MAX_ORDER << ". " << KENLM_ORDER_MESSAGE); + if (sizeof(uint64_t) > sizeof(std::size_t)) { + for (std::vector::const_iterator i = counts.begin(); i != counts.end(); ++i) { + UTIL_THROW_IF(*i > static_cast(std::numeric_limits::max()), util::OverflowException, "This model has " << *i << " " << (i - counts.begin() + 1) << "-grams which is too many for 32-bit machines."); + } + } +} +} // namespace + template void GenericModel::InitializeFromBinary(void *start, const Parameters ¶ms, const Config &config, int fd) { - UTIL_THROW_IF(params.counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << params.counts.size() << ". Re-compile (use -a), passing a number at least this large to bjam's --max-kenlm-order flag."); + CheckCounts(params.counts); SetupMemory(start, params.counts, config); vocab_.LoadedBinary(params.fixed.has_vocabulary, fd, config.enumerate_vocab); search_.LoadedBinary(); @@ -61,12 +75,11 @@ template void GenericModel counts; // File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_. ReadARPACounts(f, counts); - - UTIL_THROW_IF(counts.size() > KENLM_MAX_ORDER, FormatLoadException, "This model has order " << counts.size() << ". Re-compile (use -a), passing a number at least this large to bjam's --max-kenlm-order flag."); + CheckCounts(counts); if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model."); if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0"); - std::size_t vocab_size = VocabularyT::Size(counts[0], config); + std::size_t vocab_size = util::CheckOverflow(VocabularyT::Size(counts[0], config)); // Setup the binary file for writing the vocab lookup table. The search_ is responsible for growing the binary file to its needs. vocab_.SetupMemory(SetupJustVocab(config, counts.size(), vocab_size, backing_), vocab_size, counts[0], config); diff --git a/lm/model.hh b/lm/model.hh index 6dee94196..13ff864e1 100644 --- a/lm/model.hh +++ b/lm/model.hh @@ -41,7 +41,7 @@ template class GenericModel : public base::Mod * does not include small non-mapped control structures, such as this class * itself. */ - static size_t Size(const std::vector &counts, const Config &config = Config()); + static uint64_t Size(const std::vector &counts, const Config &config = Config()); /* Load the model from a file. It may be an ARPA or binary file. Binary * files must have the format expected by this class or you'll get an diff --git a/lm/quantize.hh b/lm/quantize.hh index 36c427272..8ce2378a7 100644 --- a/lm/quantize.hh +++ b/lm/quantize.hh @@ -3,6 +3,7 @@ #include "lm/blank.hh" #include "lm/config.hh" +#include "lm/max_order.hh" #include "lm/model_type.hh" #include "util/bit_packing.hh" @@ -23,7 +24,7 @@ class DontQuantize { public: static const ModelType kModelTypeAdd = static_cast(0); static void UpdateConfigFromBinary(int, const std::vector &, Config &) {} - static std::size_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; } + static uint64_t Size(uint8_t /*order*/, const Config &/*config*/) { return 0; } static uint8_t MiddleBits(const Config &/*config*/) { return 63; } static uint8_t LongestBits(const Config &/*config*/) { return 31; } @@ -137,9 +138,9 @@ class SeparatelyQuantize { static void UpdateConfigFromBinary(int fd, const std::vector &counts, Config &config); - static std::size_t Size(uint8_t order, const Config &config) { - size_t longest_table = (static_cast(1) << static_cast(config.prob_bits)) * sizeof(float); - size_t middle_table = (static_cast(1) << static_cast(config.backoff_bits)) * sizeof(float) + longest_table; + static uint64_t Size(uint8_t order, const Config &config) { + uint64_t longest_table = (static_cast(1) << static_cast(config.prob_bits)) * sizeof(float); + uint64_t middle_table = (static_cast(1) << static_cast(config.backoff_bits)) * sizeof(float) + longest_table; // unigrams are currently not quantized so no need for a table. return (order - 2) * middle_table + longest_table + /* for the bit counts and alignment padding) */ 8; } diff --git a/lm/read_arpa.cc b/lm/read_arpa.cc index 70727e4cb..b709fef94 100644 --- a/lm/read_arpa.cc +++ b/lm/read_arpa.cc @@ -2,12 +2,13 @@ #include "lm/blank.hh" +#include #include #include +#include #include #include -#include #include #include @@ -31,6 +32,15 @@ bool IsEntirelyWhiteSpace(const StringPiece &line) { const char kBinaryMagic[] = "mmap lm http://kheafield.com/code"; +// strtoull isn't portable enough :-( +uint64_t ReadCount(const std::string &from) { + std::stringstream stream(from); + uint64_t ret; + stream >> ret; + UTIL_THROW_IF(!stream, FormatLoadException, "Bad count " << from); + return ret; +} + } // namespace void ReadARPACounts(util::FilePiece &in, std::vector &number) { @@ -52,15 +62,11 @@ void ReadARPACounts(util::FilePiece &in, std::vector &number) { // So strtol doesn't go off the end of line. std::string remaining(line.data() + 6, line.size() - 6); char *end_ptr; - unsigned long int length = std::strtol(remaining.c_str(), &end_ptr, 10); + unsigned int length = std::strtol(remaining.c_str(), &end_ptr, 10); if ((end_ptr == remaining.c_str()) || (length - 1 != number.size())) UTIL_THROW(FormatLoadException, "ngram count lengths should be consecutive starting with 1: " << line); if (*end_ptr != '=') UTIL_THROW(FormatLoadException, "Expected = immediately following the first number in the count line " << line); ++end_ptr; - const char *start = end_ptr; - long int count = std::strtol(start, &end_ptr, 10); - if (count < 0) UTIL_THROW(FormatLoadException, "Negative n-gram count " << count); - if (start == end_ptr) UTIL_THROW(FormatLoadException, "Couldn't parse n-gram count from " << line); - number.push_back(count); + number.push_back(ReadCount(end_ptr)); } } @@ -103,7 +109,7 @@ void ReadBackoff(util::FilePiece &in, float &backoff) { int float_class = _fpclass(backoff); UTIL_THROW_IF(float_class == _FPCLASS_SNAN || float_class == _FPCLASS_QNAN || float_class == _FPCLASS_NINF || float_class == _FPCLASS_PINF, FormatLoadException, "Bad backoff " << backoff); #else - int float_class = fpclassify(backoff); + int float_class = std::fpclassify(backoff); UTIL_THROW_IF(float_class == FP_NAN || float_class == FP_INFINITE, FormatLoadException, "Bad backoff " << backoff); #endif } diff --git a/lm/search_hashed.hh b/lm/search_hashed.hh index 7e8c12206..3bcde9214 100644 --- a/lm/search_hashed.hh +++ b/lm/search_hashed.hh @@ -74,8 +74,8 @@ template class HashedSearch { // TODO: move probing_multiplier here with next binary file format update. static void UpdateConfigFromBinary(int, const std::vector &, Config &) {} - static std::size_t Size(const std::vector &counts, const Config &config) { - std::size_t ret = Unigram::Size(counts[0]); + static uint64_t Size(const std::vector &counts, const Config &config) { + uint64_t ret = Unigram::Size(counts[0]); for (unsigned char n = 1; n < counts.size() - 1; ++n) { ret += Middle::Size(counts[n], config.probing_multiplier); } @@ -160,7 +160,7 @@ template class HashedSearch { #endif {} - static std::size_t Size(uint64_t count) { + static uint64_t Size(uint64_t count) { return (count + 1) * sizeof(ProbBackoff); // +1 for hallucinate } diff --git a/lm/search_trie.cc b/lm/search_trie.cc index 9a3e96916..debcfd077 100644 --- a/lm/search_trie.cc +++ b/lm/search_trie.cc @@ -5,6 +5,7 @@ #include "lm/binary_format.hh" #include "lm/blank.hh" #include "lm/lm_exception.hh" +#include "lm/max_order.hh" #include "lm/quantize.hh" #include "lm/trie.hh" #include "lm/trie_sort.hh" @@ -88,7 +89,7 @@ class BackoffMessages { if (!HasExtension(weights.backoff)) { weights.backoff = kExtensionBackoff; UTIL_THROW_IF(fseek(unigrams, -sizeof(weights), SEEK_CUR), util::ErrnoException, "Seeking backwards to denote unigram extension failed."); - WriteOrThrow(unigrams, &weights, sizeof(weights)); + util::WriteOrThrow(unigrams, &weights, sizeof(weights)); } const ProbPointer &write_to = *reinterpret_cast(current_ + sizeof(WordIndex)); base[write_to.array][write_to.index] += weights.backoff; diff --git a/lm/search_trie.hh b/lm/search_trie.hh index 10b22ab18..1264baf5a 100644 --- a/lm/search_trie.hh +++ b/lm/search_trie.hh @@ -44,8 +44,8 @@ template class TrieSearch { Bhiksha::UpdateConfigFromBinary(fd, config); } - static std::size_t Size(const std::vector &counts, const Config &config) { - std::size_t ret = Quant::Size(counts.size(), config) + Unigram::Size(counts[0]); + static uint64_t Size(const std::vector &counts, const Config &config) { + uint64_t ret = Quant::Size(counts.size(), config) + Unigram::Size(counts[0]); for (unsigned char i = 1; i < counts.size() - 1; ++i) { ret += Middle::Size(Quant::MiddleBits(config), counts[i], counts[0], counts[i+1], config); } diff --git a/lm/state.hh b/lm/state.hh index 3dbf617bf..830e40aa2 100644 --- a/lm/state.hh +++ b/lm/state.hh @@ -1,6 +1,7 @@ #ifndef LM_STATE__ #define LM_STATE__ +#include "lm/max_order.hh" #include "lm/word_index.hh" #include "util/murmur_hash.hh" diff --git a/lm/trie.cc b/lm/trie.cc index 0f1ca574b..d9895f89d 100644 --- a/lm/trie.cc +++ b/lm/trie.cc @@ -36,7 +36,7 @@ bool FindBitPacked(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_ } } // namespace -std::size_t BitPacked::BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits) { +uint64_t BitPacked::BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits) { uint8_t total_bits = util::RequiredBits(max_vocab) + remaining_bits; // Extra entry for next pointer at the end. // +7 then / 8 to round up bits and convert to bytes @@ -57,7 +57,7 @@ void BitPacked::BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits) max_vocab_ = max_vocab; } -template std::size_t BitPackedMiddle::Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_ptr, const Config &config) { +template uint64_t BitPackedMiddle::Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_ptr, const Config &config) { return Bhiksha::Size(entries + 1, max_ptr, config) + BaseSize(entries, max_vocab, quant_bits + Bhiksha::InlineBits(entries + 1, max_ptr, config)); } diff --git a/lm/trie.hh b/lm/trie.hh index 034a14144..9ea3c5466 100644 --- a/lm/trie.hh +++ b/lm/trie.hh @@ -49,7 +49,7 @@ class Unigram { unigram_ = static_cast(start); } - static std::size_t Size(uint64_t count) { + static uint64_t Size(uint64_t count) { // +1 in case unknown doesn't appear. +1 for the final next. return (count + 2) * sizeof(UnigramValue); } @@ -84,7 +84,7 @@ class BitPacked { } protected: - static std::size_t BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits); + static uint64_t BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits); void BaseInit(void *base, uint64_t max_vocab, uint8_t remaining_bits); @@ -99,7 +99,7 @@ class BitPacked { template class BitPackedMiddle : public BitPacked { public: - static std::size_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const Config &config); + static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const Config &config); // next_source need not be initialized. BitPackedMiddle(void *base, uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const BitPacked &next_source, const Config &config); @@ -128,7 +128,7 @@ template class BitPackedMiddle : public BitPacked { class BitPackedLongest : public BitPacked { public: - static std::size_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab) { + static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab) { return BaseSize(entries, max_vocab, quant_bits); } diff --git a/lm/trie_sort.cc b/lm/trie_sort.cc index 0d83221e2..8663e94e1 100644 --- a/lm/trie_sort.cc +++ b/lm/trie_sort.cc @@ -22,12 +22,6 @@ namespace lm { namespace ngram { namespace trie { - -void WriteOrThrow(FILE *to, const void *data, size_t size) { - assert(size); - if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size); -} - namespace { typedef util::SizedIterator NGramIter; @@ -95,12 +89,12 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const util::TempMaker &make // Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator. if (context_begin == context_end) return out.release(); PartialIter i(context_begin); - WriteOrThrow(out.get(), i->Data(), context_size); + util::WriteOrThrow(out.get(), i->Data(), context_size); const void *previous = i->Data(); ++i; for (; i != context_end; ++i) { if (memcmp(previous, i->Data(), context_size)) { - WriteOrThrow(out.get(), i->Data(), context_size); + util::WriteOrThrow(out.get(), i->Data(), context_size); previous = i->Data(); } } @@ -116,7 +110,7 @@ struct ThrowCombine { // Useful for context files that just contain records with no value. struct FirstCombine { void operator()(std::size_t entry_size, const void *first, const void * /*second*/, FILE *out) const { - WriteOrThrow(out, first, entry_size); + util::WriteOrThrow(out, first, entry_size); } }; @@ -129,10 +123,10 @@ template FILE *MergeSortedFiles(FILE *first_file, FILE *second_f EntryCompare less(order); while (first && second) { if (less(first.Data(), second.Data())) { - WriteOrThrow(out_file.get(), first.Data(), entry_size); + util::WriteOrThrow(out_file.get(), first.Data(), entry_size); ++first; } else if (less(second.Data(), first.Data())) { - WriteOrThrow(out_file.get(), second.Data(), entry_size); + util::WriteOrThrow(out_file.get(), second.Data(), entry_size); ++second; } else { combine(entry_size, first.Data(), second.Data(), out_file.get()); @@ -140,7 +134,7 @@ template FILE *MergeSortedFiles(FILE *first_file, FILE *second_f } } for (RecordReader &remains = (first ? first : second); remains; ++remains) { - WriteOrThrow(out_file.get(), remains.Data(), entry_size); + util::WriteOrThrow(out_file.get(), remains.Data(), entry_size); } return out_file.release(); } @@ -164,7 +158,7 @@ void RecordReader::Init(FILE *file, std::size_t entry_size) { void RecordReader::Overwrite(const void *start, std::size_t amount) { long internal = (uint8_t*)start - (uint8_t*)data_.get(); UTIL_THROW_IF(fseek(file_, internal - entry_size_, SEEK_CUR), util::ErrnoException, "Couldn't seek backwards for revision"); - WriteOrThrow(file_, start, amount); + util::WriteOrThrow(file_, start, amount); long forward = entry_size_ - internal - amount; #if !defined(_WIN32) && !defined(_WIN64) if (forward) diff --git a/lm/trie_sort.hh b/lm/trie_sort.hh index c1be9bfc4..2197b80ce 100644 --- a/lm/trie_sort.hh +++ b/lm/trie_sort.hh @@ -3,6 +3,7 @@ #ifndef LM_TRIE_SORT__ #define LM_TRIE_SORT__ +#include "lm/max_order.hh" #include "lm/word_index.hh" #include "util/file.hh" @@ -28,8 +29,6 @@ struct Config; namespace trie { -void WriteOrThrow(FILE *to, const void *data, size_t size); - class EntryCompare : public std::binary_function { public: explicit EntryCompare(unsigned char order) : order_(order) {} diff --git a/lm/vocab.cc b/lm/vocab.cc index 5de68f16e..398475bee 100644 --- a/lm/vocab.cc +++ b/lm/vocab.cc @@ -87,7 +87,7 @@ void WriteWordsWrapper::Write(int fd) { SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL), enumerate_(NULL) {} -std::size_t SortedVocabulary::Size(std::size_t entries, const Config &/*config*/) { +uint64_t SortedVocabulary::Size(uint64_t entries, const Config &/*config*/) { // Lead with the number of entries. return sizeof(uint64_t) + sizeof(uint64_t) * entries; } @@ -165,7 +165,7 @@ struct ProbingVocabularyHeader { ProbingVocabulary::ProbingVocabulary() : enumerate_(NULL) {} -std::size_t ProbingVocabulary::Size(std::size_t entries, const Config &config) { +uint64_t ProbingVocabulary::Size(uint64_t entries, const Config &config) { return ALIGN8(sizeof(detail::ProbingVocabularyHeader)) + Lookup::Size(entries, config.probing_multiplier); } diff --git a/lm/vocab.hh b/lm/vocab.hh index a25432f97..074cd446e 100644 --- a/lm/vocab.hh +++ b/lm/vocab.hh @@ -62,7 +62,7 @@ class SortedVocabulary : public base::Vocabulary { } // Size for purposes of file writing - static size_t Size(std::size_t entries, const Config &config); + static uint64_t Size(uint64_t entries, const Config &config); // Vocab words are [0, Bound()) Only valid after FinishedLoading/LoadedBinary. WordIndex Bound() const { return bound_; } @@ -129,7 +129,7 @@ class ProbingVocabulary : public base::Vocabulary { return lookup_.Find(detail::HashForVocab(str), i) ? i->value : 0; } - static size_t Size(std::size_t entries, const Config &config); + static uint64_t Size(uint64_t entries, const Config &config); // Vocab words are [0, Bound()). WordIndex Bound() const { return bound_; } diff --git a/util/ersatz_progress.cc b/util/ersatz_progress.cc index 07b14e26d..eb635ad8a 100644 --- a/util/ersatz_progress.cc +++ b/util/ersatz_progress.cc @@ -9,16 +9,16 @@ namespace util { namespace { const unsigned char kWidth = 100; } -ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits::max()), complete_(next_), out_(NULL) {} +ErsatzProgress::ErsatzProgress() : current_(0), next_(std::numeric_limits::max()), complete_(next_), out_(NULL) {} ErsatzProgress::~ErsatzProgress() { if (out_) Finished(); } -ErsatzProgress::ErsatzProgress(std::size_t complete, std::ostream *to, const std::string &message) +ErsatzProgress::ErsatzProgress(uint64_t complete, std::ostream *to, const std::string &message) : current_(0), next_(complete / kWidth), complete_(complete), stones_written_(0), out_(to) { if (!out_) { - next_ = std::numeric_limits::max(); + next_ = std::numeric_limits::max(); return; } if (!message.empty()) *out_ << message << '\n'; @@ -28,14 +28,14 @@ ErsatzProgress::ErsatzProgress(std::size_t complete, std::ostream *to, const std void ErsatzProgress::Milestone() { if (!out_) { current_ = 0; return; } if (!complete_) return; - unsigned char stone = std::min(static_cast(kWidth), (current_ * kWidth) / complete_); + unsigned char stone = std::min(static_cast(kWidth), (current_ * kWidth) / complete_); for (; stones_written_ < stone; ++stones_written_) { (*out_) << '*'; } if (stone == kWidth) { (*out_) << std::endl; - next_ = std::numeric_limits::max(); + next_ = std::numeric_limits::max(); out_ = NULL; } else { next_ = std::max(next_, (stone * complete_) / kWidth); diff --git a/util/ersatz_progress.hh b/util/ersatz_progress.hh index f709dc516..ff4d590ff 100644 --- a/util/ersatz_progress.hh +++ b/util/ersatz_progress.hh @@ -4,6 +4,8 @@ #include #include +#include + // Ersatz version of boost::progress so core language model doesn't depend on // boost. Also adds option to print nothing. @@ -14,7 +16,7 @@ class ErsatzProgress { ErsatzProgress(); // Null means no output. The null value is useful for passing along the ostream pointer from another caller. - explicit ErsatzProgress(std::size_t complete, std::ostream *to = &std::cerr, const std::string &message = ""); + explicit ErsatzProgress(uint64_t complete, std::ostream *to = &std::cerr, const std::string &message = ""); ~ErsatzProgress(); @@ -23,12 +25,12 @@ class ErsatzProgress { return *this; } - ErsatzProgress &operator+=(std::size_t amount) { + ErsatzProgress &operator+=(uint64_t amount) { if ((current_ += amount) >= next_) Milestone(); return *this; } - void Set(std::size_t to) { + void Set(uint64_t to) { if ((current_ = to) >= next_) Milestone(); Milestone(); } @@ -40,7 +42,7 @@ class ErsatzProgress { private: void Milestone(); - std::size_t current_, next_, complete_; + uint64_t current_, next_, complete_; unsigned char stones_written_; std::ostream *out_; diff --git a/util/exception.cc b/util/exception.cc index c4f8c04ce..3806e6de4 100644 --- a/util/exception.cc +++ b/util/exception.cc @@ -84,4 +84,7 @@ EndOfFileException::EndOfFileException() throw() { } EndOfFileException::~EndOfFileException() throw() {} +OverflowException::OverflowException() throw() {} +OverflowException::~OverflowException() throw() {} + } // namespace util diff --git a/util/exception.hh b/util/exception.hh index 6d6a37cb1..83f99cd6f 100644 --- a/util/exception.hh +++ b/util/exception.hh @@ -2,9 +2,12 @@ #define UTIL_EXCEPTION__ #include +#include #include #include +#include + namespace util { template typename Except::template ExceptionTag::Identity operator<<(Except &e, const Data &data); @@ -111,6 +114,25 @@ class EndOfFileException : public Exception { ~EndOfFileException() throw(); }; +class OverflowException : public Exception { + public: + OverflowException() throw(); + ~OverflowException() throw(); +}; + +template inline std::size_t CheckOverflowInternal(uint64_t value) { + UTIL_THROW_IF(value > static_cast(std::numeric_limits::max()), OverflowException, "Integer overflow detected. This model is too big for 32-bit code."); + return value; +} + +template <> inline std::size_t CheckOverflowInternal<8>(uint64_t value) { + return value; +} + +inline std::size_t CheckOverflow(uint64_t value) { + return CheckOverflowInternal(value); +} + } // namespace util #endif // UTIL_EXCEPTION__ diff --git a/util/file.cc b/util/file.cc index 4899e5ac3..834237c8d 100644 --- a/util/file.cc +++ b/util/file.cc @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -111,6 +112,11 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) { } } +void WriteOrThrow(FILE *to, const void *data, std::size_t size) { + assert(size); + if (1 != std::fwrite(data, size, 1, to)) UTIL_THROW(util::ErrnoException, "Short write; requested size " << size); +} + void FSyncOrThrow(int fd) { // Apparently windows doesn't have fsync? #if !defined(_WIN32) && !defined(_WIN64) @@ -119,8 +125,13 @@ void FSyncOrThrow(int fd) { } namespace { -void InternalSeek(int fd, off_t off, int whence) { +void InternalSeek(int fd, int64_t off, int whence) { +#if defined(_WIN32) || defined(_WIN64) + UTIL_THROW_IF((__int64)-1 == _lseeki64(fd, off, whence), ErrnoException, "Windows seek failed"); + +#else UTIL_THROW_IF((off_t)-1 == lseek(fd, off, whence), ErrnoException, "Seek failed"); +#endif } } // namespace @@ -143,6 +154,12 @@ std::FILE *FDOpenOrThrow(scoped_fd &file) { return ret; } +std::FILE *FOpenOrThrow(const char *path, const char *mode) { + std::FILE *ret; + UTIL_THROW_IF(!(ret = fopen(path, mode)), util::ErrnoException, "Could not fopen " << path << " for " << mode); + return ret; +} + TempMaker::TempMaker(const std::string &prefix) : base_(prefix) { base_ += "XXXXXX"; } diff --git a/util/file.hh b/util/file.hh index 8af1ff4ff..0108acc95 100644 --- a/util/file.hh +++ b/util/file.hh @@ -80,6 +80,7 @@ void ReadOrThrow(int fd, void *to, std::size_t size); std::size_t ReadOrEOF(int fd, void *to_void, std::size_t amount); void WriteOrThrow(int fd, const void *data_void, std::size_t size); +void WriteOrThrow(FILE *to, const void *data, std::size_t size); void FSyncOrThrow(int fd); @@ -90,6 +91,8 @@ void SeekEnd(int fd); std::FILE *FDOpenOrThrow(scoped_fd &file); +std::FILE *FOpenOrThrow(const char *path, const char *mode); + class TempMaker { public: explicit TempMaker(const std::string &prefix); diff --git a/util/file_piece.cc b/util/file_piece.cc index 19a68728a..280f438c3 100644 --- a/util/file_piece.cc +++ b/util/file_piece.cc @@ -5,6 +5,8 @@ #include "util/mmap.hh" #ifdef WIN32 #include +#else +#include #endif // WIN32 #include diff --git a/util/probing_hash_table.hh b/util/probing_hash_table.hh index 3354b68ef..770faa7e3 100644 --- a/util/probing_hash_table.hh +++ b/util/probing_hash_table.hh @@ -8,6 +8,7 @@ #include #include +#include namespace util { @@ -42,8 +43,8 @@ template (multiplier * static_cast(entries))); + static uint64_t Size(uint64_t entries, float multiplier) { + uint64_t buckets = std::max(entries + 1, static_cast(multiplier * static_cast(entries))); return buckets * sizeof(Entry); } -- cgit v1.2.3 From a323c8daf7793a14d618885c0db781b38a8648f1 Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Fri, 28 Sep 2012 14:37:53 -0400 Subject: Send stderr to /dev/null when looking for pawd. This cleans up the logs a bit for those of us who don't have pawd. Otherwise, messages like the following show up in the logs: /usr/bin/which: no pawd in ... bash: pawd: command not found --- scripts/generic/qsub-wrapper.pl | 2 +- scripts/training/absolutize_moses_model.pl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/generic/qsub-wrapper.pl b/scripts/generic/qsub-wrapper.pl index 65158c00f..e34c84a74 100755 --- a/scripts/generic/qsub-wrapper.pl +++ b/scripts/generic/qsub-wrapper.pl @@ -248,7 +248,7 @@ sub safesystem { sub getPwdCmd(){ my $pwdcmd="pwd"; my $a; - chomp($a=`which pawd | head -1 | awk '{print $1}'`); + chomp($a=`which pawd 2> /dev/null | head -1 | awk '{print $1}'`); if ($a && -e $a){ $pwdcmd=$a; } return $pwdcmd; } diff --git a/scripts/training/absolutize_moses_model.pl b/scripts/training/absolutize_moses_model.pl index 1b485a01f..99efafe8e 100755 --- a/scripts/training/absolutize_moses_model.pl +++ b/scripts/training/absolutize_moses_model.pl @@ -98,7 +98,7 @@ sub ensure_absolute { my $target = shift; my $originfile = shift; - my $cwd = `pawd`; + my $cwd = `pawd 2> /dev/null`; $cwd = `pwd` if ! defined $cwd; # not everyone has pawd! die "Failed to absolutize $target. Failing to get cwd!" if ! defined $cwd; chomp $cwd; -- cgit v1.2.3 From 7b042edc6c8318febc2e97f856f187634275ac6a Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Fri, 28 Sep 2012 14:55:09 -0400 Subject: Send stderr to /dev/null when looking for pawd. This cleans up the logs a bit for those of us who don't have pawd. Otherwise, messages like the following show up in the logs: /usr/bin/which: no pawd in ... --- scripts/generic/moses-parallel.pl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/generic/moses-parallel.pl b/scripts/generic/moses-parallel.pl index 0a3354183..d1840fc55 100755 --- a/scripts/generic/moses-parallel.pl +++ b/scripts/generic/moses-parallel.pl @@ -965,7 +965,7 @@ sub safesystem { sub getPwdCmd(){ my $pwdcmd="pwd"; my $a; - chomp($a=`which pawd | head -1 | awk '{print $1}'`); + chomp($a=`which pawd 2> /dev/null | head -1 | awk '{print $1}'`); if ($a && -e $a){ $pwdcmd=$a; } return $pwdcmd; } -- cgit v1.2.3 From 618e2d51a672fe794b5d3ca405549cc2cfbca07c Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Mon, 1 Oct 2012 17:18:49 +0100 Subject: Apparently nobody uses hash.{h,cpp} --- moses/src/Hypothesis.cpp | 1 - moses/src/hash.cpp | 70 ------------------------------------------------ moses/src/hash.h | 8 ------ 3 files changed, 79 deletions(-) delete mode 100644 moses/src/hash.cpp delete mode 100644 moses/src/hash.h diff --git a/moses/src/Hypothesis.cpp b/moses/src/Hypothesis.cpp index b9e40b75f..530665f60 100644 --- a/moses/src/Hypothesis.cpp +++ b/moses/src/Hypothesis.cpp @@ -37,7 +37,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA #include "InputType.h" #include "LMList.h" #include "Manager.h" -#include "hash.h" using namespace std; diff --git a/moses/src/hash.cpp b/moses/src/hash.cpp deleted file mode 100644 index 9afac463d..000000000 --- a/moses/src/hash.cpp +++ /dev/null @@ -1,70 +0,0 @@ -// $Id$ - -#define mix(a,b,c) \ -{ \ - a -= b; a -= c; a ^= (c>>13); \ - b -= c; b -= a; b ^= (a<<8); \ - c -= a; c -= b; c ^= (b>>13); \ - a -= b; a -= c; a ^= (c>>12); \ - b -= c; b -= a; b ^= (a<<16); \ - c -= a; c -= b; c ^= (b>>5); \ - a -= b; a -= c; a ^= (c>>3); \ - b -= c; b -= a; b ^= (a<<10); \ - c -= a; c -= b; c ^= (b>>15); \ -} - -/* the key */ -/* the length of the key */ -/* the previous hash, or an arbitrary value */ -unsigned int quick_hash(register const char *k, register unsigned int length, register unsigned int initval) -{ - register unsigned int a,b,c,len; - - /* Set up the internal state */ - len = length; - a = b = 0x9e3779b9; /* the golden ratio; an arbitrary value */ - c = initval; /* the previous hash value */ - - /*---------------------------------------- handle most of the key */ - while (len >= 12) { - a += (k[0] +((unsigned int)k[1]<<8) +((unsigned int)k[2]<<16) +((unsigned int)k[3]<<24)); - b += (k[4] +((unsigned int)k[5]<<8) +((unsigned int)k[6]<<16) +((unsigned int)k[7]<<24)); - c += (k[8] +((unsigned int)k[9]<<8) +((unsigned int)k[10]<<16)+((unsigned int)k[11]<<24)); - mix(a,b,c); - k += 12; - len -= 12; - } - - /*------------------------------------- handle the last 11 bytes */ - c += length; - switch(len) { /* all the case statements fall through */ - case 11: - c+=((unsigned int)k[10]<<24); - case 10: - c+=((unsigned int)k[9]<<16); - case 9 : - c+=((unsigned int)k[8]<<8); - /* the first byte of c is reserved for the length */ - case 8 : - b+=((unsigned int)k[7]<<24); - case 7 : - b+=((unsigned int)k[6]<<16); - case 6 : - b+=((unsigned int)k[5]<<8); - case 5 : - b+=k[4]; - case 4 : - a+=((unsigned int)k[3]<<24); - case 3 : - a+=((unsigned int)k[2]<<16); - case 2 : - a+=((unsigned int)k[1]<<8); - case 1 : - a+=k[0]; - /* case 0: nothing left to add */ - } - mix(a,b,c); - /*-------------------------------------------- report the result */ - return c; -} - diff --git a/moses/src/hash.h b/moses/src/hash.h deleted file mode 100644 index 227f3b90a..000000000 --- a/moses/src/hash.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef moses_hash_h -#define moses_hash_h - -// taken from burtleburtle.net/bob/hash/doobs.html -unsigned int quick_hash(register const char *k, register unsigned int length, register unsigned int initval); - -#endif - -- cgit v1.2.3 From 82ab7c1507b73a5005382a29ca54885974cbc9d1 Mon Sep 17 00:00:00 2001 From: Lane Schwartz Date: Tue, 2 Oct 2012 09:24:20 -0400 Subject: Force ems to pass -S /bin/bash to qsub. For reasons that defy comprehension, when qsub runs scripts, it blatantly ignores the shebang line that specifies a shell to use. Instead, SGE has its own config variable that defines what shell to use when running scripts via qsub. The -S /bin/bash option to qsub forces SGE to launch your script using bash. The scripts created by experiment.perl all assume they will be run with bash, so it is incumbent upon experiment.perl to ensure that SGE uses bash to run them. --- scripts/ems/experiment.perl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index 00b7da721..7dd83151d 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -1059,7 +1059,7 @@ sub execute_steps { $DO{$i}++; print "\texecuting $step via qsub ($active active)\n"; my $qsub_args = &get_qsub_args($DO_STEP[$i]); - `qsub $qsub_args -e $step.STDERR -o $step.STDOUT $step`; + `qsub $qsub_args -S /bin/bash -e $step.STDERR -o $step.STDOUT $step`; } # execute in fork -- cgit v1.2.3 From f92d63c9a211f22e0fc184b481b9e54064edecfa Mon Sep 17 00:00:00 2001 From: wlin12 Date: Tue, 2 Oct 2012 18:54:51 +0100 Subject: input type bug fix --- contrib/relent-filter/scripts/prunePT.pl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/contrib/relent-filter/scripts/prunePT.pl b/contrib/relent-filter/scripts/prunePT.pl index 37dc30bad..b443cc14e 100755 --- a/contrib/relent-filter/scripts/prunePT.pl +++ b/contrib/relent-filter/scripts/prunePT.pl @@ -7,8 +7,8 @@ my $threshold = -1; use Getopt::Long; $_HELP = 1 if (@ARGV < 1 or !GetOptions ("table=s" => \$table, #table to filter "scores=s" => \$scores_file, #scores of each phrase pair, should have same size as the table to filter -"percentage=i" => \$percentage, # percentage of phrase table to remain -"threshold=i" => \$threshold)); # threshold (score < threshold equals prune entry) +"percentage=f" => \$percentage, # percentage of phrase table to remain +"threshold=f" => \$threshold)); # threshold (score < threshold equals prune entry) # help message if arguments are not correct if ($_HELP) { -- cgit v1.2.3 From 69a0fcde29abb7871706cbf2ce1d7f065401cfb1 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Wed, 3 Oct 2012 14:23:46 +0100 Subject: Merge ShrinkToLimit into ApplyThreshold --- moses/src/CYKPlusParser/ChartRuleLookupManagerMemory.cpp | 2 -- moses/src/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp | 2 -- moses/src/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp | 2 -- moses/src/ChartTranslationOptionList.cpp | 6 +----- moses/src/ChartTranslationOptionList.h | 1 - 5 files changed, 1 insertion(+), 12 deletions(-) diff --git a/moses/src/CYKPlusParser/ChartRuleLookupManagerMemory.cpp b/moses/src/CYKPlusParser/ChartRuleLookupManagerMemory.cpp index 904d536e1..57ab61b29 100644 --- a/moses/src/CYKPlusParser/ChartRuleLookupManagerMemory.cpp +++ b/moses/src/CYKPlusParser/ChartRuleLookupManagerMemory.cpp @@ -165,8 +165,6 @@ void ChartRuleLookupManagerMemory::GetChartRuleCollection( } dottedRuleCol.Clear(relEndPos+1); - - outColl.ShrinkToLimit(); } // Given a partial rule application ending at startPos-1 and given the sets of diff --git a/moses/src/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp b/moses/src/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp index e504a24d7..ca2e1d395 100644 --- a/moses/src/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp +++ b/moses/src/CYKPlusParser/ChartRuleLookupManagerMemoryPerSentence.cpp @@ -165,8 +165,6 @@ void ChartRuleLookupManagerMemoryPerSentence::GetChartRuleCollection( } dottedRuleCol.Clear(relEndPos+1); - - outColl.ShrinkToLimit(); } // Given a partial rule application ending at startPos-1 and given the sets of diff --git a/moses/src/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp b/moses/src/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp index 08db10472..f8270d113 100644 --- a/moses/src/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp +++ b/moses/src/CYKPlusParser/ChartRuleLookupManagerOnDisk.cpp @@ -268,8 +268,6 @@ void ChartRuleLookupManagerOnDisk::GetChartRuleCollection( } } // for (size_t ind = 0; ind < savedNodeColl.size(); ++ind) - outColl.ShrinkToLimit(); - //cerr << numDerivations << " "; } diff --git a/moses/src/ChartTranslationOptionList.cpp b/moses/src/ChartTranslationOptionList.cpp index 41b059239..eb2add105 100644 --- a/moses/src/ChartTranslationOptionList.cpp +++ b/moses/src/ChartTranslationOptionList.cpp @@ -98,8 +98,7 @@ void ChartTranslationOptionList::Add(const TargetPhraseCollection &tpc, } } -void ChartTranslationOptionList::ShrinkToLimit() -{ +void ChartTranslationOptionList::ApplyThreshold() { if (m_size > m_ruleLimit) { // Something's gone wrong if the list has grown to m_ruleLimit * 2 // without being pruned. @@ -112,10 +111,7 @@ void ChartTranslationOptionList::ShrinkToLimit() ChartTranslationOptionOrderer()); m_size = m_ruleLimit; } -} -void ChartTranslationOptionList::ApplyThreshold() -{ // keep only those over best + threshold float scoreThreshold = -std::numeric_limits::infinity(); diff --git a/moses/src/ChartTranslationOptionList.h b/moses/src/ChartTranslationOptionList.h index 3bd56c2a3..3197037c6 100644 --- a/moses/src/ChartTranslationOptionList.h +++ b/moses/src/ChartTranslationOptionList.h @@ -46,7 +46,6 @@ class ChartTranslationOptionList const WordsRange &); void Clear(); - void ShrinkToLimit(); void ApplyThreshold(); private: -- cgit v1.2.3 From 7f5b6b231000f52cf9280ee92262933ce270e4ff Mon Sep 17 00:00:00 2001 From: Wilker Aziz Date: Wed, 3 Oct 2012 20:04:48 +0200 Subject: better doc --- contrib/python/README.md | 10 ++++++-- contrib/python/binpt/binpt.cpp | 58 +++++++++++++++++++++--------------------- contrib/python/binpt/binpt.pyx | 8 +++--- contrib/python/setup.py | 21 ++++++++++----- 4 files changed, 55 insertions(+), 42 deletions(-) diff --git a/contrib/python/README.md b/contrib/python/README.md index fa7d270c8..d9ef1cd27 100644 --- a/contrib/python/README.md +++ b/contrib/python/README.md @@ -10,9 +10,15 @@ The idea is to have some of Moses' internals exposed to Python (inspired on pycd ## Building -1. Build the python extension +1. Build the python extension: - python setup.py build_ext -i [--with-cmph] + You need to compile Moses with link=shared and (for while) without SRILM (for some reason SRILM prevents the compiler from generating libLM.so) + + ./bjam --libdir=path cxxflags=-fPIC link=shared + + Then you can build the extension (in case you used --libdir=path above, use --moses-lib=path below) + + python setup.py build_ext -i [--with-cmph] [--moses-lib=path] 3. Check the example code diff --git a/contrib/python/binpt/binpt.cpp b/contrib/python/binpt/binpt.cpp index 7de3058fc..0abf86ab1 100644 --- a/contrib/python/binpt/binpt.cpp +++ b/contrib/python/binpt/binpt.cpp @@ -1,4 +1,4 @@ -/* Generated by Cython 0.16 on Tue Sep 18 11:36:58 2012 */ +/* Generated by Cython 0.16 on Fri Sep 21 10:28:51 2012 */ #define PY_SSIZE_T_CLEAN #include "Python.h" @@ -692,7 +692,7 @@ static PyObject *__pyx_pf_5binpt_11QueryResult_2words(struct __pyx_obj_5binpt_Qu static PyObject *__pyx_pf_5binpt_11QueryResult_4scores(struct __pyx_obj_5binpt_QueryResult *__pyx_v_self); /* proto */ static PyObject *__pyx_pf_5binpt_11QueryResult_6wa(struct __pyx_obj_5binpt_QueryResult *__pyx_v_self); /* proto */ static PyObject *__pyx_lambda_funcdef_lambda1(CYTHON_UNUSED PyObject *__pyx_self, PyObject *__pyx_v_r); /* proto */ -static PyObject *__pyx_pf_5binpt_11QueryResult_8desc(PyObject *__pyx_v_x, PyObject *__pyx_v_y, PyObject *__pyx_v_keys); /* proto */ +static PyObject *__pyx_pf_5binpt_11QueryResult_8desc(PyObject *__pyx_v_x, PyObject *__pyx_v_y, PyObject *__pyx_v_key); /* proto */ static PyObject *__pyx_pf_5binpt_11QueryResult_10__str__(struct __pyx_obj_5binpt_QueryResult *__pyx_v_self); /* proto */ static PyObject *__pyx_pf_5binpt_11QueryResult_12__repr__(struct __pyx_obj_5binpt_QueryResult *__pyx_v_self); /* proto */ static int __pyx_pf_5binpt_17BinaryPhraseTable___cinit__(struct __pyx_obj_5binpt_BinaryPhraseTable *__pyx_v_self, PyObject *__pyx_v_path, unsigned int __pyx_v_nscores, int __pyx_v_wa, PyObject *__pyx_v_delimiters); /* proto */ @@ -722,10 +722,10 @@ static char __pyx_k__y[] = "y"; static char __pyx_k__os[] = "os"; static char __pyx_k__wa[] = "wa"; static char __pyx_k__cmp[] = "cmp"; +static char __pyx_k__key[] = "key"; static char __pyx_k__top[] = "top"; static char __pyx_k__desc[] = "desc"; static char __pyx_k__join[] = "join"; -static char __pyx_k__keys[] = "keys"; static char __pyx_k__line[] = "line"; static char __pyx_k__path[] = "path"; static char __pyx_k__sort[] = "sort"; @@ -771,7 +771,7 @@ static PyObject *__pyx_n_s__encode; static PyObject *__pyx_n_s__isValidBinaryTable; static PyObject *__pyx_n_s__isfile; static PyObject *__pyx_n_s__join; -static PyObject *__pyx_n_s__keys; +static PyObject *__pyx_n_s__key; static PyObject *__pyx_n_s__line; static PyObject *__pyx_n_s__nscores; static PyObject *__pyx_n_s__os; @@ -1276,13 +1276,13 @@ static PyObject *__pyx_pf_5binpt_11QueryResult_6wa(struct __pyx_obj_5binpt_Query /* Python wrapper */ static PyObject *__pyx_pw_5binpt_11QueryResult_9desc(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds); /*proto*/ -static char __pyx_doc_5binpt_11QueryResult_8desc[] = "Returns the sign of keys(y) - keys(x).\n Can only be used if scores is not an empty vector as\n keys defaults to scores[0]"; +static char __pyx_doc_5binpt_11QueryResult_8desc[] = "Returns the sign of key(y) - key(x).\n Can only be used if scores is not an empty vector as\n keys defaults to scores[0]"; static PyMethodDef __pyx_mdef_5binpt_11QueryResult_9desc = {__Pyx_NAMESTR("desc"), (PyCFunction)__pyx_pw_5binpt_11QueryResult_9desc, METH_VARARGS|METH_KEYWORDS, __Pyx_DOCSTR(__pyx_doc_5binpt_11QueryResult_8desc)}; static PyObject *__pyx_pw_5binpt_11QueryResult_9desc(PyObject *__pyx_self, PyObject *__pyx_args, PyObject *__pyx_kwds) { PyObject *__pyx_v_x = 0; PyObject *__pyx_v_y = 0; - PyObject *__pyx_v_keys = 0; - static PyObject **__pyx_pyargnames[] = {&__pyx_n_s__x,&__pyx_n_s__y,&__pyx_n_s__keys,0}; + PyObject *__pyx_v_key = 0; + static PyObject **__pyx_pyargnames[] = {&__pyx_n_s__x,&__pyx_n_s__y,&__pyx_n_s__key,0}; PyObject *__pyx_r = 0; __Pyx_RefNannyDeclarations __Pyx_RefNannySetupContext("desc (wrapper)", 0); @@ -1313,7 +1313,7 @@ static PyObject *__pyx_pw_5binpt_11QueryResult_9desc(PyObject *__pyx_self, PyObj } case 2: if (kw_args > 0) { - PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s__keys); + PyObject* value = PyDict_GetItem(__pyx_kwds, __pyx_n_s__key); if (value) { values[2] = value; kw_args--; } } } @@ -1331,7 +1331,7 @@ static PyObject *__pyx_pw_5binpt_11QueryResult_9desc(PyObject *__pyx_self, PyObj } __pyx_v_x = values[0]; __pyx_v_y = values[1]; - __pyx_v_keys = values[2]; + __pyx_v_key = values[2]; } goto __pyx_L4_argument_unpacking_done; __pyx_L5_argtuple_error:; @@ -1341,7 +1341,7 @@ static PyObject *__pyx_pw_5binpt_11QueryResult_9desc(PyObject *__pyx_self, PyObj __Pyx_RefNannyFinishContext(); return NULL; __pyx_L4_argument_unpacking_done:; - __pyx_r = __pyx_pf_5binpt_11QueryResult_8desc(__pyx_v_x, __pyx_v_y, __pyx_v_keys); + __pyx_r = __pyx_pf_5binpt_11QueryResult_8desc(__pyx_v_x, __pyx_v_y, __pyx_v_key); __Pyx_RefNannyFinishContext(); return __pyx_r; } @@ -1362,8 +1362,8 @@ static PyObject *__pyx_pw_5binpt_11QueryResult_4desc_lambda1(PyObject *__pyx_sel /* "binpt.pyx":52 * * @staticmethod - * def desc(x, y, keys = lambda r: r.scores[0]): # <<<<<<<<<<<<<< - * '''Returns the sign of keys(y) - keys(x). + * def desc(x, y, key = lambda r: r.scores[0]): # <<<<<<<<<<<<<< + * '''Returns the sign of key(y) - key(x). * Can only be used if scores is not an empty vector as */ @@ -1399,7 +1399,7 @@ static PyObject *__pyx_lambda_funcdef_lambda1(CYTHON_UNUSED PyObject *__pyx_self return __pyx_r; } -static PyObject *__pyx_pf_5binpt_11QueryResult_8desc(PyObject *__pyx_v_x, PyObject *__pyx_v_y, PyObject *__pyx_v_keys) { +static PyObject *__pyx_pf_5binpt_11QueryResult_8desc(PyObject *__pyx_v_x, PyObject *__pyx_v_y, PyObject *__pyx_v_key) { PyObject *__pyx_r = NULL; __Pyx_RefNannyDeclarations PyObject *__pyx_t_1 = NULL; @@ -1414,7 +1414,7 @@ static PyObject *__pyx_pf_5binpt_11QueryResult_8desc(PyObject *__pyx_v_x, PyObje /* "binpt.pyx":56 * Can only be used if scores is not an empty vector as * keys defaults to scores[0]''' - * return fsign(keys(y) - keys(x)) # <<<<<<<<<<<<<< + * return fsign(key(y) - key(x)) # <<<<<<<<<<<<<< * * def __str__(self): */ @@ -1424,7 +1424,7 @@ static PyObject *__pyx_pf_5binpt_11QueryResult_8desc(PyObject *__pyx_v_x, PyObje __Pyx_INCREF(__pyx_v_y); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_y); __Pyx_GIVEREF(__pyx_v_y); - __pyx_t_2 = PyObject_Call(__pyx_v_keys, ((PyObject *)__pyx_t_1), NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_2 = PyObject_Call(__pyx_v_key, ((PyObject *)__pyx_t_1), NULL); if (unlikely(!__pyx_t_2)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_2); __Pyx_DECREF(((PyObject *)__pyx_t_1)); __pyx_t_1 = 0; __pyx_t_1 = PyTuple_New(1); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;} @@ -1432,7 +1432,7 @@ static PyObject *__pyx_pf_5binpt_11QueryResult_8desc(PyObject *__pyx_v_x, PyObje __Pyx_INCREF(__pyx_v_x); PyTuple_SET_ITEM(__pyx_t_1, 0, __pyx_v_x); __Pyx_GIVEREF(__pyx_v_x); - __pyx_t_3 = PyObject_Call(__pyx_v_keys, ((PyObject *)__pyx_t_1), NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;} + __pyx_t_3 = PyObject_Call(__pyx_v_key, ((PyObject *)__pyx_t_1), NULL); if (unlikely(!__pyx_t_3)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_3); __Pyx_DECREF(((PyObject *)__pyx_t_1)); __pyx_t_1 = 0; __pyx_t_1 = PyNumber_Subtract(__pyx_t_2, __pyx_t_3); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 56; __pyx_clineno = __LINE__; goto __pyx_L1_error;} @@ -1475,7 +1475,7 @@ static PyObject *__pyx_pw_5binpt_11QueryResult_11__str__(PyObject *__pyx_v_self) } /* "binpt.pyx":58 - * return fsign(keys(y) - keys(x)) + * return fsign(key(y) - key(x)) * * def __str__(self): # <<<<<<<<<<<<<< * '''Returns a string such as: ||| [||| word-alignment info]''' @@ -3724,7 +3724,7 @@ static __Pyx_StringTabEntry __pyx_string_tab[] = { {&__pyx_n_s__isValidBinaryTable, __pyx_k__isValidBinaryTable, sizeof(__pyx_k__isValidBinaryTable), 0, 0, 1, 1}, {&__pyx_n_s__isfile, __pyx_k__isfile, sizeof(__pyx_k__isfile), 0, 0, 1, 1}, {&__pyx_n_s__join, __pyx_k__join, sizeof(__pyx_k__join), 0, 0, 1, 1}, - {&__pyx_n_s__keys, __pyx_k__keys, sizeof(__pyx_k__keys), 0, 0, 1, 1}, + {&__pyx_n_s__key, __pyx_k__key, sizeof(__pyx_k__key), 0, 0, 1, 1}, {&__pyx_n_s__line, __pyx_k__line, sizeof(__pyx_k__line), 0, 0, 1, 1}, {&__pyx_n_s__nscores, __pyx_k__nscores, sizeof(__pyx_k__nscores), 0, 0, 1, 1}, {&__pyx_n_s__os, __pyx_k__os, sizeof(__pyx_k__os), 0, 0, 1, 1}, @@ -3774,8 +3774,8 @@ static int __Pyx_InitCachedConstants(void) { /* "binpt.pyx":52 * * @staticmethod - * def desc(x, y, keys = lambda r: r.scores[0]): # <<<<<<<<<<<<<< - * '''Returns the sign of keys(y) - keys(x). + * def desc(x, y, key = lambda r: r.scores[0]): # <<<<<<<<<<<<<< + * '''Returns the sign of key(y) - key(x). * Can only be used if scores is not an empty vector as */ __pyx_k_tuple_16 = PyTuple_New(3); if (unlikely(!__pyx_k_tuple_16)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;} @@ -3786,9 +3786,9 @@ static int __Pyx_InitCachedConstants(void) { __Pyx_INCREF(((PyObject *)__pyx_n_s__y)); PyTuple_SET_ITEM(__pyx_k_tuple_16, 1, ((PyObject *)__pyx_n_s__y)); __Pyx_GIVEREF(((PyObject *)__pyx_n_s__y)); - __Pyx_INCREF(((PyObject *)__pyx_n_s__keys)); - PyTuple_SET_ITEM(__pyx_k_tuple_16, 2, ((PyObject *)__pyx_n_s__keys)); - __Pyx_GIVEREF(((PyObject *)__pyx_n_s__keys)); + __Pyx_INCREF(((PyObject *)__pyx_n_s__key)); + PyTuple_SET_ITEM(__pyx_k_tuple_16, 2, ((PyObject *)__pyx_n_s__key)); + __Pyx_GIVEREF(((PyObject *)__pyx_n_s__key)); __Pyx_GIVEREF(((PyObject *)__pyx_k_tuple_16)); __pyx_k_codeobj_17 = (PyObject*)__Pyx_PyCode_New(3, 0, 3, 0, 0, __pyx_empty_bytes, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_k_tuple_16, __pyx_empty_tuple, __pyx_empty_tuple, __pyx_kp_s_18, __pyx_n_s__desc, 52, __pyx_empty_bytes); if (unlikely(!__pyx_k_codeobj_17)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;} @@ -3987,8 +3987,8 @@ PyMODINIT_FUNC PyInit_binpt(void) /* "binpt.pyx":52 * * @staticmethod - * def desc(x, y, keys = lambda r: r.scores[0]): # <<<<<<<<<<<<<< - * '''Returns the sign of keys(y) - keys(x). + * def desc(x, y, key = lambda r: r.scores[0]): # <<<<<<<<<<<<<< + * '''Returns the sign of key(y) - key(x). * Can only be used if scores is not an empty vector as */ __pyx_t_1 = __Pyx_CyFunction_NewEx(&__pyx_mdef_5binpt_11QueryResult_4desc_lambda1, 0, NULL, __pyx_n_s__binpt, NULL); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;} @@ -4001,8 +4001,8 @@ PyMODINIT_FUNC PyInit_binpt(void) * return self._wa * * @staticmethod # <<<<<<<<<<<<<< - * def desc(x, y, keys = lambda r: r.scores[0]): - * '''Returns the sign of keys(y) - keys(x). + * def desc(x, y, key = lambda r: r.scores[0]): + * '''Returns the sign of key(y) - key(x). */ __pyx_t_1 = PyCFunction_NewEx(&__pyx_mdef_5binpt_11QueryResult_9desc, NULL, __pyx_n_s__binpt); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;} __Pyx_GOTREF(__pyx_t_1); @@ -4021,8 +4021,8 @@ PyMODINIT_FUNC PyInit_binpt(void) /* "binpt.pyx":52 * * @staticmethod - * def desc(x, y, keys = lambda r: r.scores[0]): # <<<<<<<<<<<<<< - * '''Returns the sign of keys(y) - keys(x). + * def desc(x, y, key = lambda r: r.scores[0]): # <<<<<<<<<<<<<< + * '''Returns the sign of key(y) - key(x). * Can only be used if scores is not an empty vector as */ __pyx_t_1 = __Pyx_GetName((PyObject *)__pyx_ptype_5binpt_QueryResult, __pyx_n_s__desc); if (unlikely(!__pyx_t_1)) {__pyx_filename = __pyx_f[0]; __pyx_lineno = 52; __pyx_clineno = __LINE__; goto __pyx_L1_error;} diff --git a/contrib/python/binpt/binpt.pyx b/contrib/python/binpt/binpt.pyx index e66981df6..137b0bc8c 100644 --- a/contrib/python/binpt/binpt.pyx +++ b/contrib/python/binpt/binpt.pyx @@ -49,11 +49,11 @@ cdef class QueryResult(object): return self._wa @staticmethod - def desc(x, y, keys = lambda r: r.scores[0]): - '''Returns the sign of keys(y) - keys(x). + def desc(x, y, key = lambda r: r.scores[0]): + '''Returns the sign of key(y) - key(x). Can only be used if scores is not an empty vector as keys defaults to scores[0]''' - return fsign(keys(y) - keys(x)) + return fsign(key(y) - key(x)) def __str__(self): '''Returns a string such as: ||| [||| word-alignment info]''' @@ -138,7 +138,7 @@ cdef class BinaryPhraseTable(object): def delimiters(self): return self._delimiters - def query(self, line, cmp = None, top = 0): + def query(self, line, cmp = None, key = lambda x: x.scores[0], top = 0): '''Queries the phrase table and returns a list of matches. Each match is a QueryResult. If 'cmp' is defined the return list is sorted. diff --git a/contrib/python/setup.py b/contrib/python/setup.py index 66042fbc8..bc501b360 100644 --- a/contrib/python/setup.py +++ b/contrib/python/setup.py @@ -3,31 +3,38 @@ from distutils.extension import Extension import os import sys -available_switches = ['--with-cmph'] +available_switches = ['--with-cmph', '--moses-lib'] with_cmph = False -while sys.argv[-1] in available_switches: - switch = sys.argv.pop() - if switch == '--with-cmph': +mosesdir = os.path.abspath('../../') +includes = [mosesdir, os.path.join(mosesdir, 'moses/src'), os.path.join(mosesdir, 'util')] +libdir = os.path.join(mosesdir, 'lib') + +while sys.argv[-1].split('=')[0] in available_switches: + param = sys.argv.pop().split('=') + if param[0] == '--with-cmph': with_cmph = True + if param[0] == '--moses-lib': + libdir = param[1] +print >> sys.stderr, 'mosesdir=%s\nincludes=%s\nlibdir=%s\ncmph=%s' % (mosesdir, includes, libdir, with_cmph) #### From here you probably don't need to change anything #### unless a new dependency shows up in Moses -mosesdir = os.path.abspath('../../') -includes = [mosesdir, os.path.join(mosesdir, 'moses/src'), os.path.join(mosesdir, 'util')] -libdir = os.path.join(mosesdir, 'lib') basic=['z', 'stdc++', 'pthread', 'm', 'gcc_s', 'c', 'boost_system', 'boost_thread', 'boost_filesystem', 'rt'] moses=['OnDiskPt', 'kenutil', 'kenlm', 'LM', 'mert_lib', 'moses_internal', 'CYKPlusParser', 'Scope3Parser', 'fuzzy-match', 'RuleTable', 'CompactPT', 'moses', 'dynsa', 'pcfg_common' ] additional=[] + if with_cmph: additional.append('cmph') exobj = [os.path.join(libdir, 'lib' + l + '.so') for l in moses] +print >> sys.stderr, 'basic=%s\nmoses=%s\nadditional=%s\nextra=%s' % (basic, moses, additional, exobj) + ext_modules = [ Extension(name = 'binpt', sources = ['binpt/binpt.cpp'], -- cgit v1.2.3 From 289a9ea54f01080f8605a74620bfb64116a7008a Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Oct 2012 19:57:51 +0100 Subject: experiment.meta: update pcfg-extract and pcfg-score EMS now looks for binaries in $moses-bin-dir instead of their old location in $moses-script-dir. --- scripts/ems/experiment.meta | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 9706e2a82..b7ad61235 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -361,13 +361,13 @@ pcfg-extract default-name: model/pcfg ignore-unless: use-pcfg-feature rerun-on-change: use-pcfg-feature - template: $moses-script-dir/training/phrase-extract/pcfg-extract/pcfg-extract < IN.$output-extension > OUT.$output-extension + template: $moses-bin-dir/pcfg-extract < IN.$output-extension > OUT.$output-extension pcfg-score in: parse-relaxed-corpus pcfg out: scored-corpus default-name: model/scored-corpus pass-unless: use-pcfg-feature - template: ln -s IN.$input-extension OUT.$input-extension ; $moses-script-dir/training/phrase-extract/pcfg-score/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension + template: ln -s IN.$input-extension OUT.$input-extension ; $moses-bin-dir/pcfg-score IN1.$output-extension < IN.$output-extension > OUT.$output-extension extract-phrases in: word-alignment scored-corpus out: extracted-phrases -- cgit v1.2.3 From 0851a4d1136a345b2ac4472da087c1114a77f255 Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Wed, 3 Oct 2012 20:04:09 +0100 Subject: extract-ghkm: add --SentenceOffset option This should behave the same as the --SentenceOffset option for extract-rules. The extract-parallel.perl script expects the rule extractor to have this option. --- phrase-extract/extract-ghkm/ExtractGHKM.cpp | 5 ++++- phrase-extract/extract-ghkm/Options.h | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index dae876116..94e565085 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -90,7 +90,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::string alignmentLine; XmlTreeParser xmlTreeParser(labelSet, topLabelSet); ScfgRuleWriter writer(fwdExtractStream, invExtractStream, options); - size_t lineNum = 0; + size_t lineNum = options.sentenceOffset; while (true) { std::getline(targetStream, targetLine); std::getline(sourceStream, sourceLine); @@ -289,6 +289,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], "extract minimal rules only") ("PCFG", "include score based on PCFG scores in target corpus") + ("SentenceOffset", + po::value(&options.sentenceOffset)->default_value(options.sentenceOffset), + "set sentence number offset if processing split corpus") ("UnknownWordLabel", po::value(&options.unknownWordFile), "write unknown word labels to named file") diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h index 362fc95d2..acad2040c 100644 --- a/phrase-extract/extract-ghkm/Options.h +++ b/phrase-extract/extract-ghkm/Options.h @@ -38,6 +38,7 @@ struct Options { , maxScope(3) , minimal(false) , pcfg(false) + , sentenceOffset(0) , unpairedExtractFormat(false) {} // Positional options @@ -57,6 +58,7 @@ struct Options { int maxScope; bool minimal; bool pcfg; + int sentenceOffset; bool unpairedExtractFormat; std::string unknownWordFile; }; -- cgit v1.2.3 From 9ae55243e25560338095c5e87e191f22019625de Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Thu, 4 Oct 2012 15:08:22 +0100 Subject: remove dependency on boost filesystem object --- .../moses-cmd.xcodeproj/project.pbxproj | 3 --- .../other-builds/moses.xcodeproj/project.pbxproj | 8 ------- jam-files/sanity.jam | 1 - moses/src/Jamfile | 2 +- moses/src/StaticData.cpp | 27 +++++++++++++--------- regression-testing/tests | 2 +- 6 files changed, 18 insertions(+), 25 deletions(-) diff --git a/contrib/other-builds/moses-cmd.xcodeproj/project.pbxproj b/contrib/other-builds/moses-cmd.xcodeproj/project.pbxproj index aac225ced..5bf91e2b8 100644 --- a/contrib/other-builds/moses-cmd.xcodeproj/project.pbxproj +++ b/contrib/other-builds/moses-cmd.xcodeproj/project.pbxproj @@ -337,7 +337,6 @@ "-lflm", "-llattice", "-lboost_thread-mt", - "-lboost_filesystem-mt", "-lboost_system-mt", "-lcmph", ); @@ -385,7 +384,6 @@ "-lflm", "-llattice", "-lboost_thread-mt", - "-lboost_filesystem-mt", "-lboost_system-mt", "-lcmph", ); @@ -430,7 +428,6 @@ "-lflm", "-llattice", "-lboost_thread-mt", - "-lboost_filesystem-mt", "-lboost_system-mt", "-lcmph", ); diff --git a/contrib/other-builds/moses.xcodeproj/project.pbxproj b/contrib/other-builds/moses.xcodeproj/project.pbxproj index 2864615c6..91d3ee250 100644 --- a/contrib/other-builds/moses.xcodeproj/project.pbxproj +++ b/contrib/other-builds/moses.xcodeproj/project.pbxproj @@ -135,8 +135,6 @@ 1EC737A814B977AB00238410 /* GlobalLexicalModel.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC7363F14B977AA00238410 /* GlobalLexicalModel.cpp */; }; 1EC737A914B977AB00238410 /* GlobalLexicalModel.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC7364014B977AA00238410 /* GlobalLexicalModel.h */; }; 1EC737AA14B977AB00238410 /* gzfilebuf.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC7364114B977AA00238410 /* gzfilebuf.h */; }; - 1EC737AB14B977AB00238410 /* hash.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC7364214B977AA00238410 /* hash.cpp */; }; - 1EC737AC14B977AB00238410 /* hash.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC7364314B977AA00238410 /* hash.h */; }; 1EC737AD14B977AB00238410 /* Hypothesis.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC7364514B977AA00238410 /* Hypothesis.cpp */; }; 1EC737AE14B977AB00238410 /* Hypothesis.h in Headers */ = {isa = PBXBuildFile; fileRef = 1EC7364614B977AA00238410 /* Hypothesis.h */; }; 1EC737AF14B977AB00238410 /* HypothesisStack.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EC7364714B977AA00238410 /* HypothesisStack.cpp */; }; @@ -465,8 +463,6 @@ 1EC7363F14B977AA00238410 /* GlobalLexicalModel.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = GlobalLexicalModel.cpp; path = ../../moses/src/GlobalLexicalModel.cpp; sourceTree = ""; }; 1EC7364014B977AA00238410 /* GlobalLexicalModel.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = GlobalLexicalModel.h; path = ../../moses/src/GlobalLexicalModel.h; sourceTree = ""; }; 1EC7364114B977AA00238410 /* gzfilebuf.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = gzfilebuf.h; path = ../../moses/src/gzfilebuf.h; sourceTree = ""; }; - 1EC7364214B977AA00238410 /* hash.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = hash.cpp; path = ../../moses/src/hash.cpp; sourceTree = ""; }; - 1EC7364314B977AA00238410 /* hash.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = hash.h; path = ../../moses/src/hash.h; sourceTree = ""; }; 1EC7364414B977AA00238410 /* hypergraph.proto */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = text; name = hypergraph.proto; path = ../../moses/src/hypergraph.proto; sourceTree = ""; }; 1EC7364514B977AA00238410 /* Hypothesis.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; name = Hypothesis.cpp; path = ../../moses/src/Hypothesis.cpp; sourceTree = ""; }; 1EC7364614B977AA00238410 /* Hypothesis.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; name = Hypothesis.h; path = ../../moses/src/Hypothesis.h; sourceTree = ""; }; @@ -772,8 +768,6 @@ 1EC7363F14B977AA00238410 /* GlobalLexicalModel.cpp */, 1EC7364014B977AA00238410 /* GlobalLexicalModel.h */, 1EC7364114B977AA00238410 /* gzfilebuf.h */, - 1EC7364214B977AA00238410 /* hash.cpp */, - 1EC7364314B977AA00238410 /* hash.h */, 1EC7364414B977AA00238410 /* hypergraph.proto */, 1EF8F2C3159A61970047B613 /* HypoList.h */, 1EC7364514B977AA00238410 /* Hypothesis.cpp */, @@ -1155,7 +1149,6 @@ 1EC737A714B977AB00238410 /* GenerationDictionary.h in Headers */, 1EC737A914B977AB00238410 /* GlobalLexicalModel.h in Headers */, 1EC737AA14B977AB00238410 /* gzfilebuf.h in Headers */, - 1EC737AC14B977AB00238410 /* hash.h in Headers */, 1EC737AE14B977AB00238410 /* Hypothesis.h in Headers */, 1EC737B014B977AB00238410 /* HypothesisStack.h in Headers */, 1EC737B214B977AB00238410 /* HypothesisStackCubePruning.h in Headers */, @@ -1378,7 +1371,6 @@ 1EC737A414B977AB00238410 /* FloydWarshall.cpp in Sources */, 1EC737A614B977AB00238410 /* GenerationDictionary.cpp in Sources */, 1EC737A814B977AB00238410 /* GlobalLexicalModel.cpp in Sources */, - 1EC737AB14B977AB00238410 /* hash.cpp in Sources */, 1EC737AD14B977AB00238410 /* Hypothesis.cpp in Sources */, 1EC737AF14B977AB00238410 /* HypothesisStack.cpp in Sources */, 1EC737B114B977AB00238410 /* HypothesisStackCubePruning.cpp in Sources */, diff --git a/jam-files/sanity.jam b/jam-files/sanity.jam index 6beec3f94..2aca84e4d 100644 --- a/jam-files/sanity.jam +++ b/jam-files/sanity.jam @@ -152,7 +152,6 @@ rule boost ( min-version ) { boost-lib program_options PROGRAM_OPTIONS_DYN_LINK ; boost-lib unit_test_framework TEST_DYN_LINK ; boost-lib iostreams IOSTREAMS_DYN_LINK ; - boost-lib filesystem FILE_SYSTEM_DYN_LINK : boost_system ; } #Link normally to a library, but sometimes static isn't installed so fall back to dynamic. diff --git a/moses/src/Jamfile b/moses/src/Jamfile index 7589e5098..1ea22eb81 100644 --- a/moses/src/Jamfile +++ b/moses/src/Jamfile @@ -24,6 +24,6 @@ lib moses_internal : [ glob *.cpp DynSAInclude/*.cpp : PhraseDictionary.cpp ThreadPool.cpp SyntacticLanguageModel.cpp ] synlm ThreadPool headers rt ; -lib moses : PhraseDictionary.cpp moses_internal CYKPlusParser//CYKPlusParser CompactPT//CompactPT LM//LM RuleTable//RuleTable Scope3Parser//Scope3Parser fuzzy-match//fuzzy-match headers ../..//z ../../OnDiskPt//OnDiskPt ../..//boost_filesystem ; +lib moses : PhraseDictionary.cpp moses_internal CYKPlusParser//CYKPlusParser CompactPT//CompactPT LM//LM RuleTable//RuleTable Scope3Parser//Scope3Parser fuzzy-match//fuzzy-match headers ../..//z ../../OnDiskPt//OnDiskPt ; alias headers-to-install : [ glob-tree *.h ] ; diff --git a/moses/src/StaticData.cpp b/moses/src/StaticData.cpp index 07a8005f7..48eef17b4 100644 --- a/moses/src/StaticData.cpp +++ b/moses/src/StaticData.cpp @@ -20,9 +20,6 @@ License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ -#include -#include - #include #include "util/check.hh" #include "PhraseDictionaryMemory.h" @@ -1365,16 +1362,24 @@ void StaticData::ClearTransOptionCache() const { void StaticData::SetExecPath(const std::string &path) { - namespace fs = boost::filesystem; - - fs::path full_path( fs::initial_path() ); + /* + namespace fs = boost::filesystem; + + fs::path full_path( fs::initial_path() ); + + full_path = fs::system_complete( fs::path( path ) ); + + //Without file name + m_binPath = full_path.parent_path().string(); + */ - full_path = fs::system_complete( fs::path( path ) ); - - //Without file name - m_binPath = full_path.parent_path().string(); + // NOT TESTED + size_t pos = path.rfind("/"); + if (pos != string::npos) + { + m_binPath = path.substr(0, pos); + } cerr << m_binPath << endl; - } const string &StaticData::GetBinDirectory() const diff --git a/regression-testing/tests b/regression-testing/tests index f0bb48dad..5e518a84c 160000 --- a/regression-testing/tests +++ b/regression-testing/tests @@ -1 +1 @@ -Subproject commit f0bb48dad05db4a12b41f5c8def9c2d097b2e3b4 +Subproject commit 5e518a84c07263e387268e9db1004a134e40f6f6 -- cgit v1.2.3 From f95ee6e00cf75c1237ded278b33a287125fcbdb3 Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 4 Oct 2012 16:08:37 +0100 Subject: MERT regtest should depend on extractor. --- regression-testing/Jamfile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/regression-testing/Jamfile b/regression-testing/Jamfile index 61157eaf3..a27908e4c 100644 --- a/regression-testing/Jamfile +++ b/regression-testing/Jamfile @@ -13,10 +13,10 @@ if $(with-regtest) { if $(with-regtest) { test-dir = $(with-regtest)/tests ; - rule reg_test ( name : tests * : program : action ) { + rule reg_test ( name : tests * : programs * : action ) { alias $(name) : $(tests:D=).passed ; for test in $(tests) { - make $(test:D=).passed : $(program) : $(action) ; + make $(test:D=).passed : $(programs) : $(action) ; alias $(test) : $(test:D=).passed ; } } @@ -43,11 +43,11 @@ if $(with-regtest) { } reg_test extractrules : [ glob $(test-dir)/extract-rules.* : $(with-regtest)/extract-rules.hierarchical ] : ../phrase-extract//extract-rules : @reg_test_extractrules ; - + pwd = [ path.pwd ] ; actions reg_test_mert { - $(TOP)/regression-testing/run-test-mert.perl --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) && touch $(<) + $(TOP)/regression-testing/run-test-mert.perl --test=$(<:B) --data-dir=$(with-regtest) --test-dir=$(test-dir) --bin-dir=$(pwd)/$(>:D) && touch $(<) } - reg_test mert : [ glob $(test-dir)/mert.* ] : ../mert//mert : @reg_test_mert ; + reg_test mert : [ glob $(test-dir)/mert.* ] : ../mert//mert ../mert//extractor : @reg_test_mert ; alias all : phrase chart mert score extract extractrules ; } -- cgit v1.2.3 From 021fda966a5478703fdc6496b5ab69d06662f98d Mon Sep 17 00:00:00 2001 From: Kenneth Heafield Date: Thu, 4 Oct 2012 16:14:46 +0100 Subject: Unbork regression tests pointer --- regression-testing/tests | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/regression-testing/tests b/regression-testing/tests index 5e518a84c..f0bb48dad 160000 --- a/regression-testing/tests +++ b/regression-testing/tests @@ -1 +1 @@ -Subproject commit 5e518a84c07263e387268e9db1004a134e40f6f6 +Subproject commit f0bb48dad05db4a12b41f5c8def9c2d097b2e3b4 -- cgit v1.2.3 From ab2e18466f665ea913354493a81e7183bc5bb9b7 Mon Sep 17 00:00:00 2001 From: marcinj Date: Thu, 4 Oct 2012 19:14:42 +0200 Subject: Bugfix in queryPhraseTableMin, works now with target phrase factors --- misc/queryPhraseTableMin.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/misc/queryPhraseTableMin.cpp b/misc/queryPhraseTableMin.cpp index 02d889598..f8f2d06f7 100644 --- a/misc/queryPhraseTableMin.cpp +++ b/misc/queryPhraseTableMin.cpp @@ -49,6 +49,16 @@ int main(int argc, char **argv) LMList lmList; + Parameter *parameter = new Parameter(); + const_cast&>(parameter->GetParam("factor-delimiter")).resize(1, "||dummy_string||"); + const_cast&>(parameter->GetParam("input-factors")).resize(1, "0"); + const_cast&>(parameter->GetParam("verbose")).resize(1, "0"); + const_cast&>(parameter->GetParam("weight-w")).resize(1, "0"); + const_cast&>(parameter->GetParam("weight-d")).resize(1, "0"); + + const_cast(StaticData::Instance()).LoadData(parameter); + + PhraseDictionaryFeature pdf(Compact, nscores, nscores, input, output, ttable, weight, 0, "", ""); PhraseDictionaryCompact pdc(nscores, Compact, &pdf, false, useAlignments); bool ret = pdc.Load(input, output, ttable, weight, 0, lmList, 0); @@ -74,7 +84,8 @@ int main(int argc, char **argv) if(useAlignments) std::cout << " " << tp.GetAlignmentInfo() << "|||"; - for(size_t i = 0; i < tp.GetScoreBreakdown().size(); i++) + size_t offset = tp.GetScoreBreakdown().size() - nscores; + for(size_t i = offset; i < tp.GetScoreBreakdown().size(); i++) std::cout << " " << exp(tp.GetScoreBreakdown()[i]); std::cout << std::endl; } -- cgit v1.2.3 From 04544f8bfcd567b8306671b1420341ab7ee7fbf3 Mon Sep 17 00:00:00 2001 From: phikoehn Date: Thu, 4 Oct 2012 23:22:19 +0100 Subject: better error message when reference file not found --- scripts/generic/multi-bleu.perl | 1 + 1 file changed, 1 insertion(+) diff --git a/scripts/generic/multi-bleu.perl b/scripts/generic/multi-bleu.perl index 137117647..94da1504f 100755 --- a/scripts/generic/multi-bleu.perl +++ b/scripts/generic/multi-bleu.perl @@ -25,6 +25,7 @@ while(-e "$stem$ref") { $ref++; } &add_to_ref($stem,\@REF) if -e $stem; +die("ERROR: could not find reference file $stem") unless scalar @REF; sub add_to_ref { my ($file,$REF) = @_; -- cgit v1.2.3 From 83374e281041dda1a8d75210a8e9527a3e6c6f69 Mon Sep 17 00:00:00 2001 From: Hieu Hoang Date: Fri, 5 Oct 2012 17:17:27 +0100 Subject: eclipse project files --- contrib/other-builds/moses-chart-cmd/.cproject | 140 ++ contrib/other-builds/moses-chart-cmd/.project | 199 +++ contrib/other-builds/moses/.cproject | 13 +- contrib/other-builds/moses/.project | 2222 ++++++++++++++++-------- 4 files changed, 1796 insertions(+), 778 deletions(-) create mode 100644 contrib/other-builds/moses-chart-cmd/.cproject create mode 100644 contrib/other-builds/moses-chart-cmd/.project diff --git a/contrib/other-builds/moses-chart-cmd/.cproject b/contrib/other-builds/moses-chart-cmd/.cproject new file mode 100644 index 000000000..dfebdd577 --- /dev/null +++ b/contrib/other-builds/moses-chart-cmd/.cproject @@ -0,0 +1,140 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/contrib/other-builds/moses-chart-cmd/.project b/contrib/other-builds/moses-chart-cmd/.project new file mode 100644 index 000000000..094b33db1 --- /dev/null +++ b/contrib/other-builds/moses-chart-cmd/.project @@ -0,0 +1,199 @@ + + + moses-chart-cmd + + + lm + moses + OnDiskPt + util + + + + org.eclipse.cdt.managedbuilder.core.genmakebuilder + clean,full,incremental, + + + ?name? + + + + org.eclipse.cdt.make.core.append_environment + true + + + org.eclipse.cdt.make.core.autoBuildTarget + all + + + org.eclipse.cdt.make.core.buildArguments + + + + org.eclipse.cdt.make.core.buildCommand + make + + + org.eclipse.cdt.make.core.buildLocation + ${workspace_loc:/moses-chart-cmd/Debug} + + + org.eclipse.cdt.make.core.cleanBuildTarget + clean + + + org.eclipse.cdt.make.core.contents + org.eclipse.cdt.make.core.activeConfigSettings + + + org.eclipse.cdt.make.core.enableAutoBuild + false + + + org.eclipse.cdt.make.core.enableCleanBuild + true + + + org.eclipse.cdt.make.core.enableFullBuild + true + + + org.eclipse.cdt.make.core.fullBuildTarget + all + + + org.eclipse.cdt.make.core.stopOnError + true + + + org.eclipse.cdt.make.core.useDefaultBuildCmd + true + + + + + org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder + full,incremental, + + + + + + org.eclipse.cdt.core.cnature + org.eclipse.cdt.core.ccnature + org.eclipse.cdt.managedbuilder.core.managedBuildNature + org.eclipse.cdt.managedbuilder.core.ScannerConfigNature + + + + IOWrapper.cpp + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/IOWrapper.cpp + + + IOWrapper.h + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/IOWrapper.h + + + Jamfile + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/Jamfile + + + Main.cpp + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/Main.cpp + + + Main.h + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/Main.h + + + TranslationAnalysis.cpp + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/TranslationAnalysis.cpp + + + TranslationAnalysis.h + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/TranslationAnalysis.h + + + bin + 2 + virtual:/virtual + + + mbr.cpp + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/mbr.cpp + + + mbr.h + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/mbr.h + + + moses_chart + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/moses_chart + + + bin/gcc-4.6 + 2 + virtual:/virtual + + + bin/gcc-4.6/release + 2 + virtual:/virtual + + + bin/gcc-4.6/release/debug-symbols-on + 2 + virtual:/virtual + + + bin/gcc-4.6/release/debug-symbols-on/link-static + 2 + virtual:/virtual + + + bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi + 2 + virtual:/virtual + + + bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/IOWrapper.o + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/IOWrapper.o + + + bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/Main.o + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/Main.o + + + bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/PhraseDictionary.o + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/PhraseDictionary.o + + + bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/TranslationAnalysis.o + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/TranslationAnalysis.o + + + bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/mbr.o + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/mbr.o + + + bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/moses_chart + 1 + PARENT-3-PROJECT_LOC/moses-chart-cmd/src/bin/gcc-4.6/release/debug-symbols-on/link-static/threading-multi/moses_chart + + + diff --git a/contrib/other-builds/moses/.cproject b/contrib/other-builds/moses/.cproject index 0148cc6f2..05eb2df40 100644 --- a/contrib/other-builds/moses/.cproject +++ b/contrib/other-builds/moses/.cproject @@ -31,7 +31,6 @@