From e7228ec9fb09941593fa09329d421ca7b951f12e Mon Sep 17 00:00:00 2001 From: Phil Williams Date: Mon, 6 Jul 2015 14:41:34 +0100 Subject: extract-ghkm: minor refactoring --- phrase-extract/extract-ghkm/Alignment.cpp | 5 +- phrase-extract/extract-ghkm/Alignment.h | 4 +- phrase-extract/extract-ghkm/AlignmentGraph.cpp | 62 +++++++++++++++-------- phrase-extract/extract-ghkm/AlignmentGraph.h | 4 ++ phrase-extract/extract-ghkm/ComposedRule.cpp | 3 ++ phrase-extract/extract-ghkm/ComposedRule.h | 3 ++ phrase-extract/extract-ghkm/ExtractGHKM.cpp | 13 +++-- phrase-extract/extract-ghkm/ExtractGHKM.h | 5 +- phrase-extract/extract-ghkm/Main.cpp | 2 +- phrase-extract/extract-ghkm/Node.cpp | 3 ++ phrase-extract/extract-ghkm/Node.h | 3 ++ phrase-extract/extract-ghkm/Options.h | 4 +- phrase-extract/extract-ghkm/PhraseOrientation.cpp | 4 +- phrase-extract/extract-ghkm/PhraseOrientation.h | 3 ++ phrase-extract/extract-ghkm/Rule.cpp | 3 ++ phrase-extract/extract-ghkm/Rule.h | 3 ++ phrase-extract/extract-ghkm/ScfgRule.cpp | 3 ++ phrase-extract/extract-ghkm/ScfgRule.h | 3 ++ phrase-extract/extract-ghkm/ScfgRuleWriter.cpp | 3 ++ phrase-extract/extract-ghkm/ScfgRuleWriter.h | 4 +- phrase-extract/extract-ghkm/Span.cpp | 3 ++ phrase-extract/extract-ghkm/Span.h | 3 ++ phrase-extract/extract-ghkm/StsgRule.cpp | 3 ++ phrase-extract/extract-ghkm/StsgRule.h | 3 ++ phrase-extract/extract-ghkm/StsgRuleWriter.cpp | 3 ++ phrase-extract/extract-ghkm/StsgRuleWriter.h | 3 ++ phrase-extract/extract-ghkm/Subgraph.cpp | 3 ++ phrase-extract/extract-ghkm/Subgraph.h | 4 +- 28 files changed, 129 insertions(+), 33 deletions(-) (limited to 'phrase-extract') diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/extract-ghkm/Alignment.cpp index ba89a1594..9293a07cf 100644 --- a/phrase-extract/extract-ghkm/Alignment.cpp +++ b/phrase-extract/extract-ghkm/Alignment.cpp @@ -27,6 +27,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -44,7 +46,7 @@ void ReadAlignment(const std::string &s, Alignment &a) } int src = std::atoi(s.substr(begin, end-begin).c_str()); if (end+1 == s.size()) { - throw Syntax::Exception("Target index missing"); + throw Exception("Target index missing"); } begin = end+1; @@ -70,4 +72,5 @@ void FlipAlignment(Alignment &a) } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Alignment.h b/phrase-extract/extract-ghkm/Alignment.h index 154e1fc4f..da1279f8f 100644 --- a/phrase-extract/extract-ghkm/Alignment.h +++ b/phrase-extract/extract-ghkm/Alignment.h @@ -25,6 +25,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -35,5 +37,5 @@ void ReadAlignment(const std::string &, Alignment &); void FlipAlignment(Alignment &); } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining - diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp index 9dba71331..21708bdfc 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp +++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp @@ -34,6 +34,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -242,36 +244,24 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root) return p; } -// Finds the set of frontier nodes. The definition of a frontier node differs -// from Galley et al's (2004) in the following ways: -// -// 1. A node with an empty span is not a frontier node (this excludes -// unaligned target subtrees). -// 2. Target word nodes are not frontier nodes. -// 3. Source word nodes are not frontier nodes. -// 4. Unless the --AllowUnary option is used, a node is not a frontier node if -// it has the same span as its parent. +// Recursively constructs the set of frontier nodes for the tree (or subtree) +// rooted at the given node. void AlignmentGraph::ComputeFrontierSet(Node *root, const Options &options, std::set &frontierSet) const { - // Don't include word nodes or unaligned target subtrees. + // Non-tree nodes and unaligned target subtrees are not frontier nodes (and + // nor are their descendants). See the comment for the function + // AlignmentGraph::IsFrontierNode(). if (root->GetType() != TREE || root->GetSpan().empty()) { return; } - if (!SpansIntersect(root->GetComplementSpan(), Closure(root->GetSpan()))) { - // Unless unary rules are explicitly allowed, we use Chung et al's (2011) - // modified defintion of a frontier node to eliminate the production of - // non-lexical unary rules. - assert(root->GetParents().size() <= 1); - if (options.allowUnary - || root->GetParents().empty() - || root->GetParents()[0]->GetSpan() != root->GetSpan()) { - frontierSet.insert(root); - } + if (IsFrontierNode(*root, options)) { + frontierSet.insert(root); } + // Recursively check descendants. const std::vector &children = root->GetChildren(); for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { @@ -279,6 +269,37 @@ void AlignmentGraph::ComputeFrontierSet(Node *root, } } +// Determines whether the given node is a frontier node or not. The definition +// of a frontier node differs from Galley et al's (2004) in the following ways: +// +// 1. A node with an empty span is not a frontier node (this is to exclude +// unaligned target subtrees). +// 2. Target word nodes are not frontier nodes. +// 3. Source word nodes are not frontier nodes. +// 4. Unless the --AllowUnary option is used, a node is not a frontier node if +// it has the same span as its parent. +bool AlignmentGraph::IsFrontierNode(const Node &n, const Options &options) const +{ + // Don't include word nodes or unaligned target subtrees. + if (n.GetType() != TREE || n.GetSpan().empty()) { + return false; + } + // This is the original GHKM definition of a frontier node. + if (SpansIntersect(n.GetComplementSpan(), Closure(n.GetSpan()))) { + return false; + } + // Unless unary rules are explicitly allowed, we use Chung et al's (2011) + // modified defintion of a frontier node to eliminate the production of + // non-lexical unary rules. + assert(n.GetParents().size() <= 1); + if (!options.allowUnary && + !n.GetParents().empty() && + n.GetParents()[0]->GetSpan() == n.GetSpan()) { + return false; + } + return true; +} + void AlignmentGraph::CalcComplementSpans(Node *root) { Span compSpan; @@ -393,4 +414,5 @@ Node *AlignmentGraph::DetermineAttachmentPoint(int index) } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.h b/phrase-extract/extract-ghkm/AlignmentGraph.h index 032b946f0..be1182c16 100644 --- a/phrase-extract/extract-ghkm/AlignmentGraph.h +++ b/phrase-extract/extract-ghkm/AlignmentGraph.h @@ -32,6 +32,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -64,6 +66,7 @@ private: Node *CopyParseTree(const SyntaxTree *); void ComputeFrontierSet(Node *, const Options &, std::set &) const; + bool IsFrontierNode(const Node &, const Options &) const; void CalcComplementSpans(Node *); void GetTargetTreeLeaves(Node *, std::vector &); void AttachUnalignedSourceWords(); @@ -78,6 +81,7 @@ private: }; } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/ComposedRule.cpp b/phrase-extract/extract-ghkm/ComposedRule.cpp index d322a255f..b4f6a6fcd 100644 --- a/phrase-extract/extract-ghkm/ComposedRule.cpp +++ b/phrase-extract/extract-ghkm/ComposedRule.cpp @@ -29,6 +29,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -128,4 +130,5 @@ Subgraph ComposedRule::CreateSubgraph() } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/ComposedRule.h b/phrase-extract/extract-ghkm/ComposedRule.h index d456fd27c..9ff910293 100644 --- a/phrase-extract/extract-ghkm/ComposedRule.h +++ b/phrase-extract/extract-ghkm/ComposedRule.h @@ -28,6 +28,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -67,6 +69,7 @@ private: }; } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index a4e8afcd3..8a415eb71 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -55,6 +55,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -131,8 +133,8 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::string sourceLine; std::string alignmentLine; Alignment alignment; - Syntax::XmlTreeParser targetXmlTreeParser; - Syntax::XmlTreeParser sourceXmlTreeParser; + XmlTreeParser targetXmlTreeParser; + XmlTreeParser sourceXmlTreeParser; ScfgRuleWriter scfgWriter(fwdExtractStream, invExtractStream, options); StsgRuleWriter stsgWriter(fwdExtractStream, invExtractStream, options); size_t lineNum = options.sentenceOffset; @@ -160,7 +162,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) try { targetParseTree = targetXmlTreeParser.Parse(targetLine); assert(targetParseTree.get()); - } catch (const Syntax::Exception &e) { + } catch (const Exception &e) { std::ostringstream oss; oss << "Failed to parse target XML tree at line " << lineNum; if (!e.msg().empty()) { @@ -178,7 +180,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) try { sourceParseTree = sourceXmlTreeParser.Parse(sourceLine); assert(sourceParseTree.get()); - } catch (const Syntax::Exception &e) { + } catch (const Exception &e) { std::ostringstream oss; oss << "Failed to parse source XML tree at line " << lineNum; if (!e.msg().empty()) { @@ -192,7 +194,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Read word alignments. try { ReadAlignment(alignmentLine, alignment); - } catch (const Syntax::Exception &e) { + } catch (const Exception &e) { std::ostringstream oss; oss << "Failed to read alignment at line " << lineNum << ": "; oss << e.msg(); @@ -896,4 +898,5 @@ void ExtractGHKM::StripBitParLabels( } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h index 0d0fa8bf1..170de7ae9 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.h +++ b/phrase-extract/extract-ghkm/ExtractGHKM.h @@ -32,12 +32,14 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { struct Options; -class ExtractGHKM : public Syntax::Tool +class ExtractGHKM : public Tool { public: ExtractGHKM() : Tool("extract-ghkm") {} @@ -76,4 +78,5 @@ private: }; } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Main.cpp b/phrase-extract/extract-ghkm/Main.cpp index 64b3e0f00..f7a2173fb 100644 --- a/phrase-extract/extract-ghkm/Main.cpp +++ b/phrase-extract/extract-ghkm/Main.cpp @@ -21,6 +21,6 @@ int main(int argc, char *argv[]) { - MosesTraining::GHKM::ExtractGHKM tool; + MosesTraining::Syntax::GHKM::ExtractGHKM tool; return tool.Main(argc, argv); } diff --git a/phrase-extract/extract-ghkm/Node.cpp b/phrase-extract/extract-ghkm/Node.cpp index 384db3306..382fda996 100644 --- a/phrase-extract/extract-ghkm/Node.cpp +++ b/phrase-extract/extract-ghkm/Node.cpp @@ -23,6 +23,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -70,4 +72,5 @@ void Node::GetTargetWords(std::vector &targetWords) const } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Node.h b/phrase-extract/extract-ghkm/Node.h index 71a24b28e..81f4a46b9 100644 --- a/phrase-extract/extract-ghkm/Node.h +++ b/phrase-extract/extract-ghkm/Node.h @@ -30,6 +30,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -215,6 +217,7 @@ Node *Node::LowestCommonAncestor(InputIterator first, InputIterator last) } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h index f694fb55c..429469883 100644 --- a/phrase-extract/extract-ghkm/Options.h +++ b/phrase-extract/extract-ghkm/Options.h @@ -23,6 +23,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -89,5 +91,5 @@ public: }; } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining - diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.cpp b/phrase-extract/extract-ghkm/PhraseOrientation.cpp index 57952d580..f07e19a46 100644 --- a/phrase-extract/extract-ghkm/PhraseOrientation.cpp +++ b/phrase-extract/extract-ghkm/PhraseOrientation.cpp @@ -28,6 +28,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -469,5 +471,5 @@ void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining - diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.h b/phrase-extract/extract-ghkm/PhraseOrientation.h index 572124e61..d956e2bc8 100644 --- a/phrase-extract/extract-ghkm/PhraseOrientation.h +++ b/phrase-extract/extract-ghkm/PhraseOrientation.h @@ -32,6 +32,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -120,4 +122,5 @@ private: }; } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Rule.cpp b/phrase-extract/extract-ghkm/Rule.cpp index 1b7207c3c..b4b59f8e3 100644 --- a/phrase-extract/extract-ghkm/Rule.cpp +++ b/phrase-extract/extract-ghkm/Rule.cpp @@ -5,6 +5,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -38,4 +40,5 @@ bool Rule::PartitionOrderComp(const Node *a, const Node *b) } } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Rule.h b/phrase-extract/extract-ghkm/Rule.h index b87934735..5317be7c8 100644 --- a/phrase-extract/extract-ghkm/Rule.h +++ b/phrase-extract/extract-ghkm/Rule.h @@ -9,6 +9,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -54,6 +56,7 @@ protected: }; } // namespace GHKM +} // namespace Syntax } // namespace MosesTraining #endif diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp index 1a49c862e..e26b17a87 100644 --- a/phrase-extract/extract-ghkm/ScfgRule.cpp +++ b/phrase-extract/extract-ghkm/ScfgRule.cpp @@ -28,6 +28,8 @@ namespace MosesTraining { +namespace Syntax +{ namespace GHKM { @@ -197,4 +199,5 @@ void ScfgRule::UpdateSourceLabelCoocCounts(std::map< std::string, std::map