1 files changed, 42 insertions, 20 deletions
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
index 9dba71331..21708bdfc 100644
--- a/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -34,6 +34,8 @@
 
 namespace MosesTraining
 {
+namespace Syntax
+{
 namespace GHKM
 {
 
@@ -242,36 +244,24 @@ Node *AlignmentGraph::CopyParseTree(const SyntaxTree *root)
   return p;
 }
 
-// Finds the set of frontier nodes.  The definition of a frontier node differs
-// from Galley et al's (2004) in the following ways:
-//
-// 1. A node with an empty span is not a frontier node (this excludes
-//    unaligned target subtrees).
-// 2. Target word nodes are not frontier nodes.
-// 3. Source word nodes are not frontier nodes.
-// 4. Unless the --AllowUnary option is used, a node is not a frontier node if
-//    it has the same span as its parent.
+// Recursively constructs the set of frontier nodes for the tree (or subtree)
+// rooted at the given node.
 void AlignmentGraph::ComputeFrontierSet(Node *root,
                                         const Options &options,
                                         std::set<Node *> &frontierSet) const
 {
-  // Don't include word nodes or unaligned target subtrees.
+  // Non-tree nodes and unaligned target subtrees are not frontier nodes (and
+  // nor are their descendants).  See the comment for the function
+  // AlignmentGraph::IsFrontierNode().
   if (root->GetType() != TREE || root->GetSpan().empty()) {
     return;
   }
 
-  if (!SpansIntersect(root->GetComplementSpan(), Closure(root->GetSpan()))) {
-    // Unless unary rules are explicitly allowed, we use Chung et al's (2011)
-    // modified defintion of a frontier node to eliminate the production of
-    // non-lexical unary rules.
-    assert(root->GetParents().size() <= 1);
-    if (options.allowUnary
-        || root->GetParents().empty()
-        || root->GetParents()[0]->GetSpan() != root->GetSpan()) {
-      frontierSet.insert(root);
-    }
+  if (IsFrontierNode(*root, options)) {
+    frontierSet.insert(root);
   }
 
+  // Recursively check descendants.
   const std::vector<Node *> &children = root->GetChildren();
   for (std::vector<Node *>::const_iterator p(children.begin());
        p != children.end(); ++p) {
@@ -279,6 +269,37 @@ void AlignmentGraph::ComputeFrontierSet(Node *root,
   }
 }
 
+// Determines whether the given node is a frontier node or not. The definition
+// of a frontier node differs from Galley et al's (2004) in the following ways:
+//
+// 1. A node with an empty span is not a frontier node (this is to exclude
+//    unaligned target subtrees).
+// 2. Target word nodes are not frontier nodes.
+// 3. Source word nodes are not frontier nodes.
+// 4. Unless the --AllowUnary option is used, a node is not a frontier node if
+//    it has the same span as its parent.
+bool AlignmentGraph::IsFrontierNode(const Node &n, const Options &options) const
+{
+  // Don't include word nodes or unaligned target subtrees.
+  if (n.GetType() != TREE || n.GetSpan().empty()) {
+    return false;
+  }
+  // This is the original GHKM definition of a frontier node.
+  if (SpansIntersect(n.GetComplementSpan(), Closure(n.GetSpan()))) {
+    return false;
+  }
+  // Unless unary rules are explicitly allowed, we use Chung et al's (2011)
+  // modified defintion of a frontier node to eliminate the production of
+  // non-lexical unary rules.
+  assert(n.GetParents().size() <= 1);
+  if (!options.allowUnary &&
+      !n.GetParents().empty() &&
+      n.GetParents()[0]->GetSpan() == n.GetSpan()) {
+    return false;
+  }
+  return true;
+}
+
 void AlignmentGraph::CalcComplementSpans(Node *root)
 {
   Span compSpan;
@@ -393,4 +414,5 @@ Node *AlignmentGraph::DetermineAttachmentPoint(int index)
 }
 
 }  // namespace GHKM
+}  // namespace Syntax
 }  // namespace MosesTraining