/*********************************************************************** Moses - statistical machine translation system Copyright (C) 2006-2011 University of Edinburgh This library is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. You should have received a copy of the GNU Lesser General Public License along with this library; if not, write to the Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ #include "AlignmentGraph.h" #include "ComposedRule.h" #include "Node.h" #include "Options.h" #include "ParseTree.h" #include "Subgraph.h" #include #include #include #include namespace Moses { namespace GHKM { AlignmentGraph::AlignmentGraph(const ParseTree *t, const std::vector &s, const Alignment &a) { // Copy the parse tree nodes and add them to m_targetNodes. m_root = CopyParseTree(t); // Create a node for each source word. m_sourceNodes.reserve(s.size()); for (std::vector::const_iterator p(s.begin()); p != s.end(); ++p) { m_sourceNodes.push_back(new Node(*p, SOURCE)); } // Connect source nodes to parse tree leaves according to the given word // alignment. std::vector targetTreeLeaves; GetTargetTreeLeaves(m_root, targetTreeLeaves); for (Alignment::const_iterator p(a.begin()); p != a.end(); ++p) { Node *src = m_sourceNodes[p->first]; Node *tgt = targetTreeLeaves[p->second]; src->AddParent(tgt); tgt->AddChild(src); } // Attach unaligned source words (if any). AttachUnalignedSourceWords(); // Populate node spans. std::vector::const_iterator p(m_sourceNodes.begin()); for (int i = 0; p != m_sourceNodes.end(); ++p, ++i) { (*p)->PropagateIndex(i); } // Calculate complement spans. CalcComplementSpans(m_root); } AlignmentGraph::~AlignmentGraph() { for (std::vector::iterator p(m_sourceNodes.begin()); p != m_sourceNodes.end(); ++p) { delete *p; } for (std::vector::iterator p(m_targetNodes.begin()); p != m_targetNodes.end(); ++p) { delete *p; } } Subgraph AlignmentGraph::ComputeMinimalFrontierGraphFragment( Node *root, const std::set &frontierSet) { std::stack expandableNodes; std::set expandedNodes; if (root->IsSink()) { expandedNodes.insert(root); } else { expandableNodes.push(root); } while (!expandableNodes.empty()) { Node *n = expandableNodes.top(); expandableNodes.pop(); const std::vector &children = n->GetChildren(); for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { Node *child = *p; if (child->IsSink()) { expandedNodes.insert(child); continue; } std::set::const_iterator q = frontierSet.find(child); if (q == frontierSet.end()) { //child is not from the frontier set expandableNodes.push(child); } else if (child->GetType() == TARGET) { // still need source word expandableNodes.push(child); } else { expandedNodes.insert(child); } } } return Subgraph(root, expandedNodes); } void AlignmentGraph::ExtractMinimalRules(const Options &options) { // Determine which nodes are frontier nodes. std::set frontierSet; ComputeFrontierSet(m_root, options, frontierSet); // Form the minimal frontier graph fragment rooted at each frontier node. std::vector fragments; fragments.reserve(frontierSet.size()); for (std::set::iterator p(frontierSet.begin()); p != frontierSet.end(); ++p) { Node *root = *p; Subgraph fragment = ComputeMinimalFrontierGraphFragment(root, frontierSet); assert(!fragment.IsTrivial()); // Can it form an SCFG rule? // FIXME Does this exclude non-lexical unary rules? if (root->GetType() == TREE && !root->GetSpan().empty()) { root->AddRule(new Subgraph(fragment)); } } } void AlignmentGraph::ExtractComposedRules(const Options &options) { ExtractComposedRules(m_root, options); } void AlignmentGraph::ExtractComposedRules(Node *node, const Options &options) { // Extract composed rules for all children first. const std::vector &children = node->GetChildren(); for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { ExtractComposedRules(*p, options); } // If there is no minimal rule for this node then there are no composed // rules. const std::vector &rules = node->GetRules(); assert(rules.size() <= 1); if (rules.empty()) { return; } // Construct an initial composition candidate from the minimal rule. ComposedRule cr(*(rules[0])); if (!cr.GetOpenAttachmentPoint()) { // No composition possible. return; } std::queue queue; queue.push(cr); while (!queue.empty()) { ComposedRule cr = queue.front(); queue.pop(); const Node *attachmentPoint = cr.GetOpenAttachmentPoint(); assert(attachmentPoint); assert(attachmentPoint != node); // Create all possible rules by composing this node's minimal rule with the // existing rules (both minimal and composed) rooted at the first open // attachment point. const std::vector &rules = attachmentPoint->GetRules(); for (std::vector::const_iterator p = rules.begin(); p != rules.end(); ++p) { assert((*p)->GetRoot()->GetType() == TREE); ComposedRule *cr2 = cr.AttemptComposition(**p, options); if (cr2) { node->AddRule(new Subgraph(cr2->CreateSubgraph())); if (cr2->GetOpenAttachmentPoint()) { queue.push(*cr2); } delete cr2; } } // Done with this attachment point. Advance to the next, if any. cr.CloseAttachmentPoint(); if (cr.GetOpenAttachmentPoint()) { queue.push(cr); } } } Node *AlignmentGraph::CopyParseTree(const ParseTree *root) { NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE; std::auto_ptr n(new Node(root->GetLabel(), nodeType)); if (nodeType == TREE) { n->SetPcfgScore(root->GetPcfgScore()); } const std::vector &children = root->GetChildren(); std::vector childNodes; childNodes.reserve(children.size()); for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { Node *child = CopyParseTree(*p); child->AddParent(n.get()); childNodes.push_back(child); } n->SetChildren(childNodes); Node *p = n.release(); m_targetNodes.push_back(p); return p; } // Finds the set of frontier nodes. The definition of a frontier node differs // from Galley et al's (2004) in the following ways: // // 1. A node with an empty span is not a frontier node (this excludes // unaligned target subtrees). // 2. Target word nodes are not frontier nodes. // 3. Source word nodes are not frontier nodes. // 4. Unless the --AllowUnary option is used, a node is not a frontier node if // it has the same span as its parent. void AlignmentGraph::ComputeFrontierSet(Node *root, const Options &options, std::set &frontierSet) const { // Don't include word nodes or unaligned target subtrees. if (root->GetType() != TREE || root->GetSpan().empty()) { return; } if (!SpansIntersect(root->GetComplementSpan(), Closure(root->GetSpan()))) { // Unless unary rules are explicitly allowed, we use Chung et al's (2011) // modified defintion of a frontier node to eliminate the production of // non-lexical unary rules. assert(root->GetParents().size() <= 1); if (options.allowUnary || root->GetParents().empty() || root->GetParents()[0]->GetSpan() != root->GetSpan()) { frontierSet.insert(root); } } const std::vector &children = root->GetChildren(); for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { ComputeFrontierSet(*p, options, frontierSet); } } void AlignmentGraph::CalcComplementSpans(Node *root) { Span compSpan; std::set siblings; const std::vector &parents = root->GetParents(); for (std::vector::const_iterator p(parents.begin()); p != parents.end(); ++p) { const Span &parentCompSpan = (*p)->GetComplementSpan(); compSpan.insert(parentCompSpan.begin(), parentCompSpan.end()); const std::vector &c = (*p)->GetChildren(); siblings.insert(c.begin(), c.end()); } for (std::set::iterator p(siblings.begin()); p != siblings.end(); ++p) { if (*p == root) { continue; } const Span &siblingSpan = (*p)->GetSpan(); compSpan.insert(siblingSpan.begin(), siblingSpan.end()); } root->SetComplementSpan(compSpan); const std::vector &children = root->GetChildren(); for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { CalcComplementSpans(*p); } } void AlignmentGraph::GetTargetTreeLeaves(Node *root, std::vector &leaves) { if (root->IsSink()) { leaves.push_back(root); } else { const std::vector &children = root->GetChildren(); for (std::vector::const_iterator p(children.begin()); p != children.end(); ++p) { GetTargetTreeLeaves(*p, leaves); } } } void AlignmentGraph::AttachUnalignedSourceWords() { // Find the unaligned source words (if any). std::set unaligned; for (size_t i = 0; i < m_sourceNodes.size(); ++i) { const Node &sourceNode = (*m_sourceNodes[i]); if (sourceNode.GetParents().empty()) { unaligned.insert(i); } } // Determine the attachment point for each one and attach it. for (std::set::iterator p = unaligned.begin(); p != unaligned.end(); ++p) { int index = *p; Node *attachmentPoint = DetermineAttachmentPoint(index); Node *sourceNode = m_sourceNodes[index]; attachmentPoint->AddChild(sourceNode); sourceNode->AddParent(attachmentPoint); } } Node *AlignmentGraph::DetermineAttachmentPoint(int index) { // Find the nearest aligned neighbour to the left, if any. int i = index; while (--i >= 0) { if (!m_sourceNodes[i]->GetParents().empty()) { break; } } // No aligned neighbours to the left, so attach to the root. if (i == -1) { return m_root; } // Find the nearest aligned neighbour to the right, if any. size_t j = index; while (++j < m_sourceNodes.size()) { if (!m_sourceNodes[j]->GetParents().empty()) { break; } } // No aligned neighbours to the right, so attach to the root. if (j == m_sourceNodes.size()) { return m_root; } // Construct the set of target nodes that are aligned to the left and right // neighbours. const std::vector &leftParents = m_sourceNodes[i]->GetParents(); assert(!leftParents.empty()); const std::vector &rightParents = m_sourceNodes[j]->GetParents(); assert(!rightParents.empty()); std::set targetSet; targetSet.insert(leftParents.begin(), leftParents.end()); targetSet.insert(rightParents.begin(), rightParents.end()); // The attachment point is the lowest common ancestor of the target word // nodes, unless the LCA is itself a target word, in which case the LCA // is the parent. This is to avoid including introducing new word alignments. // It assumes that the parse tree uses preterminals for parts of speech. Node *lca = Node::LowestCommonAncestor(targetSet.begin(), targetSet.end()); if (lca->GetType() == TARGET) { assert(lca->GetParents().size() == 1); return lca->GetParents()[0]; } return lca; } } // namespace GHKM } // namespace Moses