extract-ghkm: add the features required for use as a drop-in replacement

for extract-rules: composed rules, unaligned source word attachment, non-lexical unary rule elimination, glue rule generation, unknown word label generation, and EMS integration.
author: Phil Williams <philip.williams@mac.com> 2011-11-21 20:21:04 +0400
committer: Phil Williams <philip.williams@mac.com> 2011-11-21 20:21:04 +0400
commit: ee0a6dbd5cea7d2a72bbfb0d081a9232d1f60a8b (patch)
tree: a8a57bd866061d0292cfb2f978fa91190dff9f87 /scripts
parent: 463f2e232e9e1907ccd4ef69045e06ff77ad5f3f (diff)
50 files changed, 3848 insertions, 1561 deletions
diff --git a/scripts/Makefile b/scripts/Makefile
index b81b1c10c..dfa0fd3a1 100644
--- a/scripts/Makefile
+++ b/scripts/Makefile
@@ -27,7 +27,7 @@ all: compile
 SUBDIRS=training/phrase-extract training/symal training/mbr training/lexical-reordering ems/biconcor
 SUBDIRS_CLEAN=$(SUBDIRS) training/memscore training/eppex training/compact-rule-table
 
-compile: compile-memscore compile-eppex compile-compact-rule-table
+compile: compile-memscore compile-eppex compile-compact-rule-table compile-extract-ghkm
 	touch release-exclude # No files excluded by default
 	pwd=`pwd`; \
 	for subdir in $(SUBDIRS); do \
@@ -62,6 +62,19 @@ compile-compact-rule-table:
 	  || ( echo "WARNING: Building compact rule table tools failed."; \
 	       echo 'training/compact-rule-table/tools/compactify' >> ../../release-exclude )
 
+compile-extract-ghkm:
+	# Building the GHKM rule extractor may fail e.g. if certain boost libraries are not available.
+	# We ignore this because the tool isn't used by default.
+	cd training/phrase-extract/extract-ghkm ; \
+	  if [ ! -f configure ]; then \
+	    autoreconf -i \
+	      || ( echo "WARNING: Bootstrapping extract-ghkm build environment failed."; \
+	           echo 'training/phrase-extract/extract-ghkm/tools/extract-ghkm' >> ../../release-exclude ); \
+          fi; \
+	  ./configure && $(MAKE) \
+	  || ( echo "WARNING: Building extract-ghkm failed."; \
+	       echo 'training/phrase-extract/extract-ghkm/tools/extract-ghkm' >> ../../release-exclude )
+
 clean:
 	pwd=`pwd`; \
 	for subdir in $(SUBDIRS_CLEAN); do \
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index 956a422af..d3e37e31b 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -1662,6 +1662,10 @@ sub define_training_extract_phrases {
 	  my $unknown_word_label = &versionize(&long_file_name("unknown-word-label","model",""));
 	  $cmd .= "-unknown-word-label $unknown_word_label ";
       }
+
+      if (&get("TRAINING:use-ghkm")) {
+        $cmd .= "-ghkm ";
+      }
     }
 
     my $extract_settings = &get("TRAINING:extract-settings");
diff --git a/scripts/released-files b/scripts/released-files
index 7a0ee1fc9..5153a4e52 100644
--- a/scripts/released-files
+++ b/scripts/released-files
@@ -88,6 +88,7 @@ training/zmert-moses.pl
 training/mert-moses.pl
 training/mert-moses-multi.pl
 training/phrase-extract/extract
+training/phrase-extract/extract-ghkm/tools/extract-ghkm
 training/phrase-extract/extract-rules
 training/phrase-extract/score
 training/phrase-extract/consolidate
diff --git a/scripts/training/phrase-extract/XmlTree.h b/scripts/training/phrase-extract/XmlTree.h
index 439828d32..7e6bbecea 100644
--- a/scripts/training/phrase-extract/XmlTree.h
+++ b/scripts/training/phrase-extract/XmlTree.h
@@ -33,3 +33,4 @@ std::string TrimXml(const std::string& str);
 bool isXmlTag(const std::string& tag);
 std::vector<std::string> TokenizeXml(const std::string& str);
 bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection );
+std::string unescape(const std::string &str);
diff --git a/scripts/training/phrase-extract/extract-ghkm/Alignment.h b/scripts/training/phrase-extract/extract-ghkm/Alignment.h
deleted file mode 100644
index c1ca9e586..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/Alignment.h
+++ /dev/null
@@ -1,33 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#pragma once
-#ifndef ALIGNMENT_H_INCLUDED_
-#define ALIGNMENT_H_INCLUDED_
-
-#include <string>
-#include <utility>
-#include <vector>
-
-typedef std::vector<std::pair<int, int> > Alignment;
-
-Alignment
-readAlignment(const std::string &);
-
-#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
deleted file mode 100644
index 35c89b5c6..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.cpp
+++ /dev/null
@@ -1,350 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#include "AlignmentGraph.h"
-
-#include "Rule.h"
-#include "Subgraph.h"
-
-#include <algorithm>
-#include <cassert>
-#include <memory>
-
-namespace
-{
-Node *
-copyParseTree(const ParseTree * root, std::vector<Node *> & nodes)
-{
-  NodeType nodeType = (root->isLeaf()) ? TARGET : TREE;
-
-  std::auto_ptr<Node> n(new Node(root->getLabel(), nodeType));
-
-  const std::vector<ParseTree *> & children = root->getChildren();
-  std::vector<Node *> childNodes;
-  childNodes.reserve(children.size());
-  for (std::vector<ParseTree *>::const_iterator p(children.begin());
-       p != children.end(); ++p) {
-    Node * child = copyParseTree(*p, nodes);
-    child->addParent(n.get());
-    childNodes.push_back(child);
-  }
-  n->setChildren(childNodes);
-
-  Node * n2 = n.release();
-  nodes.push_back(n2);
-  return n2;
-}
-
-void
-computeFrontierSet(Node * root, std::set<Node *> & frontierSet)
-{
-  // TODO Return if frontierSet already contains entry for root
-  // TODO Or maintain set of visited nodes?
-
-  if (!spansIntersect(root->getComplementSpan(), closure(root->getSpan()))) {
-    frontierSet.insert(root);
-  }
-
-  const std::vector<Node *> & children = root->getChildren();
-  for (std::vector<Node *>::const_iterator p(children.begin());
-       p != children.end(); ++p) {
-    computeFrontierSet(*p, frontierSet);
-  }
-}
-
-void
-calcComplementSpans(Node * root)
-{
-  Span & compSpan = root->getComplementSpan();
-
-  std::set<Node *> siblings;
-
-  const std::vector<Node *> & parents = root->getParents();
-  for (std::vector<Node *>::const_iterator p(parents.begin());
-       p != parents.end(); ++p) {
-    const Span & parentCompSpan = (*p)->getComplementSpan();
-    compSpan.insert(parentCompSpan.begin(), parentCompSpan.end());
-    const std::vector<Node *> & c = (*p)->getChildren();
-    siblings.insert(c.begin(), c.end());
-  }
-
-  for (std::set<Node *>::iterator p(siblings.begin());
-       p != siblings.end(); ++p) {
-    if (*p == root) {
-      continue;
-    }
-    const Span & siblingSpan = (*p)->getSpan();
-    compSpan.insert(siblingSpan.begin(), siblingSpan.end());
-  }
-
-  const std::vector<Node *> & children = root->getChildren();
-  for (std::vector<Node *>::const_iterator p(children.begin());
-       p != children.end(); ++p) {
-    calcComplementSpans(*p);
-  }
-}
-
-void
-getTargetTreeLeaves(Node * root, std::vector<Node *> & leaves)
-{
-  if (root->isSink()) {
-    leaves.push_back(root);
-  } else {
-    const std::vector<Node *> & children = root->getChildren();
-    for (std::vector<Node *>::const_iterator p(children.begin());
-         p != children.end(); ++p) {
-      getTargetTreeLeaves(*p, leaves);
-    }
-  }
-}
-
-bool
-partitionOrderComp(const Node * a, const Node * b)
-{
-  const Span & aSpan = a->getSpan();
-  const Span & bSpan = b->getSpan();
-
-  assert(!aSpan.empty() && !bSpan.empty());
-
-  return *(aSpan.begin()) < *(bSpan.begin());
-}
-
-Rule
-fragmentToRule(const Subgraph & fragment)
-{
-  // Source RHS
-
-  std::set<Node *> sinkNodes(fragment.getSinkNodes());
-
-  std::vector<Node *> sourceRHSNodes;
-  for (std::set<Node *>::const_iterator p(sinkNodes.begin());
-       p != sinkNodes.end(); ++p) {
-    const Node & sinkNode = **p;
-    if (!sinkNode.getSpan().empty()) {
-      sourceRHSNodes.push_back(*p);
-    }
-  }
-
-  std::sort(sourceRHSNodes.begin(), sourceRHSNodes.end(),
-            partitionOrderComp);
-
-  // Build a mapping from target nodes to source-order indices, so that we
-  // can construct the Alignment object later.
-  std::map<const Node *, std::vector<int> > sourceOrder;
-
-  std::vector<Symbol> sourceRHS;
-  int srcIndex = 0;
-  for (std::vector<Node *>::const_iterator p(sourceRHSNodes.begin());
-       p != sourceRHSNodes.end(); ++p, ++srcIndex) {
-    const Node & sinkNode = **p;
-    if (sinkNode.getType() == TREE) {
-      sourceRHS.push_back(Symbol("X", NonTerminal));
-      sourceOrder[&sinkNode].push_back(srcIndex);
-    } else {
-      assert(sinkNode.getType() == SOURCE);
-      sourceRHS.push_back(Symbol(sinkNode.getLabel(), Terminal));
-      // Add all aligned target words to the sourceOrder map
-      const std::vector<Node *> & parents(sinkNode.getParents());
-      for (std::vector<Node *>::const_iterator q(parents.begin());
-           q != parents.end(); ++q) {
-        assert((*q)->getType() == TARGET);
-        sourceOrder[*q].push_back(srcIndex);
-      }
-    }
-  }
-
-  // Target RHS + alignment
-
-  std::vector<Symbol> targetRHS;
-  Alignment alignment;
-
-  std::vector<Node *> leafNodes(fragment.getLeafNodes());
-
-  alignment.reserve(leafNodes.size());  // might be too much but that's OK
-  targetRHS.reserve(leafNodes.size());
-
-  for (std::vector<Node *>::const_iterator p(leafNodes.begin());
-       p != leafNodes.end(); ++p) {
-    const Node & leaf = **p;
-    if (leaf.getSpan().empty()) {
-      // The node doesn't cover any source words, so we can only add
-      // terminals to the target RHS (not a non-terminal).
-      std::vector<std::string> targetWords(leaf.getTargetWords());
-      for (std::vector<std::string>::const_iterator q(targetWords.begin());
-           q != targetWords.end(); ++q) {
-        targetRHS.push_back(Symbol(*q, Terminal));
-      }
-    } else {
-      SymbolType type = (leaf.getType() == TREE) ? NonTerminal : Terminal;
-      targetRHS.push_back(Symbol(leaf.getLabel(), type));
-
-      int tgtIndex = targetRHS.size()-1;
-      std::map<const Node *, std::vector<int> >::iterator q(sourceOrder.find(&leaf));
-      assert(q != sourceOrder.end());
-      std::vector<int> & sourceNodes = q->second;
-      for (std::vector<int>::iterator r(sourceNodes.begin());
-           r != sourceNodes.end(); ++r) {
-        int srcIndex = *r;
-        alignment.push_back(std::make_pair(srcIndex, tgtIndex));
-      }
-    }
-  }
-
-  assert(!alignment.empty());
-
-  // Source LHS
-  Symbol sourceLHS("X", NonTerminal);
-
-  // Target LHS
-  Symbol targetLHS(fragment.getRoot()->getLabel(), NonTerminal);
-
-  return Rule(sourceLHS, targetLHS, sourceRHS, targetRHS, alignment);
-}
-}
-
-void
-Node::setChildren(const std::vector<Node*> & children)
-{
-  m_children = children;
-}
-
-void
-Node::setParents(const std::vector<Node*> & parents)
-{
-  m_parents = parents;
-}
-
-void
-Node::addChild(Node * child)
-{
-  m_children.push_back(child);
-}
-
-void
-Node::addParent(Node * parent)
-{
-  m_parents.push_back(parent);
-}
-
-bool
-Node::isSink() const
-{
-  return m_children.empty();
-}
-
-void
-Node::propagateIndex(int index)
-{
-  m_span.insert(index);
-  for (std::vector<Node *>::const_iterator p(m_parents.begin());
-       p != m_parents.end(); ++p) {
-    (*p)->propagateIndex(index);
-  }
-}
-
-std::vector<std::string>
-Node::getTargetWords() const
-{
-  std::vector<std::string> targetWords;
-  getTargetWords(targetWords);
-  return targetWords;
-}
-
-void
-Node::getTargetWords(std::vector<std::string> & targetWords) const
-{
-  if (m_type == TARGET) {
-    targetWords.push_back(m_label);
-  } else {
-    for (std::vector<Node *>::const_iterator p(m_children.begin());
-         p != m_children.end(); ++p) {
-      (*p)->getTargetWords(targetWords);
-    }
-  }
-}
-
-AlignmentGraph::AlignmentGraph(const ParseTree * t,
-                               const std::vector<std::string> & s,
-                               const Alignment & a)
-{
-  m_root = copyParseTree(t, m_targetNodes);
-
-  m_sourceNodes.reserve(s.size());
-  for (std::vector<std::string>::const_iterator p(s.begin());
-       p != s.end(); ++p) {
-    m_sourceNodes.push_back(new Node(*p, SOURCE));
-  }
-
-  std::vector<Node *> targetTreeLeaves;
-  getTargetTreeLeaves(m_root, targetTreeLeaves);
-
-  for (Alignment::const_iterator p(a.begin()); p != a.end(); ++p) {
-    Node * src = m_sourceNodes[p->first];
-    Node * tgt = targetTreeLeaves[p->second];
-    src->addParent(tgt);
-    tgt->addChild(src);
-  }
-}
-
-AlignmentGraph::~AlignmentGraph()
-{
-  for (std::vector<Node *>::iterator p(m_sourceNodes.begin());
-       p != m_sourceNodes.end(); ++p) {
-    delete *p;
-  }
-  for (std::vector<Node *>::iterator p(m_targetNodes.begin());
-       p != m_targetNodes.end(); ++p) {
-    delete *p;
-  }
-}
-
-std::vector<Rule>
-AlignmentGraph::inferRules() const
-{
-  size_t i = 0;
-  std::vector<Node *>::const_iterator p(m_sourceNodes.begin());
-  for (; p != m_sourceNodes.end(); ++p, ++i) {
-    (*p)->propagateIndex(i);
-  }
-
-  calcComplementSpans(m_root);
-
-  std::set<Node *> frontierSet;
-  computeFrontierSet(m_root, frontierSet);
-
-  std::vector<Subgraph> fragments;
-  for (std::set<Node *>::iterator p(frontierSet.begin());
-       p != frontierSet.end(); ++p) {
-    Subgraph subgraph(*p);
-    while (!subgraph.expand(frontierSet)) {
-      ;
-    }
-    if (subgraph.canFormSCFGRule()) {
-      fragments.push_back(subgraph);
-    }
-  }
-
-  std::vector<Rule> rules;
-  for (std::vector<Subgraph>::const_iterator p = fragments.begin();
-       p != fragments.end(); ++p) {
-    rules.push_back(fragmentToRule(*p));
-  }
-
-  return rules;
-}
diff --git a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.h b/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.h
deleted file mode 100644
index 32698d5c3..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/AlignmentGraph.h
+++ /dev/null
@@ -1,154 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#pragma once
-#ifndef ALIGNMENTGRAPH_H_INCLUDED_
-#define ALIGNMENTGRAPH_H_INCLUDED_
-
-#include "Alignment.h"
-#include "ParseTree.h"
-#include "Span.h"
-#include "Rule.h"
-
-#include <string>
-#include <vector>
-
-enum NodeType { SOURCE, TARGET, TREE };
-
-class Node
-{
-public:
-
-  Node(const std::string & label, NodeType type)
-    : m_label(label)
-    , m_type(type)
-    , m_children()
-    , m_parents()
-  {}
-
-  const std::string &
-  getLabel() const {
-    return m_label;
-  }
-
-  NodeType
-  getType() const {
-    return m_type;
-  }
-
-  const std::vector<Node*> &
-  getChildren() const {
-    return m_children;
-  }
-
-  const std::vector<Node*> &
-  getParents() const {
-    return m_parents;
-  }
-
-  void
-  setChildren(const std::vector<Node*> &);
-
-  void
-  setParents(const std::vector<Node*> &);
-
-  void
-  addChild(Node *);
-
-  void
-  addParent(Node *);
-
-  bool
-  isSink() const;
-
-  void
-  propagateIndex(int);
-
-  Span &
-  getSpan() {
-    return m_span;
-  }
-
-  const Span &
-  getSpan() const {
-    return m_span;
-  }
-
-  Span &
-  getComplementSpan() {
-    return m_complementSpan;
-  }
-
-  const Span &
-  getComplementSpan() const {
-    return m_complementSpan;
-  }
-
-  std::vector<std::string>
-  getTargetWords() const;
-
-private:
-  std::string m_label;
-  NodeType m_type;
-  std::vector<Node*> m_children;
-  std::vector<Node*> m_parents;
-  Span m_span;
-  Span m_complementSpan;
-
-  // Disallow copying
-  Node(const Node &);
-  Node & operator=(const Node &);
-
-  void
-  getTargetWords(std::vector<std::string> &) const;
-};
-
-class AlignmentGraph
-{
-public:
-  AlignmentGraph(const ParseTree *,
-                 const std::vector<std::string> &,
-                 const Alignment &);
-
-  ~AlignmentGraph();
-
-  Node *
-  getRoot() {
-    return m_root;
-  }
-
-  std::vector<Node *> &
-  getSourceNodes() {
-    return m_sourceNodes;
-  }
-
-  std::vector<Rule>
-  inferRules() const;
-
-private:
-  Node * m_root;
-  std::vector<Node *> m_sourceNodes;
-  std::vector<Node *> m_targetNodes;
-
-  // Disallow copying
-  AlignmentGraph(const AlignmentGraph &);
-  AlignmentGraph & operator=(const AlignmentGraph &);
-};
-
-#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/Exception.h b/scripts/training/phrase-extract/extract-ghkm/Exception.h
deleted file mode 100644
index 56c5f83f0..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/Exception.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#pragma once
-#ifndef EXCEPTION_H_INCLUDED_
-#define EXCEPTION_H_INCLUDED_
-
-#include <string>
-
-class Exception
-{
-public:
-  Exception(const char * msg)
-    : m_msg(msg)
-  {}
-
-  Exception(const std::string & msg)
-    : m_msg(msg)
-  {}
-
-  const std::string &
-  getMsg() const {
-    return m_msg;
-  }
-
-private:
-  std::string m_msg;
-};
-
-#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/Makefile b/scripts/training/phrase-extract/extract-ghkm/Makefile
deleted file mode 100644
index 23c0ff104..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/Makefile
+++ /dev/null
@@ -1,34 +0,0 @@
-.DELETE_ON_ERROR:
-.PHONY: all clean
-
-CC          = g++
-CFLAGS      = -pedantic -Wall -O -g
-CPPFLAGS    = -I ../
-
-PROG        = extract-ghkm
-SOURCES     = $(addsuffix .cpp,$(PROG)) \
-              Alignment.cpp \
-              AlignmentGraph.cpp \
-              ParseTree.cpp \
-              Span.cpp \
-              Subgraph.cpp \
-              ../SyntaxTree.cpp \
-              ../XmlTree.cpp \
-              XmlTreeParser.cpp
-OBJS        = $(notdir $(SOURCES:.cpp=.o))
-
-all: $(PROG)
-
-clean:
-	@rm -f $(PROG) $(OBJS)
-
-$(PROG):  $(OBJS)
-	$(CC) $(CPPFLAGS) $(CFLAGS) $^ $(LDFLAGS) -o $@
-
-include Makefile.dep
-
-Makefile.dep: $(SOURCES)
-	$(CC) $(CPPFLAGS) -MM $(SOURCES) > $@
-
-%.o:
-	$(CC) -c $(CPPFLAGS) $(CFLAGS) $<
diff --git a/scripts/training/phrase-extract/extract-ghkm/Makefile.am b/scripts/training/phrase-extract/extract-ghkm/Makefile.am
new file mode 100644
index 000000000..88c1c68cc
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/Makefile.am
@@ -0,0 +1,3 @@
+ACLOCAL_AMFLAGS = -I m4
+
+SUBDIRS = tools
diff --git a/scripts/training/phrase-extract/extract-ghkm/Makefile.dep b/scripts/training/phrase-extract/extract-ghkm/Makefile.dep
deleted file mode 100644
index 473d02e06..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/Makefile.dep
+++ /dev/null
@@ -1,13 +0,0 @@
-extract-ghkm.o: extract-ghkm.cpp Alignment.h AlignmentGraph.h ParseTree.h \
-  Span.h Rule.h Exception.h XmlTreeParser.h
-Alignment.o: Alignment.cpp Alignment.h Exception.h
-AlignmentGraph.o: AlignmentGraph.cpp AlignmentGraph.h Alignment.h \
-  ParseTree.h Span.h Rule.h Subgraph.h
-ParseTree.o: ParseTree.cpp ParseTree.h
-Span.o: Span.cpp Span.h
-Subgraph.o: Subgraph.cpp Subgraph.h AlignmentGraph.h Alignment.h \
-  ParseTree.h Span.h Rule.h
-SyntaxTree.o: ../SyntaxTree.cpp ../SyntaxTree.h
-XmlTree.o: ../XmlTree.cpp ../SyntaxTree.h
-XmlTreeParser.o: XmlTreeParser.cpp XmlTreeParser.h ParseTree.h \
-  ../XmlTree.h ../SyntaxTree.h
diff --git a/scripts/training/phrase-extract/extract-ghkm/ParseTree.cpp b/scripts/training/phrase-extract/extract-ghkm/ParseTree.cpp
deleted file mode 100644
index 05a5929e4..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/ParseTree.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#include "ParseTree.h"
-
-ParseTree::~ParseTree()
-{
-  for (std::vector<ParseTree*>::iterator p(m_children.begin());
-       p != m_children.end(); ++p) {
-    delete *p;
-  }
-}
-
-void
-ParseTree::setChildren(const std::vector<ParseTree*> & children)
-{
-  m_children = children;
-}
-
-void
-ParseTree::setParent(ParseTree * parent)
-{
-  m_parent = parent;
-}
-
-void
-ParseTree::addChild(ParseTree * child)
-{
-  m_children.push_back(child);
-}
-
-bool
-ParseTree::isLeaf() const
-{
-  return m_children.empty();
-}
diff --git a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h b/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
deleted file mode 100644
index bd191aea2..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/ParseTree.h
+++ /dev/null
@@ -1,75 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#pragma once
-#ifndef PARSETREE_H_INCLUDED_
-#define PARSETREE_H_INCLUDED_
-
-#include <string>
-#include <vector>
-
-class ParseTree
-{
-public:
-  ParseTree(const std::string & label)
-    : m_label(label)
-    , m_children()
-    , m_parent()
-  {}
-
-  ~ParseTree();
-
-  const std::string &
-  getLabel() const {
-    return m_label;
-  }
-
-  const std::vector<ParseTree*> &
-  getChildren() const {
-    return m_children;
-  }
-
-  const ParseTree *
-  getParent() const {
-    return m_parent;
-  }
-
-  void
-  setParent(ParseTree *);
-
-  void
-  setChildren(const std::vector<ParseTree*> &);
-
-  void
-  addChild(ParseTree *);
-
-  bool
-  isLeaf() const;
-
-private:
-  std::string m_label;
-  std::vector<ParseTree*> m_children;
-  ParseTree * m_parent;
-
-  // Disallow copying
-  ParseTree(const ParseTree &);
-  ParseTree & operator=(const ParseTree &);
-};
-
-#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/Rule.h b/scripts/training/phrase-extract/extract-ghkm/Rule.h
deleted file mode 100644
index e641c1044..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/Rule.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#pragma once
-#ifndef RULE_H_INCLUDED_
-#define RULE_H_INCLUDED_
-
-#include "Alignment.h"
-
-#include <string>
-#include <vector>
-
-enum SymbolType { Terminal, NonTerminal };
-
-class Symbol
-{
-public:
-  Symbol(const std::string & value, SymbolType type)
-    : m_value(value)
-    , m_type(type)
-  {}
-
-  const std::string &
-  getValue() const {
-    return m_value;
-  }
-
-  SymbolType
-  getType() const {
-    return m_type;
-  }
-
-private:
-  std::string m_value;
-  SymbolType m_type;
-};
-
-class Rule
-{
-public:
-  Rule(const Symbol & sourceLHS,
-       const Symbol & targetLHS,
-       const std::vector<Symbol> & sourceRHS,
-       const std::vector<Symbol> & targetRHS,
-       const Alignment & alignment)
-    : m_sourceLHS(sourceLHS)
-    , m_targetLHS(targetLHS)
-    , m_sourceRHS(sourceRHS)
-    , m_targetRHS(targetRHS)
-    , m_alignment(alignment)
-  {}
-
-  const Symbol &
-  getSourceLHS() const {
-    return m_sourceLHS;
-  }
-
-  const Symbol &
-  getTargetLHS() const {
-    return m_targetLHS;
-  }
-
-  const std::vector<Symbol> &
-  getSourceRHS() const {
-    return m_sourceRHS;
-  }
-
-  const std::vector<Symbol> &
-  getTargetRHS() const {
-    return m_targetRHS;
-  }
-
-  const Alignment &
-  getAlignment() const {
-    return m_alignment;
-  }
-
-private:
-  Symbol m_sourceLHS;
-  Symbol m_targetLHS;
-  std::vector<Symbol> m_sourceRHS;
-  std::vector<Symbol> m_targetRHS;
-  Alignment m_alignment;
-};
-
-#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/Span.cpp b/scripts/training/phrase-extract/extract-ghkm/Span.cpp
deleted file mode 100644
index 56b224ee7..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/Span.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#include "Span.h"
-
-bool
-spansIntersect(const Span & a, const Span & b)
-{
-  for (Span::const_iterator p(a.begin()); p != a.end(); ++p) {
-    Span::const_iterator q = b.find(*p);
-    if (q != b.end()) {
-      return true;
-    }
-  }
-  return false;
-}
-
-Span
-closure(const Span & s)
-{
-  Span result;
-  if (s.empty()) {
-    return result;
-  }
-  Span::const_iterator p(s.begin());
-  int min = *p;
-  int max = *p;
-  ++p;
-  for (; p != s.end(); ++p) {
-    if (*p < min) {
-      min = *p;
-    }
-    if (*p > max) {
-      max = *p;
-    }
-  }
-  for (int i = min; i <= max; ++i) {
-    result.insert(i);
-  }
-
-  return result;
-}
diff --git a/scripts/training/phrase-extract/extract-ghkm/Span.h b/scripts/training/phrase-extract/extract-ghkm/Span.h
deleted file mode 100644
index a0c14877d..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/Span.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#pragma once
-#ifndef SPAN_H_INCLUDED_
-#define SPAN_H_INCLUDED_
-
-#include <map>
-#include <set>
-
-typedef std::set<int> Span;
-
-bool
-spansIntersect(const Span & a, const Span & b);
-
-Span
-closure(const Span & s);
-
-#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp b/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
deleted file mode 100644
index 44033aaa7..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.cpp
+++ /dev/null
@@ -1,133 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#include "Subgraph.h"
-
-#include "AlignmentGraph.h"
-
-#include <cassert>
-
-Subgraph::Subgraph(Node * root)
-  : m_root(root)
-{
-  if (root->isSink()) {
-    m_expandedNodes.insert(root);
-  } else {
-    m_expandableNodes.push(root);
-  }
-}
-
-bool
-Subgraph::isFragment() const
-{
-  return !isTrivial();
-}
-
-bool
-Subgraph::canFormSCFGRule() const
-{
-  return isFragment()
-         && (m_root->getType() == TREE)
-         && !(m_root->getSpan().empty());
-}
-
-bool
-Subgraph::isTrivial() const
-{
-  std::set<Node *> sinkNodes = getSinkNodes();
-  return (sinkNodes.size() == 1) &&
-         (sinkNodes.find(m_root) != sinkNodes.end());
-}
-
-bool
-Subgraph::isSinkNode(Node * n) const
-{
-  assert(m_expandableNodes.empty());
-  return m_expandedNodes.find(n) != m_expandedNodes.end();
-}
-
-std::set<Node *>
-Subgraph::getSinkNodes() const
-{
-  std::set<Node *> sinkNodes;
-  std::stack<Node *> expandable(m_expandableNodes);
-  while (!expandable.empty()) {
-    sinkNodes.insert(expandable.top());
-    expandable.pop();
-  }
-  sinkNodes.insert(m_expandedNodes.begin(), m_expandedNodes.end());
-  return sinkNodes;
-}
-
-// Expand a single subgraph node.  Return true if subgraph is fully expanded
-// or false otherwise.
-bool
-Subgraph::expand(const std::set<Node *> & frontierSet)
-{
-  if (m_expandableNodes.empty()) {
-    return true;
-  }
-
-  Node * n = m_expandableNodes.top();
-  m_expandableNodes.pop();
-
-  const std::vector<Node *> & children = n->getChildren();
-  for (std::vector<Node *>::const_iterator p(children.begin());
-       p != children.end(); ++p) {
-    Node * child = *p;
-    if (child->isSink()) {
-      m_expandedNodes.insert(child);
-      continue;
-    }
-    std::set<Node *>::const_iterator q = frontierSet.find(child);
-    if (q == frontierSet.end()) { //child is not from the frontier set
-      m_expandableNodes.push(child);
-    } else if (child->getType() == TARGET) { // still need source word
-      m_expandableNodes.push(child);
-    } else {
-      m_expandedNodes.insert(child);
-    }
-  }
-
-  return m_expandableNodes.empty();
-}
-
-std::vector<Node *>
-Subgraph::getLeafNodes() const
-{
-  std::vector<Node *> leafNodes;
-  std::set<Node *> sinkNodes(getSinkNodes());
-  getLeafNodes(m_root, leafNodes, sinkNodes);
-  return leafNodes;
-}
-
-void
-Subgraph::getLeafNodes(Node * root, std::vector<Node *> & leafNodes,
-                       const std::set<Node *> & sinkNodes) const
-{
-  if (root->getType() == TARGET || sinkNodes.find(root) != sinkNodes.end()) {
-    leafNodes.push_back(root);
-  } else {
-    const std::vector<Node*> & children(root->getChildren());
-    for (std::vector<Node *>::const_iterator p(children.begin());
-         p != children.end(); ++p) {
-      getLeafNodes(*p, leafNodes, sinkNodes);
-    }
-  }
-}
diff --git a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h b/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
deleted file mode 100644
index 9b772d73a..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/Subgraph.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#pragma once
-#ifndef SUBGRAPH_H_INCLUDED_
-#define SUBGRAPH_H_INCLUDED_
-
-class Node;
-
-#include <set>
-#include <stack>
-#include <vector>
-
-class Subgraph
-{
-public:
-  Subgraph(Node * root);
-
-  bool
-  isTrivial() const;
-
-  bool
-  isFragment() const;
-
-  bool
-  canFormSCFGRule() const;
-
-  bool
-  isSinkNode(Node *) const;
-
-  bool
-  expand(const std::set<Node *> & frontierSet);
-
-  const Node *
-  getRoot() const {
-    return m_root;
-  };
-
-  std::set<Node *>
-  getSinkNodes() const;
-
-  std::vector<Node *>
-  getLeafNodes() const;
-
-private:
-  Node * m_root;
-  std::stack<Node *> m_expandableNodes;
-  std::set<Node *> m_expandedNodes;
-
-  void
-  getLeafNodes(Node * root, std::vector<Node *> & leafNodes,
-               const std::set<Node *> & sinkNodes) const;
-};
-
-#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
deleted file mode 100644
index 74e0c6e43..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.cpp
+++ /dev/null
@@ -1,86 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#include "XmlTreeParser.h"
-
-#include "ParseTree.h"
-#include "XmlTree.h"
-
-#include <cassert>
-#include <vector>
-
-namespace
-{
-std::auto_ptr<ParseTree>
-parseXmlTree(std::vector<std::string>::const_iterator & p,
-             const std::vector<std::string>::const_iterator & end)
-{
-  std::auto_ptr<ParseTree> t;
-
-  if (p == end) {
-    return t;
-  }
-
-  std::string s(Trim(*p));
-
-  while (s.empty()) {
-    s = Trim(*++p);
-  }
-
-  if (!isXmlTag(s)) {
-    p++;
-    t.reset(new ParseTree(s));
-    return t;
-  }
-
-  const std::string & tag = s;
-
-  if (tag[1] == '/') {
-    // Closing tag.  Don't advance p -- let caller handle it.
-    return t;
-  }
-
-  std::string label = ParseXmlTagAttribute(tag, "label");
-  t.reset(new ParseTree(label));
-
-  if (tag[tag.size()-2] == '/') {
-    // Unary tag.
-    p++;
-    return t;
-  }
-
-  p++;
-  while (ParseTree * c = parseXmlTree(p, end).release()) {
-    t->addChild(c);
-    c->setParent(t.get());
-  }
-  p++;  // Skip over closing tag
-
-  return t;
-}
-}
-
-std::auto_ptr<ParseTree>
-parseXmlTree(const std::string & line)
-{
-  std::vector<std::string> xmlTokens(TokenizeXml(line));
-  std::vector<std::string>::const_iterator begin(xmlTokens.begin());
-  std::vector<std::string>::const_iterator end(xmlTokens.end());
-  return parseXmlTree(begin, end);
-}
diff --git a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.h b/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.h
deleted file mode 100644
index 11c4d4a33..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/XmlTreeParser.h
+++ /dev/null
@@ -1,32 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-#pragma once
-#ifndef XMLTREEPARSER_H_INCLUDED_
-#define XMLTREEPARSER_H_INCLUDED_
-
-#include <memory>
-#include <string>
-
-class ParseTree;
-
-std::auto_ptr<ParseTree>
-parseXmlTree(const std::string &);
-
-#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/configure.ac b/scripts/training/phrase-extract/extract-ghkm/configure.ac
new file mode 100644
index 000000000..bb0a0c916
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/configure.ac
@@ -0,0 +1,33 @@
+#                                               -*- Autoconf -*-
+# Process this file with autoconf to produce a configure script.
+
+AC_PREREQ([2.65])
+AC_INIT([moses-extract-ghkm], [1.0], [moses-support@mit.edu])
+AM_INIT_AUTOMAKE([foreign])
+AC_CONFIG_SRCDIR([tools/ExtractGHKM.cpp])
+AC_CONFIG_HEADERS([config.h])
+AC_CONFIG_MACRO_DIR([m4])
+AC_LANG([C++])
+
+# Checks for programs.
+AC_PROG_CXX
+AC_PROG_INSTALL
+AC_PROG_LIBTOOL
+
+# Checks for libraries.
+BOOST_REQUIRE
+BOOST_PROGRAM_OPTIONS
+BOOST_STRING_ALGO
+BOOST_UNORDERED
+
+# Checks for header files.
+
+# Checks for typedefs, structures, and compiler characteristics.
+AC_HEADER_STDBOOL
+AC_TYPE_SIZE_T
+
+# Checks for library functions.
+AC_FUNC_ERROR_AT_LINE
+
+AC_CONFIG_FILES([Makefile tools/Makefile])
+AC_OUTPUT
diff --git a/scripts/training/phrase-extract/extract-ghkm/extract-ghkm.cpp b/scripts/training/phrase-extract/extract-ghkm/extract-ghkm.cpp
deleted file mode 100644
index 453e810ea..000000000
--- a/scripts/training/phrase-extract/extract-ghkm/extract-ghkm.cpp
+++ /dev/null
@@ -1,263 +0,0 @@
-/***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
-
-////////////////////////////////////////////////////////////////////////////////
-//
-//  extract-ghkm
-//  SCFG grammar rule extractor based on the GHKM algorithm described in:
-//
-//    Galley, M., Hopkins, M., Knight, K., and Marcu, D. (2004)
-//    "What's in a Translation Rule?", In Proceedings of HLT/NAACL 2004.
-//
-////////////////////////////////////////////////////////////////////////////////
-
-#include "Alignment.h"
-#include "AlignmentGraph.h"
-#include "Exception.h"
-#include "ParseTree.h"
-#include "Rule.h"
-#include "Span.h"
-#include "XmlTreeParser.h"
-
-#include <cassert>
-#include <cstdlib>
-#include <fstream>
-#include <iostream>
-#include <string>
-#include <sstream>
-#include <vector>
-
-namespace
-{
-const std::string progName = "extract-ghkm";
-
-void
-printUsage()
-{
-  std::cerr << "Usage: "
-            << progName << " TARGET SOURCE ALIGNMENT EXTRACT"
-            << std::endl;
-}
-
-void
-printErrorMsg(const std::string & errorMsg)
-{
-  std::cerr << progName << ": " << errorMsg << std::endl;
-}
-
-void
-printSymbol(const Symbol & symbol, std::ostream & out)
-{
-  if (symbol.getType() == NonTerminal) {
-    out << "[" << symbol.getValue() << "]";
-  } else {
-    out << symbol.getValue();
-  }
-}
-
-void
-printRule(const Rule & rule, std::ostream & out, std::ostream & invOut)
-{
-  const std::vector<Symbol> & sourceRHS = rule.getSourceRHS();
-  const std::vector<Symbol> & targetRHS = rule.getTargetRHS();
-
-  // TODO Just create maps for NTs (one-to-one)
-  std::map<int, std::vector<int> > sourceToTarget;
-  std::map<int, std::vector<int> > targetToSource;
-
-  const Alignment & alignment = rule.getAlignment();
-
-  for (Alignment::const_iterator p(alignment.begin());
-       p != alignment.end(); ++p) {
-    sourceToTarget[p->first].push_back(p->second);
-    targetToSource[p->second].push_back(p->first);
-  }
-
-  std::ostringstream sourceSS;
-  std::ostringstream targetSS;
-
-  int i = 0;
-  for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
-       p != sourceRHS.end(); ++p, ++i) {
-    printSymbol(*p, sourceSS);
-    if (p->getType() == NonTerminal) {
-      assert(sourceToTarget.find(i) != sourceToTarget.end());
-      const std::vector<int> & targetIndices = sourceToTarget[i];
-      assert(targetIndices.size() == 1);
-      int targetIndex = targetIndices[0];
-      printSymbol(targetRHS[targetIndex], sourceSS);
-    }
-    sourceSS << " ";
-  }
-  printSymbol(rule.getSourceLHS(), sourceSS);
-
-  i = 0;
-  for (std::vector<Symbol>::const_iterator p(targetRHS.begin());
-       p != targetRHS.end(); ++p, ++i) {
-    if (p->getType() == NonTerminal) {
-      assert(targetToSource.find(i) != targetToSource.end());
-      const std::vector<int> & sourceIndices = targetToSource[i];
-      assert(sourceIndices.size() == 1);
-      int sourceIndex = sourceIndices[0];
-      printSymbol(sourceRHS[sourceIndex], targetSS);
-    }
-    printSymbol(*p, targetSS);
-    targetSS << " ";
-  }
-
-  printSymbol(rule.getTargetLHS(), targetSS);
-
-  out << sourceSS.str() << " ||| " << targetSS.str() << " |||";
-  invOut << targetSS.str() << " ||| " << sourceSS.str() << " |||";
-
-  for (Alignment::const_iterator p(alignment.begin());
-       p != alignment.end(); ++p) {
-    out << " " << p->first << "-" << p->second;
-    invOut << " " << p->second << "-" << p->first;
-  }
-
-  out << " ||| 1" << std::endl;
-  invOut << " ||| 1" << std::endl;
-}
-
-std::vector<std::string>
-readTokens(const std::string & s)
-{
-  std::vector<std::string> tokens;
-
-  std::string whitespace = " \t";
-
-  std::string::size_type begin = s.find_first_not_of(whitespace);
-  assert(begin != std::string::npos);
-  while (true) {
-    std::string::size_type end = s.find_first_of(whitespace, begin);
-    std::string token;
-    if (end == std::string::npos) {
-      token = s.substr(begin);
-    } else {
-      token = s.substr(begin, end-begin);
-    }
-    tokens.push_back(token);
-    if (end == std::string::npos) {
-      break;
-    }
-    begin = s.find_first_not_of(whitespace, end);
-    if (begin == std::string::npos) {
-      break;
-    }
-  }
-
-  return tokens;
-}
-}
-
-int
-main(int argc, char * argv[])
-{
-  if (argc != 5) {
-    printUsage();
-    exit(1);
-  }
-
-  std::ifstream targetStream(argv[1]);
-  if (!targetStream) {
-    printErrorMsg("Failed to open file: " + std::string(argv[1]));
-    exit(1);
-  }
-
-  std::ifstream sourceStream(argv[2]);
-  if (!sourceStream) {
-    printErrorMsg("Failed to open file: " + std::string(argv[2]));
-    exit(1);
-  }
-
-  std::ifstream alignmentStream(argv[3]);
-  if (!alignmentStream) {
-    printErrorMsg("Failed to open file: " + std::string(argv[3]));
-    exit(1);
-  }
-
-  std::ofstream extractStream(argv[4]);
-  if (!extractStream) {
-    printErrorMsg("Failed to open file: " + std::string(argv[4]));
-    exit(1);
-  }
-
-  std::string invExtractFileName = std::string(argv[4]) + std::string(".inv");
-  std::ofstream invExtractStream(invExtractFileName.c_str());
-  if (!invExtractStream) {
-    printErrorMsg("Failed to open file: " + invExtractFileName);
-    exit(1);
-  }
-
-  size_t lineNum = 0;
-  while (true) {
-    std::string targetLine;
-    std::getline(targetStream, targetLine);
-
-    std::string sourceLine;
-    std::getline(sourceStream, sourceLine);
-
-    std::string alignmentLine;
-    std::getline(alignmentStream, alignmentLine);
-
-    if (targetStream.eof() && sourceStream.eof() && alignmentStream.eof()) {
-      break;
-    }
-
-    if (targetStream.eof() || sourceStream.eof() || alignmentStream.eof()) {
-      printErrorMsg("Files must contain same number of lines");
-      exit(1);
-    }
-
-    ++lineNum;
-
-    std::auto_ptr<ParseTree> t(parseXmlTree(targetLine));
-    if (!t.get()) {
-      std::ostringstream s;
-      s << "Failed to parse XML tree at line " << lineNum;
-      printErrorMsg(s.str());
-      exit(1);
-    }
-
-    std::vector<std::string> sourceTokens(readTokens(sourceLine));
-
-    Alignment alignment;
-    try {
-      alignment = readAlignment(alignmentLine);
-    } catch (const Exception & e) {
-      std::ostringstream s;
-      s << "Failed to read alignment at line " << lineNum << ": ";
-      s << e.getMsg();
-      printErrorMsg(s.str());
-      exit(1);
-    }
-
-    AlignmentGraph graph(t.get(), sourceTokens, alignment);
-
-    std::vector<Rule> rules(graph.inferRules());
-
-    for (std::vector<Rule>::iterator p(rules.begin());
-         p != rules.end(); ++p) {
-      printRule(*p, extractStream, invExtractStream);
-    }
-  }
-
-  return 0;
-}
diff --git a/scripts/training/phrase-extract/extract-ghkm/m4/boost.m4 b/scripts/training/phrase-extract/extract-ghkm/m4/boost.m4
new file mode 100644
index 000000000..6a9294c70
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/m4/boost.m4
@@ -0,0 +1,1133 @@
+# boost.m4: Locate Boost headers and libraries for autoconf-based projects.
+# Copyright (C) 2007, 2008, 2009, 2010, 2011  Benoit Sigoure <tsuna@lrde.epita.fr>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# Additional permission under section 7 of the GNU General Public
+# License, version 3 ("GPLv3"):
+#
+# If you convey this file as part of a work that contains a
+# configuration script generated by Autoconf, you may do so under
+# terms of your choice.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+m4_define([_BOOST_SERIAL], [m4_translit([
+# serial 16
+], [#
+], [])])
+
+# Original sources can be found at http://github.com/tsuna/boost.m4
+# You can fetch the latest version of the script by doing:
+#   wget http://github.com/tsuna/boost.m4/raw/master/build-aux/boost.m4
+
+# ------ #
+# README #
+# ------ #
+
+# This file provides several macros to use the various Boost libraries.
+# The first macro is BOOST_REQUIRE.  It will simply check if it's possible to
+# find the Boost headers of a given (optional) minimum version and it will
+# define BOOST_CPPFLAGS accordingly.  It will add an option --with-boost to
+# your configure so that users can specify non standard locations.
+# If the user's environment contains BOOST_ROOT and --with-boost was not
+# specified, --with-boost=$BOOST_ROOT is implicitly used.
+# For more README and documentation, go to http://github.com/tsuna/boost.m4
+# Note: THESE MACROS ASSUME THAT YOU USE LIBTOOL.  If you don't, don't worry,
+# simply read the README, it will show you what to do step by step.
+
+m4_pattern_forbid([^_?(BOOST|Boost)_])
+
+
+# _BOOST_SED_CPP(SED-PROGRAM, PROGRAM,
+#                [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND])
+# --------------------------------------------------------
+# Same as AC_EGREP_CPP, but leave the result in conftest.i.
+#
+# SED-PROGRAM is *not* overquoted, as in AC_EGREP_CPP.  It is expanded
+# in double-quotes, so escape your double quotes.
+#
+# It could be useful to turn this into a macro which extracts the
+# value of any macro.
+m4_define([_BOOST_SED_CPP],
+[AC_LANG_PREPROC_REQUIRE()dnl
+AC_REQUIRE([AC_PROG_SED])dnl
+AC_LANG_CONFTEST([AC_LANG_SOURCE([[$2]])])
+AS_IF([dnl eval is necessary to expand ac_cpp.
+dnl Ultrix and Pyramid sh refuse to redirect output of eval, so use subshell.
+dnl Beware of Windows end-of-lines, for instance if we are running
+dnl some Windows programs under Wine.  In that case, boost/version.hpp
+dnl is certainly using "\r\n", but the regular Unix shell will only
+dnl strip `\n' with backquotes, not the `\r'.  This results in
+dnl boost_cv_lib_version='1_37\r' for instance, which breaks
+dnl everything else.
+dnl Cannot use 'dnl' after [$4] because a trailing dnl may break AC_CACHE_CHECK
+(eval "$ac_cpp conftest.$ac_ext") 2>&AS_MESSAGE_LOG_FD |
+  tr -d '\r' |
+  $SED -n -e "$1" >conftest.i 2>&1],
+  [$3],
+  [$4])
+rm -rf conftest*
+])# AC_EGREP_CPP
+
+
+
+# BOOST_REQUIRE([VERSION], [ACTION-IF-NOT-FOUND])
+# -----------------------------------------------
+# Look for Boost.  If version is given, it must either be a literal of the form
+# "X.Y.Z" where X, Y and Z are integers (the ".Z" part being optional) or a
+# variable "$var".
+# Defines the value BOOST_CPPFLAGS.  This macro only checks for headers with
+# the required version, it does not check for any of the Boost libraries.
+# On # success, defines HAVE_BOOST.  On failure, calls the optional
+# ACTION-IF-NOT-FOUND action if one was supplied.
+# Otherwise aborts with an error message.
+AC_DEFUN([BOOST_REQUIRE],
+[AC_REQUIRE([AC_PROG_CXX])dnl
+AC_REQUIRE([AC_PROG_GREP])dnl
+echo "$as_me: this is boost.m4[]_BOOST_SERIAL" >&AS_MESSAGE_LOG_FD
+boost_save_IFS=$IFS
+boost_version_req=$1
+IFS=.
+set x $boost_version_req 0 0 0
+IFS=$boost_save_IFS
+shift
+boost_version_req=`expr "$[1]" '*' 100000 + "$[2]" '*' 100 + "$[3]"`
+boost_version_req_string=$[1].$[2].$[3]
+AC_ARG_WITH([boost],
+   [AS_HELP_STRING([--with-boost=DIR],
+                   [prefix of Boost $1 @<:@guess@:>@])])dnl
+AC_ARG_VAR([BOOST_ROOT],[Location of Boost installation])dnl
+# If BOOST_ROOT is set and the user has not provided a value to
+# --with-boost, then treat BOOST_ROOT as if it the user supplied it.
+if test x"$BOOST_ROOT" != x; then
+  if test x"$with_boost" = x; then
+    AC_MSG_NOTICE([Detected BOOST_ROOT; continuing with --with-boost=$BOOST_ROOT])
+    with_boost=$BOOST_ROOT
+  else
+    AC_MSG_NOTICE([Detected BOOST_ROOT=$BOOST_ROOT, but overridden by --with-boost=$with_boost])
+  fi
+fi
+AC_SUBST([DISTCHECK_CONFIGURE_FLAGS],
+         ["$DISTCHECK_CONFIGURE_FLAGS '--with-boost=$with_boost'"])dnl
+boost_save_CPPFLAGS=$CPPFLAGS
+  AC_CACHE_CHECK([for Boost headers version >= $boost_version_req_string],
+    [boost_cv_inc_path],
+    [boost_cv_inc_path=no
+AC_LANG_PUSH([C++])dnl
+m4_pattern_allow([^BOOST_VERSION$])dnl
+    AC_LANG_CONFTEST([AC_LANG_PROGRAM([[#include <boost/version.hpp>
+#if !defined BOOST_VERSION
+# error BOOST_VERSION is not defined
+#elif BOOST_VERSION < $boost_version_req
+# error Boost headers version < $boost_version_req
+#endif
+]])])
+    # If the user provided a value to --with-boost, use it and only it.
+    case $with_boost in #(
+      ''|yes) set x '' /opt/local/include /usr/local/include /opt/include \
+                 /usr/include C:/Boost/include;; #(
+      *)      set x "$with_boost/include" "$with_boost";;
+    esac
+    shift
+    for boost_dir
+    do
+    # Without --layout=system, Boost (or at least some versions) installs
+    # itself in <prefix>/include/boost-<version>.  This inner loop helps to
+    # find headers in such directories.
+    #
+    # Any ${boost_dir}/boost-x_xx directories are searched in reverse version
+    # order followed by ${boost_dir}.  The final '.' is a sentinel for
+    # searching $boost_dir" itself.  Entries are whitespace separated.
+    #
+    # I didn't indent this loop on purpose (to avoid over-indented code)
+    boost_layout_system_search_list=`cd "$boost_dir" 2>/dev/null \
+        && ls -1 | "${GREP}" '^boost-' | sort -rn -t- -k2 \
+        && echo .`
+    for boost_inc in $boost_layout_system_search_list
+    do
+      if test x"$boost_inc" != x.; then
+        boost_inc="$boost_dir/$boost_inc"
+      else
+        boost_inc="$boost_dir" # Uses sentinel in boost_layout_system_search_list
+      fi
+      if test x"$boost_inc" != x; then
+        # We are going to check whether the version of Boost installed
+        # in $boost_inc is usable by running a compilation that
+        # #includes it.  But if we pass a -I/some/path in which Boost
+        # is not installed, the compiler will just skip this -I and
+        # use other locations (either from CPPFLAGS, or from its list
+        # of system include directories).  As a result we would use
+        # header installed on the machine instead of the /some/path
+        # specified by the user.  So in that precise case (trying
+        # $boost_inc), make sure the version.hpp exists.
+        #
+        # Use test -e as there can be symlinks.
+        test -e "$boost_inc/boost/version.hpp" || continue
+        CPPFLAGS="$CPPFLAGS -I$boost_inc"
+      fi
+      AC_COMPILE_IFELSE([], [boost_cv_inc_path=yes], [boost_cv_version=no])
+      if test x"$boost_cv_inc_path" = xyes; then
+        if test x"$boost_inc" != x; then
+          boost_cv_inc_path=$boost_inc
+        fi
+        break 2
+      fi
+    done
+    done
+AC_LANG_POP([C++])dnl
+    ])
+    case $boost_cv_inc_path in #(
+      no)
+        boost_errmsg="cannot find Boost headers version >= $boost_version_req_string"
+        m4_if([$2], [],  [AC_MSG_ERROR([$boost_errmsg])],
+                        [AC_MSG_NOTICE([$boost_errmsg])])
+        $2
+        ;;#(
+      yes)
+        BOOST_CPPFLAGS=
+        ;;#(
+      *)
+        AC_SUBST([BOOST_CPPFLAGS], ["-I$boost_cv_inc_path"])dnl
+        ;;
+    esac
+  if test x"$boost_cv_inc_path" != xno; then
+  AC_DEFINE([HAVE_BOOST], [1],
+            [Defined if the requested minimum BOOST version is satisfied])
+  AC_CACHE_CHECK([for Boost's header version],
+    [boost_cv_lib_version],
+    [m4_pattern_allow([^BOOST_LIB_VERSION$])dnl
+     _BOOST_SED_CPP([/^boost-lib-version = /{s///;s/\"//g;p;q;}],
+                    [#include <boost/version.hpp>
+boost-lib-version = BOOST_LIB_VERSION],
+    [boost_cv_lib_version=`cat conftest.i`])])
+    # e.g. "134" for 1_34_1 or "135" for 1_35
+    boost_major_version=`echo "$boost_cv_lib_version" | sed 's/_//;s/_.*//'`
+    case $boost_major_version in #(
+      '' | *[[!0-9]]*)
+        AC_MSG_ERROR([invalid value: boost_major_version=$boost_major_version])
+        ;;
+    esac
+fi
+CPPFLAGS=$boost_save_CPPFLAGS
+])# BOOST_REQUIRE
+
+# BOOST_STATIC()
+# --------------
+# Add the "--enable-static-boost" configure argument. If this argument is given
+# on the command line, static versions of the libraries will be looked up.
+AC_DEFUN([BOOST_STATIC],
+  [AC_ARG_ENABLE([static-boost],
+     [AC_HELP_STRING([--enable-static-boost],
+               [Prefer the static boost libraries over the shared ones [no]])],
+     [enable_static_boost=yes],
+     [enable_static_boost=no])])# BOOST_STATIC
+
+# BOOST_FIND_HEADER([HEADER-NAME], [ACTION-IF-NOT-FOUND], [ACTION-IF-FOUND])
+# --------------------------------------------------------------------------
+# Wrapper around AC_CHECK_HEADER for Boost headers.  Useful to check for
+# some parts of the Boost library which are only made of headers and don't
+# require linking (such as Boost.Foreach).
+#
+# Default ACTION-IF-NOT-FOUND: Fail with a fatal error unless Boost couldn't be
+# found in the first place, in which case by default a notice is issued to the
+# user.  Presumably if we haven't died already it's because it's OK to not have
+# Boost, which is why only a notice is issued instead of a hard error.
+#
+# Default ACTION-IF-FOUND: define the preprocessor symbol HAVE_<HEADER-NAME> in
+# case of success # (where HEADER-NAME is written LIKE_THIS, e.g.,
+# HAVE_BOOST_FOREACH_HPP).
+AC_DEFUN([BOOST_FIND_HEADER],
+[AC_REQUIRE([BOOST_REQUIRE])dnl
+if test x"$boost_cv_inc_path" = xno; then
+  m4_default([$2], [AC_MSG_NOTICE([Boost not available, not searching for $1])])
+else
+AC_LANG_PUSH([C++])dnl
+boost_save_CPPFLAGS=$CPPFLAGS
+CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
+AC_CHECK_HEADER([$1],
+  [m4_default([$3], [AC_DEFINE(AS_TR_CPP([HAVE_$1]), [1],
+                               [Define to 1 if you have <$1>])])],
+  [m4_default([$2], [AC_MSG_ERROR([cannot find $1])])])
+CPPFLAGS=$boost_save_CPPFLAGS
+AC_LANG_POP([C++])dnl
+fi
+])# BOOST_FIND_HEADER
+
+
+# BOOST_FIND_LIB([LIB-NAME], [PREFERRED-RT-OPT], [HEADER-NAME], [CXX-TEST],
+#                [CXX-PROLOGUE])
+# -------------------------------------------------------------------------
+# Look for the Boost library LIB-NAME (e.g., LIB-NAME = `thread', for
+# libboost_thread).  Check that HEADER-NAME works and check that
+# libboost_LIB-NAME can link with the code CXX-TEST.  The optional argument
+# CXX-PROLOGUE can be used to include some C++ code before the `main'
+# function.
+#
+# Invokes BOOST_FIND_HEADER([HEADER-NAME]) (see above).
+#
+# Boost libraries typically come compiled with several flavors (with different
+# runtime options) so PREFERRED-RT-OPT is the preferred suffix.  A suffix is one
+# or more of the following letters: sgdpn (in that order).  s = static
+# runtime, d = debug build, g = debug/diagnostic runtime, p = STLPort build,
+# n = (unsure) STLPort build without iostreams from STLPort (it looks like `n'
+# must always be used along with `p').  Additionally, PREFERRED-RT-OPT can
+# start with `mt-' to indicate that there is a preference for multi-thread
+# builds.  Some sample values for PREFERRED-RT-OPT: (nothing), mt, d, mt-d, gdp
+# ...  If you want to make sure you have a specific version of Boost
+# (eg, >= 1.33) you *must* invoke BOOST_REQUIRE before this macro.
+AC_DEFUN([BOOST_FIND_LIB],
+[AC_REQUIRE([BOOST_REQUIRE])dnl
+AC_REQUIRE([_BOOST_FIND_COMPILER_TAG])dnl
+AC_REQUIRE([BOOST_STATIC])dnl
+AC_REQUIRE([_BOOST_GUESS_WHETHER_TO_USE_MT])dnl
+if test x"$boost_cv_inc_path" = xno; then
+  AC_MSG_NOTICE([Boost not available, not searching for the Boost $1 library])
+else
+dnl The else branch is huge and wasn't intended on purpose.
+AC_LANG_PUSH([C++])dnl
+AS_VAR_PUSHDEF([Boost_lib], [boost_cv_lib_$1])dnl
+AS_VAR_PUSHDEF([Boost_lib_LDFLAGS], [boost_cv_lib_$1_LDFLAGS])dnl
+AS_VAR_PUSHDEF([Boost_lib_LDPATH], [boost_cv_lib_$1_LDPATH])dnl
+AS_VAR_PUSHDEF([Boost_lib_LIBS], [boost_cv_lib_$1_LIBS])dnl
+BOOST_FIND_HEADER([$3])
+boost_save_CPPFLAGS=$CPPFLAGS
+CPPFLAGS="$CPPFLAGS $BOOST_CPPFLAGS"
+# Now let's try to find the library.  The algorithm is as follows: first look
+# for a given library name according to the user's PREFERRED-RT-OPT.  For each
+# library name, we prefer to use the ones that carry the tag (toolset name).
+# Each library is searched through the various standard paths were Boost is
+# usually installed.  If we can't find the standard variants, we try to
+# enforce -mt (for instance on MacOSX, libboost_threads.dylib doesn't exist
+# but there's -obviously- libboost_threads-mt.dylib).
+AC_CACHE_CHECK([for the Boost $1 library], [Boost_lib],
+  [Boost_lib=no
+  case "$2" in #(
+    mt | mt-) boost_mt=-mt; boost_rtopt=;; #(
+    mt* | mt-*) boost_mt=-mt; boost_rtopt=`expr "X$2" : 'Xmt-*\(.*\)'`;; #(
+    *) boost_mt=; boost_rtopt=$2;;
+  esac
+  if test $enable_static_boost = yes; then
+    boost_rtopt="s$boost_rtopt"
+  fi
+  # Find the proper debug variant depending on what we've been asked to find.
+  case $boost_rtopt in #(
+    *d*) boost_rt_d=$boost_rtopt;; #(
+    *[[sgpn]]*) # Insert the `d' at the right place (in between `sg' and `pn')
+      boost_rt_d=`echo "$boost_rtopt" | sed 's/\(s*g*\)\(p*n*\)/\1\2/'`;; #(
+    *) boost_rt_d='-d';;
+  esac
+  # If the PREFERRED-RT-OPT are not empty, prepend a `-'.
+  test -n "$boost_rtopt" && boost_rtopt="-$boost_rtopt"
+  $boost_guess_use_mt && boost_mt=-mt
+  # Look for the abs path the static archive.
+  # $libext is computed by Libtool but let's make sure it's non empty.
+  test -z "$libext" &&
+    AC_MSG_ERROR([the libext variable is empty, did you invoke Libtool?])
+  boost_save_ac_objext=$ac_objext
+  # Generate the test file.
+  AC_LANG_CONFTEST([AC_LANG_PROGRAM([#include <$3>
+$5], [$4])])
+dnl Optimization hacks: compiling C++ is slow, especially with Boost.  What
+dnl we're trying to do here is guess the right combination of link flags
+dnl (LIBS / LDFLAGS) to use a given library.  This can take several
+dnl iterations before it succeeds and is thus *very* slow.  So what we do
+dnl instead is that we compile the code first (and thus get an object file,
+dnl typically conftest.o).  Then we try various combinations of link flags
+dnl until we succeed to link conftest.o in an executable.  The problem is
+dnl that the various TRY_LINK / COMPILE_IFELSE macros of Autoconf always
+dnl remove all the temporary files including conftest.o.  So the trick here
+dnl is to temporarily change the value of ac_objext so that conftest.o is
+dnl preserved accross tests.  This is obviously fragile and I will burn in
+dnl hell for not respecting Autoconf's documented interfaces, but in the
+dnl mean time, it optimizes the macro by a factor of 5 to 30.
+dnl Another small optimization: the first argument of AC_COMPILE_IFELSE left
+dnl empty because the test file is generated only once above (before we
+dnl start the for loops).
+  AC_COMPILE_IFELSE([],
+    [ac_objext=do_not_rm_me_plz],
+    [AC_MSG_ERROR([cannot compile a test that uses Boost $1])])
+  ac_objext=$boost_save_ac_objext
+  boost_failed_libs=
+# Don't bother to ident the 6 nested for loops, only the 2 innermost ones
+# matter.
+for boost_tag_ in -$boost_cv_lib_tag ''; do
+for boost_ver_ in -$boost_cv_lib_version ''; do
+for boost_mt_ in $boost_mt -mt ''; do
+for boost_rtopt_ in $boost_rtopt '' -d; do
+  for boost_lib in \
+    boost_$1$boost_tag_$boost_mt_$boost_rtopt_$boost_ver_ \
+    boost_$1$boost_tag_$boost_rtopt_$boost_ver_ \
+    boost_$1$boost_tag_$boost_mt_$boost_ver_ \
+    boost_$1$boost_tag_$boost_ver_
+  do
+    # Avoid testing twice the same lib
+    case $boost_failed_libs in #(
+      *@$boost_lib@*) continue;;
+    esac
+    # If with_boost is empty, we'll search in /lib first, which is not quite
+    # right so instead we'll try to a location based on where the headers are.
+    boost_tmp_lib=$with_boost
+    test x"$with_boost" = x && boost_tmp_lib=${boost_cv_inc_path%/include}
+    for boost_ldpath in "$boost_tmp_lib/lib" '' \
+             /opt/local/lib* /usr/local/lib* /opt/lib* /usr/lib* \
+             "$with_boost" C:/Boost/lib /lib*
+    do
+      test -e "$boost_ldpath" || continue
+      boost_save_LDFLAGS=$LDFLAGS
+      # Are we looking for a static library?
+      case $boost_ldpath:$boost_rtopt_ in #(
+        *?*:*s*) # Yes (Non empty boost_ldpath + s in rt opt)
+          Boost_lib_LIBS="$boost_ldpath/lib$boost_lib.$libext"
+          test -e "$Boost_lib_LIBS" || continue;; #(
+        *) # No: use -lboost_foo to find the shared library.
+          Boost_lib_LIBS="-l$boost_lib";;
+      esac
+      boost_save_LIBS=$LIBS
+      LIBS="$Boost_lib_LIBS $LIBS"
+      test x"$boost_ldpath" != x && LDFLAGS="$LDFLAGS -L$boost_ldpath"
+dnl First argument of AC_LINK_IFELSE left empty because the test file is
+dnl generated only once above (before we start the for loops).
+      _BOOST_AC_LINK_IFELSE([],
+                            [Boost_lib=yes], [Boost_lib=no])
+      ac_objext=$boost_save_ac_objext
+      LDFLAGS=$boost_save_LDFLAGS
+      LIBS=$boost_save_LIBS
+      if test x"$Boost_lib" = xyes; then
+        Boost_lib_LDFLAGS="-L$boost_ldpath -Wl,-R$boost_ldpath"
+        Boost_lib_LDPATH="$boost_ldpath"
+        break 6
+      else
+        boost_failed_libs="$boost_failed_libs@$boost_lib@"
+      fi
+    done
+  done
+done
+done
+done
+done
+rm -f conftest.$ac_objext
+])
+case $Boost_lib in #(
+  no) _AC_MSG_LOG_CONFTEST
+    AC_MSG_ERROR([cannot find the flags to link with Boost $1])
+    ;;
+esac
+AC_SUBST(AS_TR_CPP([BOOST_$1_LDFLAGS]), [$Boost_lib_LDFLAGS])dnl
+AC_SUBST(AS_TR_CPP([BOOST_$1_LDPATH]), [$Boost_lib_LDPATH])dnl
+AC_SUBST([BOOST_LDPATH], [$Boost_lib_LDPATH])dnl
+AC_SUBST(AS_TR_CPP([BOOST_$1_LIBS]), [$Boost_lib_LIBS])dnl
+CPPFLAGS=$boost_save_CPPFLAGS
+AS_VAR_POPDEF([Boost_lib])dnl
+AS_VAR_POPDEF([Boost_lib_LDFLAGS])dnl
+AS_VAR_POPDEF([Boost_lib_LDPATH])dnl
+AS_VAR_POPDEF([Boost_lib_LIBS])dnl
+AC_LANG_POP([C++])dnl
+fi
+])# BOOST_FIND_LIB
+
+
+# --------------------------------------- #
+# Checks for the various Boost libraries. #
+# --------------------------------------- #
+
+# List of boost libraries: http://www.boost.org/libs/libraries.htm
+# The page http://beta.boost.org/doc/libs is useful: it gives the first release
+# version of each library (among other things).
+
+# BOOST_DEFUN(LIBRARY, CODE)
+# --------------------------
+# Define BOOST_<LIBRARY-UPPERCASE> as a macro that runs CODE.
+#
+# Use indir to avoid the warning on underquoted macro name given to AC_DEFUN.
+m4_define([BOOST_DEFUN],
+[m4_indir([AC_DEFUN],
+          m4_toupper([BOOST_$1]),
+[m4_pushdef([BOOST_Library], [$1])dnl
+$2
+m4_popdef([BOOST_Library])dnl
+])
+])
+
+# BOOST_ARRAY()
+# -------------
+# Look for Boost.Array
+BOOST_DEFUN([Array],
+[BOOST_FIND_HEADER([boost/array.hpp])])
+
+
+# BOOST_ASIO()
+# ------------
+# Look for Boost.Asio (new in Boost 1.35).
+BOOST_DEFUN([Asio],
+[AC_REQUIRE([BOOST_SYSTEM])dnl
+BOOST_FIND_HEADER([boost/asio.hpp])])
+
+
+# BOOST_BIND()
+# ------------
+# Look for Boost.Bind
+BOOST_DEFUN([Bind],
+[BOOST_FIND_HEADER([boost/bind.hpp])])
+
+
+# BOOST_CONVERSION()
+# ------------------
+# Look for Boost.Conversion (cast / lexical_cast)
+BOOST_DEFUN([Conversion],
+[BOOST_FIND_HEADER([boost/cast.hpp])
+BOOST_FIND_HEADER([boost/lexical_cast.hpp])
+])# BOOST_CONVERSION
+
+
+# BOOST_DATE_TIME([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.Date_Time.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Date_Time],
+[BOOST_FIND_LIB([date_time], [$1],
+                [boost/date_time/posix_time/posix_time.hpp],
+                [boost::posix_time::ptime t;])
+])# BOOST_DATE_TIME
+
+
+# BOOST_FILESYSTEM([PREFERRED-RT-OPT])
+# ------------------------------------
+# Look for Boost.Filesystem.  For the documentation of PREFERRED-RT-OPT, see
+# the documentation of BOOST_FIND_LIB above.
+# Do not check for boost/filesystem.hpp because this file was introduced in
+# 1.34.
+BOOST_DEFUN([Filesystem],
+[# Do we have to check for Boost.System?  This link-time dependency was
+# added as of 1.35.0.  If we have a version <1.35, we must not attempt to
+# find Boost.System as it didn't exist by then.
+if test $boost_major_version -ge 135; then
+BOOST_SYSTEM([$1])
+fi # end of the Boost.System check.
+boost_filesystem_save_LIBS=$LIBS
+boost_filesystem_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_SYSTEM_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS"
+BOOST_FIND_LIB([filesystem], [$1],
+                [boost/filesystem/path.hpp], [boost::filesystem::path p;])
+if test $enable_static_boost = yes && test $boost_major_version -ge 135; then
+    AC_SUBST([BOOST_FILESYSTEM_LIBS], ["$BOOST_FILESYSTEM_LIBS $BOOST_SYSTEM_LIBS"])
+fi
+LIBS=$boost_filesystem_save_LIBS
+LDFLAGS=$boost_filesystem_save_LDFLAGS
+])# BOOST_FILESYSTEM
+
+
+# BOOST_FOREACH()
+# ---------------
+# Look for Boost.Foreach
+BOOST_DEFUN([Foreach],
+[BOOST_FIND_HEADER([boost/foreach.hpp])])
+
+
+# BOOST_FORMAT()
+# --------------
+# Look for Boost.Format
+# Note: we can't check for boost/format/format_fwd.hpp because the header isn't
+# standalone.  It can't be compiled because it triggers the following error:
+# boost/format/detail/config_macros.hpp:88: error: 'locale' in namespace 'std'
+#                                                  does not name a type
+BOOST_DEFUN([Format],
+[BOOST_FIND_HEADER([boost/format.hpp])])
+
+
+# BOOST_FUNCTION()
+# ----------------
+# Look for Boost.Function
+BOOST_DEFUN([Function],
+[BOOST_FIND_HEADER([boost/function.hpp])])
+
+
+# BOOST_GRAPH([PREFERRED-RT-OPT])
+# -------------------------------
+# Look for Boost.Graphs.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Graph],
+[BOOST_FIND_LIB([graph], [$1],
+                [boost/graph/adjacency_list.hpp], [boost::adjacency_list<> g;])
+])# BOOST_GRAPH
+
+
+# BOOST_IOSTREAMS([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.IOStreams.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([IOStreams],
+[BOOST_FIND_LIB([iostreams], [$1],
+                [boost/iostreams/device/file_descriptor.hpp],
+                [boost::iostreams::file_descriptor fd; fd.close();])
+])# BOOST_IOSTREAMS
+
+
+# BOOST_HASH()
+# ------------
+# Look for Boost.Functional/Hash
+BOOST_DEFUN([Hash],
+[BOOST_FIND_HEADER([boost/functional/hash.hpp])])
+
+
+# BOOST_LAMBDA()
+# --------------
+# Look for Boost.Lambda
+BOOST_DEFUN([Lambda],
+[BOOST_FIND_HEADER([boost/lambda/lambda.hpp])])
+
+
+# BOOST_LOG([PREFERRED-RT-OPT])
+# -----------------------------
+# Look for Boost.Log For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Log],
+[BOOST_FIND_LIB([log], [$1],
+    [boost/log/core/core.hpp],
+    [boost::log::attribute a; a.get_value();])
+])# BOOST_LOG
+
+
+# BOOST_LOG_SETUP([PREFERRED-RT-OPT])
+# -----------------------------------
+# Look for Boost.Log For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Log_Setup],
+[AC_REQUIRE([BOOST_LOG])dnl
+BOOST_FIND_LIB([log_setup], [$1],
+    [boost/log/utility/init/from_settings.hpp],
+    [boost::log::basic_settings<char> bs; bs.empty();])
+])# BOOST_LOG_SETUP
+
+
+# BOOST_MATH()
+# ------------
+# Look for Boost.Math
+# TODO: This library isn't header-only but it comes in multiple different
+# flavors that don't play well with BOOST_FIND_LIB (e.g, libboost_math_c99,
+# libboost_math_c99f, libboost_math_c99l, libboost_math_tr1,
+# libboost_math_tr1f, libboost_math_tr1l).  This macro must be fixed to do the
+# right thing anyway.
+BOOST_DEFUN([Math],
+[BOOST_FIND_HEADER([boost/math/special_functions.hpp])])
+
+
+# BOOST_MULTIARRAY()
+# ------------------
+# Look for Boost.MultiArray
+BOOST_DEFUN([MultiArray],
+[BOOST_FIND_HEADER([boost/multi_array.hpp])])
+
+
+# BOOST_NUMERIC_CONVERSION()
+# --------------------------
+# Look for Boost.NumericConversion (policy-based numeric conversion)
+BOOST_DEFUN([Numeric_Conversion],
+[BOOST_FIND_HEADER([boost/numeric/conversion/converter.hpp])
+])# BOOST_NUMERIC_CONVERSION
+
+
+# BOOST_OPTIONAL()
+# ----------------
+# Look for Boost.Optional
+BOOST_DEFUN([Optional],
+[BOOST_FIND_HEADER([boost/optional.hpp])])
+
+
+# BOOST_PREPROCESSOR()
+# --------------------
+# Look for Boost.Preprocessor
+BOOST_DEFUN([Preprocessor],
+[BOOST_FIND_HEADER([boost/preprocessor/repeat.hpp])])
+
+
+# BOOST_UNORDERED()
+# -----------------
+# Look for Boost.Unordered
+BOOST_DEFUN([Unordered],
+[BOOST_FIND_HEADER([boost/unordered_map.hpp])])
+
+
+# BOOST_UUID()
+# ------------
+# Look for Boost.Uuid
+BOOST_DEFUN([Uuid],
+[BOOST_FIND_HEADER([boost/uuid/uuid.hpp])])
+
+
+# BOOST_PROGRAM_OPTIONS([PREFERRED-RT-OPT])
+# -----------------------------------------
+# Look for Boost.Program_options.  For the documentation of PREFERRED-RT-OPT,
+# see the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Program_Options],
+[BOOST_FIND_LIB([program_options], [$1],
+                [boost/program_options.hpp],
+                [boost::program_options::options_description d("test");])
+])# BOOST_PROGRAM_OPTIONS
+
+
+
+# _BOOST_PYTHON_CONFIG(VARIABLE, FLAG)
+# ------------------------------------
+# Save VARIABLE, and define it via `python-config --FLAG`.
+# Substitute BOOST_PYTHON_VARIABLE.
+m4_define([_BOOST_PYTHON_CONFIG],
+[AC_SUBST([BOOST_PYTHON_$1],
+          [`python-config --$2 2>/dev/null`])dnl
+boost_python_save_$1=$$1
+$1="$$1 $BOOST_PYTHON_$1"])
+
+
+# BOOST_PYTHON([PREFERRED-RT-OPT])
+# --------------------------------
+# Look for Boost.Python.  For the documentation of PREFERRED-RT-OPT,
+# see the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Python],
+[_BOOST_PYTHON_CONFIG([CPPFLAGS], [includes])
+_BOOST_PYTHON_CONFIG([LDFLAGS],   [ldflags])
+_BOOST_PYTHON_CONFIG([LIBS],      [libs])
+m4_pattern_allow([^BOOST_PYTHON_MODULE$])dnl
+BOOST_FIND_LIB([python], [$1],
+               [boost/python.hpp],
+               [], [BOOST_PYTHON_MODULE(empty) {}])
+CPPFLAGS=$boost_python_save_CPPFLAGS
+LDFLAGS=$boost_python_save_LDFLAGS
+LIBS=$boost_python_save_LIBS
+])# BOOST_PYTHON
+
+
+# BOOST_REF()
+# -----------
+# Look for Boost.Ref
+BOOST_DEFUN([Ref],
+[BOOST_FIND_HEADER([boost/ref.hpp])])
+
+
+# BOOST_REGEX([PREFERRED-RT-OPT])
+# -------------------------------
+# Look for Boost.Regex.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Regex],
+[BOOST_FIND_LIB([regex], [$1],
+                [boost/regex.hpp],
+                [boost::regex exp("*"); boost::regex_match("foo", exp);])
+])# BOOST_REGEX
+
+
+# BOOST_SERIALIZATION([PREFERRED-RT-OPT])
+# ---------------------------------------
+# Look for Boost.Serialization.  For the documentation of PREFERRED-RT-OPT, see
+# the documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Serialization],
+[BOOST_FIND_LIB([serialization], [$1],
+                [boost/archive/text_oarchive.hpp],
+                [std::ostream* o = 0; // Cheap way to get an ostream...
+                boost::archive::text_oarchive t(*o);])
+])# BOOST_SERIALIZATION
+
+
+# BOOST_SIGNALS([PREFERRED-RT-OPT])
+# ---------------------------------
+# Look for Boost.Signals.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Signals],
+[BOOST_FIND_LIB([signals], [$1],
+                [boost/signal.hpp],
+                [boost::signal<void ()> s;])
+])# BOOST_SIGNALS
+
+
+# BOOST_SMART_PTR()
+# -----------------
+# Look for Boost.SmartPtr
+BOOST_DEFUN([Smart_Ptr],
+[BOOST_FIND_HEADER([boost/scoped_ptr.hpp])
+BOOST_FIND_HEADER([boost/shared_ptr.hpp])
+])
+
+
+# BOOST_STATICASSERT()
+# --------------------
+# Look for Boost.StaticAssert
+BOOST_DEFUN([StaticAssert],
+[BOOST_FIND_HEADER([boost/static_assert.hpp])])
+
+
+# BOOST_STRING_ALGO()
+# -------------------
+# Look for Boost.StringAlgo
+BOOST_DEFUN([String_Algo],
+[BOOST_FIND_HEADER([boost/algorithm/string.hpp])
+])
+
+
+# BOOST_SYSTEM([PREFERRED-RT-OPT])
+# --------------------------------
+# Look for Boost.System.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.  This library was introduced in Boost
+# 1.35.0.
+BOOST_DEFUN([System],
+[BOOST_FIND_LIB([system], [$1],
+                [boost/system/error_code.hpp],
+                [boost::system::error_code e; e.clear();])
+])# BOOST_SYSTEM
+
+
+# BOOST_TEST([PREFERRED-RT-OPT])
+# ------------------------------
+# Look for Boost.Test.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Test],
+[m4_pattern_allow([^BOOST_CHECK$])dnl
+BOOST_FIND_LIB([unit_test_framework], [$1],
+               [boost/test/unit_test.hpp], [BOOST_CHECK(2 == 2);],
+               [using boost::unit_test::test_suite;
+               test_suite* init_unit_test_suite(int argc, char ** argv)
+               { return NULL; }])
+])# BOOST_TEST
+
+
+# BOOST_THREADS([PREFERRED-RT-OPT])
+# ---------------------------------
+# Look for Boost.Thread.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+# FIXME: Provide an alias "BOOST_THREAD".
+BOOST_DEFUN([Threads],
+[dnl Having the pthread flag is required at least on GCC3 where
+dnl boost/thread.hpp would complain if we try to compile without
+dnl -pthread on GNU/Linux.
+AC_REQUIRE([_BOOST_PTHREAD_FLAG])dnl
+boost_threads_save_LIBS=$LIBS
+boost_threads_save_CPPFLAGS=$CPPFLAGS
+LIBS="$LIBS $boost_cv_pthread_flag"
+# Yes, we *need* to put the -pthread thing in CPPFLAGS because with GCC3,
+# boost/thread.hpp will trigger a #error if -pthread isn't used:
+#   boost/config/requires_threads.hpp:47:5: #error "Compiler threading support
+#   is not turned on. Please set the correct command line options for
+#   threading: -pthread (Linux), -pthreads (Solaris) or -mthreads (Mingw32)"
+CPPFLAGS="$CPPFLAGS $boost_cv_pthread_flag"
+BOOST_FIND_LIB([thread], [$1],
+               [boost/thread.hpp], [boost::thread t; boost::mutex m;])
+BOOST_THREAD_LIBS="$BOOST_THREAD_LIBS $boost_cv_pthread_flag"
+BOOST_CPPFLAGS="$BOOST_CPPFLAGS $boost_cv_pthread_flag"
+LIBS=$boost_threads_save_LIBS
+CPPFLAGS=$boost_threads_save_CPPFLAGS
+])# BOOST_THREADS
+
+
+# BOOST_TOKENIZER()
+# -----------------
+# Look for Boost.Tokenizer
+BOOST_DEFUN([Tokenizer],
+[BOOST_FIND_HEADER([boost/tokenizer.hpp])])
+
+
+# BOOST_TRIBOOL()
+# ---------------
+# Look for Boost.Tribool
+BOOST_DEFUN([Tribool],
+[BOOST_FIND_HEADER([boost/logic/tribool_fwd.hpp])
+BOOST_FIND_HEADER([boost/logic/tribool.hpp])
+])
+
+
+# BOOST_TUPLE()
+# -------------
+# Look for Boost.Tuple
+BOOST_DEFUN([Tuple],
+[BOOST_FIND_HEADER([boost/tuple/tuple.hpp])])
+
+
+# BOOST_TYPETRAITS()
+# --------------------
+# Look for Boost.TypeTraits
+BOOST_DEFUN([TypeTraits],
+[BOOST_FIND_HEADER([boost/type_traits.hpp])])
+
+
+# BOOST_UTILITY()
+# ---------------
+# Look for Boost.Utility (noncopyable, result_of, base-from-member idiom,
+# etc.)
+BOOST_DEFUN([Utility],
+[BOOST_FIND_HEADER([boost/utility.hpp])])
+
+
+# BOOST_VARIANT()
+# ---------------
+# Look for Boost.Variant.
+BOOST_DEFUN([Variant],
+[BOOST_FIND_HEADER([boost/variant/variant_fwd.hpp])
+BOOST_FIND_HEADER([boost/variant.hpp])])
+
+
+# BOOST_WAVE([PREFERRED-RT-OPT])
+# ------------------------------
+# NOTE: If you intend to use Wave/Spirit with thread support, make sure you
+# call BOOST_THREADS first.
+# Look for Boost.Wave.  For the documentation of PREFERRED-RT-OPT, see the
+# documentation of BOOST_FIND_LIB above.
+BOOST_DEFUN([Wave],
+[AC_REQUIRE([BOOST_FILESYSTEM])dnl
+AC_REQUIRE([BOOST_DATE_TIME])dnl
+boost_wave_save_LIBS=$LIBS
+boost_wave_save_LDFLAGS=$LDFLAGS
+m4_pattern_allow([^BOOST_((FILE)?SYSTEM|DATE_TIME|THREAD)_(LIBS|LDFLAGS)$])dnl
+LIBS="$LIBS $BOOST_SYSTEM_LIBS $BOOST_FILESYSTEM_LIBS $BOOST_DATE_TIME_LIBS\
+$BOOST_THREAD_LIBS"
+LDFLAGS="$LDFLAGS $BOOST_SYSTEM_LDFLAGS $BOOST_FILESYSTEM_LDFLAGS\
+$BOOST_DATE_TIME_LDFLAGS $BOOST_THREAD_LDFLAGS"
+BOOST_FIND_LIB([wave], [$1],
+                [boost/wave.hpp],
+                [boost::wave::token_id id; get_token_name(id);])
+LIBS=$boost_wave_save_LIBS
+LDFLAGS=$boost_wave_save_LDFLAGS
+])# BOOST_WAVE
+
+
+# BOOST_XPRESSIVE()
+# -----------------
+# Look for Boost.Xpressive (new since 1.36.0).
+BOOST_DEFUN([Xpressive],
+[BOOST_FIND_HEADER([boost/xpressive/xpressive.hpp])])
+
+
+# ----------------- #
+# Internal helpers. #
+# ----------------- #
+
+
+# _BOOST_PTHREAD_FLAG()
+# ---------------------
+# Internal helper for BOOST_THREADS.  Based on ACX_PTHREAD:
+# http://autoconf-archive.cryp.to/acx_pthread.html
+AC_DEFUN([_BOOST_PTHREAD_FLAG],
+[AC_REQUIRE([AC_PROG_CXX])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_LANG_PUSH([C++])dnl
+AC_CACHE_CHECK([for the flags needed to use pthreads], [boost_cv_pthread_flag],
+[ boost_cv_pthread_flag=
+  # The ordering *is* (sometimes) important.  Some notes on the
+  # individual items follow:
+  # (none): in case threads are in libc; should be tried before -Kthread and
+  #       other compiler flags to prevent continual compiler warnings
+  # -lpthreads: AIX (must check this before -lpthread)
+  # -Kthread: Sequent (threads in libc, but -Kthread needed for pthread.h)
+  # -kthread: FreeBSD kernel threads (preferred to -pthread since SMP-able)
+  # -llthread: LinuxThreads port on FreeBSD (also preferred to -pthread)
+  # -pthread: GNU Linux/GCC (kernel threads), BSD/GCC (userland threads)
+  # -pthreads: Solaris/GCC
+  # -mthreads: MinGW32/GCC, Lynx/GCC
+  # -mt: Sun Workshop C (may only link SunOS threads [-lthread], but it
+  #      doesn't hurt to check since this sometimes defines pthreads too;
+  #      also defines -D_REENTRANT)
+  #      ... -mt is also the pthreads flag for HP/aCC
+  # -lpthread: GNU Linux, etc.
+  # --thread-safe: KAI C++
+  case $host_os in #(
+    *solaris*)
+      # On Solaris (at least, for some versions), libc contains stubbed
+      # (non-functional) versions of the pthreads routines, so link-based
+      # tests will erroneously succeed.  (We need to link with -pthreads/-mt/
+      # -lpthread.)  (The stubs are missing pthread_cleanup_push, or rather
+      # a function called by this macro, so we could check for that, but
+      # who knows whether they'll stub that too in a future libc.)  So,
+      # we'll just look for -pthreads and -lpthread first:
+      boost_pthread_flags="-pthreads -lpthread -mt -pthread";; #(
+    *)
+      boost_pthread_flags="-lpthreads -Kthread -kthread -llthread -pthread \
+                           -pthreads -mthreads -lpthread --thread-safe -mt";;
+  esac
+  # Generate the test file.
+  AC_LANG_CONFTEST([AC_LANG_PROGRAM([#include <pthread.h>],
+    [pthread_t th; pthread_join(th, 0);
+    pthread_attr_init(0); pthread_cleanup_push(0, 0);
+    pthread_create(0,0,0,0); pthread_cleanup_pop(0);])])
+  for boost_pthread_flag in '' $boost_pthread_flags; do
+    boost_pthread_ok=false
+dnl Re-use the test file already generated.
+    boost_pthreads__save_LIBS=$LIBS
+    LIBS="$LIBS $boost_pthread_flag"
+    AC_LINK_IFELSE([],
+      [if grep ".*$boost_pthread_flag" conftest.err; then
+         echo "This flag seems to have triggered warnings" >&AS_MESSAGE_LOG_FD
+       else
+         boost_pthread_ok=:; boost_cv_pthread_flag=$boost_pthread_flag
+       fi])
+    LIBS=$boost_pthreads__save_LIBS
+    $boost_pthread_ok && break
+  done
+])
+AC_LANG_POP([C++])dnl
+])# _BOOST_PTHREAD_FLAG
+
+
+# _BOOST_gcc_test(MAJOR, MINOR)
+# -----------------------------
+# Internal helper for _BOOST_FIND_COMPILER_TAG.
+m4_define([_BOOST_gcc_test],
+["defined __GNUC__ && __GNUC__ == $1 && __GNUC_MINOR__ == $2 && !defined __ICC @ gcc$1$2"])dnl
+
+
+# _BOOST_FIND_COMPILER_TAG()
+# --------------------------
+# Internal.  When Boost is installed without --layout=system, each library
+# filename will hold a suffix that encodes the compiler used during the
+# build.  The Boost build system seems to call this a `tag'.
+AC_DEFUN([_BOOST_FIND_COMPILER_TAG],
+[AC_REQUIRE([AC_PROG_CXX])dnl
+AC_REQUIRE([AC_CANONICAL_HOST])dnl
+AC_CACHE_CHECK([for the toolset name used by Boost for $CXX], [boost_cv_lib_tag],
+[boost_cv_lib_tag=unknown
+if test x$boost_cv_inc_path != xno; then
+  AC_LANG_PUSH([C++])dnl
+  # The following tests are mostly inspired by boost/config/auto_link.hpp
+  # The list is sorted to most recent/common to oldest compiler (in order
+  # to increase the likelihood of finding the right compiler with the
+  # least number of compilation attempt).
+  # Beware that some tests are sensible to the order (for instance, we must
+  # look for MinGW before looking for GCC3).
+  # I used one compilation test per compiler with a #error to recognize
+  # each compiler so that it works even when cross-compiling (let me know
+  # if you know a better approach).
+  # Known missing tags (known from Boost's tools/build/v2/tools/common.jam):
+  #   como, edg, kcc, bck, mp, sw, tru, xlc
+  # I'm not sure about my test for `il' (be careful: Intel's ICC pre-defines
+  # the same defines as GCC's).
+  for i in \
+    _BOOST_gcc_test(4, 6) \
+    _BOOST_gcc_test(4, 5) \
+    _BOOST_gcc_test(4, 4) \
+    _BOOST_gcc_test(4, 3) \
+    _BOOST_gcc_test(4, 2) \
+    _BOOST_gcc_test(4, 1) \
+    _BOOST_gcc_test(4, 0) \
+    "defined __GNUC__ && __GNUC__ == 3 && !defined __ICC \
+     && (defined WIN32 || defined WINNT || defined _WIN32 || defined __WIN32 \
+         || defined __WIN32__ || defined __WINNT || defined __WINNT__) @ mgw" \
+    _BOOST_gcc_test(3, 4) \
+    _BOOST_gcc_test(3, 3) \
+    "defined _MSC_VER && _MSC_VER >= 1500 @ vc90" \
+    "defined _MSC_VER && _MSC_VER == 1400 @ vc80" \
+    _BOOST_gcc_test(3, 2) \
+    "defined _MSC_VER && _MSC_VER == 1310 @ vc71" \
+    _BOOST_gcc_test(3, 1) \
+    _BOOST_gcc_test(3, 0) \
+    "defined __BORLANDC__ @ bcb" \
+    "defined __ICC && (defined __unix || defined __unix__) @ il" \
+    "defined __ICL @ iw" \
+    "defined _MSC_VER && _MSC_VER == 1300 @ vc7" \
+    _BOOST_gcc_test(2, 95) \
+    "defined __MWERKS__ && __MWERKS__ <= 0x32FF @ cw9" \
+    "defined _MSC_VER && _MSC_VER < 1300 && !defined UNDER_CE @ vc6" \
+    "defined _MSC_VER && _MSC_VER < 1300 && defined UNDER_CE @ evc4" \
+    "defined __MWERKS__ && __MWERKS__ <= 0x31FF @ cw8"
+  do
+    boost_tag_test=`expr "X$i" : 'X\([[^@]]*\) @ '`
+    boost_tag=`expr "X$i" : 'X[[^@]]* @ \(.*\)'`
+    AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#if $boost_tag_test
+/* OK */
+#else
+# error $boost_tag_test
+#endif
+]])], [boost_cv_lib_tag=$boost_tag; break], [])
+  done
+AC_LANG_POP([C++])dnl
+  case $boost_cv_lib_tag in #(
+    # Some newer (>= 1.35?) versions of Boost seem to only use "gcc" as opposed
+    # to "gcc41" for instance.
+    *-gcc | *'-gcc ') :;; #(  Don't re-add -gcc: it's already in there.
+    gcc*)
+      boost_tag_x=
+      case $host_os in #(
+        darwin*)
+          if test $boost_major_version -ge 136; then
+            # The `x' added in r46793 of Boost.
+            boost_tag_x=x
+          fi;;
+      esac
+      # We can specify multiple tags in this variable because it's used by
+      # BOOST_FIND_LIB that does a `for tag in -$boost_cv_lib_tag' ...
+      boost_cv_lib_tag="$boost_tag_x$boost_cv_lib_tag -${boost_tag_x}gcc"
+      ;; #(
+    unknown)
+      AC_MSG_WARN([[could not figure out which toolset name to use for $CXX]])
+      boost_cv_lib_tag=
+      ;;
+  esac
+fi])dnl end of AC_CACHE_CHECK
+])# _BOOST_FIND_COMPILER_TAG
+
+
+# _BOOST_GUESS_WHETHER_TO_USE_MT()
+# --------------------------------
+# Compile a small test to try to guess whether we should favor MT (Multi
+# Thread) flavors of Boost.  Sets boost_guess_use_mt accordingly.
+AC_DEFUN([_BOOST_GUESS_WHETHER_TO_USE_MT],
+[# Check whether we do better use `mt' even though we weren't ask to.
+AC_COMPILE_IFELSE([AC_LANG_PROGRAM([[
+#if defined _REENTRANT || defined _MT || defined __MT__
+/* use -mt */
+#else
+# error MT not needed
+#endif
+]])], [boost_guess_use_mt=:], [boost_guess_use_mt=false])
+])
+
+# _BOOST_AC_LINK_IFELSE(PROGRAM, [ACTION-IF-TRUE], [ACTION-IF-FALSE])
+# -------------------------------------------------------------------
+# Fork of _AC_LINK_IFELSE that preserves conftest.o across calls.  Fragile,
+# will break when Autoconf changes its internals.  Requires that you manually
+# rm -f conftest.$ac_objext in between to really different tests, otherwise
+# you will try to link a conftest.o left behind by a previous test.
+# Used to aggressively optimize BOOST_FIND_LIB (see the big comment in this
+# macro).
+#
+# Don't use "break" in the actions, as it would short-circuit some code
+# this macro runs after the actions.
+m4_define([_BOOST_AC_LINK_IFELSE],
+[m4_ifvaln([$1], [AC_LANG_CONFTEST([$1])])dnl
+rm -f conftest$ac_exeext
+boost_save_ac_ext=$ac_ext
+boost_use_source=:
+# If we already have a .o, re-use it.  We change $ac_ext so that $ac_link
+# tries to link the existing object file instead of compiling from source.
+test -f conftest.$ac_objext && ac_ext=$ac_objext && boost_use_source=false &&
+  _AS_ECHO_LOG([re-using the existing conftest.$ac_objext])
+AS_IF([_AC_DO_STDERR($ac_link) && {
+	 test -z "$ac_[]_AC_LANG_ABBREV[]_werror_flag" ||
+	 test ! -s conftest.err
+       } && test -s conftest$ac_exeext && {
+	 test "$cross_compiling" = yes ||
+	 $as_executable_p conftest$ac_exeext
+dnl FIXME: use AS_TEST_X instead when 2.61 is widespread enough.
+       }],
+      [$2],
+      [if $boost_use_source; then
+         _AC_MSG_LOG_CONFTEST
+       fi
+       $3])
+ac_objext=$boost_save_ac_objext
+ac_ext=$boost_save_ac_ext
+dnl Delete also the IPA/IPO (Inter Procedural Analysis/Optimization)
+dnl information created by the PGI compiler (conftest_ipa8_conftest.oo),
+dnl as it would interfere with the next link command.
+rm -f core conftest.err conftest_ipa8_conftest.oo \
+      conftest$ac_exeext m4_ifval([$1], [conftest.$ac_ext])[]dnl
+])# _BOOST_AC_LINK_IFELSE
+
+# Local Variables:
+# mode: autoconf
+# End:
diff --git a/scripts/training/phrase-extract/extract-ghkm/Alignment.cpp b/scripts/training/phrase-extract/extract-ghkm/tools/Alignment.cpp
index eb67fe8fd..fcd5e14e1 100644
--- a/scripts/training/phrase-extract/extract-ghkm/Alignment.cpp
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/Alignment.cpp
@@ -1,21 +1,21 @@
 /***********************************************************************
-  Moses - factored phrase-based language decoder
-  Copyright (C) 2010 University of Edinburgh
-
-  This library is free software; you can redistribute it and/or
-  modify it under the terms of the GNU Lesser General Public
-  License as published by the Free Software Foundation; either
-  version 2.1 of the License, or (at your option) any later version.
-
-  This library is distributed in the hope that it will be useful,
-  but WITHOUT ANY WARRANTY; without even the implied warranty of
-  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-  Lesser General Public License for more details.
-
-  You should have received a copy of the GNU Lesser General Public
-  License along with this library; if not, write to the Free Software
-  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
- ***********************************************************************/
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
 
 #include "Alignment.h"
 
@@ -24,8 +24,10 @@
 #include <cassert>
 #include <cstdlib>
 
-Alignment
-readAlignment(const std::string & s)
+namespace Moses {
+namespace GHKM {
+
+Alignment ReadAlignment(const std::string &s)
 {
   Alignment a;
 
@@ -67,3 +69,6 @@ readAlignment(const std::string & s)
 
   return a;
 }
+
+}  // namespace GHKM
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/Alignment.h b/scripts/training/phrase-extract/extract-ghkm/tools/Alignment.h
new file mode 100644
index 000000000..bc42191e1
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/Alignment.h
@@ -0,0 +1,38 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_ALIGNMENT_H_
+#define EXTRACT_GHKM_ALIGNMENT_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+typedef std::vector<std::pair<int, int> > Alignment;
+
+Alignment ReadAlignment(const std::string &);
+
+}  // namespace GHKM
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/AlignmentGraph.cpp b/scripts/training/phrase-extract/extract-ghkm/tools/AlignmentGraph.cpp
new file mode 100644
index 000000000..c1d132648
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/AlignmentGraph.cpp
@@ -0,0 +1,382 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "AlignmentGraph.h"
+
+#include "ComposedRule.h"
+#include "Node.h"
+#include "Options.h"
+#include "ParseTree.h"
+#include "Subgraph.h"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <stack>
+
+namespace Moses {
+namespace GHKM {
+
+AlignmentGraph::AlignmentGraph(const ParseTree *t,
+                               const std::vector<std::string> &s,
+                               const Alignment &a)
+{
+  // Copy the parse tree nodes and add them to m_targetNodes.
+  m_root = CopyParseTree(t);
+
+  // Create a node for each source word.
+  m_sourceNodes.reserve(s.size());
+  for (std::vector<std::string>::const_iterator p(s.begin());
+       p != s.end(); ++p) {
+    m_sourceNodes.push_back(new Node(*p, SOURCE));
+  }
+
+  // Connect source nodes to parse tree leaves according to the given word
+  // alignment.
+  std::vector<Node *> targetTreeLeaves;
+  GetTargetTreeLeaves(m_root, targetTreeLeaves);
+  for (Alignment::const_iterator p(a.begin()); p != a.end(); ++p) {
+    Node *src = m_sourceNodes[p->first];
+    Node *tgt = targetTreeLeaves[p->second];
+    src->AddParent(tgt);
+    tgt->AddChild(src);
+  }
+
+  // Attach unaligned source words (if any).
+  AttachUnalignedSourceWords();
+
+  // Populate node spans.
+  std::vector<Node *>::const_iterator p(m_sourceNodes.begin());
+  for (int i = 0; p != m_sourceNodes.end(); ++p, ++i) {
+    (*p)->PropagateIndex(i);
+  }
+
+  // Calculate complement spans.
+  CalcComplementSpans(m_root);
+}
+
+AlignmentGraph::~AlignmentGraph()
+{
+  for (std::vector<Node *>::iterator p(m_sourceNodes.begin());
+       p != m_sourceNodes.end(); ++p) {
+    delete *p;
+  }
+  for (std::vector<Node *>::iterator p(m_targetNodes.begin());
+       p != m_targetNodes.end(); ++p) {
+    delete *p;
+  }
+}
+
+Subgraph AlignmentGraph::ComputeMinimalFrontierGraphFragment(
+    Node *root,
+    const std::set<Node *> &frontierSet)
+{
+  std::stack<Node *> expandableNodes;
+  std::set<const Node *> expandedNodes;
+
+  if (root->IsSink()) {
+    expandedNodes.insert(root);
+  } else {
+    expandableNodes.push(root);
+  }
+
+  while (!expandableNodes.empty()) {
+    Node *n = expandableNodes.top();
+    expandableNodes.pop();
+
+    const std::vector<Node *> &children = n->GetChildren();
+
+    for (std::vector<Node *>::const_iterator p(children.begin());
+         p != children.end(); ++p) {
+      Node *child = *p;
+      if (child->IsSink()) {
+        expandedNodes.insert(child);
+        continue;
+      }
+      std::set<Node *>::const_iterator q = frontierSet.find(child);
+      if (q == frontierSet.end()) { //child is not from the frontier set
+        expandableNodes.push(child);
+      } else if (child->GetType() == TARGET) { // still need source word
+        expandableNodes.push(child);
+      } else {
+        expandedNodes.insert(child);
+      }
+    }
+  }
+
+  return Subgraph(root, expandedNodes);
+}
+
+void AlignmentGraph::ExtractMinimalRules(const Options &options)
+{
+  // Determine which nodes are frontier nodes.
+  std::set<Node *> frontierSet;
+  ComputeFrontierSet(m_root, options, frontierSet);
+
+  // Form the minimal frontier graph fragment rooted at each frontier node.
+  std::vector<Subgraph> fragments;
+  fragments.reserve(frontierSet.size());
+  for (std::set<Node *>::iterator p(frontierSet.begin());
+       p != frontierSet.end(); ++p) {
+    Node *root = *p;
+    Subgraph fragment = ComputeMinimalFrontierGraphFragment(root, frontierSet);
+    assert(!fragment.IsTrivial());
+    // Can it form an SCFG rule?
+    // FIXME Does this exclude non-lexical unary rules?
+    if (root->GetType() == TREE && !root->GetSpan().empty()) {
+      root->AddRule(new Subgraph(fragment));
+    }
+  }
+}
+
+void AlignmentGraph::ExtractComposedRules(const Options &options)
+{
+  ExtractComposedRules(m_root, options);
+}
+
+void AlignmentGraph::ExtractComposedRules(Node *node, const Options &options)
+{
+  // Extract composed rules for all children first.
+  const std::vector<Node *> &children = node->GetChildren();
+  for (std::vector<Node *>::const_iterator p(children.begin());
+       p != children.end(); ++p) {
+    ExtractComposedRules(*p, options);
+  }
+
+  // If there is no minimal rule for this node then there are no composed
+  // rules.
+  const std::vector<const Subgraph*> &rules = node->GetRules();
+  assert(rules.size() <= 1);
+  if (rules.empty()) {
+    return;
+  }
+
+  // Construct an initial composition candidate from the minimal rule.
+  ComposedRule cr(*(rules[0]));
+  if (!cr.GetOpenAttachmentPoint()) {
+    // No composition possible.
+    return;
+  }
+
+  std::queue<ComposedRule> queue;
+  queue.push(cr);
+  while (!queue.empty()) {
+    ComposedRule cr = queue.front();
+    queue.pop();
+    const Node *attachmentPoint = cr.GetOpenAttachmentPoint();
+    assert(attachmentPoint);
+    assert(attachmentPoint != node);
+    // Create all possible rules by composing this node's minimal rule with the
+    // existing rules (both minimal and composed) rooted at the first open
+    // attachment point.
+    const std::vector<const Subgraph*> &rules = attachmentPoint->GetRules();
+    for (std::vector<const Subgraph*>::const_iterator p = rules.begin();
+         p != rules.end(); ++p) {
+      assert((*p)->GetRoot()->GetType() == TREE);
+      ComposedRule *cr2 = cr.AttemptComposition(**p, options);
+      if (cr2) {
+        node->AddRule(new Subgraph(cr2->CreateSubgraph()));
+        if (cr2->GetOpenAttachmentPoint()) {
+          queue.push(*cr2);
+        }
+        delete cr2;
+      }
+    }
+    // Done with this attachment point.  Advance to the next, if any.
+    cr.CloseAttachmentPoint();
+    if (cr.GetOpenAttachmentPoint()) {
+      queue.push(cr);
+    }
+  }
+}
+
+Node *AlignmentGraph::CopyParseTree(const ParseTree *root)
+{
+  NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE;
+
+  std::auto_ptr<Node> n(new Node(root->GetLabel(), nodeType));
+
+  const std::vector<ParseTree *> &children = root->GetChildren();
+  std::vector<Node *> childNodes;
+  childNodes.reserve(children.size());
+  for (std::vector<ParseTree *>::const_iterator p(children.begin());
+       p != children.end(); ++p) {
+    Node *child = CopyParseTree(*p);
+    child->AddParent(n.get());
+    childNodes.push_back(child);
+  }
+  n->SetChildren(childNodes);
+
+  Node *p = n.release();
+  m_targetNodes.push_back(p);
+  return p;
+}
+
+// Finds the set of frontier nodes.  The definition of a frontier node differs
+// from Galley et al's (2004) in the following ways:
+//
+// 1. A node with an empty span is not a frontier node (this excludes
+//    unaligned target subtrees).
+// 2. Target word nodes are not frontier nodes.
+// 3. Source word nodes are not frontier nodes.
+// 4. Unless the --AllowUnary option is used, a node is not a frontier node if
+//    it has the same span as its parent.
+void AlignmentGraph::ComputeFrontierSet(Node *root,
+                                        const Options &options,
+                                        std::set<Node *> &frontierSet) const
+{
+  // Don't include word nodes or unaligned target subtrees.
+  if (root->GetType() != TREE || root->GetSpan().empty()) {
+    return;
+  }
+
+  if (!SpansIntersect(root->GetComplementSpan(), Closure(root->GetSpan()))) {
+    // Unless unary rules are explicitly allowed, we use Chung et al's (2011)
+    // modified defintion of a frontier node to eliminate the production of
+    // non-lexical unary rules.
+    assert(root->GetParents().size() <= 1);
+    if (options.allowUnary
+        || root->GetParents().empty()
+        || root->GetParents()[0]->GetSpan() != root->GetSpan()) {
+      frontierSet.insert(root);
+    }
+  }
+
+  const std::vector<Node *> &children = root->GetChildren();
+  for (std::vector<Node *>::const_iterator p(children.begin());
+       p != children.end(); ++p) {
+    ComputeFrontierSet(*p, options, frontierSet);
+  }
+}
+
+void AlignmentGraph::CalcComplementSpans(Node *root)
+{
+  Span compSpan;
+  std::set<Node *> siblings;
+
+  const std::vector<Node *> &parents = root->GetParents();
+  for (std::vector<Node *>::const_iterator p(parents.begin());
+       p != parents.end(); ++p) {
+    const Span &parentCompSpan = (*p)->GetComplementSpan();
+    compSpan.insert(parentCompSpan.begin(), parentCompSpan.end());
+    const std::vector<Node *> &c = (*p)->GetChildren();
+    siblings.insert(c.begin(), c.end());
+  }
+
+  for (std::set<Node *>::iterator p(siblings.begin());
+       p != siblings.end(); ++p) {
+    if (*p == root) {
+      continue;
+    }
+    const Span &siblingSpan = (*p)->GetSpan();
+    compSpan.insert(siblingSpan.begin(), siblingSpan.end());
+  }
+
+  root->SetComplementSpan(compSpan);
+
+  const std::vector<Node *> &children = root->GetChildren();
+  for (std::vector<Node *>::const_iterator p(children.begin());
+       p != children.end(); ++p) {
+    CalcComplementSpans(*p);
+  }
+}
+
+void AlignmentGraph::GetTargetTreeLeaves(Node *root,
+                                         std::vector<Node *> &leaves)
+{
+  if (root->IsSink()) {
+    leaves.push_back(root);
+  } else {
+    const std::vector<Node *> &children = root->GetChildren();
+    for (std::vector<Node *>::const_iterator p(children.begin());
+         p != children.end(); ++p) {
+      GetTargetTreeLeaves(*p, leaves);
+    }
+  }
+}
+
+void AlignmentGraph::AttachUnalignedSourceWords()
+{
+  // Find the unaligned source words (if any).
+  std::set<int> unaligned;
+  for (int i = 0; i < m_sourceNodes.size(); ++i) {
+    const Node &sourceNode = (*m_sourceNodes[i]);
+    if (sourceNode.GetParents().empty()) {
+      unaligned.insert(i);
+    }
+  }
+
+  // Determine the attachment point for each one and attach it.
+  for (std::set<int>::iterator p = unaligned.begin();
+       p != unaligned.end(); ++p) {
+    int index = *p;
+    Node *attachmentPoint = DetermineAttachmentPoint(index);
+    Node *sourceNode = m_sourceNodes[index];
+    attachmentPoint->AddChild(sourceNode);
+    sourceNode->AddParent(attachmentPoint);
+  }
+}
+
+Node *AlignmentGraph::DetermineAttachmentPoint(int index)
+{
+  // Find the nearest aligned neighbour to the left, if any.
+  int i = index;
+  while (--i >= 0) {
+    if (!m_sourceNodes[i]->GetParents().empty()) {
+      break;
+    }
+  }
+  // No aligned neighbours to the left, so attach to the root.
+  if (i == -1) {
+    return m_root;
+  }
+  // Find the nearest aligned neighbour to the right, if any.
+  int j = index;
+  while (++j < m_sourceNodes.size()) {
+    if (!m_sourceNodes[j]->GetParents().empty()) {
+      break;
+    }
+  }
+  // No aligned neighbours to the right, so attach to the root.
+  if (j == m_sourceNodes.size()) {
+    return m_root;
+  }
+  // Construct the set of target nodes that are aligned to the left and right
+  // neighbours.
+  const std::vector<Node *> &leftParents = m_sourceNodes[i]->GetParents();
+  assert(!leftParents.empty());
+  const std::vector<Node *> &rightParents = m_sourceNodes[j]->GetParents();
+  assert(!rightParents.empty());
+  std::set<Node *> targetSet;
+  targetSet.insert(leftParents.begin(), leftParents.end());
+  targetSet.insert(rightParents.begin(), rightParents.end());
+  // The attachment point is the lowest common ancestor of the target word
+  // nodes, unless the LCA is itself a target word, in which case the LCA
+  // is the parent.  This is to avoid including introducing new word alignments.
+  // It assumes that the parse tree uses preterminals for parts of speech.
+  Node *lca = Node::LowestCommonAncestor(targetSet.begin(), targetSet.end());
+  if (lca->GetType() == TARGET) {
+    assert(lca->GetParents().size() == 1);
+    return lca->GetParents()[0];
+  }
+  return lca;
+}
+
+}  // namespace GHKM
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/AlignmentGraph.h b/scripts/training/phrase-extract/extract-ghkm/tools/AlignmentGraph.h
new file mode 100644
index 000000000..94948758a
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/AlignmentGraph.h
@@ -0,0 +1,76 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_ALIGNMENT_GRAPH_H_
+#define EXTRACT_GHKM_ALIGNMENT_GRAPH_H_
+
+#include "Alignment.h"
+#include "Options.h"
+
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+class Node;
+class ParseTree;
+class Subgraph;
+
+class AlignmentGraph
+{
+ public:
+  AlignmentGraph(const ParseTree *,
+                 const std::vector<std::string> &,
+                 const Alignment &);
+
+  ~AlignmentGraph();
+
+  Node *GetRoot() { return m_root; }
+  const std::vector<Node *> &GetTargetNodes() { return m_targetNodes; }
+
+  void ExtractMinimalRules(const Options &);
+  void ExtractComposedRules(const Options &);
+
+ private:
+  // Disallow copying
+  AlignmentGraph(const AlignmentGraph &);
+  AlignmentGraph &operator=(const AlignmentGraph &);
+
+  Node *CopyParseTree(const ParseTree *);
+  void ComputeFrontierSet(Node *, const Options &, std::set<Node *> &) const;
+  void CalcComplementSpans(Node *);
+  void GetTargetTreeLeaves(Node *, std::vector<Node *> &); 
+  void AttachUnalignedSourceWords();
+  Node *DetermineAttachmentPoint(int);
+  Subgraph ComputeMinimalFrontierGraphFragment(Node *,
+                                               const std::set<Node *> &);
+  void ExtractComposedRules(Node *, const Options &);
+
+  Node *m_root;
+  std::vector<Node *> m_sourceNodes;
+  std::vector<Node *> m_targetNodes;
+};
+
+}  // namespace GHKM
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/ComposedRule.cpp b/scripts/training/phrase-extract/extract-ghkm/tools/ComposedRule.cpp
new file mode 100644
index 000000000..7a7fba106
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/ComposedRule.cpp
@@ -0,0 +1,129 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "ComposedRule.h"
+
+#include "Node.h"
+#include "Options.h"
+#include "Subgraph.h"
+
+#include <set>
+#include <vector>
+#include <queue>
+
+namespace Moses {
+namespace GHKM {
+
+ComposedRule::ComposedRule(const Subgraph &baseRule)
+    : m_baseRule(baseRule)
+    , m_depth(baseRule.GetDepth())
+    , m_size(baseRule.GetSize())
+    , m_nodeCount(baseRule.GetNodeCount())
+{
+  const std::set<const Node *> &leaves = baseRule.GetLeaves();
+  for (std::set<const Node *>::const_iterator p = leaves.begin();
+       p != leaves.end(); ++p) {
+    if ((*p)->GetType() == TREE) {
+      m_openAttachmentPoints.push(*p);
+    }
+  }
+}
+
+ComposedRule::ComposedRule(const ComposedRule &other, const Subgraph &rule,
+                           int depth)
+    : m_baseRule(other.m_baseRule)
+    , m_attachedRules(other.m_attachedRules)
+    , m_openAttachmentPoints(other.m_openAttachmentPoints)
+    , m_depth(depth)
+    , m_size(other.m_size+rule.GetSize())
+    , m_nodeCount(other.m_nodeCount+rule.GetNodeCount()-1)
+{
+  m_attachedRules.push_back(&rule);
+  m_openAttachmentPoints.pop();
+}
+
+const Node *ComposedRule::GetOpenAttachmentPoint()
+{
+  return m_openAttachmentPoints.empty() ? 0 : m_openAttachmentPoints.front();
+}
+
+void ComposedRule::CloseAttachmentPoint()
+{
+  assert(!m_openAttachmentPoints.empty());
+  m_attachedRules.push_back(0);
+  m_openAttachmentPoints.pop();
+}
+
+ComposedRule *ComposedRule::AttemptComposition(const Subgraph &rule,
+                                               const Options &options) const
+{
+  // The smallest possible rule fragment should be rooted at a tree node.
+  // Note that this differs from the original GHKM definition.
+  assert(rule.GetRoot()->GetType() == TREE);
+
+  // Check the node count of the proposed rule.
+  if (m_nodeCount+rule.GetNodeCount()-1 > options.maxNodes) {
+    return 0;
+  }
+
+  // Check the size of the proposed rule.
+  if (m_size+rule.GetSize() > options.maxRuleSize) {
+    return 0;
+  }
+
+  // Determine the depth of the proposed rule and test whether it exceeds the
+  // limit.
+  int attachmentPointDepth = 0;
+  const Node *n = rule.GetRoot();
+  while (n != m_baseRule.GetRoot()) {
+    assert(n->GetParents().size() == 1);
+    n = n->GetParents()[0];
+    ++attachmentPointDepth;
+  }
+  int newDepth = std::max(m_depth, attachmentPointDepth+rule.GetDepth());
+  if (newDepth > options.maxRuleDepth) {
+    return 0;
+  }
+
+  return new ComposedRule(*this, rule, newDepth);
+}
+
+Subgraph ComposedRule::CreateSubgraph()
+{
+  std::set<const Node *> leaves;
+  const std::set<const Node *> &baseLeaves = m_baseRule.GetLeaves();
+  int i = 0;
+  for (std::set<const Node *>::const_iterator p = baseLeaves.begin();
+       p != baseLeaves.end(); ++p) {
+    const Node *baseLeaf = *p;
+    if (baseLeaf->GetType() == TREE && i < m_attachedRules.size()) {
+      const Subgraph *attachedRule = m_attachedRules[i++];
+      if (attachedRule) {
+        leaves.insert(attachedRule->GetLeaves().begin(),
+                      attachedRule->GetLeaves().end());
+        continue;
+      }
+    }
+    leaves.insert(baseLeaf);
+  }
+  return Subgraph(m_baseRule.GetRoot(), leaves);
+}
+
+}  // namespace GHKM
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/ComposedRule.h b/scripts/training/phrase-extract/extract-ghkm/tools/ComposedRule.h
new file mode 100644
index 000000000..34394c935
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/ComposedRule.h
@@ -0,0 +1,70 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_COMPOSED_RULE_H_
+#define EXTRACT_GHKM_COMPOSED_RULE_H_
+
+#include "Subgraph.h"
+
+#include <vector>
+#include <queue>
+
+namespace Moses {
+namespace GHKM {
+
+class Node;
+class Options;
+
+class ComposedRule
+{
+ public:
+  // Form a 'trivial' ComposedRule from a single existing rule.
+  ComposedRule(const Subgraph &baseRule);
+
+  // Returns the first open attachment point if any exist or 0 otherwise.
+  const Node *GetOpenAttachmentPoint();
+
+  // Close the first open attachment point without attaching a rule.
+  void CloseAttachmentPoint();
+
+  // Attempts to produce a new composed rule by attaching a given rule at the
+  // first open attachment point.  This will fail if the proposed rule violates
+  // the constraints set in the Options object, in which case the function
+  // returns 0.
+  ComposedRule *AttemptComposition(const Subgraph &, const Options &) const;
+
+  // Constructs a Subgraph object corresponding to the composed rule.
+  Subgraph CreateSubgraph();
+
+ private:
+  ComposedRule(const ComposedRule &, const Subgraph &, int);
+
+  const Subgraph &m_baseRule;
+  std::vector<const Subgraph *> m_attachedRules;
+  std::queue<const Node *> m_openAttachmentPoints;
+  int m_depth;
+  int m_nodeCount;
+  int m_size;
+};
+
+}  // namespace GHKM
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/Exception.h b/scripts/training/phrase-extract/extract-ghkm/tools/Exception.h
new file mode 100644
index 000000000..9928785f0
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/Exception.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_EXCEPTION_H_
+#define EXTRACT_GHKM_EXCEPTION_H_
+
+#include <string>
+
+namespace Moses {
+namespace GHKM {
+
+class Exception
+{
+ public:
+  Exception(const char *msg) : m_msg(msg) {}
+  Exception(const std::string &msg) : m_msg(msg) {}
+  const std::string &GetMsg() const { return m_msg; }
+ private:
+  std::string m_msg;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/ExtractGHKM.cpp b/scripts/training/phrase-extract/extract-ghkm/tools/ExtractGHKM.cpp
new file mode 100644
index 000000000..dad326131
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/ExtractGHKM.cpp
@@ -0,0 +1,476 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "ExtractGHKM.h"
+
+#include "Alignment.h"
+#include "AlignmentGraph.h"
+#include "Exception.h"
+#include "Node.h"
+#include "Options.h"
+#include "ParseTree.h"
+#include "ScfgRule.h"
+#include "ScfgRuleWriter.h"
+#include "Span.h"
+#include "XmlTreeParser.h"
+
+#include <boost/program_options.hpp>
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <sstream>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+int ExtractGHKM::Main(int argc, char *argv[])
+{
+  // Process command-line options.
+  Options options;
+  ProcessOptions(argc, argv, options);
+
+  // Open input files.
+  std::ifstream targetStream;
+  std::ifstream sourceStream;
+  std::ifstream alignmentStream;
+  OpenInputFileOrDie(options.targetFile, targetStream);
+  OpenInputFileOrDie(options.sourceFile, sourceStream);
+  OpenInputFileOrDie(options.alignmentFile, alignmentStream);
+
+  // Open output files.
+  std::ofstream extractStream;
+  std::ofstream invExtractStream;
+  std::ofstream glueGrammarStream;
+  std::ofstream unknownWordStream;
+  std::string invExtractFileName = options.extractFile + std::string(".inv");
+  OpenOutputFileOrDie(options.extractFile, extractStream);
+  OpenOutputFileOrDie(invExtractFileName, invExtractStream);
+  if (!options.glueGrammarFile.empty()) {
+    OpenOutputFileOrDie(options.glueGrammarFile, glueGrammarStream);
+  }
+  if (!options.unknownWordFile.empty()) {
+    OpenOutputFileOrDie(options.unknownWordFile, unknownWordStream);
+  }
+
+  // Target label sets for producing glue grammar.
+  std::set<std::string> labelSet;
+  std::set<std::string> topLabelSet;
+
+  // Word count statistics for producing unknown word labels.
+  std::map<std::string, int> wordCount;
+  std::map<std::string, std::string> wordLabel;
+
+  std::string targetLine;
+  std::string sourceLine;
+  std::string alignmentLine;
+  ScfgRuleWriter writer(extractStream, invExtractStream, options);
+  size_t lineNum = 0;
+  while (true) {
+    std::getline(targetStream, targetLine);
+    std::getline(sourceStream, sourceLine);
+    std::getline(alignmentStream, alignmentLine);
+
+    if (targetStream.eof() && sourceStream.eof() && alignmentStream.eof()) {
+      break;
+    }
+
+    if (targetStream.eof() || sourceStream.eof() || alignmentStream.eof()) {
+      Error("Files must contain same number of lines");
+    }
+
+    ++lineNum;
+
+    // Parse target tree.
+    std::auto_ptr<ParseTree> t(ParseXmlTree(targetLine));
+    if (!t.get()) {
+      std::ostringstream s;
+      s << "Failed to parse XML tree at line " << lineNum;
+      Error(s.str());
+    }
+
+    // Read source tokens.
+    std::vector<std::string> sourceTokens(ReadTokens(sourceLine));
+
+    // Read word alignments.
+    Alignment alignment;
+    try {
+      alignment = ReadAlignment(alignmentLine);
+    } catch (const Exception &e) {
+      std::ostringstream s;
+      s << "Failed to read alignment at line " << lineNum << ": ";
+      s << e.GetMsg();
+      Error(s.str());
+    }
+
+    // Record tree labels for use in glue grammar.
+    if (!options.glueGrammarFile.empty()) {
+      // Record labels that cover the full sentence to topLabelSet.
+      ParseTree *p = t.get();
+      topLabelSet.insert(p->GetLabel());
+      while (p->GetChildren().size() == 1) {
+        p = p->GetChildren()[0];
+        if (p->IsLeaf()) {
+          break;
+        }
+        topLabelSet.insert(p->GetLabel());
+      }
+      // Record all labels to labelSet.
+      RecordTreeLabels(*t, labelSet);
+    }
+
+    // Record word counts.
+    if (!options.unknownWordFile.empty()) {
+      CollectWordLabelCounts(*t, wordCount, wordLabel);
+    }
+
+    // Form an alignment graph from the target tree, source words, and
+    // alignment.
+    AlignmentGraph graph(t.get(), sourceTokens, alignment);
+
+    // Extract minimal rules, adding each rule to its root node's rule set.
+    graph.ExtractMinimalRules(options);
+
+    // Extract composed rules.
+    if (!options.minimal) {
+      graph.ExtractComposedRules(options);
+    }
+
+    // Write the rules, subject to scope pruning.
+    const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
+    for (std::vector<Node *>::const_iterator p = targetNodes.begin();
+         p != targetNodes.end(); ++p) {
+      const std::vector<const Subgraph *> &rules = (*p)->GetRules();
+      for (std::vector<const Subgraph *>::const_iterator q = rules.begin();
+           q != rules.end(); ++q) {
+        ScfgRule r(**q);
+        // TODO Can scope pruning be done earlier?
+        if (r.Scope() <= options.maxScope) {
+          writer.Write(r);
+        }
+      }
+    }
+  }
+
+  if (!options.glueGrammarFile.empty()) {
+    WriteGlueGrammar(labelSet, topLabelSet, glueGrammarStream);
+  }
+
+  if (!options.unknownWordFile.empty()) {
+    WriteUnknownWordLabel(wordCount, wordLabel, unknownWordStream);
+  }
+
+  return 0;
+}
+
+void ExtractGHKM::OpenInputFileOrDie(const std::string &filename,
+                                     std::ifstream &stream)
+{
+  stream.open(filename.c_str());
+  if (!stream) {
+    std::ostringstream msg;
+    msg << "failed to open input file: " << filename;
+    Error(msg.str());
+  }
+}
+
+void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename,
+                                      std::ofstream &stream)
+{
+  stream.open(filename.c_str());
+  if (!stream) {
+    std::ostringstream msg;
+    msg << "failed to open output file: " << filename;
+    Error(msg.str());
+  }
+}
+
+void ExtractGHKM::ProcessOptions(int argc, char *argv[],
+                                 Options &options) const
+{
+  namespace po = boost::program_options;
+  namespace cls = boost::program_options::command_line_style;
+
+  // Construct the 'top' of the usage message: the bit that comes before the
+  // options list.
+  std::ostringstream usageTop;
+  usageTop << "Usage: " << GetName()
+           << " [OPTION]... TARGET SOURCE ALIGNMENT EXTRACT\n\n"
+           << "SCFG rule extractor based on the GHKM algorithm described in\n"
+           << "Galley et al. (2004).\n\n"
+           << "Options";
+
+  // Construct the 'bottom' of the usage message.
+  std::ostringstream usageBottom;
+  usageBottom << "\nImplementation Notes:\n"
+              << "\nThe parse tree is assumed to contain part-of-speech preterminal nodes.\n"
+              << "\n"
+              << "For the composed rule constraints: rule depth is the maximum distance from the\nrule's root node to a sink node, not counting preterminal expansions or word\nalignments.  Rule size is the measure defined in DeNeefe et al (2007): the\nnumber of non-part-of-speech, non-leaf constituent labels in the target tree.\nNode count is the number of target tree nodes (excluding target words).\n"
+              << "\n"
+              << "Scope pruning (Hopkins and Langmead, 2010) is applied to both minimal and\ncomposed rules.\n"
+              << "\n"
+              << "Unaligned source words are attached to the tree using the following heuristic:\nif there are aligned source words to both the left and the right of an unaligned\nsource word then it is attached to the lowest common ancestor of its nearest\nsuch left and right neighbours.  Otherwise, it is attached to the root of the\nparse tree.\n"
+              << "\n"
+              << "Unless the --AllowUnary option is given, unary rules containing no lexical\nsource items are eliminated using the method described in Chung et al. (2011).\nThe parsing algorithm used in Moses is unable to handle such rules.\n"
+              << "\n"
+              << "References:\n"
+              << "Galley, M., Hopkins, M., Knight, K., and Marcu, D. (2004)\n"
+              << "\"What's in a Translation Rule?\", In Proceedings of HLT/NAACL 2004.\n"
+              << "\n"
+              << "DeNeefe, S., Knight, K., Wang, W., and Marcu, D. (2007)\n"
+              << "\"What Can Syntax-Based MT Learn from Phrase-Based MT?\", In Proceedings of\nEMNLP-CoNLL 2007.\n"
+              << "\n"
+              << "Hopkins, M. and Langmead, G. (2010)\n"
+              << "\"SCFG Decoding Without Binarization\", In Proceedings of EMNLP 2010.\n"
+              << "\n"
+              << "Chung, T. and Fang, L. and Gildea, D. (2011)\n"
+              << "\"Issues Concerning Decoding with Synchronous Context-free Grammar\", In\nProceedings of ACL/HLT 2011.";
+
+  // Declare the command line options that are visible to the user.
+  po::options_description visible(usageTop.str());
+  visible.add_options()
+    //("help", "print this help message and exit")
+    ("AllowUnary",
+        "allow fully non-lexical unary rules")
+    ("GlueGrammar",
+        po::value(&options.glueGrammarFile),
+        "write glue grammar to named file")
+    ("MaxNodes",
+        po::value(&options.maxNodes)->default_value(options.maxNodes),
+        "set maximum number of tree nodes for composed rules")
+    ("MaxRuleDepth",
+        po::value(&options.maxRuleDepth)->default_value(options.maxRuleDepth),
+        "set maximum depth for composed rules")
+    ("MaxRuleSize",
+        po::value(&options.maxRuleSize)->default_value(options.maxRuleSize),
+        "set maximum size for composed rules")
+    ("MaxScope",
+        po::value(&options.maxScope)->default_value(options.maxScope),
+        "set maximum allowed scope")
+    ("Minimal",
+        "extract minimal rules only")
+    ("UnknownWordLabel",
+        po::value(&options.unknownWordFile),
+        "write unknown word labels to named file")
+    ("UnpairedExtractFormat",
+        "do not pair non-terminals in extract files")
+  ;
+
+  // Declare the command line options that are hidden from the user
+  // (these are used as positional options).
+  po::options_description hidden("Hidden options");
+  hidden.add_options()
+    ("TargetFile",
+        po::value(&options.targetFile),
+        "target file")
+    ("SourceFile",
+        po::value(&options.sourceFile),
+        "source file")
+    ("AlignmentFile",
+        po::value(&options.alignmentFile),
+        "alignment file")
+    ("ExtractFile",
+        po::value(&options.extractFile),
+        "extract file")
+  ;
+
+  // Compose the full set of command-line options.
+  po::options_description cmdLineOptions;
+  cmdLineOptions.add(visible).add(hidden);
+
+  // Register the positional options.
+  po::positional_options_description p;
+  p.add("TargetFile", 1);
+  p.add("SourceFile", 1);
+  p.add("AlignmentFile", 1);
+  p.add("ExtractFile", 1);
+
+  // Process the command-line.
+  po::variables_map vm;
+  const int optionStyle = cls::allow_long
+                        | cls::long_allow_adjacent
+                        | cls::long_allow_next;
+  try {
+    po::store(po::command_line_parser(argc, argv).style(optionStyle).
+              options(cmdLineOptions).positional(p).run(), vm);
+    po::notify(vm);
+  } catch (const std::exception &e) {
+    std::ostringstream msg;
+    msg << e.what() << "\n\n" << visible << usageBottom.str();
+    Error(msg.str());
+  }
+
+  if (vm.count("help")) {
+    std::cout << visible << usageBottom.str() << std::endl;
+    std::exit(0);
+  }
+
+  // Check all positional options were given.
+  if (!vm.count("TargetFile") ||
+      !vm.count("SourceFile") ||
+      !vm.count("AlignmentFile") ||
+      !vm.count("ExtractFile")) {
+    std::ostringstream msg;
+    std::cerr << visible << usageBottom.str() << std::endl;
+    std::exit(1);
+  }
+
+  // Process Boolean options.
+  if (vm.count("AllowUnary")) {
+    options.allowUnary = true;
+  }
+  if (vm.count("Minimal")) {
+    options.minimal = true;
+  }
+  if (vm.count("UnpairedExtractFormat")) {
+    options.unpairedExtractFormat = true;
+  }
+}
+
+void ExtractGHKM::Error(const std::string &msg) const
+{
+  std::cerr << GetName() << ": " << msg << std::endl;
+  std::exit(1);
+}
+
+std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s)
+{
+  std::vector<std::string> tokens;
+
+  std::string whitespace = " \t";
+
+  std::string::size_type begin = s.find_first_not_of(whitespace);
+  assert(begin != std::string::npos);
+  while (true) {
+    std::string::size_type end = s.find_first_of(whitespace, begin);
+    std::string token;
+    if (end == std::string::npos) {
+      token = s.substr(begin);
+    } else {
+      token = s.substr(begin, end-begin);
+    }
+    tokens.push_back(token);
+    if (end == std::string::npos) {
+      break;
+    }
+    begin = s.find_first_not_of(whitespace, end);
+    if (begin == std::string::npos) {
+      break;
+    }
+  }
+
+  return tokens;
+}
+
+void ExtractGHKM::WriteGlueGrammar(const std::set<std::string> &labelSet,
+                                   const std::set<std::string> &topLabelSet,
+                                   std::ostream &out)
+{
+  // chose a top label that is not already a label
+  std::string topLabel = "QQQQQQ";
+  for(int i = 1; i <= topLabel.length(); i++) {
+    if (labelSet.find(topLabel.substr(0,i)) == labelSet.end() ) {
+      topLabel = topLabel.substr(0,i);
+      break;
+    }
+  }
+
+  // basic rules
+  out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1  ||| " << std::endl;
+  out << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 " << std::endl;
+
+  // top rules
+  for (std::set<std::string>::const_iterator i = topLabelSet.begin();
+       i != topLabelSet.end(); ++i) {
+    out << "<s> [X][" << *i << "] </s> [X] ||| <s> [X][" << *i << "] </s> [" << topLabel << "] ||| 1 ||| 1-1" << std::endl;
+  }
+
+  // glue rules
+  for(std::set<std::string>::const_iterator i = labelSet.begin();
+      i != labelSet.end(); i++ ) {
+    out << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << std::endl;
+  }
+  // glue rule for unknown word...
+  out << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 |||  0-0 1-1 " << std::endl;
+}
+
+void ExtractGHKM::RecordTreeLabels(const ParseTree &t,
+                                   std::set<std::string> &labelSet)
+{
+  labelSet.insert(t.GetLabel());
+  const std::vector<ParseTree *> &children = t.GetChildren();
+  for (std::vector<ParseTree *>::const_iterator p = children.begin();
+       p != children.end(); ++p) {
+    const ParseTree &child = **p;
+    if (!child.IsLeaf()) {
+      RecordTreeLabels(child, labelSet);
+    }
+  }
+}
+
+void ExtractGHKM::CollectWordLabelCounts(
+    ParseTree &root,
+    std::map<std::string, int> &wordCount,
+    std::map<std::string, std::string> &wordLabel)
+{
+  std::vector<const ParseTree*> leaves;
+  root.GetLeaves(std::back_inserter(leaves));
+  for (std::vector<const ParseTree *>::const_iterator p = leaves.begin();
+       p != leaves.end(); ++p) {
+    const ParseTree &leaf = **p;
+    const std::string &word = leaf.GetLabel();
+    const std::string &label = leaf.GetParent()->GetLabel();
+    ++wordCount[word];
+    wordLabel[word] = label;
+  }
+}
+
+void ExtractGHKM::WriteUnknownWordLabel(
+    const std::map<std::string, int> &wordCount,
+    const std::map<std::string, std::string> &wordLabel,
+    std::ostream &out)
+{
+  std::map<std::string, int> labelCount;
+  int total = 0;
+  for (std::map<std::string, int>::const_iterator p = wordCount.begin();
+       p != wordCount.end(); ++p) {
+    // Only consider singletons.
+    if (p->second == 1) {
+      std::map<std::string, std::string>::const_iterator q =
+        wordLabel.find(p->first);
+      assert(q != wordLabel.end());
+      ++labelCount[q->second];
+      ++total;
+    }
+  }
+  for (std::map<std::string, int>::const_iterator p = labelCount.begin();
+       p != labelCount.end(); ++p) {
+    double ratio = static_cast<double>(p->second) / static_cast<double>(total);
+    if (ratio > 0.03) {
+      out << p->first << " " << ratio << std::endl;
+    }
+  }
+}
+
+}  // namespace GHKM
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/ExtractGHKM.h b/scripts/training/phrase-extract/extract-ghkm/tools/ExtractGHKM.h
new file mode 100644
index 000000000..4c06c2646
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/ExtractGHKM.h
@@ -0,0 +1,67 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_EXTRACT_GHKM_H_
+#define EXTRACT_GHKM_EXTRACT_GHKM_H_
+
+#include <map>
+#include <ostream>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+class Options;
+class ParseTree;
+
+class ExtractGHKM
+{
+ public:
+  ExtractGHKM() : m_name("extract-ghkm") {}
+  const std::string &GetName() const { return m_name; }
+  int Main(int argc, char *argv[]);
+ private:
+  void Error(const std::string &) const;
+  void OpenInputFileOrDie(const std::string &, std::ifstream &);
+  void OpenOutputFileOrDie(const std::string &, std::ofstream &);
+  void RecordTreeLabels(const ParseTree &, std::set<std::string> &);
+  void CollectWordLabelCounts(ParseTree &,
+                              std::map<std::string, int> &,
+                              std::map<std::string, std::string> &);
+  void WriteUnknownWordLabel(
+    const std::map<std::string, int> &,
+    const std::map<std::string, std::string> &,
+    std::ostream &);
+  void WriteGlueGrammar(const std::set<std::string> &,
+                        const std::set<std::string> &,
+                        std::ostream &);
+  std::vector<std::string> ReadTokens(const std::string &);
+  
+  void ProcessOptions(int, char *[], Options &) const;
+
+  std::string m_name;
+};
+
+}  // namespace GHKM
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/Main.cpp b/scripts/training/phrase-extract/extract-ghkm/tools/Main.cpp
new file mode 100644
index 000000000..faf3230a6
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/Main.cpp
@@ -0,0 +1,26 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "ExtractGHKM.h"
+
+int main(int argc, char *argv[])
+{
+  Moses::GHKM::ExtractGHKM tool;
+  return tool.Main(argc, argv);
+}
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/Makefile.am b/scripts/training/phrase-extract/extract-ghkm/tools/Makefile.am
new file mode 100644
index 000000000..ee230a12c
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/Makefile.am
@@ -0,0 +1,36 @@
+AM_CPPFLAGS = $(BOOST_CPPFLAGS) -I$(top_srcdir)/..
+AM_LDFLAGS = $(BOOST_PROGRAM_OPTIONS_LDFLAGS)
+LDADD = $(BOOST_PROGRAM_OPTIONS_LIBS)
+
+bin_PROGRAMS = extract-ghkm
+
+extract_ghkm_SOURCES = \
+  Alignment.cpp \
+  Alignment.h \
+  AlignmentGraph.cpp \
+  AlignmentGraph.h \
+  ComposedRule.cpp \
+  ComposedRule.h \
+  Exception.h \
+  ExtractGHKM.cpp \
+  ExtractGHKM.h \
+  Main.cpp \
+  Node.cpp \
+  Node.h \
+  ParseTree.cpp \
+  ParseTree.h \
+  ScfgRule.cpp \
+  ScfgRule.h \
+  ScfgRuleWriter.cpp \
+  ScfgRuleWriter.h \
+  Span.cpp \
+  Span.h \
+  Subgraph.cpp \
+  Subgraph.h \
+  XmlTreeParser.cpp \
+  XmlTreeParser.h \
+  ../../SyntaxTree.cpp \
+  ../../SyntaxTree.h \
+  ../../XmlException.h \
+  ../../XmlTree.cpp \
+  ../../XmlTree.h
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/Node.cpp b/scripts/training/phrase-extract/extract-ghkm/tools/Node.cpp
new file mode 100644
index 000000000..beb7470b8
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/Node.cpp
@@ -0,0 +1,71 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "Node.h"
+
+#include "Subgraph.h"
+
+namespace Moses {
+namespace GHKM {
+
+Node::~Node()
+{
+  for (std::vector<const Subgraph*>::const_iterator p(m_rules.begin());
+       p != m_rules.end(); ++p) {
+    delete *p;
+  }
+}
+
+bool Node::IsPreterminal() const
+{
+  return (m_type == TREE
+          && m_children.size() == 1
+          && m_children[0]->m_type == TARGET);
+}
+
+void Node::PropagateIndex(int index)
+{
+  m_span.insert(index);
+  for (std::vector<Node *>::const_iterator p(m_parents.begin());
+       p != m_parents.end(); ++p) {
+    (*p)->PropagateIndex(index);
+  }
+}
+
+std::vector<std::string> Node::GetTargetWords() const
+{
+  std::vector<std::string> targetWords;
+  GetTargetWords(targetWords);
+  return targetWords;
+}
+
+void Node::GetTargetWords(std::vector<std::string> &targetWords) const
+{
+  if (m_type == TARGET) {
+    targetWords.push_back(m_label);
+  } else {
+    for (std::vector<Node *>::const_iterator p(m_children.begin());
+         p != m_children.end(); ++p) {
+      (*p)->GetTargetWords(targetWords);
+    }
+  }
+}
+
+}  // namespace GHKM
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/Node.h b/scripts/training/phrase-extract/extract-ghkm/tools/Node.h
new file mode 100644
index 000000000..228fdc812
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/Node.h
@@ -0,0 +1,182 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_NODE_H_
+#define EXTRACT_GHKM_NODE_H_
+
+#include "Span.h"
+
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+class Subgraph;
+
+enum NodeType { SOURCE, TARGET, TREE };
+
+class Node
+{
+ public:
+  Node(const std::string &label, NodeType type)
+      : m_label(label)
+      , m_type(type)
+      , m_children()
+      , m_parents() {}
+
+  ~Node();
+
+  const std::string &GetLabel() const { return m_label; }
+  NodeType GetType() const { return m_type; }
+  const std::vector<Node*> &GetChildren() const { return m_children; }
+  const std::vector<Node*> &GetParents() const { return m_parents; }
+  const Span &GetSpan() const { return m_span; }
+  const Span &GetComplementSpan() const { return m_complementSpan; }
+  const std::vector<const Subgraph*> &GetRules() const { return m_rules; }
+
+  void SetChildren(const std::vector<Node*> &c) { m_children = c; }
+  void SetParents(const std::vector<Node*> &p) { m_parents = p; }
+  void SetSpan(const Span &s) { m_span = s; }
+  void SetComplementSpan(const Span &cs) { m_complementSpan = cs; }
+
+  void AddChild(Node *c) { m_children.push_back(c); }
+  void AddParent(Node *p) { m_parents.push_back(p); }
+  void AddRule(const Subgraph *s) { m_rules.push_back(s); }
+
+  bool IsSink() const { return m_children.empty(); }
+  bool IsPreterminal() const;
+
+  void PropagateIndex(int);
+
+  std::vector<std::string> GetTargetWords() const;
+
+  // Gets the path from this node's parent to the root.  This node is
+  // required to be part of the original parse tree (i.e. not a source word,
+  // which can have multiple parents).
+  template<typename OutputIterator>
+  void GetTreeAncestors(OutputIterator result, bool includeSelf=false);
+
+  // Returns the lowest common ancestor given a sequence of nodes belonging to
+  // the target tree.
+  template<typename InputIterator>
+  static Node *LowestCommonAncestor(InputIterator first, InputIterator last);
+
+ private:
+  // Disallow copying
+  Node(const Node &);
+  Node &operator=(const Node &);
+
+  void GetTargetWords(std::vector<std::string> &) const;
+
+  std::string m_label;
+  NodeType m_type;
+  std::vector<Node*> m_children;
+  std::vector<Node*> m_parents;
+  Span m_span;
+  Span m_complementSpan;
+  std::vector<const Subgraph*> m_rules;
+};
+
+template<typename OutputIterator>
+void Node::GetTreeAncestors(OutputIterator result, bool includeSelf)
+{
+  // This function assumes the node is part of the parse tree.
+  assert(m_type == TARGET || m_type == TREE);
+
+  if (includeSelf) {
+    *result++ = this;
+  }
+
+  Node *ancestor = !(m_parents.empty()) ? m_parents[0] : 0;
+  while (ancestor != 0) {
+    *result++ = ancestor;
+    ancestor = !(ancestor->m_parents.empty()) ? ancestor->m_parents[0] : 0;
+  }
+}
+
+template<typename InputIterator>
+Node *Node::LowestCommonAncestor(InputIterator first, InputIterator last)
+{
+  // Check for an empty sequence.
+  if (first == last) {
+    return 0;
+  }
+
+  // Check for the case that the sequence contains only one distinct node.
+  // Also check that every node belongs to the target tree.
+  InputIterator p = first;
+  Node *lca = *p++;
+  for (; p != last; ++p) {
+    Node *node = *p;
+    assert(node->m_type != SOURCE);
+    if (node != lca) {
+      lca = 0;
+    }
+  }
+  if (lca) {
+    return lca;
+  }
+
+  // Now construct an ancestor path for each node, from itself to the root.
+  size_t minPathLength = 0;
+  std::vector<std::vector<Node *> > paths;
+  for (p = first; p != last; ++p) {
+    paths.resize(paths.size()+1);
+    (*p)->GetTreeAncestors(std::back_inserter(paths.back()), true);
+    size_t pathLength = paths.back().size();
+    assert(pathLength > 0);
+    if (paths.size() == 1 || pathLength < minPathLength) {
+      minPathLength = pathLength;
+    }
+  }
+
+  // Search for the start of the longest common suffix by working forward from
+  // the the earliest possible starting point to the root.
+  for (size_t i = 0; i < minPathLength; ++i) {
+    bool match = true;
+    for (size_t j = 0; j < paths.size(); ++j) {
+      size_t index = paths[j].size() - minPathLength + i;
+      assert(index >= 0);
+      assert(index < paths[j].size());
+      if (j == 0) {
+        lca = paths[j][index];
+        assert(lca);
+      } else if (lca != paths[j][index]) {
+        match = false;
+        break;
+      }
+    }
+    if (match) {
+      return lca;
+    }
+  }
+
+  // A lowest common ancestor should have been found.
+  assert(false);
+  return 0;
+}
+
+}  // namespace GHKM
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/Options.h b/scripts/training/phrase-extract/extract-ghkm/tools/Options.h
new file mode 100644
index 000000000..a34a35744
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/Options.h
@@ -0,0 +1,61 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_OPTIONS_H_
+#define EXTRACT_GHKM_OPTIONS_H_
+
+#include <string>
+
+namespace Moses {
+namespace GHKM {
+
+struct Options {
+ public:
+  Options()
+      : allowUnary(false)
+      , maxNodes(15)
+      , maxRuleDepth(3)
+      , maxRuleSize(3)
+      , maxScope(3)
+      , minimal(false)
+      , unpairedExtractFormat(false) {}
+
+  // Positional options
+  std::string targetFile;
+  std::string sourceFile;
+  std::string alignmentFile;
+  std::string extractFile;
+
+  // All other options
+  bool allowUnary;
+  std::string glueGrammarFile;
+  int maxNodes;
+  int maxRuleDepth;
+  int maxRuleSize;
+  int maxScope;
+  bool minimal;
+  bool unpairedExtractFormat;
+  std::string unknownWordFile;
+};
+
+}  // namespace GHKM
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/ParseTree.cpp b/scripts/training/phrase-extract/extract-ghkm/tools/ParseTree.cpp
new file mode 100644
index 000000000..052b8dee1
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/ParseTree.cpp
@@ -0,0 +1,54 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "ParseTree.h"
+
+namespace Moses {
+namespace GHKM {
+
+ParseTree::~ParseTree()
+{
+  for (std::vector<ParseTree*>::iterator p(m_children.begin());
+       p != m_children.end(); ++p) {
+    delete *p;
+  }
+}
+
+void ParseTree::SetChildren(const std::vector<ParseTree*> &children)
+{
+  m_children = children;
+}
+
+void ParseTree::SetParent(ParseTree *parent)
+{
+  m_parent = parent;
+}
+
+void ParseTree::AddChild(ParseTree *child)
+{
+  m_children.push_back(child);
+}
+
+bool ParseTree::IsLeaf() const
+{
+  return m_children.empty();
+}
+
+}  // namespace GHKM
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/ParseTree.h b/scripts/training/phrase-extract/extract-ghkm/tools/ParseTree.h
new file mode 100644
index 000000000..ec6fc147a
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/ParseTree.h
@@ -0,0 +1,82 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_PARSE_TREE_H_
+#define EXTRACT_GHKM_PARSE_TREE_H_
+
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+class ParseTree
+{
+ public:
+  ParseTree(const std::string &label)
+      : m_label(label)
+      , m_children()
+      , m_parent() {}
+
+  ~ParseTree();
+
+  const std::string &GetLabel() const { return m_label; }
+  const std::vector<ParseTree*> &GetChildren() const { return m_children; }
+  const ParseTree *GetParent() const { return m_parent; }
+
+  void SetParent(ParseTree *);
+  void SetChildren(const std::vector<ParseTree*> &);
+
+  void AddChild(ParseTree *);
+
+  bool IsLeaf() const;
+
+  template<typename OutputIterator>
+  void GetLeaves(OutputIterator);
+
+ private:
+  // Disallow copying
+  ParseTree(const ParseTree &);
+  ParseTree &operator=(const ParseTree &);
+
+  std::string m_label;
+  std::vector<ParseTree*> m_children;
+  ParseTree *m_parent;
+};
+
+template<typename OutputIterator>
+void ParseTree::GetLeaves(OutputIterator result)
+{
+  if (IsLeaf()) {
+    *result++ = this;
+  } else {
+    std::vector<ParseTree *>::const_iterator p = m_children.begin();
+    std::vector<ParseTree *>::const_iterator end = m_children.end();
+    while (p != end) {
+      ParseTree &child = **p++;
+      child.GetLeaves(result);
+    }
+  }
+}
+
+}  // namespace GHKM
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/ScfgRule.cpp b/scripts/training/phrase-extract/extract-ghkm/tools/ScfgRule.cpp
new file mode 100644
index 000000000..648fe6461
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/ScfgRule.cpp
@@ -0,0 +1,144 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "ScfgRule.h"
+
+#include "Node.h"
+#include "Subgraph.h"
+
+#include <algorithm>
+
+namespace Moses {
+namespace GHKM {
+
+ScfgRule::ScfgRule(const Subgraph &fragment)
+    : m_sourceLHS("X", NonTerminal)
+    , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
+{
+  // Source RHS
+
+  const std::set<const Node *> &leaves = fragment.GetLeaves();
+
+  std::vector<const Node *> sourceRHSNodes;
+  sourceRHSNodes.reserve(leaves.size());
+  for (std::set<const Node *>::const_iterator p(leaves.begin());
+       p != leaves.end(); ++p) {
+    const Node &leaf = **p;
+    if (!leaf.GetSpan().empty()) {
+      sourceRHSNodes.push_back(&leaf);
+    }
+  }
+
+  std::sort(sourceRHSNodes.begin(), sourceRHSNodes.end(), PartitionOrderComp);
+
+  // Build a mapping from target nodes to source-order indices, so that we
+  // can construct the Alignment object later.
+  std::map<const Node *, std::vector<int> > sourceOrder;
+
+  m_sourceRHS.reserve(sourceRHSNodes.size());
+  int srcIndex = 0;
+  for (std::vector<const Node *>::const_iterator p(sourceRHSNodes.begin());
+       p != sourceRHSNodes.end(); ++p, ++srcIndex) {
+    const Node &sinkNode = **p;
+    if (sinkNode.GetType() == TREE) {
+      m_sourceRHS.push_back(Symbol("X", NonTerminal));
+      sourceOrder[&sinkNode].push_back(srcIndex);
+    } else {
+      assert(sinkNode.GetType() == SOURCE);
+      m_sourceRHS.push_back(Symbol(sinkNode.GetLabel(), Terminal));
+      // Add all aligned target words to the sourceOrder map
+      const std::vector<Node *> &parents(sinkNode.GetParents());
+      for (std::vector<Node *>::const_iterator q(parents.begin());
+           q != parents.end(); ++q) {
+        if ((*q)->GetType() == TARGET) {
+          sourceOrder[*q].push_back(srcIndex);
+        }
+      }
+    }
+  }
+
+  // Target RHS + alignment
+
+  std::vector<const Node *> targetLeaves;
+  fragment.GetTargetLeaves(targetLeaves);
+
+  m_alignment.reserve(targetLeaves.size());  // might be too much but that's OK
+  m_targetRHS.reserve(targetLeaves.size());
+
+  for (std::vector<const Node *>::const_iterator p(targetLeaves.begin());
+       p != targetLeaves.end(); ++p) {
+    const Node &leaf = **p;
+    if (leaf.GetSpan().empty()) {
+      // The node doesn't cover any source words, so we can only add
+      // terminals to the target RHS (not a non-terminal).
+      std::vector<std::string> targetWords(leaf.GetTargetWords());
+      for (std::vector<std::string>::const_iterator q(targetWords.begin());
+           q != targetWords.end(); ++q) {
+        m_targetRHS.push_back(Symbol(*q, Terminal));
+      }
+    } else if (leaf.GetType() == SOURCE) {
+      // Do nothing
+    } else {
+      SymbolType type = (leaf.GetType() == TREE) ? NonTerminal : Terminal;
+      m_targetRHS.push_back(Symbol(leaf.GetLabel(), type));
+
+      int tgtIndex = m_targetRHS.size()-1;
+      std::map<const Node *, std::vector<int> >::iterator q(sourceOrder.find(&leaf));
+      assert(q != sourceOrder.end());
+      std::vector<int> &sourceNodes = q->second;
+      for (std::vector<int>::iterator r(sourceNodes.begin());
+           r != sourceNodes.end(); ++r) {
+        int srcIndex = *r;
+        m_alignment.push_back(std::make_pair(srcIndex, tgtIndex));
+      }
+    }
+  }
+}
+
+int ScfgRule::Scope() const
+{
+  int scope = 0;
+  bool predIsNonTerm = false;
+  if (m_sourceRHS[0].GetType() == NonTerminal) {
+    ++scope;
+    predIsNonTerm = true;
+  }
+  for (int i = 1; i < m_sourceRHS.size(); ++i) {
+    bool isNonTerm = m_sourceRHS[i].GetType() == NonTerminal;
+    if (isNonTerm && predIsNonTerm) {
+      ++scope;
+    }
+    predIsNonTerm = isNonTerm;
+  }
+  if (predIsNonTerm) {
+    ++scope;
+  }
+  return scope;
+}
+
+bool ScfgRule::PartitionOrderComp(const Node *a, const Node *b)
+{
+  const Span &aSpan = a->GetSpan();
+  const Span &bSpan = b->GetSpan();
+  assert(!aSpan.empty() && !bSpan.empty());
+  return *(aSpan.begin()) < *(bSpan.begin());
+}
+
+}  // namespace GHKM
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/ScfgRule.h b/scripts/training/phrase-extract/extract-ghkm/tools/ScfgRule.h
new file mode 100644
index 000000000..1ed534d9e
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/ScfgRule.h
@@ -0,0 +1,76 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_SCFG_RULE_H_
+#define EXTRACT_GHKM_SCFG_RULE_H_
+
+#include "Alignment.h"
+
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+class Node;
+class Subgraph;
+
+enum SymbolType { Terminal, NonTerminal };
+
+struct Symbol
+{
+ public:
+  Symbol(const std::string &v, SymbolType t) : m_value(v) , m_type(t) {}
+
+  const std::string &GetValue() const { return m_value; }
+  SymbolType GetType() const { return m_type; }
+
+ private:
+  std::string m_value;
+  SymbolType m_type;
+};
+
+class ScfgRule
+{
+ public:
+  ScfgRule(const Subgraph &fragment);
+
+  const Symbol &GetSourceLHS() const { return m_sourceLHS; }
+  const Symbol &GetTargetLHS() const { return m_targetLHS; }
+  const std::vector<Symbol> &GetSourceRHS() const { return m_sourceRHS; }
+  const std::vector<Symbol> &GetTargetRHS() const { return m_targetRHS; }
+  const Alignment &GetAlignment() const { return m_alignment; }
+
+  int Scope() const;
+
+ private:
+  static bool PartitionOrderComp(const Node *, const Node *);
+
+  Symbol m_sourceLHS;
+  Symbol m_targetLHS;
+  std::vector<Symbol> m_sourceRHS;
+  std::vector<Symbol> m_targetRHS;
+  Alignment m_alignment;
+};
+
+}  // namespace GHKM
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/ScfgRuleWriter.cpp b/scripts/training/phrase-extract/extract-ghkm/tools/ScfgRuleWriter.cpp
new file mode 100644
index 000000000..4be3f048d
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/ScfgRuleWriter.cpp
@@ -0,0 +1,153 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "ScfgRuleWriter.h"
+
+#include "Alignment.h"
+#include "Options.h"
+#include "ScfgRule.h"
+
+#include <cassert>
+#include <ostream>
+#include <map>
+#include <sstream>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+void ScfgRuleWriter::Write(const ScfgRule &rule)
+{
+  if (m_options.unpairedExtractFormat) {
+    WriteUnpairedFormat(rule);
+  } else {
+    WriteStandardFormat(rule);
+  }
+}
+
+void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule)
+{
+  const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
+  const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
+
+  std::map<int, int> sourceToTargetNTMap;
+  std::map<int, int> targetToSourceNTMap;
+
+  const Alignment &alignment = rule.GetAlignment();
+
+  for (Alignment::const_iterator p(alignment.begin());
+       p != alignment.end(); ++p) {
+    if (sourceRHS[p->first].GetType() == NonTerminal) {
+      assert(targetRHS[p->second].GetType() == NonTerminal);
+      sourceToTargetNTMap[p->first] = p->second;
+      targetToSourceNTMap[p->second] = p->first;
+    }
+  }
+
+  std::ostringstream sourceSS;
+  std::ostringstream targetSS;
+
+  // Write the source side of the rule to sourceSS.
+  int i = 0;
+  for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
+       p != sourceRHS.end(); ++p, ++i) {
+    WriteSymbol(*p, sourceSS);
+    if (p->GetType() == NonTerminal) {
+      int targetIndex = sourceToTargetNTMap[i];
+      WriteSymbol(targetRHS[targetIndex], sourceSS);
+    }
+    sourceSS << " ";
+  }
+  WriteSymbol(rule.GetSourceLHS(), sourceSS);
+
+  // Write the target side of the rule to targetSS.
+  i = 0;
+  for (std::vector<Symbol>::const_iterator p(targetRHS.begin());
+       p != targetRHS.end(); ++p, ++i) {
+    if (p->GetType() == NonTerminal) {
+      int sourceIndex = targetToSourceNTMap[i];
+      WriteSymbol(sourceRHS[sourceIndex], targetSS);
+    }
+    WriteSymbol(*p, targetSS);
+    targetSS << " ";
+  }
+  WriteSymbol(rule.GetTargetLHS(), targetSS);
+
+  // Write the rule to the forward and inverse extract files.
+  m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+  m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+  for (Alignment::const_iterator p(alignment.begin());
+       p != alignment.end(); ++p) {
+    m_fwd << " " << p->first << "-" << p->second;
+    m_inv << " " << p->second << "-" << p->first;
+  }
+  m_fwd << " ||| 1" << std::endl;
+  m_inv << " ||| 1" << std::endl;
+}
+
+void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule)
+{
+  const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
+  const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
+  const Alignment &alignment = rule.GetAlignment();
+
+  std::ostringstream sourceSS;
+  std::ostringstream targetSS;
+
+  // Write the source side of the rule to sourceSS.
+  int i = 0;
+  for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
+       p != sourceRHS.end(); ++p, ++i) {
+    WriteSymbol(*p, sourceSS);
+    sourceSS << " ";
+  }
+  WriteSymbol(rule.GetSourceLHS(), sourceSS);
+
+  // Write the target side of the rule to targetSS.
+  i = 0;
+  for (std::vector<Symbol>::const_iterator p(targetRHS.begin());
+       p != targetRHS.end(); ++p, ++i) {
+    WriteSymbol(*p, targetSS);
+    targetSS << " ";
+  }
+  WriteSymbol(rule.GetTargetLHS(), targetSS);
+
+  // Write the rule to the forward and inverse extract files.
+  m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+  m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+  for (Alignment::const_iterator p(alignment.begin());
+       p != alignment.end(); ++p) {
+    m_fwd << " " << p->first << "-" << p->second;
+    m_inv << " " << p->second << "-" << p->first;
+  }
+  m_fwd << " ||| 1" << std::endl;
+  m_inv << " ||| 1" << std::endl;
+}
+
+void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
+{
+  if (symbol.GetType() == NonTerminal) {
+    out << "[" << symbol.GetValue() << "]";
+  } else {
+    out << symbol.GetValue();
+  }
+}
+
+}  // namespace GHKM
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/ScfgRuleWriter.h b/scripts/training/phrase-extract/extract-ghkm/tools/ScfgRuleWriter.h
new file mode 100644
index 000000000..edea1e95c
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/ScfgRuleWriter.h
@@ -0,0 +1,60 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_RULE_WRITER_H_
+#define EXTRACT_GHKM_RULE_WRITER_H_
+
+#include <ostream>
+
+namespace Moses {
+namespace GHKM {
+
+class Options;
+class ScfgRule;
+class Symbol;
+
+class ScfgRuleWriter
+{
+ public:
+  ScfgRuleWriter(std::ostream &fwd, std::ostream &inv, const Options &options)
+      : m_fwd(fwd)
+      , m_inv(inv)
+      , m_options(options) {}
+
+  void Write(const ScfgRule &);
+
+ private:
+  // Disallow copying
+  ScfgRuleWriter(const ScfgRuleWriter &);
+  ScfgRuleWriter &operator=(const ScfgRuleWriter &);
+
+  void WriteStandardFormat(const ScfgRule &);
+  void WriteUnpairedFormat(const ScfgRule &);
+  void WriteSymbol(const Symbol &, std::ostream &);
+
+  std::ostream &m_fwd;
+  std::ostream &m_inv;
+  const Options &m_options;
+};
+
+}  // namespace GHKM
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/Span.cpp b/scripts/training/phrase-extract/extract-ghkm/tools/Span.cpp
new file mode 100644
index 000000000..f0eccbdf2
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/Span.cpp
@@ -0,0 +1,46 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "Span.h"
+
+namespace Moses {
+namespace GHKM {
+
+bool SpansIntersect(const Span &a, const ContiguousSpan &b)
+{
+  for (Span::const_iterator p = a.begin(); p != a.end(); ++p) {
+    if (*p >= b.first && *p <= b.second) {
+      return true;
+    }
+  }
+  return false;
+}
+
+ContiguousSpan Closure(const Span &s)
+{
+  ContiguousSpan result(-1,-1);
+  if (!s.empty()) {
+    result.first = *(s.begin());
+    result.second = *(s.rbegin());
+  }
+  return result;
+}
+
+}  // namespace GHKM
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/Span.h b/scripts/training/phrase-extract/extract-ghkm/tools/Span.h
new file mode 100644
index 000000000..003d1ef84
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/Span.h
@@ -0,0 +1,40 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_SPAN_H_
+#define EXTRACT_GHKM_SPAN_H_
+
+#include <map>
+#include <set>
+
+namespace Moses {
+namespace GHKM {
+
+typedef std::set<int> Span;
+typedef std::pair<int, int> ContiguousSpan;
+
+bool SpansIntersect(const Span &, const ContiguousSpan &);
+
+ContiguousSpan Closure(const Span &);
+
+}  // namespace Moses
+}  // namespace GHKM
+
+#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/Subgraph.cpp b/scripts/training/phrase-extract/extract-ghkm/tools/Subgraph.cpp
new file mode 100644
index 000000000..e5aedbb16
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/Subgraph.cpp
@@ -0,0 +1,105 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "Subgraph.h"
+
+#include "Node.h"
+
+namespace Moses {
+namespace GHKM {
+
+void Subgraph::GetTargetLeaves(std::vector<const Node *> &result) const
+{
+  result.clear();
+  GetTargetLeaves(m_root, result);
+}
+
+void Subgraph::GetTargetLeaves(const Node *root,
+                               std::vector<const Node *> &result) const
+{
+  if (root->GetType() == TARGET || m_leaves.find(root) != m_leaves.end()) {
+    result.push_back(root);
+  } else {
+    const std::vector<Node*> &children = root->GetChildren();
+    for (std::vector<Node *>::const_iterator p(children.begin());
+         p != children.end(); ++p) {
+      GetTargetLeaves(*p, result);
+    }
+  }
+}
+
+int Subgraph::CountNodes(const Node *n) const
+{
+  if (n->GetType() != TREE) {
+    return 0;
+  }
+  if (IsTrivial()) {
+    return 1;
+  }
+  int count = 1;
+  const std::vector<Node*> &children = n->GetChildren();
+  for (std::vector<Node *>::const_iterator p = children.begin();
+       p != children.end(); ++p) {
+    const Node *child = *p;
+    if (m_leaves.find(child) == m_leaves.end()) {
+      count += CountNodes(child);
+    } else if (child->GetType() == TREE) {
+      ++count;
+    }
+  }
+  return count;
+}
+
+int Subgraph::CalcSize(const Node *n) const
+{
+  if (n->GetType() != TREE || n->IsPreterminal()) {
+    return 0;
+  }
+  if (IsTrivial()) {
+    return 1;
+  }
+  int count = 1;
+  const std::vector<Node*> &children = n->GetChildren();
+  for (std::vector<Node *>::const_iterator p = children.begin();
+       p != children.end(); ++p) {
+    if (m_leaves.find(*p) == m_leaves.end()) {
+      count += CalcSize(*p);
+    }
+  }
+  return count;
+}
+
+int Subgraph::CalcDepth(const Node *n) const
+{
+  if (n->GetType() != TREE || n->IsPreterminal() || m_leaves.empty()) {
+    return 0;
+  }
+  int maxChildDepth = 0;
+  const std::vector<Node*> &children = n->GetChildren();
+  for (std::vector<Node *>::const_iterator p = children.begin();
+       p != children.end(); ++p) {
+    if (m_leaves.find(*p) == m_leaves.end()) {
+      maxChildDepth = std::max(maxChildDepth, CalcDepth(*p));
+    }
+  }
+  return maxChildDepth + 1;
+}
+
+}  // namespace Moses
+}  // namespace GHKM
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/Subgraph.h b/scripts/training/phrase-extract/extract-ghkm/tools/Subgraph.h
new file mode 100644
index 000000000..e84903502
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/Subgraph.h
@@ -0,0 +1,81 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_SUBGRAPH_H_
+#define EXTRACT_GHKM_SUBGRAPH_H_
+
+#include "Node.h"
+
+#include <set>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+class Node;
+
+class Subgraph
+{
+ public:
+  Subgraph(const Node *root)
+      : m_root(root)
+      , m_depth(0)
+      , m_size(root->GetType() == TREE ? 1 : 0)
+      , m_nodeCount(1) {}
+
+  Subgraph(const Node *root, const std::set<const Node *> &leaves)
+      : m_root(root)
+      , m_leaves(leaves)
+      , m_depth(-1)
+      , m_size(-1)
+      , m_nodeCount(-1)
+  {
+    m_depth = CalcDepth(m_root);
+    m_size = CalcSize(m_root);
+    m_nodeCount = CountNodes(m_root);
+  }
+
+  const Node *GetRoot() const { return m_root; }
+  const std::set<const Node *> &GetLeaves() const { return m_leaves; }
+  int GetDepth() const { return m_depth; }
+  int GetSize() const { return m_size; }
+  int GetNodeCount() const { return m_nodeCount; }
+
+  bool IsTrivial() const { return m_leaves.empty(); }
+
+  void GetTargetLeaves(std::vector<const Node *> &) const;
+
+ private:
+  void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
+  int CalcDepth(const Node *) const;
+  int CalcSize(const Node *) const;
+  int CountNodes(const Node *) const;
+
+  const Node *m_root;
+  std::set<const Node *> m_leaves;
+  int m_depth;
+  int m_size;
+  int m_nodeCount;
+};
+
+}  // namespace GHKM
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/XmlTreeParser.cpp b/scripts/training/phrase-extract/extract-ghkm/tools/XmlTreeParser.cpp
new file mode 100644
index 000000000..ab4616918
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/XmlTreeParser.cpp
@@ -0,0 +1,88 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#include "XmlTreeParser.h"
+
+#include "ParseTree.h"
+#include "XmlTree.h"
+
+#include <cassert>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+std::auto_ptr<ParseTree> ParseXmlTree(
+    std::vector<std::string>::const_iterator &p,
+    const std::vector<std::string>::const_iterator &end)
+{
+  std::auto_ptr<ParseTree> t;
+
+  if (p == end) {
+    return t;
+  }
+
+  std::string s(Trim(*p));
+
+  while (s.empty()) {
+    s = Trim(*++p);
+  }
+
+  if (!isXmlTag(s)) {
+    p++;
+    t.reset(new ParseTree(unescape(s)));
+    return t;
+  }
+
+  const std::string &tag = s;
+
+  if (tag[1] == '/') {
+    // Closing tag.  Don't advance p -- let caller handle it.
+    return t;
+  }
+
+  std::string label = ParseXmlTagAttribute(tag, "label");
+  t.reset(new ParseTree(label));
+
+  if (tag[tag.size()-2] == '/') {
+    // Unary tag.
+    p++;
+    return t;
+  }
+
+  p++;
+  while (ParseTree *c = ParseXmlTree(p, end).release()) {
+    t->AddChild(c);
+    c->SetParent(t.get());
+  }
+  p++;  // Skip over closing tag
+
+  return t;
+}
+
+std::auto_ptr<ParseTree> ParseXmlTree(const std::string &line)
+{
+  std::vector<std::string> xmlTokens(TokenizeXml(line));
+  std::vector<std::string>::const_iterator begin(xmlTokens.begin());
+  std::vector<std::string>::const_iterator end(xmlTokens.end());
+  return ParseXmlTree(begin, end);
+}
+
+}  // namespace GHKM
+}  // namespace Moses
diff --git a/scripts/training/phrase-extract/extract-ghkm/tools/XmlTreeParser.h b/scripts/training/phrase-extract/extract-ghkm/tools/XmlTreeParser.h
new file mode 100644
index 000000000..16f4c3f8e
--- /dev/null
+++ b/scripts/training/phrase-extract/extract-ghkm/tools/XmlTreeParser.h
@@ -0,0 +1,37 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+ 
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+ 
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ Lesser General Public License for more details.
+ 
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_XML_TREE_PARSER_H_
+#define EXTRACT_GHKM_XML_TREE_PARSER_H_
+
+#include <memory>
+#include <string>
+
+namespace Moses {
+namespace GHKM {
+
+class ParseTree;
+
+std::auto_ptr<ParseTree> ParseXmlTree(const std::string &);
+
+}  // namespace GHKM
+}  // namespace Moses
+
+#endif
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index b6bc7d2f0..18eb80eb1 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -30,7 +30,7 @@ my($_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_DIR, $_TEMP_DIR, $_
    $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
    @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
    $_DONT_ZIP,  $_MGIZA, $_MGIZA_CPUS,  $_HMM_ALIGN, $_CONFIG,
-   $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS,
+   $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_EXTRACT_OPTIONS,$_SCORE_OPTIONS,
    $_PHRASE_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
    $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
    $_CONTINUE,$_MAX_LEXICAL_REORDERING,$_DO_STEPS,
@@ -99,6 +99,7 @@ $_HELP = 1
 		       'glue-grammar' => \$_GLUE_GRAMMAR,
 		       'glue-grammar-file=s' => \$_GLUE_GRAMMAR_FILE,
 		       'unknown-word-label-file=s' => \$_UNKNOWN_WORD_LABEL_FILE,
+		       'ghkm' => \$_GHKM,
 		       'extract-options=s' => \$_EXTRACT_OPTIONS,
 		       'score-options=s' => \$_SCORE_OPTIONS,
 		       'source-syntax' => \$_SOURCE_SYNTAX,
@@ -196,7 +197,13 @@ my $MKCLS = "$BINDIR/mkcls";
 
 # supporting scripts/binaries from this package
 my $PHRASE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract";
-my $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-rules";
+my $RULE_EXTRACT;
+if (defined($_GHKM)) {
+  $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-ghkm/tools/extract-ghkm";
+}
+else {
+  $RULE_EXTRACT = "$SCRIPTS_ROOTDIR/training/phrase-extract/extract-rules";
+}
 my $LEXICAL_REO_SCORER = "$SCRIPTS_ROOTDIR/training/lexical-reordering/score";
 my $MEMSCORE = "$SCRIPTS_ROOTDIR/training/memscore/memscore";
 my $EPPEX = "$SCRIPTS_ROOTDIR/training/eppex/eppex";
@@ -1304,8 +1311,10 @@ sub extract_phrase {
         $cmd = "$RULE_EXTRACT $alignment_file_e $alignment_file_f $alignment_file_a $extract_file";
         $cmd .= " --GlueGrammar $___GLUE_GRAMMAR_FILE" if $_GLUE_GRAMMAR;
         $cmd .= " --UnknownWordLabel $_UNKNOWN_WORD_LABEL_FILE" if $_TARGET_SYNTAX && defined($_UNKNOWN_WORD_LABEL_FILE);
-        $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
-        $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
+        if (!defined($_GHKM)) {
+          $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
+          $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
+        }
         $cmd .= " ".$_EXTRACT_OPTIONS if defined($_EXTRACT_OPTIONS);
     }
     else
author	Phil Williams <philip.williams@mac.com>	2011-11-21 20:21:04 +0400
committer	Phil Williams <philip.williams@mac.com>	2011-11-21 20:21:04 +0400
commit	ee0a6dbd5cea7d2a72bbfb0d081a9232d1f60a8b (patch)
tree	a8a57bd866061d0292cfb2f978fa91190dff9f87 /scripts
parent	463f2e232e9e1907ccd4ef69045e06ff77ad5f3f (diff)