Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <fishandfrolick@gmail.com>2012-05-31 20:24:06 +0400
committerHieu Hoang <fishandfrolick@gmail.com>2012-05-31 20:24:06 +0400
commit4eef94b1217a82eb979242dd3e06d8a4b6255e6e (patch)
treebab044ee77c9bf699954ea789078f4c885d66723 /phrase-extract
parentc3d4615fc012a5ad5b33a2b587df31064e0ec82b (diff)
move c++ code out of /script/ to /
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/AlignmentPhrase.cpp42
-rw-r--r--phrase-extract/AlignmentPhrase.h72
-rw-r--r--phrase-extract/ExtractedRule.cpp41
-rw-r--r--phrase-extract/ExtractedRule.h74
-rw-r--r--phrase-extract/Hole.h111
-rw-r--r--phrase-extract/HoleCollection.cpp62
-rw-r--r--phrase-extract/HoleCollection.h97
-rw-r--r--phrase-extract/InputFileStream.cpp61
-rw-r--r--phrase-extract/InputFileStream.h48
-rw-r--r--phrase-extract/Jamfile32
-rw-r--r--phrase-extract/OutputFileStream.cpp79
-rw-r--r--phrase-extract/OutputFileStream.h50
-rw-r--r--phrase-extract/PhraseAlignment.cpp239
-rw-r--r--phrase-extract/PhraseAlignment.h54
-rw-r--r--phrase-extract/RuleExist.h59
-rw-r--r--phrase-extract/RuleExtractionOptions.h88
-rw-r--r--phrase-extract/SafeGetline.h35
-rw-r--r--phrase-extract/SentenceAlignment.cpp91
-rw-r--r--phrase-extract/SentenceAlignment.h46
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.cpp70
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.h65
-rw-r--r--phrase-extract/SyntaxTree.cpp180
-rw-r--r--phrase-extract/SyntaxTree.h121
-rw-r--r--phrase-extract/XmlException.h41
-rw-r--r--phrase-extract/XmlTree.cpp392
-rw-r--r--phrase-extract/XmlTree.h36
-rw-r--r--phrase-extract/consolidate-direct.cpp138
-rw-r--r--phrase-extract/consolidate-direct.vcxproj98
-rw-r--r--phrase-extract/consolidate-reverse.cpp245
-rw-r--r--phrase-extract/consolidate.cpp326
-rw-r--r--phrase-extract/consolidate.vcxproj100
-rw-r--r--phrase-extract/extract-ghkm/Alignment.cpp74
-rw-r--r--phrase-extract/extract-ghkm/Alignment.h38
-rw-r--r--phrase-extract/extract-ghkm/AlignmentGraph.cpp386
-rw-r--r--phrase-extract/extract-ghkm/AlignmentGraph.h76
-rw-r--r--phrase-extract/extract-ghkm/ComposedRule.cpp129
-rw-r--r--phrase-extract/extract-ghkm/ComposedRule.h70
-rw-r--r--phrase-extract/extract-ghkm/Exception.h42
-rw-r--r--phrase-extract/extract-ghkm/ExtractGHKM.cpp492
-rw-r--r--phrase-extract/extract-ghkm/ExtractGHKM.h71
-rw-r--r--phrase-extract/extract-ghkm/Jamfile1
-rw-r--r--phrase-extract/extract-ghkm/Main.cpp26
-rw-r--r--phrase-extract/extract-ghkm/Node.cpp71
-rw-r--r--phrase-extract/extract-ghkm/Node.h184
-rw-r--r--phrase-extract/extract-ghkm/Options.h67
-rw-r--r--phrase-extract/extract-ghkm/ParseTree.cpp54
-rw-r--r--phrase-extract/extract-ghkm/ParseTree.h85
-rw-r--r--phrase-extract/extract-ghkm/ScfgRule.cpp145
-rw-r--r--phrase-extract/extract-ghkm/ScfgRule.h78
-rw-r--r--phrase-extract/extract-ghkm/ScfgRuleWriter.cpp164
-rw-r--r--phrase-extract/extract-ghkm/ScfgRuleWriter.h60
-rw-r--r--phrase-extract/extract-ghkm/Span.cpp46
-rw-r--r--phrase-extract/extract-ghkm/Span.h40
-rw-r--r--phrase-extract/extract-ghkm/Subgraph.cpp121
-rw-r--r--phrase-extract/extract-ghkm/Subgraph.h87
-rw-r--r--phrase-extract/extract-ghkm/XmlTreeParser.cpp89
-rw-r--r--phrase-extract/extract-ghkm/XmlTreeParser.h59
-rw-r--r--phrase-extract/extract-lex.cpp222
-rw-r--r--phrase-extract/extract-lex.h117
-rw-r--r--phrase-extract/extract-lex.vcxproj89
-rw-r--r--phrase-extract/extract-rules.cpp1100
-rw-r--r--phrase-extract/extract-rules.vcxproj111
-rw-r--r--phrase-extract/extract.cpp699
-rw-r--r--phrase-extract/extract.vcxproj103
-rw-r--r--phrase-extract/extract.xcodeproj/project.pbxproj788
-rw-r--r--phrase-extract/gzfilebuf.h85
-rw-r--r--phrase-extract/hierarchical.h33
-rw-r--r--phrase-extract/pcfg-common/Jamfile1
-rw-r--r--phrase-extract/pcfg-common/exception.h41
-rw-r--r--phrase-extract/pcfg-common/numbered_set.h109
-rw-r--r--phrase-extract/pcfg-common/pcfg.cc106
-rw-r--r--phrase-extract/pcfg-common/pcfg.h61
-rw-r--r--phrase-extract/pcfg-common/pcfg_tree.h77
-rw-r--r--phrase-extract/pcfg-common/syntax_tree.h91
-rw-r--r--phrase-extract/pcfg-common/tool.cc80
-rw-r--r--phrase-extract/pcfg-common/tool.h91
-rw-r--r--phrase-extract/pcfg-common/typedef.h37
-rw-r--r--phrase-extract/pcfg-common/xml_tree_parser.cc88
-rw-r--r--phrase-extract/pcfg-common/xml_tree_parser.h56
-rw-r--r--phrase-extract/pcfg-common/xml_tree_writer.h133
-rw-r--r--phrase-extract/pcfg-extract/Jamfile1
-rw-r--r--phrase-extract/pcfg-extract/main.cc25
-rw-r--r--phrase-extract/pcfg-extract/options.h36
-rw-r--r--phrase-extract/pcfg-extract/pcfg_extract.cc131
-rw-r--r--phrase-extract/pcfg-extract/pcfg_extract.h42
-rw-r--r--phrase-extract/pcfg-extract/rule_collection.cc58
-rw-r--r--phrase-extract/pcfg-extract/rule_collection.h59
-rw-r--r--phrase-extract/pcfg-extract/rule_extractor.cc51
-rw-r--r--phrase-extract/pcfg-extract/rule_extractor.h45
-rw-r--r--phrase-extract/pcfg-score/Jamfile1
-rw-r--r--phrase-extract/pcfg-score/main.cc25
-rw-r--r--phrase-extract/pcfg-score/options.h36
-rw-r--r--phrase-extract/pcfg-score/pcfg_score.cc152
-rw-r--r--phrase-extract/pcfg-score/pcfg_score.h42
-rw-r--r--phrase-extract/pcfg-score/tree_scorer.cc68
-rw-r--r--phrase-extract/pcfg-score/tree_scorer.h47
-rw-r--r--phrase-extract/phrase-extract.sln56
-rw-r--r--phrase-extract/relax-parse.cpp277
-rw-r--r--phrase-extract/relax-parse.h46
-rw-r--r--phrase-extract/score.cpp761
-rw-r--r--phrase-extract/score.h65
-rw-r--r--phrase-extract/score.vcxproj106
-rw-r--r--phrase-extract/statistics.cpp340
-rw-r--r--phrase-extract/tables-core.cpp124
-rw-r--r--phrase-extract/tables-core.h66
105 files changed, 12926 insertions, 0 deletions
diff --git a/phrase-extract/AlignmentPhrase.cpp b/phrase-extract/AlignmentPhrase.cpp
new file mode 100644
index 000000000..2d4439567
--- /dev/null
+++ b/phrase-extract/AlignmentPhrase.cpp
@@ -0,0 +1,42 @@
+// $Id$
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include <algorithm>
+#include <iostream>
+#include "AlignmentPhrase.h"
+
+using namespace std;
+
+void AlignmentElement::Merge(size_t align)
+{
+ m_elements.insert(align);
+}
+
+void AlignmentPhrase::Merge(const std::vector< std::vector<size_t> > &source)
+{
+ for (size_t idx = 0 ; idx < source.size() ; ++idx) {
+ AlignmentElement &currElement = m_elements[idx];
+ const vector<size_t> &newElement = source[idx];
+
+ for (size_t pos = 0 ; pos < newElement.size() ; ++pos) {
+ currElement.Merge(newElement[pos]);
+ }
+ }
+}
diff --git a/phrase-extract/AlignmentPhrase.h b/phrase-extract/AlignmentPhrase.h
new file mode 100644
index 000000000..f77b44f36
--- /dev/null
+++ b/phrase-extract/AlignmentPhrase.h
@@ -0,0 +1,72 @@
+// $Id$
+/***********************************************************************
+Moses - factored phrase-based language decoder
+Copyright (C) 2006 University of Edinburgh
+
+This library is free software; you can redistribute it and/or
+modify it under the terms of the GNU Lesser General Public
+License as published by the Free Software Foundation; either
+version 2.1 of the License, or (at your option) any later version.
+
+This library is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+Lesser General Public License for more details.
+
+You should have received a copy of the GNU Lesser General Public
+License along with this library; if not, write to the Free Software
+Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+
+#include <vector>
+#include <set>
+
+class WordsRange;
+
+
+class AlignmentElement
+{
+protected:
+ std::set<size_t> m_elements;
+public:
+ typedef std::set<size_t>::iterator iterator;
+ typedef std::set<size_t>::const_iterator const_iterator;
+ const_iterator begin() const {
+ return m_elements.begin();
+ }
+ const_iterator end() const {
+ return m_elements.end();
+ }
+
+ AlignmentElement()
+ {}
+
+ size_t GetSize() const {
+ return m_elements.size();
+ }
+
+ void Merge(size_t align);
+};
+
+class AlignmentPhrase
+{
+protected:
+ std::vector<AlignmentElement> m_elements;
+public:
+ AlignmentPhrase(size_t size)
+ :m_elements(size)
+ {}
+ void Merge(const AlignmentPhrase &newAlignment, const WordsRange &newAlignmentRange);
+ void Merge(const std::vector< std::vector<size_t> > &source);
+ size_t GetSize() const {
+ return m_elements.size();
+ }
+ const AlignmentElement &GetElement(size_t pos) const {
+ return m_elements[pos];
+ }
+};
+
+
+
diff --git a/phrase-extract/ExtractedRule.cpp b/phrase-extract/ExtractedRule.cpp
new file mode 100644
index 000000000..c566e842a
--- /dev/null
+++ b/phrase-extract/ExtractedRule.cpp
@@ -0,0 +1,41 @@
+//
+// ExtractedRule.cpp
+// extract
+//
+// Created by Hieu Hoang on 13/09/2011.
+// Copyright 2011 __MyCompanyName__. All rights reserved.
+//
+
+#include "ExtractedRule.h"
+
+using namespace std;
+
+void ExtractedRule::OutputNTLengths(std::ostream &out) const
+{
+ ostringstream outString;
+ OutputNTLengths(outString);
+ out << outString;
+}
+
+void ExtractedRule::OutputNTLengths(std::ostringstream &outString) const
+{
+ std::map<size_t, std::pair<size_t, size_t> >::const_iterator iter;
+ for (iter = m_ntLengths.begin(); iter != m_ntLengths.end(); ++iter)
+ {
+ size_t sourcePos = iter->first;
+ const std::pair<size_t, size_t> &spanLengths = iter->second;
+ outString << sourcePos << "=" << spanLengths.first << "," <<spanLengths.second << " ";
+ }
+}
+
+std::ostream& operator<<(std::ostream &out, const ExtractedRule &obj)
+{
+ out << obj.source << " ||| " << obj.target << " ||| "
+ << obj.alignment << " ||| "
+ << obj.alignmentInv << " ||| ";
+
+ obj.OutputNTLengths(out);
+
+ return out;
+}
+
diff --git a/phrase-extract/ExtractedRule.h b/phrase-extract/ExtractedRule.h
new file mode 100644
index 000000000..be6e30836
--- /dev/null
+++ b/phrase-extract/ExtractedRule.h
@@ -0,0 +1,74 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+#ifndef EXTRACTEDRULE_H_INCLUDED_
+#define EXTRACTEDRULE_H_INCLUDED_
+
+#include <string>
+#include <iostream>
+#include <sstream>
+#include <map>
+
+// sentence-level collection of rules
+class ExtractedRule
+{
+ friend std::ostream& operator<<(std::ostream &, const ExtractedRule &);
+
+public:
+ std::string source;
+ std::string target;
+ std::string alignment;
+ std::string alignmentInv;
+ std::string orientation;
+ std::string orientationForward;
+ int startT;
+ int endT;
+ int startS;
+ int endS;
+ float count;
+ double pcfgScore;
+
+ std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
+
+ ExtractedRule(int sT, int eT, int sS, int eS)
+ : source()
+ , target()
+ , alignment()
+ , alignmentInv()
+ , orientation()
+ , orientationForward()
+ , startT(sT)
+ , endT(eT)
+ , startS(sS)
+ , endS(eS)
+ , count(0)
+ , pcfgScore(0.0)
+ {}
+
+ void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength)
+ {
+ m_ntLengths[sourcePos] = std::pair<size_t, size_t>(sourceLength, targetLength);
+ }
+
+ void OutputNTLengths(std::ostream &out) const;
+ void OutputNTLengths(std::ostringstream &out) const;
+};
+
+#endif
diff --git a/phrase-extract/Hole.h b/phrase-extract/Hole.h
new file mode 100644
index 000000000..31e928f6e
--- /dev/null
+++ b/phrase-extract/Hole.h
@@ -0,0 +1,111 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+#ifndef HOLE_H_INCLUDED_
+#define HOLE_H_INCLUDED_
+
+#include <cassert>
+#include <list>
+#include <string>
+#include <vector>
+
+class Hole
+{
+protected:
+ std::vector<int> m_start, m_end, m_pos;
+ std::vector<std::string> m_label;
+
+public:
+ Hole()
+ : m_start(2)
+ , m_end(2)
+ , m_pos(2)
+ , m_label(2)
+ {}
+
+ Hole(const Hole &copy)
+ : m_start(copy.m_start)
+ , m_end(copy.m_end)
+ , m_pos(copy.m_pos)
+ , m_label(copy.m_label)
+ {}
+
+ Hole(int startS, int endS, int startT, int endT)
+ : m_start(2)
+ , m_end(2)
+ , m_pos(2)
+ , m_label(2) {
+ m_start[0] = startS;
+ m_end[0] = endS;
+ m_start[1] = startT;
+ m_end[1] = endT;
+ }
+
+ int GetStart(size_t direction) const {
+ return m_start[direction];
+ }
+
+ int GetEnd(size_t direction) const {
+ return m_end[direction];
+ }
+
+ int GetSize(size_t direction) const {
+ return m_end[direction] - m_start[direction] + 1;
+ }
+
+ void SetPos(int pos, size_t direction) {
+ m_pos[direction] = pos;
+ }
+
+ int GetPos(size_t direction) const {
+ return m_pos[direction];
+ }
+
+ void SetLabel(const std::string &label, size_t direction) {
+ m_label[direction] = label;
+ }
+
+ const std::string &GetLabel(size_t direction) const {
+ return m_label[direction];
+ }
+
+ bool Overlap(const Hole &otherHole, size_t direction) const {
+ return ! ( otherHole.GetEnd(direction) < GetStart(direction) ||
+ otherHole.GetStart(direction) > GetEnd(direction) );
+ }
+
+ bool Neighbor(const Hole &otherHole, size_t direction) const {
+ return ( otherHole.GetEnd(direction)+1 == GetStart(direction) ||
+ otherHole.GetStart(direction) == GetEnd(direction)+1 );
+ }
+};
+
+typedef std::list<Hole> HoleList;
+
+class HoleSourceOrderer
+{
+public:
+ bool operator()(const Hole* holeA, const Hole* holeB) const {
+ assert(holeA->GetStart(0) != holeB->GetStart(0));
+ return holeA->GetStart(0) < holeB->GetStart(0);
+ }
+};
+
+#endif
diff --git a/phrase-extract/HoleCollection.cpp b/phrase-extract/HoleCollection.cpp
new file mode 100644
index 000000000..4cffab7fd
--- /dev/null
+++ b/phrase-extract/HoleCollection.cpp
@@ -0,0 +1,62 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "HoleCollection.h"
+
+#include <algorithm>
+
+void HoleCollection::SortSourceHoles()
+{
+ assert(m_sortedSourceHoles.size() == 0);
+
+ // add
+ HoleList::iterator iter;
+ for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
+ Hole &currHole = *iter;
+ m_sortedSourceHoles.push_back(&currHole);
+ }
+
+ // sort
+ std::sort(m_sortedSourceHoles.begin(), m_sortedSourceHoles.end(), HoleSourceOrderer());
+}
+
+void HoleCollection::Add(int startT, int endT, int startS, int endS)
+{
+ Hole hole(startS, endS, startT, endT);
+ m_scope = Scope(hole);
+ m_sourceHoleStartPoints.insert(startS);
+ m_sourceHoleEndPoints.insert(endS);
+ m_holes.push_back(hole);
+}
+
+int HoleCollection::Scope(const Hole &proposedHole) const
+{
+ const int holeStart = proposedHole.GetStart(0);
+ const int holeEnd = proposedHole.GetEnd(0);
+ int scope = m_scope;
+ if (holeStart == m_sourcePhraseStart ||
+ m_sourceHoleEndPoints.find(holeStart-1) != m_sourceHoleEndPoints.end()) {
+ ++scope; // Adding hole would introduce choice point at start of hole.
+ }
+ if (holeEnd == m_sourcePhraseEnd ||
+ m_sourceHoleStartPoints.find(holeEnd+1) != m_sourceHoleStartPoints.end()) {
+ ++scope; // Adding hole would introduce choice point at end of hole.
+ }
+ return scope;
+}
diff --git a/phrase-extract/HoleCollection.h b/phrase-extract/HoleCollection.h
new file mode 100644
index 000000000..355e825fb
--- /dev/null
+++ b/phrase-extract/HoleCollection.h
@@ -0,0 +1,97 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+#ifndef HOLECOLLECTION_H_INCLUDED_
+#define HOLECOLLECTION_H_INCLUDED_
+
+#include <set>
+#include <vector>
+
+#include "Hole.h"
+
+class HoleCollection
+{
+protected:
+ HoleList m_holes;
+ std::vector<Hole*> m_sortedSourceHoles;
+ std::set<int> m_sourceHoleStartPoints;
+ std::set<int> m_sourceHoleEndPoints;
+ int m_scope;
+ int m_sourcePhraseStart;
+ int m_sourcePhraseEnd;
+
+public:
+ HoleCollection(int sourcePhraseStart, int sourcePhraseEnd)
+ : m_scope(0)
+ , m_sourcePhraseStart(sourcePhraseStart)
+ , m_sourcePhraseEnd(sourcePhraseEnd)
+ {}
+
+ HoleCollection(const HoleCollection &copy)
+ : m_holes(copy.m_holes)
+ , m_sourceHoleStartPoints(copy.m_sourceHoleStartPoints)
+ , m_sourceHoleEndPoints(copy.m_sourceHoleEndPoints)
+ , m_scope(copy.m_scope)
+ , m_sourcePhraseStart(copy.m_sourcePhraseStart)
+ , m_sourcePhraseEnd(copy.m_sourcePhraseEnd)
+ {} // don't copy sorted target holes. messes up sorting fn
+
+ const HoleList &GetHoles() const {
+ return m_holes;
+ }
+
+ HoleList &GetHoles() {
+ return m_holes;
+ }
+
+ std::vector<Hole*> &GetSortedSourceHoles() {
+ return m_sortedSourceHoles;
+ }
+
+ void Add(int startT, int endT, int startS, int endS);
+
+ bool OverlapSource(const Hole &sourceHole) const {
+ HoleList::const_iterator iter;
+ for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
+ const Hole &currHole = *iter;
+ if (currHole.Overlap(sourceHole, 0))
+ return true;
+ }
+ return false;
+ }
+
+ bool ConsecSource(const Hole &sourceHole) const {
+ HoleList::const_iterator iter;
+ for (iter = m_holes.begin(); iter != m_holes.end(); ++iter) {
+ const Hole &currHole = *iter;
+ if (currHole.Neighbor(sourceHole, 0))
+ return true;
+ }
+ return false;
+ }
+
+ // Determine the scope that would result from adding the given hole.
+ int Scope(const Hole &proposedHole) const;
+
+ void SortSourceHoles();
+
+};
+
+#endif
diff --git a/phrase-extract/InputFileStream.cpp b/phrase-extract/InputFileStream.cpp
new file mode 100644
index 000000000..d111903e6
--- /dev/null
+++ b/phrase-extract/InputFileStream.cpp
@@ -0,0 +1,61 @@
+// $Id: InputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "InputFileStream.h"
+#include "gzfilebuf.h"
+#include <iostream>
+
+using namespace std;
+
+namespace Moses
+{
+InputFileStream::InputFileStream(const std::string &filePath)
+ : std::istream(NULL)
+ , m_streambuf(NULL)
+{
+ if (filePath.size() > 3 &&
+ filePath.substr(filePath.size() - 3, 3) == ".gz") {
+ m_streambuf = new gzfilebuf(filePath.c_str());
+ } else {
+ std::filebuf* fb = new std::filebuf();
+ fb = fb->open(filePath.c_str(), std::ios::in);
+ if (! fb) {
+ cerr << "Can't read " << filePath.c_str() << endl;
+ exit(1);
+ }
+ m_streambuf = fb;
+ }
+ this->init(m_streambuf);
+}
+
+InputFileStream::~InputFileStream()
+{
+ delete m_streambuf;
+ m_streambuf = NULL;
+}
+
+void InputFileStream::Close()
+{
+}
+
+
+}
+
diff --git a/phrase-extract/InputFileStream.h b/phrase-extract/InputFileStream.h
new file mode 100644
index 000000000..e2a31bc82
--- /dev/null
+++ b/phrase-extract/InputFileStream.h
@@ -0,0 +1,48 @@
+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#ifndef moses_InputFileStream_h
+#define moses_InputFileStream_h
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+
+namespace Moses
+{
+
+/** Used in place of std::istream, can read zipped files if it ends in .gz
+ */
+class InputFileStream : public std::istream
+{
+protected:
+ std::streambuf *m_streambuf;
+public:
+
+ InputFileStream(const std::string &filePath);
+ ~InputFileStream();
+
+ void Close();
+};
+
+}
+
+#endif
diff --git a/phrase-extract/Jamfile b/phrase-extract/Jamfile
new file mode 100644
index 000000000..d834674b8
--- /dev/null
+++ b/phrase-extract/Jamfile
@@ -0,0 +1,32 @@
+obj InputFileStream.o : InputFileStream.cpp : <include>. ;
+alias InputFileStream : InputFileStream.o ..//z ;
+
+obj tables-core.o : tables-core.cpp : <include>. ;
+obj AlignmentPhrase.o : AlignmentPhrase.cpp : <include>. ;
+obj SentenceAlignment.o : SentenceAlignment.cpp : <include>. ;
+obj SyntaxTree.o : SyntaxTree.cpp : <include>. ;
+obj XmlTree.o : XmlTree.cpp : <include>. ;
+
+alias filestreams : InputFileStream.cpp OutputFileStream.cpp : : : <include>. ;
+alias trees : SyntaxTree.cpp tables-core.o XmlTree.o : : : <include>. ;
+
+exe extract : tables-core.o SentenceAlignment.o extract.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+
+exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o SentenceAlignmentWithSyntax.cpp HoleCollection.cpp extract-rules.cpp ExtractedRule.cpp OutputFileStream.cpp InputFileStream ../moses/src//ThreadPool ..//boost_iostreams ;
+
+exe extract-lex : extract-lex.cpp InputFileStream ;
+
+exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+
+exe consolidate : consolidate.cpp tables-core.o OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+
+exe consolidate-direct : consolidate-direct.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+
+exe consolidate-reverse : consolidate-reverse.cpp tables-core.o InputFileStream ;
+
+exe relax-parse : tables-core.o SyntaxTree.o XmlTree.o relax-parse.cpp ;
+
+exe statistics : tables-core.o AlignmentPhrase.o statistics.cpp InputFileStream ;
+
+alias programs : extract extract-rules extract-lex score consolidate consolidate-direct consolidate-reverse relax-parse statistics ;
+
diff --git a/phrase-extract/OutputFileStream.cpp b/phrase-extract/OutputFileStream.cpp
new file mode 100644
index 000000000..2cad33bb9
--- /dev/null
+++ b/phrase-extract/OutputFileStream.cpp
@@ -0,0 +1,79 @@
+// $Id: OutputFileStream.cpp 2780 2010-01-29 17:11:17Z bojar $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <boost/iostreams/filter/gzip.hpp>
+#include "OutputFileStream.h"
+#include "gzfilebuf.h"
+
+using namespace std;
+
+namespace Moses
+{
+OutputFileStream::OutputFileStream()
+ :boost::iostreams::filtering_ostream()
+ ,m_outFile(NULL)
+{
+}
+
+OutputFileStream::OutputFileStream(const std::string &filePath)
+ : m_outFile(NULL)
+{
+ Open(filePath);
+}
+
+OutputFileStream::~OutputFileStream()
+{
+ Close();
+}
+
+bool OutputFileStream::Open(const std::string &filePath)
+{
+ m_outFile = new ofstream(filePath.c_str(), ios_base::out | ios_base::binary);
+ if (m_outFile->fail()) {
+ return false;
+ }
+
+ if (filePath.size() > 3 && filePath.substr(filePath.size() - 3, 3) == ".gz") {
+ this->push(boost::iostreams::gzip_compressor());
+ }
+ this->push(*m_outFile);
+
+ return true;
+}
+
+void OutputFileStream::Close()
+{
+ if (m_outFile == NULL) {
+ return;
+ }
+
+ this->flush();
+ this->pop(); // file
+
+ m_outFile->close();
+ delete m_outFile;
+ m_outFile = NULL;
+ return;
+}
+
+
+}
+
diff --git a/phrase-extract/OutputFileStream.h b/phrase-extract/OutputFileStream.h
new file mode 100644
index 000000000..f52e36d76
--- /dev/null
+++ b/phrase-extract/OutputFileStream.h
@@ -0,0 +1,50 @@
+// $Id: InputFileStream.h 2939 2010-02-24 11:15:44Z jfouet $
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <cstdlib>
+#include <fstream>
+#include <string>
+#include <iostream>
+#include <boost/iostreams/filtering_stream.hpp>
+
+namespace Moses
+{
+
+/** Used in place of std::istream, can read zipped files if it ends in .gz
+ */
+class OutputFileStream : public boost::iostreams::filtering_ostream
+{
+protected:
+ std::ofstream *m_outFile;
+public:
+ OutputFileStream();
+
+ OutputFileStream(const std::string &filePath);
+ virtual ~OutputFileStream();
+
+ bool Open(const std::string &filePath);
+ void Close();
+};
+
+}
+
diff --git a/phrase-extract/PhraseAlignment.cpp b/phrase-extract/PhraseAlignment.cpp
new file mode 100644
index 000000000..ceb74f04c
--- /dev/null
+++ b/phrase-extract/PhraseAlignment.cpp
@@ -0,0 +1,239 @@
+/*
+ * PhraseAlignment.cpp
+ * extract
+ *
+ * Created by Hieu Hoang on 28/07/2010.
+ * Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+
+#include <sstream>
+#include "PhraseAlignment.h"
+#include "SafeGetline.h"
+#include "tables-core.h"
+#include "score.h"
+
+#include <cstdlib>
+
+using namespace std;
+
+extern Vocabulary vcbT;
+extern Vocabulary vcbS;
+
+extern bool hierarchicalFlag;
+
+//! convert string to variable of type T. Used to reading floats, int etc from files
+template<typename T>
+inline T Scan(const std::string &input)
+{
+ std::stringstream stream(input);
+ T ret;
+ stream >> ret;
+ return ret;
+}
+
+
+//! speeded up version of above
+template<typename T>
+inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
+{
+ output.resize(input.size());
+ for (size_t i = 0 ; i < input.size() ; i++)
+ {
+ output[i] = Scan<T>( input[i] );
+ }
+}
+
+
+inline void Tokenize(std::vector<std::string> &output
+ , const std::string& str
+ , const std::string& delimiters = " \t")
+{
+ // Skip delimiters at beginning.
+ std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+ // Find first "non-delimiter".
+ std::string::size_type pos = str.find_first_of(delimiters, lastPos);
+
+ while (std::string::npos != pos || std::string::npos != lastPos) {
+ // Found a token, add it to the vector.
+ output.push_back(str.substr(lastPos, pos - lastPos));
+ // Skip delimiters. Note the "not_of"
+ lastPos = str.find_first_not_of(delimiters, pos);
+ // Find next "non-delimiter"
+ pos = str.find_first_of(delimiters, lastPos);
+ }
+}
+
+// speeded up version of above
+template<typename T>
+inline void Tokenize( std::vector<T> &output
+ , const std::string &input
+ , const std::string& delimiters = " \t")
+{
+ std::vector<std::string> stringVector;
+ Tokenize(stringVector, input, delimiters);
+ return Scan<T>(output, stringVector );
+}
+
+// read in a phrase pair and store it
+void PhraseAlignment::create( char line[], int lineID )
+{
+ assert(phraseS.empty());
+ assert(phraseT.empty());
+
+ //cerr << "processing " << line;
+ vector< string > token = tokenize( line );
+ int item = 1;
+ for (size_t j=0; j<token.size(); j++) {
+ if (token[j] == "|||") item++;
+ else if (item == 1) { // source phrase
+ phraseS.push_back( vcbS.storeIfNew( token[j] ) );
+ }
+
+ else if (item == 2) { // target phrase
+ phraseT.push_back( vcbT.storeIfNew( token[j] ) );
+ }
+ else if (item == 3) { // alignment
+ int s,t;
+ sscanf(token[j].c_str(), "%d-%d", &s, &t);
+ if ((size_t)t >= phraseT.size() || (size_t)s >= phraseS.size()) {
+ cerr << "WARNING: phrase pair " << lineID
+ << " has alignment point (" << s << ", " << t
+ << ") out of bounds (" << phraseS.size() << ", " << phraseT.size() << ")\n";
+ } else {
+ // first alignment point? -> initialize
+ createAlignVec(phraseS.size(), phraseT.size());
+
+ // add alignment point
+ alignedToT[t].insert( s );
+ alignedToS[s].insert( t );
+ }
+ } else if (item == 4) { // count
+ sscanf(token[j].c_str(), "%f", &count);
+ }
+ else if (item == 5) { // non-term lengths
+ addNTLength(token[j]);
+ } else if (item == 6) { // target syntax PCFG score
+ float pcfgScore = std::atof(token[j].c_str());
+ pcfgSum = pcfgScore * count;
+ }
+ }
+
+ createAlignVec(phraseS.size(), phraseT.size());
+
+ if (item == 3) {
+ count = 1.0;
+ }
+ if (item < 3 || item > 6) {
+ cerr << "ERROR: faulty line " << lineID << ": " << line << endl;
+ }
+}
+
+void PhraseAlignment::addNTLength(const std::string &tok)
+{
+ vector< string > tokens;
+
+ Tokenize(tokens, tok, "=");
+ assert(tokens.size() == 2);
+
+ size_t sourcePos = Scan<size_t>(tokens[0]);
+ assert(sourcePos < phraseS.size());
+
+ vector< size_t > ntLengths;
+ Tokenize<size_t>(ntLengths, tokens[1], ",");
+ assert(ntLengths.size() == 2);
+
+ m_ntLengths[sourcePos] = std::pair<size_t, size_t>(ntLengths[0], ntLengths[1]);
+}
+
+void PhraseAlignment::createAlignVec(size_t sourceSize, size_t targetSize)
+{
+ // in case of no align info. always need align info, even if blank
+ if (alignedToT.size() == 0) {
+ size_t numTgtSymbols = (hierarchicalFlag ? targetSize-1 : targetSize);
+ alignedToT.resize(numTgtSymbols);
+ }
+
+ if (alignedToS.size() == 0) {
+ size_t numSrcSymbols = (hierarchicalFlag ? sourceSize-1 : sourceSize);
+ alignedToS.resize(numSrcSymbols);
+ }
+}
+
+void PhraseAlignment::clear()
+{
+ phraseS.clear();
+ phraseT.clear();
+ alignedToT.clear();
+ alignedToS.clear();
+}
+
+// check if two word alignments between a phrase pair are the same
+bool PhraseAlignment::equals( const PhraseAlignment& other )
+{
+ if (this == &other) return true;
+ if (other.GetTarget() != GetTarget()) return false;
+ if (other.GetSource() != GetSource()) return false;
+ if (other.alignedToT != alignedToT) return false;
+ if (other.alignedToS != alignedToS) return false;
+ return true;
+}
+
+// check if two word alignments between a phrase pairs "match"
+// i.e. they do not differ in the alignment of non-termimals
+bool PhraseAlignment::match( const PhraseAlignment& other )
+{
+ if (this == &other) return true;
+ if (other.GetTarget() != GetTarget()) return false;
+ if (other.GetSource() != GetSource()) return false;
+ if (!hierarchicalFlag) return true;
+
+ assert(phraseT.size() == alignedToT.size() + 1);
+ assert(alignedToT.size() == other.alignedToT.size());
+
+ // loop over all words (note: 0 = left hand side of rule)
+ for(size_t i=0; i<phraseT.size()-1; i++) {
+ if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
+ if (alignedToT[i].size() != 1 ||
+ other.alignedToT[i].size() != 1 ||
+ *(alignedToT[i].begin()) != *(other.alignedToT[i].begin()))
+ return false;
+ }
+ }
+ return true;
+}
+
+int PhraseAlignment::Compare(const PhraseAlignment &other) const
+{
+ if (this == &other) // comparing with itself
+ return 0;
+
+ if (GetTarget() != other.GetTarget())
+ return ( GetTarget() < other.GetTarget() ) ? -1 : +1;
+
+ if (GetSource() != other.GetSource())
+ return ( GetSource() < other.GetSource() ) ? -1 : +1;
+
+ if (!hierarchicalFlag)
+ return 0;
+
+ // loop over all words (note: 0 = left hand side of rule)
+ for(size_t i=0; i<phraseT.size()-1; i++) {
+ if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
+ size_t thisAlign = *(alignedToT[i].begin());
+ size_t otherAlign = *(other.alignedToT[i].begin());
+
+ if (alignedToT[i].size() != 1 ||
+ other.alignedToT[i].size() != 1 ||
+ thisAlign != otherAlign)
+ {
+ int ret = (thisAlign < otherAlign) ? -1 : +1;
+ return ret;
+ }
+ }
+ }
+ return 0;
+
+}
+
+
diff --git a/phrase-extract/PhraseAlignment.h b/phrase-extract/PhraseAlignment.h
new file mode 100644
index 000000000..8bd83503d
--- /dev/null
+++ b/phrase-extract/PhraseAlignment.h
@@ -0,0 +1,54 @@
+#pragma once
+/*
+ * PhraseAlignment.h
+ * extract
+ *
+ * Created by Hieu Hoang on 28/07/2010.
+ * Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include "tables-core.h"
+
+#include <vector>
+#include <set>
+#include <map>
+
+// data structure for a single phrase pair
+class PhraseAlignment
+{
+protected:
+ PHRASE phraseS;
+ PHRASE phraseT;
+
+ std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
+
+ void createAlignVec(size_t sourceSize, size_t targetSize);
+ void addNTLength(const std::string &tok);
+public:
+ float pcfgSum;
+ float count;
+ std::vector< std::set<size_t> > alignedToT;
+ std::vector< std::set<size_t> > alignedToS;
+
+ void create( char*, int );
+ void clear();
+ bool equals( const PhraseAlignment& );
+ bool match( const PhraseAlignment& );
+
+ int Compare(const PhraseAlignment &compare) const;
+ inline bool operator<(const PhraseAlignment &compare) const
+ {
+ return Compare(compare) < 0;
+ }
+
+ const PHRASE &GetSource() const {
+ return phraseS;
+ }
+ const PHRASE &GetTarget() const {
+ return phraseT;
+ }
+
+ const std::map<size_t, std::pair<size_t, size_t> > &GetNTLengths() const
+ { return m_ntLengths; }
+
+};
diff --git a/phrase-extract/RuleExist.h b/phrase-extract/RuleExist.h
new file mode 100644
index 000000000..cf7fae3cd
--- /dev/null
+++ b/phrase-extract/RuleExist.h
@@ -0,0 +1,59 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+#ifndef RULEEXIST_H_INCLUDED_
+#define RULEEXIST_H_INCLUDED_
+
+#include <vector>
+
+#include "Hole.h"
+
+// reposity of extracted phrase pairs
+// which are potential holes in larger phrase pairs
+class RuleExist
+{
+protected:
+ std::vector< std::vector<HoleList> > m_phraseExist;
+ // indexed by source pos. and source length
+ // maps to list of holes where <int, int> are target pos
+
+public:
+ RuleExist(size_t size)
+ :m_phraseExist(size) {
+ // size is the length of the source sentence
+ for (size_t pos = 0; pos < size; ++pos) {
+ // create empty hole lists
+ std::vector<HoleList> &endVec = m_phraseExist[pos];
+ endVec.resize(size - pos);
+ }
+ }
+
+ void Add(int startT, int endT, int startS, int endS) {
+ m_phraseExist[startT][endT - startT].push_back(Hole(startS, endS, startT, endT));
+ }
+
+ const HoleList &GetSourceHoles(int startT, int endT) const {
+ const HoleList &sourceHoles = m_phraseExist[startT][endT - startT];
+ return sourceHoles;
+ }
+
+};
+
+#endif
diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h
new file mode 100644
index 000000000..272af2c76
--- /dev/null
+++ b/phrase-extract/RuleExtractionOptions.h
@@ -0,0 +1,88 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+#ifndef RULEEXTRACTIONOPTIONS_H_INCLUDED_
+#define RULEEXTRACTIONOPTIONS_H_INCLUDED_
+
+struct RuleExtractionOptions {
+public:
+ int maxSpan;
+ int minHoleSource;
+ int minHoleTarget;
+ int minWords;
+ int maxSymbolsTarget;
+ int maxSymbolsSource;
+ int maxNonTerm;
+ int maxScope;
+ bool onlyDirectFlag;
+ bool glueGrammarFlag;
+ bool unknownWordLabelFlag;
+ bool onlyOutputSpanInfo;
+ bool noFileLimit;
+ bool properConditioning;
+ bool nonTermFirstWord;
+ bool nonTermConsecTarget;
+ bool nonTermConsecSource;
+ bool requireAlignedWord;
+ bool sourceSyntax;
+ bool targetSyntax;
+ bool duplicateRules;
+ bool fractionalCounting;
+ bool pcfgScore;
+ bool outputNTLengths;
+ bool gzOutput;
+ bool unpairedExtractFormat;
+ bool conditionOnTargetLhs;
+
+ RuleExtractionOptions()
+ : maxSpan(10)
+ , minHoleSource(2)
+ , minHoleTarget(1)
+ , minWords(1)
+ , maxSymbolsTarget(999)
+ , maxSymbolsSource(5)
+ , maxNonTerm(2)
+ , maxScope(999)
+ // int minHoleSize(1)
+ // int minSubPhraseSize(1) // minimum size of a remaining lexical phrase
+ , onlyDirectFlag(false)
+ , glueGrammarFlag(false)
+ , unknownWordLabelFlag(false)
+ , onlyOutputSpanInfo(false)
+ , noFileLimit(false)
+ //bool zipFiles(false)
+ , properConditioning(false)
+ , nonTermFirstWord(true)
+ , nonTermConsecTarget(true)
+ , nonTermConsecSource(false)
+ , requireAlignedWord(true)
+ , sourceSyntax(false)
+ , targetSyntax(false)
+ , duplicateRules(true)
+ , fractionalCounting(true)
+ , pcfgScore(false)
+ , outputNTLengths(false)
+ , gzOutput(false)
+ , unpairedExtractFormat(false)
+ , conditionOnTargetLhs(false)
+ {}
+};
+
+#endif
diff --git a/phrase-extract/SafeGetline.h b/phrase-extract/SafeGetline.h
new file mode 100644
index 000000000..0e03b8468
--- /dev/null
+++ b/phrase-extract/SafeGetline.h
@@ -0,0 +1,35 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+#ifndef SAFE_GETLINE_INCLUDED_
+#define SAFE_GETLINE_INCLUDED_
+
+#define SAFE_GETLINE(_IS, _LINE, _SIZE, _DELIM, _FILE) { \
+ _IS.getline(_LINE, _SIZE, _DELIM); \
+ if(_IS.fail() && !_IS.bad() && !_IS.eof()) _IS.clear(); \
+ if (_IS.gcount() == _SIZE-1) { \
+ cerr << "Line too long! Buffer overflow. Delete lines >=" \
+ << _SIZE << " chars or raise LINE_MAX_LENGTH in " << _FILE \
+ << endl; \
+ exit(1); \
+ } \
+ }
+
+#endif
diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp
new file mode 100644
index 000000000..8b513cfb4
--- /dev/null
+++ b/phrase-extract/SentenceAlignment.cpp
@@ -0,0 +1,91 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "SentenceAlignment.h"
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "tables-core.h"
+
+SentenceAlignment::~SentenceAlignment() {}
+
+bool SentenceAlignment::processTargetSentence(const char * targetString, int)
+{
+ target = tokenize(targetString);
+ return true;
+}
+
+bool SentenceAlignment::processSourceSentence(const char * sourceString, int)
+{
+ source = tokenize(sourceString);
+ return true;
+}
+
+bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID)
+{
+ using namespace std;
+ this->sentenceID = sentenceID;
+
+ // process sentence strings and store in target and source members.
+ if (!processTargetSentence(targetString, sentenceID)) {
+ return false;
+ }
+ if (!processSourceSentence(sourceString, sentenceID)) {
+ return false;
+ }
+
+ // check if sentences are empty
+ if (target.size() == 0 || source.size() == 0) {
+ cerr << "no target (" << target.size() << ") or source (" << source.size() << ") words << end insentence " << sentenceID << endl;
+ cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
+ return false;
+ }
+
+ // prepare data structures for alignments
+ for(size_t i=0; i<source.size(); i++) {
+ alignedCountS.push_back( 0 );
+ }
+ for(size_t i=0; i<target.size(); i++) {
+ vector< int > dummy;
+ alignedToT.push_back( dummy );
+ }
+
+ // reading in alignments
+ vector<string> alignmentSequence = tokenize( alignmentString );
+ for(size_t i=0; i<alignmentSequence.size(); i++) {
+ int s,t;
+ // cout << "scaning " << alignmentSequence[i].c_str() << endl;
+ if (! sscanf(alignmentSequence[i].c_str(), "%d-%d", &s, &t)) {
+ cerr << "WARNING: " << alignmentSequence[i] << " is a bad alignment point in sentence " << sentenceID << endl;
+ cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
+ return false;
+ }
+ // cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
+ if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
+ cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
+ cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
+ return false;
+ }
+ alignedToT[t].push_back( s );
+ alignedCountS[s]++;
+ }
+ return true;
+}
diff --git a/phrase-extract/SentenceAlignment.h b/phrase-extract/SentenceAlignment.h
new file mode 100644
index 000000000..df3987198
--- /dev/null
+++ b/phrase-extract/SentenceAlignment.h
@@ -0,0 +1,46 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+#ifndef SENTENCE_ALIGNMENT_H_INCLUDED_
+#define SENTENCE_ALIGNMENT_H_INCLUDED_
+
+#include <string>
+#include <vector>
+
+class SentenceAlignment
+{
+public:
+ std::vector<std::string> target;
+ std::vector<std::string> source;
+ std::vector<int> alignedCountS;
+ std::vector<std::vector<int> > alignedToT;
+ int sentenceID;
+
+ virtual ~SentenceAlignment();
+
+ virtual bool processTargetSentence(const char *, int);
+
+ virtual bool processSourceSentence(const char *, int);
+
+ bool create(char targetString[], char sourceString[],
+ char alignmentString[], int sentenceID);
+};
+
+#endif
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.cpp b/phrase-extract/SentenceAlignmentWithSyntax.cpp
new file mode 100644
index 000000000..06dc3919f
--- /dev/null
+++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp
@@ -0,0 +1,70 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "SentenceAlignmentWithSyntax.h"
+
+#include <map>
+#include <set>
+#include <string>
+
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+
+using namespace std;
+
+bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID)
+{
+ if (!m_options.targetSyntax) {
+ return SentenceAlignment::processTargetSentence(targetString, sentenceID);
+ }
+
+ string targetStringCPP(targetString);
+ try {
+ ProcessAndStripXMLTags(targetStringCPP, targetTree,
+ m_targetLabelCollection,
+ m_targetTopLabelCollection);
+ } catch (const XmlException & e) {
+ std::cerr << "WARNING: failed to process target sentence at line "
+ << sentenceID << ": " << e.getMsg() << std::endl;
+ return false;
+ }
+ target = tokenize(targetStringCPP.c_str());
+ return true;
+}
+
+bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID)
+{
+ if (!m_options.sourceSyntax) {
+ return SentenceAlignment::processSourceSentence(sourceString, sentenceID);
+ }
+
+ string sourceStringCPP(sourceString);
+ try {
+ ProcessAndStripXMLTags(sourceStringCPP, sourceTree,
+ m_sourceLabelCollection ,
+ m_sourceTopLabelCollection);
+ } catch (const XmlException & e) {
+ std::cerr << "WARNING: failed to process source sentence at line "
+ << sentenceID << ": " << e.getMsg() << std::endl;
+ return false;
+ }
+ source = tokenize(sourceStringCPP.c_str());
+ return true;
+}
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h
new file mode 100644
index 000000000..a2c164655
--- /dev/null
+++ b/phrase-extract/SentenceAlignmentWithSyntax.h
@@ -0,0 +1,65 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+#ifndef SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_
+#define SENTENCEALIGNMENTWITHSYNTAX_H_INCLUDED_
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "RuleExtractionOptions.h"
+#include "SentenceAlignment.h"
+#include "SyntaxTree.h"
+
+class SentenceAlignmentWithSyntax : public SentenceAlignment
+{
+public:
+ SyntaxTree targetTree;
+ SyntaxTree sourceTree;
+ std::set<std::string> & m_targetLabelCollection;
+ std::set<std::string> & m_sourceLabelCollection;
+ std::map<std::string, int> & m_targetTopLabelCollection;
+ std::map<std::string, int> & m_sourceTopLabelCollection;
+ const RuleExtractionOptions & m_options;
+
+ SentenceAlignmentWithSyntax(std::set<std::string> & tgtLabelColl,
+ std::set<std::string> & srcLabelColl,
+ std::map<std::string,int> & tgtTopLabelColl,
+ std::map<std::string,int> & srcTopLabelColl,
+ const RuleExtractionOptions & options)
+ : m_targetLabelCollection(tgtLabelColl)
+ , m_sourceLabelCollection(srcLabelColl)
+ , m_targetTopLabelCollection(tgtTopLabelColl)
+ , m_sourceTopLabelCollection(srcTopLabelColl)
+ , m_options(options)
+ {}
+
+ virtual ~SentenceAlignmentWithSyntax() {}
+
+ bool
+ processTargetSentence(const char *, int);
+
+ bool
+ processSourceSentence(const char *, int);
+};
+
+#endif
diff --git a/phrase-extract/SyntaxTree.cpp b/phrase-extract/SyntaxTree.cpp
new file mode 100644
index 000000000..f2783ffd2
--- /dev/null
+++ b/phrase-extract/SyntaxTree.cpp
@@ -0,0 +1,180 @@
+// $Id: SyntaxTree.cpp 1960 2008-12-15 12:52:38Z phkoehn $
+// vim:tabstop=2
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+
+#include "SyntaxTree.h"
+
+#include <cassert>
+#include <iostream>
+
+SyntaxTree::~SyntaxTree()
+{
+ Clear();
+}
+
+void SyntaxTree::Clear()
+{
+ m_top = 0;
+ // loop through all m_nodes, delete them
+ for(size_t i=0; i<m_nodes.size(); i++) {
+ delete m_nodes[i];
+ }
+ m_nodes.clear();
+ m_index.clear();
+}
+
+SyntaxNode *SyntaxTree::AddNode( int startPos, int endPos, std::string label )
+{
+ SyntaxNode* newNode = new SyntaxNode( startPos, endPos, label );
+ m_nodes.push_back( newNode );
+ m_index[ startPos ][ endPos ].push_back( newNode );
+ return newNode;
+}
+
+ParentNodes SyntaxTree::Parse()
+{
+ ParentNodes parents;
+
+ int size = m_index.size();
+
+ // looping through all spans of size >= 2
+ for( int length=2; length<=size; length++ ) {
+ for( int startPos = 0; startPos <= size-length; startPos++ ) {
+ if (HasNode( startPos, startPos+length-1 )) {
+ // processing one (parent) span
+
+ //std::cerr << "# " << startPos << "-" << (startPos+length-1) << ":";
+ SplitPoints splitPoints;
+ splitPoints.push_back( startPos );
+ //std::cerr << " " << startPos;
+
+ int first = 1;
+ int covered = 0;
+ while( covered < length ) {
+ // find largest covering subspan (child)
+ // starting at last covered position
+ for( int midPos=length-first; midPos>covered; midPos-- ) {
+ if( HasNode( startPos+covered, startPos+midPos-1 ) ) {
+ covered = midPos;
+ splitPoints.push_back( startPos+covered );
+ // std::cerr << " " << ( startPos+covered );
+ first = 0;
+ }
+ }
+ }
+ // std::cerr << std::endl;
+ parents.push_back( splitPoints );
+ }
+ }
+ }
+ return parents;
+}
+
+bool SyntaxTree::HasNode( int startPos, int endPos ) const
+{
+ return GetNodes( startPos, endPos).size() > 0;
+}
+
+const std::vector< SyntaxNode* >& SyntaxTree::GetNodes( int startPos, int endPos ) const
+{
+ SyntaxTreeIndexIterator startIndex = m_index.find( startPos );
+ if (startIndex == m_index.end() )
+ return m_emptyNode;
+
+ SyntaxTreeIndexIterator2 endIndex = startIndex->second.find( endPos );
+ if (endIndex == startIndex->second.end())
+ return m_emptyNode;
+
+ return endIndex->second;
+}
+
+// for printing out tree
+std::string SyntaxTree::ToString() const
+{
+ std::stringstream out;
+ out << *this;
+ return out.str();
+}
+
+void SyntaxTree::ConnectNodes()
+{
+ typedef SyntaxTreeIndex2::const_reverse_iterator InnerIterator;
+
+ SyntaxNode *prev = 0;
+ // Iterate over all start indices from lowest to highest.
+ for (SyntaxTreeIndexIterator p = m_index.begin(); p != m_index.end(); ++p) {
+ const SyntaxTreeIndex2 &inner = p->second;
+ // Iterate over all end indices from highest to lowest.
+ for (InnerIterator q = inner.rbegin(); q != inner.rend(); ++q) {
+ const std::vector<SyntaxNode*> &nodes = q->second;
+ // Iterate over all nodes that cover the same span in order of tree
+ // depth, top-most first.
+ for (std::vector<SyntaxNode*>::const_reverse_iterator r = nodes.rbegin();
+ r != nodes.rend(); ++r) {
+ SyntaxNode *node = *r;
+ if (!prev) {
+ // node is the root.
+ m_top = node;
+ node->SetParent(0);
+ } else if (prev->GetStart() == node->GetStart()) {
+ // prev is the parent of node.
+ assert(prev->GetEnd() >= node->GetEnd());
+ node->SetParent(prev);
+ prev->AddChild(node);
+ } else {
+ // prev is a descendant of node's parent. The lowest common
+ // ancestor of prev and node will be node's parent.
+ SyntaxNode *ancestor = prev->GetParent();
+ while (ancestor->GetEnd() < node->GetEnd()) {
+ ancestor = ancestor->GetParent();
+ }
+ assert(ancestor);
+ node->SetParent(ancestor);
+ ancestor->AddChild(node);
+ }
+ prev = node;
+ }
+ }
+ }
+}
+
+std::ostream& operator<<(std::ostream& os, const SyntaxTree& t)
+{
+ size_t size = t.m_index.size();
+ for(size_t length=1; length<=size; length++) {
+ for(size_t space=0; space<length; space++) {
+ os << " ";
+ }
+ for(size_t start=0; start<=size-length; start++) {
+
+ if (t.HasNode( start, start+(length-1) )) {
+ std::string label = t.GetNodes( start, start+(length-1) )[0]->GetLabel() + "#######";
+
+ os << label.substr(0,7) << " ";
+ } else {
+ os << "------- ";
+ }
+ }
+ os << std::endl;
+ }
+ return os;
+}
diff --git a/phrase-extract/SyntaxTree.h b/phrase-extract/SyntaxTree.h
new file mode 100644
index 000000000..17c106b49
--- /dev/null
+++ b/phrase-extract/SyntaxTree.h
@@ -0,0 +1,121 @@
+// $Id: SyntaxTree.h 1960 2008-12-15 12:52:38Z phkoehn $
+// vim:tabstop=2
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+
+#pragma once
+#include <string>
+#include <vector>
+#include <map>
+#include <sstream>
+
+class SyntaxNode
+{
+protected:
+ int m_start, m_end;
+ std::string m_label;
+ std::vector< SyntaxNode* > m_children;
+ SyntaxNode* m_parent;
+ float m_pcfgScore;
+public:
+ SyntaxNode( int startPos, int endPos, std::string label )
+ :m_start(startPos)
+ ,m_end(endPos)
+ ,m_label(label)
+ ,m_parent(0)
+ ,m_pcfgScore(0.0f)
+ {}
+ int GetStart() const {
+ return m_start;
+ }
+ int GetEnd() const {
+ return m_end;
+ }
+ std::string GetLabel() const {
+ return m_label;
+ }
+ float GetPcfgScore() const {
+ return m_pcfgScore;
+ }
+ void SetPcfgScore(float score) {
+ m_pcfgScore = score;
+ }
+ SyntaxNode *GetParent() {
+ return m_parent;
+ }
+ void SetParent(SyntaxNode *parent) {
+ m_parent = parent;
+ }
+ void AddChild(SyntaxNode* child) {
+ m_children.push_back(child);
+ }
+ const std::vector< SyntaxNode* > &GetChildren() const {
+ return m_children;
+ }
+};
+
+
+typedef std::vector< int > SplitPoints;
+typedef std::vector< SplitPoints > ParentNodes;
+
+class SyntaxTree
+{
+protected:
+ std::vector< SyntaxNode* > m_nodes;
+ SyntaxNode* m_top;
+
+ typedef std::map< int, std::vector< SyntaxNode* > > SyntaxTreeIndex2;
+ typedef SyntaxTreeIndex2::const_iterator SyntaxTreeIndexIterator2;
+ typedef std::map< int, SyntaxTreeIndex2 > SyntaxTreeIndex;
+ typedef SyntaxTreeIndex::const_iterator SyntaxTreeIndexIterator;
+ SyntaxTreeIndex m_index;
+ std::vector< SyntaxNode* > m_emptyNode;
+
+ friend std::ostream& operator<<(std::ostream&, const SyntaxTree&);
+
+public:
+ SyntaxTree() {
+ m_top = 0; // m_top doesn't get set unless ConnectNodes is called.
+ }
+ ~SyntaxTree();
+
+ SyntaxNode *AddNode( int startPos, int endPos, std::string label );
+
+ SyntaxNode *GetTop() {
+ return m_top;
+ }
+
+ ParentNodes Parse();
+ bool HasNode( int startPos, int endPos ) const;
+ const std::vector< SyntaxNode* >& GetNodes( int startPos, int endPos ) const;
+ const std::vector< SyntaxNode* >& GetAllNodes() {
+ return m_nodes;
+ };
+ size_t GetNumWords() const {
+ return m_index.size();
+ }
+ void ConnectNodes();
+ void Clear();
+ std::string ToString() const;
+};
+
+std::ostream& operator<<(std::ostream&, const SyntaxTree&);
+
diff --git a/phrase-extract/XmlException.h b/phrase-extract/XmlException.h
new file mode 100644
index 000000000..15ee91f20
--- /dev/null
+++ b/phrase-extract/XmlException.h
@@ -0,0 +1,41 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2010 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+#pragma once
+#ifndef XMLEXCEPTION_H_INCLUDED_
+#define XMLEXCEPTION_H_INCLUDED_
+
+#include <string>
+
+class XmlException
+{
+public:
+ XmlException(const std::string & msg)
+ : m_msg(msg)
+ {}
+
+ const std::string &
+ getMsg() const {
+ return m_msg;
+ }
+
+private:
+ std::string m_msg;
+};
+
+#endif
diff --git a/phrase-extract/XmlTree.cpp b/phrase-extract/XmlTree.cpp
new file mode 100644
index 000000000..29c0d94aa
--- /dev/null
+++ b/phrase-extract/XmlTree.cpp
@@ -0,0 +1,392 @@
+// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
+// vim:tabstop=2
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <cassert>
+#include <vector>
+#include <string>
+#include <set>
+#include <iostream>
+#include <cstdlib>
+#include <sstream>
+#include "SyntaxTree.h"
+#include "XmlException.h"
+
+using namespace std;
+
+
+inline std::vector<std::string> Tokenize(const std::string& str,
+ const std::string& delimiters = " \t")
+{
+ std::vector<std::string> tokens;
+ // Skip delimiters at beginning.
+ std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+ // Find first "non-delimiter".
+ std::string::size_type pos = str.find_first_of(delimiters, lastPos);
+
+ while (std::string::npos != pos || std::string::npos != lastPos) {
+ // Found a token, add it to the vector.
+ tokens.push_back(str.substr(lastPos, pos - lastPos));
+ // Skip delimiters. Note the "not_of"
+ lastPos = str.find_first_not_of(delimiters, pos);
+ // Find next "non-delimiter"
+ pos = str.find_first_of(delimiters, lastPos);
+ }
+
+ return tokens;
+}
+
+std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r")
+{
+ std::string res = str;
+ res.erase(str.find_last_not_of(dropChars)+1);
+ return res.erase(0, res.find_first_not_of(dropChars));
+}
+
+string ParseXmlTagAttribute(const string& tag,const string& attributeName)
+{
+ /*TODO deal with unescaping \"*/
+ string tagOpen = attributeName + "=\"";
+ size_t contentsStart = tag.find(tagOpen);
+ if (contentsStart == string::npos) return "";
+ contentsStart += tagOpen.size();
+ size_t contentsEnd = tag.find_first_of('"',contentsStart+1);
+ if (contentsEnd == string::npos) {
+ cerr << "Malformed XML attribute: "<< tag;
+ return "";
+ }
+ size_t possibleEnd;
+ while (tag.at(contentsEnd-1) == '\\' && (possibleEnd = tag.find_first_of('"',contentsEnd+1)) != string::npos) {
+ contentsEnd = possibleEnd;
+ }
+ return tag.substr(contentsStart,contentsEnd-contentsStart);
+}
+
+/**
+ * Remove "<" and ">" from XML tag
+ *
+ * \param str xml token to be stripped
+ */
+string TrimXml(const string& str)
+{
+ // too short to be xml token -> do nothing
+ if (str.size() < 2) return str;
+
+ // strip first and last character
+ if (str[0] == '<' && str[str.size() - 1] == '>') {
+ return str.substr(1, str.size() - 2);
+ }
+ // not an xml token -> do nothing
+ else {
+ return str;
+ }
+}
+
+/**
+ * Check if the token is an XML tag, i.e. starts with "<"
+ *
+ * \param tag token to be checked
+ */
+bool isXmlTag(const string& tag)
+{
+ return tag[0] == '<';
+}
+
+/**
+ * Unescape XML special characters.
+ */
+string unescape(const string& str)
+{
+ string s;
+ s.reserve(str.size());
+ string::size_type n;
+ string::size_type start = 0;
+ while ((n = str.find('&', start)) != string::npos) {
+ s += str.substr(start, n-start);
+ string::size_type end = str.find(';', n);
+ assert(n != string::npos);
+ string name = str.substr(n+1, end-n-1);
+ if (name == "lt") {
+ s += string("<");
+ } else if (name == "gt") {
+ s += string(">");
+ } else if (name == "#91") {
+ s += string("[");
+ } else if (name == "#93") {
+ s += string("]");
+ } else if (name == "bra") {
+ s += string("[");
+ } else if (name == "ket") {
+ s += string("]");
+ } else if (name == "bar") {
+ s += string("|");
+ } else if (name == "amp") {
+ s += string("&");
+ } else if (name == "apos") {
+ s += string("'");
+ } else if (name == "quot") {
+ s += string("\"");
+ } else {
+ // Currently only handles the following five XML escape sequences:
+ // &lt; <
+ // &gt; >
+ // &amp; &
+ // &apos; '
+ // &quot; "
+ // Numeric character references (like &#xf6;) are not supported.
+ std::ostringstream msg;
+ msg << "unsupported XML escape sequence: &" << name << ";";
+ throw XmlException(msg.str());
+ }
+ if (end == str.size()-1) {
+ return s;
+ }
+ start = end + 1;
+ }
+ s += str.substr(start);
+ return s;
+}
+
+/**
+ * Split up the input character string into tokens made up of
+ * either XML tags or text.
+ * example: this <b> is a </b> test .
+ * => (this ), (<b>), ( is a ), (</b>), ( test .)
+ *
+ * \param str input string
+ */
+vector<string> TokenizeXml(const string& str)
+{
+ string lbrack = "<";
+ string rbrack = ">";
+ vector<string> tokens; // vector of tokens to be returned
+ string::size_type cpos = 0; // current position in string
+ string::size_type lpos = 0; // left start of xml tag
+ string::size_type rpos = 0; // right end of xml tag
+
+ // walk thorugh the string (loop vver cpos)
+ while (cpos != str.size()) {
+ // find the next opening "<" of an xml tag
+ lpos = str.find_first_of(lbrack, cpos);
+ if (lpos != string::npos) {
+ // find the end of the xml tag
+ rpos = str.find_first_of(rbrack, lpos);
+ // sanity check: there has to be closing ">"
+ if (rpos == string::npos) {
+ cerr << "ERROR: malformed XML: " << str << endl;
+ return tokens;
+ }
+ } else { // no more tags found
+ // add the rest as token
+ tokens.push_back(str.substr(cpos));
+ break;
+ }
+
+ // add stuff before xml tag as token, if there is any
+ if (lpos - cpos > 0)
+ tokens.push_back(str.substr(cpos, lpos - cpos));
+
+ // add xml tag as token
+ tokens.push_back(str.substr(lpos, rpos-lpos+1));
+ cpos = rpos + 1;
+ }
+ return tokens;
+}
+
+/**
+ * Process a sentence with xml annotation
+ * Xml tags may specifiy additional/replacing translation options
+ * and reordering constraints
+ *
+ * \param line in: sentence, out: sentence without the xml
+ * \param res vector with translation options specified by xml
+ * \param reorderingConstraint reordering constraint zones specified by xml
+ * \param walls reordering constraint walls specified by xml
+ */
+/*TODO: we'd only have to return a vector of XML options if we dropped linking. 2-d vector
+ is so we can link things up afterwards. We can't create TranslationOptions as we
+ parse because we don't have the completed source parsed until after this function
+ removes all the markup from it (CreateFromString in Sentence::Read).
+*/
+bool ProcessAndStripXMLTags(string &line, SyntaxTree &tree, set< string > &labelCollection, map< string, int > &topLabelCollection )
+{
+ //parse XML markup in translation line
+
+ // no xml tag? we're done.
+ if (line.find_first_of('<') == string::npos) {
+ return true;
+ }
+
+ // break up input into a vector of xml tags and text
+ // example: (this), (<b>), (is a), (</b>), (test .)
+ vector<string> xmlTokens = TokenizeXml(line);
+
+ // we need to store opened tags, until they are closed
+ // tags are stored as tripled (tagname, startpos, contents)
+ typedef pair< string, pair< size_t, string > > OpenedTag;
+ vector< OpenedTag > tagStack; // stack that contains active opened tags
+
+ string cleanLine; // return string (text without xml)
+ size_t wordPos = 0; // position in sentence (in terms of number of words)
+ bool isLinked = false;
+
+ // loop through the tokens
+ for (size_t xmlTokenPos = 0 ; xmlTokenPos < xmlTokens.size() ; xmlTokenPos++) {
+ // not a xml tag, but regular text (may contain many words)
+ if(!isXmlTag(xmlTokens[xmlTokenPos])) {
+ // add a space at boundary, if necessary
+ if (cleanLine.size()>0 &&
+ cleanLine[cleanLine.size() - 1] != ' ' &&
+ xmlTokens[xmlTokenPos][0] != ' ') {
+ cleanLine += " ";
+ }
+ cleanLine += unescape(xmlTokens[xmlTokenPos]); // add to output
+ wordPos = Tokenize(cleanLine).size(); // count all the words
+ }
+
+ // process xml tag
+ else {
+ // *** get essential information about tag ***
+
+ // strip extra boundary spaces and "<" and ">"
+ string tag = Trim(TrimXml(xmlTokens[xmlTokenPos]));
+ // cerr << "XML TAG IS: " << tag << std::endl;
+
+ if (tag.size() == 0) {
+ cerr << "ERROR: empty tag name: " << line << endl;
+ return false;
+ }
+
+ // check if unary (e.g., "<wall/>")
+ bool isUnary = ( tag[tag.size() - 1] == '/' );
+
+ // check if opening tag (e.g. "<a>", not "</a>")g
+ bool isClosed = ( tag[0] == '/' );
+ bool isOpen = !isClosed;
+
+ if (isClosed && isUnary) {
+ cerr << "ERROR: can't have both closed and unary tag <" << tag << ">: " << line << endl;
+ return false;
+ }
+
+ if (isClosed)
+ tag = tag.substr(1); // remove "/" at the beginning
+ if (isUnary)
+ tag = tag.substr(0,tag.size()-1); // remove "/" at the end
+
+ // find the tag name and contents
+ string::size_type endOfName = tag.find_first_of(' ');
+ string tagName = tag;
+ string tagContent = "";
+ if (endOfName != string::npos) {
+ tagName = tag.substr(0,endOfName);
+ tagContent = tag.substr(endOfName+1);
+ }
+
+ // *** process new tag ***
+
+ if (isOpen || isUnary) {
+ // put the tag on the tag stack
+ OpenedTag openedTag = make_pair( tagName, make_pair( wordPos, tagContent ) );
+ tagStack.push_back( openedTag );
+ // cerr << "XML TAG " << tagName << " (" << tagContent << ") added to stack, now size " << tagStack.size() << endl;
+ }
+
+ // *** process completed tag ***
+
+ if (isClosed || isUnary) {
+ // pop last opened tag from stack;
+ if (tagStack.size() == 0) {
+ cerr << "ERROR: tag " << tagName << " closed, but not opened" << ":" << line << endl;
+ return false;
+ }
+ OpenedTag openedTag = tagStack.back();
+ tagStack.pop_back();
+
+ // tag names have to match
+ if (openedTag.first != tagName) {
+ cerr << "ERROR: tag " << openedTag.first << " closed by tag " << tagName << ": " << line << endl;
+ return false;
+ }
+
+ // assemble remaining information about tag
+ size_t startPos = openedTag.second.first;
+ string tagContent = openedTag.second.second;
+ size_t endPos = wordPos;
+
+ // span attribute overwrites position
+ string span = ParseXmlTagAttribute(tagContent,"span");
+ if (! span.empty()) {
+ vector<string> ij = Tokenize(span, "-");
+ if (ij.size() != 1 && ij.size() != 2) {
+ cerr << "ERROR: span attribute must be of the form \"i-j\" or \"i\": " << line << endl;
+ return false;
+ }
+ startPos = atoi(ij[0].c_str());
+ if (ij.size() == 1) endPos = startPos + 1;
+ else endPos = atoi(ij[1].c_str()) + 1;
+ }
+
+ // cerr << "XML TAG " << tagName << " (" << tagContent << ") spanning " << startPos << " to " << (endPos-1) << " complete, commence processing" << endl;
+
+ if (startPos >= endPos) {
+ cerr << "ERROR: tag " << tagName << " must span at least one word (" << startPos << "-" << endPos << "): " << line << endl;
+ return false;
+ }
+
+ string label = ParseXmlTagAttribute(tagContent,"label");
+ labelCollection.insert( label );
+
+ string pcfgString = ParseXmlTagAttribute(tagContent,"pcfg");
+ float pcfgScore = pcfgString == "" ? 0.0f
+ : std::atof(pcfgString.c_str());
+
+ // report what we have processed so far
+ if (0) {
+ cerr << "XML TAG NAME IS: '" << tagName << "'" << endl;
+ cerr << "XML TAG LABEL IS: '" << label << "'" << endl;
+ cerr << "XML SPAN IS: " << startPos << "-" << (endPos-1) << endl;
+ }
+ SyntaxNode *node = tree.AddNode( startPos, endPos-1, label );
+ node->SetPcfgScore(pcfgScore);
+ }
+ }
+ }
+ // we are done. check if there are tags that are still open
+ if (tagStack.size() > 0) {
+ cerr << "ERROR: some opened tags were never closed: " << line << endl;
+ return false;
+ }
+
+ // collect top labels
+ const vector< SyntaxNode* >& topNodes = tree.GetNodes( 0, wordPos-1 );
+ for( vector< SyntaxNode* >::const_iterator node = topNodes.begin(); node != topNodes.end(); node++ ) {
+ SyntaxNode *n = *node;
+ const string &label = n->GetLabel();
+ if (topLabelCollection.find( label ) == topLabelCollection.end())
+ topLabelCollection[ label ] = 0;
+ topLabelCollection[ label ]++;
+ }
+
+ // return de-xml'ed sentence in line
+ line = cleanLine;
+ return true;
+}
diff --git a/phrase-extract/XmlTree.h b/phrase-extract/XmlTree.h
new file mode 100644
index 000000000..7e6bbecea
--- /dev/null
+++ b/phrase-extract/XmlTree.h
@@ -0,0 +1,36 @@
+// $Id: XmlOption.cpp 1960 2008-12-15 12:52:38Z phkoehn $
+// vim:tabstop=2
+
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2006 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+#include <string>
+#include <vector>
+#include <set>
+#include <map>
+#include "SyntaxTree.h"
+
+std::string ParseXmlTagAttribute(const std::string& tag,const std::string& attributeName);
+std::string Trim(const std::string& str, const std::string dropChars = " \t\n\r");
+std::string TrimXml(const std::string& str);
+bool isXmlTag(const std::string& tag);
+std::vector<std::string> TokenizeXml(const std::string& str);
+bool ProcessAndStripXMLTags(std::string &line, SyntaxTree &tree, std::set< std::string > &labelCollection, std::map< std::string, int > &topLabelCollection );
+std::string unescape(const std::string &str);
diff --git a/phrase-extract/consolidate-direct.cpp b/phrase-extract/consolidate-direct.cpp
new file mode 100644
index 000000000..e7e68e977
--- /dev/null
+++ b/phrase-extract/consolidate-direct.cpp
@@ -0,0 +1,138 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <string.h>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <iostream>
+#include <cstdlib>
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+
+#include "SafeGetline.h"
+
+#define LINE_MAX_LENGTH 10000
+
+using namespace std;
+
+char line[LINE_MAX_LENGTH];
+
+
+vector< string > splitLine()
+{
+ vector< string > item;
+ int start=0;
+ int i=0;
+ for(; line[i] != '\0'; i++) {
+ if (line[i] == ' ' &&
+ line[i+1] == '|' &&
+ line[i+2] == '|' &&
+ line[i+3] == '|' &&
+ line[i+4] == ' ') {
+ if (start > i) start = i; // empty item
+ item.push_back( string( line+start, i-start ) );
+ start = i+5;
+ i += 3;
+ }
+ }
+ item.push_back( string( line+start, i-start ) );
+
+ return item;
+}
+
+bool getLine( istream &fileP, vector< string > &item )
+{
+ if (fileP.eof())
+ return false;
+
+ SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (fileP.eof())
+ return false;
+
+ item = splitLine();
+
+ return true;
+}
+
+
+int main(int argc, char* argv[])
+{
+ cerr << "Starting..." << endl;
+
+ char* &fileNameDirect = argv[1];
+ Moses::InputFileStream fileDirect(fileNameDirect);
+
+
+ //fileDirect.open(fileNameDirect);
+ if (fileDirect.fail()) {
+ cerr << "ERROR: could not open extract file " << fileNameDirect << endl;
+ exit(1);
+ }
+ istream &fileDirectP = fileDirect;
+
+ char* &fileNameConsolidated = argv[2];
+ ostream *fileConsolidated;
+
+ if (strcmp(fileNameConsolidated, "-") == 0) {
+ fileConsolidated = &cout;
+ }
+ else {
+ Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
+ bool success = outputFile->Open(fileNameConsolidated);
+ if (!success) {
+ cerr << "ERROR: could not open file phrase table file "
+ << fileNameConsolidated << endl;
+ exit(1);
+ }
+ fileConsolidated = outputFile;
+ }
+
+ int i=0;
+ while(true) {
+ i++;
+ if (i%1000 == 0) cerr << "." << flush;
+ if (i%10000 == 0) cerr << ":" << flush;
+ if (i%100000 == 0) cerr << "!" << flush;
+
+ vector< string > itemDirect;
+ if (! getLine(fileDirectP, itemDirect ))
+ break;
+
+ (*fileConsolidated) << itemDirect[0] << " ||| " << itemDirect[1] << " ||| ";
+
+ // output alignment and probabilities
+ (*fileConsolidated) << itemDirect[2] // prob direct
+ << " 2.718" // phrase count feature
+ << " ||| " << itemDirect[3]; // alignment
+
+ // counts
+ (*fileConsolidated) << "||| 0 " << itemDirect[4]; // indirect
+ (*fileConsolidated) << endl;
+
+ }
+
+ fileConsolidated->flush();
+ if (fileConsolidated != &cout) {
+ delete fileConsolidated;
+ }
+
+ cerr << "Finished" << endl;
+}
+
diff --git a/phrase-extract/consolidate-direct.vcxproj b/phrase-extract/consolidate-direct.vcxproj
new file mode 100644
index 000000000..a84d2cce6
--- /dev/null
+++ b/phrase-extract/consolidate-direct.vcxproj
@@ -0,0 +1,98 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{33775109-60CF-4C1C-A869-5450B3DD88B3}</ProjectGuid>
+ <RootNamespace>consolidatedirect</RootNamespace>
+ <Keyword>Win32Proj</Keyword>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup>
+ <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <MinimalRebuild>true</MinimalRebuild>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>zdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <TargetMachine>MachineX86</TargetMachine>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <Optimization>MaxSpeed</Optimization>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>zdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <TargetMachine>MachineX86</TargetMachine>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="consolidate-direct.cpp" />
+ <ClCompile Include="InputFileStream.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="InputFileStream.h" />
+ <ClInclude Include="SafeGetline.h" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project> \ No newline at end of file
diff --git a/phrase-extract/consolidate-reverse.cpp b/phrase-extract/consolidate-reverse.cpp
new file mode 100644
index 000000000..c86d870c8
--- /dev/null
+++ b/phrase-extract/consolidate-reverse.cpp
@@ -0,0 +1,245 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <sstream>
+#include <cstdlib>
+#include <cstring>
+
+#include "tables-core.h"
+#include "SafeGetline.h"
+#include "InputFileStream.h"
+
+#define LINE_MAX_LENGTH 10000
+
+using namespace std;
+
+bool hierarchicalFlag = false;
+bool onlyDirectFlag = false;
+bool phraseCountFlag = true;
+bool logProbFlag = false;
+char line[LINE_MAX_LENGTH];
+
+void processFiles( char*, char*, char* );
+bool getLine( istream &fileP, vector< string > &item );
+string reverseAlignment(const string &alignments);
+vector< string > splitLine();
+
+inline void Tokenize(std::vector<std::string> &output
+ , const std::string& str
+ , const std::string& delimiters = " \t")
+{
+ // Skip delimiters at beginning.
+ std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+ // Find first "non-delimiter".
+ std::string::size_type pos = str.find_first_of(delimiters, lastPos);
+
+ while (std::string::npos != pos || std::string::npos != lastPos) {
+ // Found a token, add it to the vector.
+ output.push_back(str.substr(lastPos, pos - lastPos));
+ // Skip delimiters. Note the "not_of"
+ lastPos = str.find_first_not_of(delimiters, pos);
+ // Find next "non-delimiter"
+ pos = str.find_first_of(delimiters, lastPos);
+ }
+}
+
+int main(int argc, char* argv[])
+{
+ cerr << "Consolidate v2.0 written by Philipp Koehn\n"
+ << "consolidating direct and indirect rule tables\n";
+
+ if (argc < 4) {
+ cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect]\n";
+ exit(1);
+ }
+ char* &fileNameDirect = argv[1];
+ char* &fileNameIndirect = argv[2];
+ char* &fileNameConsolidated = argv[3];
+
+ for(int i=4; i<argc; i++) {
+ if (strcmp(argv[i],"--Hierarchical") == 0) {
+ hierarchicalFlag = true;
+ cerr << "processing hierarchical rules\n";
+ } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
+ onlyDirectFlag = true;
+ cerr << "only including direct translation scores p(e|f)\n";
+ } else if (strcmp(argv[i],"--NoPhraseCount") == 0) {
+ phraseCountFlag = false;
+ cerr << "not including the phrase count feature\n";
+ } else if (strcmp(argv[i],"--LogProb") == 0) {
+ logProbFlag = true;
+ cerr << "using log-probabilities\n";
+ } else {
+ cerr << "ERROR: unknown option " << argv[i] << endl;
+ exit(1);
+ }
+ }
+
+ processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated );
+}
+
+void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated )
+{
+ // open input files
+ Moses::InputFileStream fileDirect(fileNameDirect);
+ Moses::InputFileStream fileIndirect(fileNameIndirect);
+
+ if (fileDirect.fail()) {
+ cerr << "ERROR: could not open phrase table file " << fileNameDirect << endl;
+ exit(1);
+ }
+ istream &fileDirectP = fileDirect;
+
+ if (fileIndirect.fail()) {
+ cerr << "ERROR: could not open phrase table file " << fileNameIndirect << endl;
+ exit(1);
+ }
+ istream &fileIndirectP = fileIndirect;
+
+ // open output file: consolidated phrase table
+ ofstream fileConsolidated;
+ fileConsolidated.open(fileNameConsolidated);
+ if (fileConsolidated.fail()) {
+ cerr << "ERROR: could not open output file " << fileNameConsolidated << endl;
+ exit(1);
+ }
+
+ // loop through all extracted phrase translations
+ int i=0;
+ while(true) {
+ i++;
+ if (i%100000 == 0) cerr << "." << flush;
+
+ vector< string > itemDirect, itemIndirect;
+ if (! getLine(fileIndirectP,itemIndirect) ||
+ ! getLine(fileDirectP, itemDirect ))
+ break;
+
+ // direct: target source alignment probabilities
+ // indirect: source target probabilities
+
+ // consistency checks
+ if (itemDirect[0].compare( itemIndirect[0] ) != 0) {
+ cerr << "ERROR: target phrase does not match in line " << i << ": '"
+ << itemDirect[0] << "' != '" << itemIndirect[0] << "'" << endl;
+ exit(1);
+ }
+
+ if (itemDirect[1].compare( itemIndirect[1] ) != 0) {
+ cerr << "ERROR: source phrase does not match in line " << i << ": '"
+ << itemDirect[1] << "' != '" << itemIndirect[1] << "'" << endl;
+ exit(1);
+ }
+
+ // output hierarchical phrase pair (with separated labels)
+ fileConsolidated << itemDirect[1] << " ||| " << itemDirect[0];
+
+ // probs
+ fileConsolidated << " ||| ";
+ if (!onlyDirectFlag) {
+ fileConsolidated << itemDirect[2]; // prob indirect
+ }
+ fileConsolidated << " " << itemIndirect[2]; // prob direct
+ if (phraseCountFlag) {
+ fileConsolidated << " " << (logProbFlag ? 1 : 2.718); // phrase count feature
+ }
+
+ // alignment
+ fileConsolidated << " ||| " << reverseAlignment(itemDirect[3]);
+
+ // counts, for debugging
+ vector<string> directCounts = tokenize(itemDirect[4].c_str());
+ vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
+ fileConsolidated << "||| " << directCounts[0] << " " << indirectCounts[0];
+ // output rule count if present in either file
+ if (indirectCounts.size() > 1) {
+ fileConsolidated << " " << indirectCounts[1];
+ } else if (directCounts.size() > 1) {
+ fileConsolidated << " " << directCounts[1];
+ }
+
+ fileConsolidated << endl;
+ }
+ fileDirect.Close();
+ fileIndirect.Close();
+ fileConsolidated.close();
+}
+
+bool getLine( istream &fileP, vector< string > &item )
+{
+ if (fileP.eof())
+ return false;
+
+ SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (fileP.eof())
+ return false;
+
+ item = splitLine();
+
+ return true;
+}
+
+vector< string > splitLine()
+{
+ vector< string > item;
+ bool betweenWords = true;
+ int start=0;
+ int i=0;
+ for(; line[i] != '\0'; i++) {
+ if (line[i] == ' ' &&
+ line[i+1] == '|' &&
+ line[i+2] == '|' &&
+ line[i+3] == '|' &&
+ line[i+4] == ' ') {
+ if (start > i) start = i; // empty item
+ item.push_back( string( line+start, i-start ) );
+ start = i+5;
+ i += 3;
+ }
+ }
+ item.push_back( string( line+start, i-start ) );
+
+ return item;
+}
+
+string reverseAlignment(const string &alignments)
+{
+ stringstream ret("");
+
+ vector<string> alignToks = tokenize(alignments.c_str());
+
+ for (size_t i = 0; i < alignToks.size(); ++i)
+ {
+ string &alignPair = alignToks[i];
+ vector<string> alignPoints;
+ Tokenize(alignPoints, alignPair, "-");
+ assert(alignPoints.size() == 2);
+
+ ret << alignPoints[1] << "-" << alignPoints[0] << " ";
+ }
+
+ return ret.str();
+}
+
+
diff --git a/phrase-extract/consolidate.cpp b/phrase-extract/consolidate.cpp
new file mode 100644
index 000000000..17aa11846
--- /dev/null
+++ b/phrase-extract/consolidate.cpp
@@ -0,0 +1,326 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <cstdlib>
+#include <cstring>
+
+#include "tables-core.h"
+#include "SafeGetline.h"
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+
+#define LINE_MAX_LENGTH 10000
+
+using namespace std;
+
+bool hierarchicalFlag = false;
+bool onlyDirectFlag = false;
+bool phraseCountFlag = true;
+bool lowCountFlag = false;
+bool goodTuringFlag = false;
+bool kneserNeyFlag = false;
+bool logProbFlag = false;
+bool outputNTLengths = false;
+inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; }
+
+char line[LINE_MAX_LENGTH];
+void processFiles( char*, char*, char*, char* );
+void loadCountOfCounts( char* );
+bool getLine( istream &fileP, vector< string > &item );
+vector< string > splitLine();
+
+int main(int argc, char* argv[])
+{
+ cerr << "Consolidate v2.0 written by Philipp Koehn\n"
+ << "consolidating direct and indirect rule tables\n";
+
+ if (argc < 4) {
+ cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--OutputNTLengths] \n";
+ exit(1);
+ }
+ char* &fileNameDirect = argv[1];
+ char* &fileNameIndirect = argv[2];
+ char* &fileNameConsolidated = argv[3];
+ char* fileNameCountOfCounts;
+
+ for(int i=4; i<argc; i++) {
+ if (strcmp(argv[i],"--Hierarchical") == 0) {
+ hierarchicalFlag = true;
+ cerr << "processing hierarchical rules\n";
+ } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
+ onlyDirectFlag = true;
+ cerr << "only including direct translation scores p(e|f)\n";
+ } else if (strcmp(argv[i],"--NoPhraseCount") == 0) {
+ phraseCountFlag = false;
+ cerr << "not including the phrase count feature\n";
+ } else if (strcmp(argv[i],"--GoodTuring") == 0) {
+ goodTuringFlag = true;
+ if (i+1==argc) {
+ cerr << "ERROR: specify count of count files for Good Turing discounting!\n";
+ exit(1);
+ }
+ fileNameCountOfCounts = argv[++i];
+ cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
+ } else if (strcmp(argv[i],"--KneserNey") == 0) {
+ kneserNeyFlag = true;
+ if (i+1==argc) {
+ cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
+ exit(1);
+ }
+ fileNameCountOfCounts = argv[++i];
+ cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
+ } else if (strcmp(argv[i],"--LowCountFeature") == 0) {
+ lowCountFlag = true;
+ cerr << "including the low count feature\n";
+ } else if (strcmp(argv[i],"--LogProb") == 0) {
+ logProbFlag = true;
+ cerr << "using log-probabilities\n";
+ } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
+ outputNTLengths = true;
+ } else {
+ cerr << "ERROR: unknown option " << argv[i] << endl;
+ exit(1);
+ }
+ }
+
+ processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts );
+}
+
+vector< float > countOfCounts;
+vector< float > goodTuringDiscount;
+float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
+void loadCountOfCounts( char* fileNameCountOfCounts )
+{
+ Moses::InputFileStream fileCountOfCounts(fileNameCountOfCounts);
+ if (fileCountOfCounts.fail()) {
+ cerr << "ERROR: could not open count of counts file " << fileNameCountOfCounts << endl;
+ exit(1);
+ }
+ istream &fileP = fileCountOfCounts;
+
+ countOfCounts.push_back(0.0);
+ while(1) {
+ if (fileP.eof()) break;
+ SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (fileP.eof()) break;
+ if (totalCount < 0)
+ totalCount = atof(line); // total number of distinct phrase pairs
+ else
+ countOfCounts.push_back( atof(line) );
+ }
+ fileCountOfCounts.Close();
+
+ // compute Good Turing discounts
+ if (goodTuringFlag) {
+ goodTuringDiscount.push_back(0.01); // floor value
+ for( size_t i=1; i<countOfCounts.size()-1; i++ ) {
+ goodTuringDiscount.push_back(((float)i+1)/(float)i*((countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)));
+ if (goodTuringDiscount[i]>1)
+ goodTuringDiscount[i] = 1;
+ if (goodTuringDiscount[i]<goodTuringDiscount[i-1])
+ goodTuringDiscount[i] = goodTuringDiscount[i-1];
+ }
+ }
+
+ // compute Kneser Ney co-efficients [Chen&Goodman, 1998]
+ float Y = countOfCounts[1] / (countOfCounts[1] + 2*countOfCounts[2]);
+ kneserNey_D1 = 1 - 2*Y * countOfCounts[2] / countOfCounts[1];
+ kneserNey_D2 = 2 - 3*Y * countOfCounts[3] / countOfCounts[2];
+ kneserNey_D3 = 3 - 4*Y * countOfCounts[4] / countOfCounts[3];
+ // sanity constraints
+ if (kneserNey_D1 > 0.9) kneserNey_D1 = 0.9;
+ if (kneserNey_D2 > 1.9) kneserNey_D2 = 1.9;
+ if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
+}
+
+void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
+{
+ if (goodTuringFlag || kneserNeyFlag)
+ loadCountOfCounts( fileNameCountOfCounts );
+
+ // open input files
+ Moses::InputFileStream fileDirect(fileNameDirect);
+ Moses::InputFileStream fileIndirect(fileNameIndirect);
+
+ if (fileDirect.fail()) {
+ cerr << "ERROR: could not open phrase table file " << fileNameDirect << endl;
+ exit(1);
+ }
+ istream &fileDirectP = fileDirect;
+
+ if (fileIndirect.fail()) {
+ cerr << "ERROR: could not open phrase table file " << fileNameIndirect << endl;
+ exit(1);
+ }
+ istream &fileIndirectP = fileIndirect;
+
+ // open output file: consolidated phrase table
+ Moses::OutputFileStream fileConsolidated;
+ bool success = fileConsolidated.Open(fileNameConsolidated);
+ if (!success) {
+ cerr << "ERROR: could not open output file " << fileNameConsolidated << endl;
+ exit(1);
+ }
+
+ // loop through all extracted phrase translations
+ int i=0;
+ while(true) {
+ i++;
+ if (i%100000 == 0) cerr << "." << flush;
+
+ vector< string > itemDirect, itemIndirect;
+ if (! getLine(fileIndirectP,itemIndirect) ||
+ ! getLine(fileDirectP, itemDirect ))
+ break;
+
+ // direct: target source alignment probabilities
+ // indirect: source target probabilities
+
+ // consistency checks
+ if (itemDirect[0].compare( itemIndirect[0] ) != 0) {
+ cerr << "ERROR: target phrase does not match in line " << i << ": '"
+ << itemDirect[0] << "' != '" << itemIndirect[0] << "'" << endl;
+ exit(1);
+ }
+
+ if (itemDirect[1].compare( itemIndirect[1] ) != 0) {
+ cerr << "ERROR: source phrase does not match in line " << i << ": '"
+ << itemDirect[1] << "' != '" << itemIndirect[1] << "'" << endl;
+ exit(1);
+ }
+
+ // output hierarchical phrase pair (with separated labels)
+ fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1];
+
+ // SCORES ...
+ fileConsolidated << " |||";
+ vector<string> directCounts = tokenize(itemDirect[4].c_str());
+ vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
+ float countF = atof(directCounts[0].c_str());
+ float countE = atof(indirectCounts[0].c_str());
+ float countEF = atof(indirectCounts[1].c_str());
+ float n1_F, n1_E;
+ if (kneserNeyFlag) {
+ n1_F = atof(directCounts[2].c_str());
+ n1_E = atof(indirectCounts[2].c_str());
+ }
+
+ // Good Turing discounting
+ float adjustedCountEF = countEF;
+ if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1)
+ adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)];
+ float adjustedCountEF_indirect = adjustedCountEF;
+
+ // Kneser Ney discounting [Foster et al, 2006]
+ if (kneserNeyFlag) {
+ float D = kneserNey_D3;
+ if (countEF < 2) D = kneserNey_D1;
+ if (countEF < 3) D = kneserNey_D2;
+ if (D > countEF) D = countEF - 0.01; // sanity constraint
+
+ float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
+ float alpha_F = D * n1_F / countF; // available mass
+ adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;
+
+ // for indirect
+ float p_b_F = n1_F / totalCount; // target phrase prob based on distinct
+ float alpha_E = D * n1_E / countE; // available mass
+ adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
+ }
+
+ // prob indirect
+ if (!onlyDirectFlag) {
+ fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
+ fileConsolidated << " " << itemIndirect[2];
+ }
+
+ // prob direct
+ fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
+ fileConsolidated << " " << itemDirect[2];
+
+ // phrase count feature
+ if (phraseCountFlag) {
+ fileConsolidated << " " << maybeLogProb(2.718);
+ }
+
+ // low count feature
+ if (lowCountFlag) {
+ fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
+ }
+
+ // alignment
+ fileConsolidated << " ||| " << itemDirect[3];
+
+ // counts, for debugging
+ fileConsolidated << "||| " << countE << " " << countF; // << " " << countEF;
+
+ if (outputNTLengths)
+ {
+ fileConsolidated << " ||| " << itemDirect[5];
+ }
+
+ fileConsolidated << endl;
+ }
+ fileDirect.Close();
+ fileIndirect.Close();
+ fileConsolidated.Close();
+}
+
+
+bool getLine( istream &fileP, vector< string > &item )
+{
+ if (fileP.eof())
+ return false;
+
+ SAFE_GETLINE((fileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (fileP.eof())
+ return false;
+
+ item = splitLine();
+
+ return true;
+}
+
+vector< string > splitLine()
+{
+ vector< string > item;
+ bool betweenWords = true;
+ int start=0;
+ int i=0;
+ for(; line[i] != '\0'; i++) {
+ if (line[i] == ' ' &&
+ line[i+1] == '|' &&
+ line[i+2] == '|' &&
+ line[i+3] == '|' &&
+ line[i+4] == ' ') {
+ if (start > i) start = i; // empty item
+ item.push_back( string( line+start, i-start ) );
+ start = i+5;
+ i += 3;
+ }
+ }
+ item.push_back( string( line+start, i-start ) );
+
+ return item;
+}
diff --git a/phrase-extract/consolidate.vcxproj b/phrase-extract/consolidate.vcxproj
new file mode 100644
index 000000000..1e77a90f0
--- /dev/null
+++ b/phrase-extract/consolidate.vcxproj
@@ -0,0 +1,100 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{45410EFE-65C0-4078-82E4-D636258F9225}</ProjectGuid>
+ <RootNamespace>consolidate</RootNamespace>
+ <Keyword>Win32Proj</Keyword>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup>
+ <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <MinimalRebuild>true</MinimalRebuild>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <TargetMachine>MachineX86</TargetMachine>
+ <AdditionalDependencies>zdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <Optimization>MaxSpeed</Optimization>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <TargetMachine>MachineX86</TargetMachine>
+ <AdditionalDependencies>zdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="consolidate.cpp" />
+ <ClCompile Include="InputFileStream.cpp" />
+ <ClCompile Include="tables-core.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="InputFileStream.h" />
+ <ClInclude Include="SafeGetline.h" />
+ <ClInclude Include="tables-core.h" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project> \ No newline at end of file
diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/extract-ghkm/Alignment.cpp
new file mode 100644
index 000000000..fcd5e14e1
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Alignment.cpp
@@ -0,0 +1,74 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "Alignment.h"
+
+#include "Exception.h"
+
+#include <cassert>
+#include <cstdlib>
+
+namespace Moses {
+namespace GHKM {
+
+Alignment ReadAlignment(const std::string &s)
+{
+ Alignment a;
+
+ const std::string digits = "0123456789";
+
+ std::string::size_type begin = s.find_first_of(digits);
+ if (begin == std::string::npos) {
+ // Empty word alignments are allowed
+ return a;
+ }
+
+ while (true) {
+ std::string::size_type end = s.find("-", begin);
+ if (end == std::string::npos) {
+ throw Exception("Alignment separator '-' missing");
+ }
+ int src = std::atoi(s.substr(begin, end-begin).c_str());
+
+ begin = s.find_first_of(digits, end);
+ if (begin == std::string::npos) {
+ throw Exception("Target index missing");
+ }
+
+ end = s.find(" ", begin);
+ int tgt;
+ if (end == std::string::npos) {
+ tgt = std::atoi(s.substr(begin).c_str());
+ } else {
+ tgt = std::atoi(s.substr(begin, end-begin).c_str());
+ }
+
+ a.push_back(std::make_pair(src, tgt));
+
+ if (end == std::string::npos) {
+ break;
+ }
+ begin = s.find_first_of(digits, end);
+ }
+
+ return a;
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/Alignment.h b/phrase-extract/extract-ghkm/Alignment.h
new file mode 100644
index 000000000..bc42191e1
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Alignment.h
@@ -0,0 +1,38 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_ALIGNMENT_H_
+#define EXTRACT_GHKM_ALIGNMENT_H_
+
+#include <string>
+#include <utility>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+typedef std::vector<std::pair<int, int> > Alignment;
+
+Alignment ReadAlignment(const std::string &);
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.cpp b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
new file mode 100644
index 000000000..6bd32a13b
--- /dev/null
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.cpp
@@ -0,0 +1,386 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "AlignmentGraph.h"
+
+#include "ComposedRule.h"
+#include "Node.h"
+#include "Options.h"
+#include "ParseTree.h"
+#include "Subgraph.h"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <stack>
+
+namespace Moses {
+namespace GHKM {
+
+AlignmentGraph::AlignmentGraph(const ParseTree *t,
+ const std::vector<std::string> &s,
+ const Alignment &a)
+{
+ // Copy the parse tree nodes and add them to m_targetNodes.
+ m_root = CopyParseTree(t);
+
+ // Create a node for each source word.
+ m_sourceNodes.reserve(s.size());
+ for (std::vector<std::string>::const_iterator p(s.begin());
+ p != s.end(); ++p) {
+ m_sourceNodes.push_back(new Node(*p, SOURCE));
+ }
+
+ // Connect source nodes to parse tree leaves according to the given word
+ // alignment.
+ std::vector<Node *> targetTreeLeaves;
+ GetTargetTreeLeaves(m_root, targetTreeLeaves);
+ for (Alignment::const_iterator p(a.begin()); p != a.end(); ++p) {
+ Node *src = m_sourceNodes[p->first];
+ Node *tgt = targetTreeLeaves[p->second];
+ src->AddParent(tgt);
+ tgt->AddChild(src);
+ }
+
+ // Attach unaligned source words (if any).
+ AttachUnalignedSourceWords();
+
+ // Populate node spans.
+ std::vector<Node *>::const_iterator p(m_sourceNodes.begin());
+ for (int i = 0; p != m_sourceNodes.end(); ++p, ++i) {
+ (*p)->PropagateIndex(i);
+ }
+
+ // Calculate complement spans.
+ CalcComplementSpans(m_root);
+}
+
+AlignmentGraph::~AlignmentGraph()
+{
+ for (std::vector<Node *>::iterator p(m_sourceNodes.begin());
+ p != m_sourceNodes.end(); ++p) {
+ delete *p;
+ }
+ for (std::vector<Node *>::iterator p(m_targetNodes.begin());
+ p != m_targetNodes.end(); ++p) {
+ delete *p;
+ }
+}
+
+Subgraph AlignmentGraph::ComputeMinimalFrontierGraphFragment(
+ Node *root,
+ const std::set<Node *> &frontierSet)
+{
+ std::stack<Node *> expandableNodes;
+ std::set<const Node *> expandedNodes;
+
+ if (root->IsSink()) {
+ expandedNodes.insert(root);
+ } else {
+ expandableNodes.push(root);
+ }
+
+ while (!expandableNodes.empty()) {
+ Node *n = expandableNodes.top();
+ expandableNodes.pop();
+
+ const std::vector<Node *> &children = n->GetChildren();
+
+ for (std::vector<Node *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ Node *child = *p;
+ if (child->IsSink()) {
+ expandedNodes.insert(child);
+ continue;
+ }
+ std::set<Node *>::const_iterator q = frontierSet.find(child);
+ if (q == frontierSet.end()) { //child is not from the frontier set
+ expandableNodes.push(child);
+ } else if (child->GetType() == TARGET) { // still need source word
+ expandableNodes.push(child);
+ } else {
+ expandedNodes.insert(child);
+ }
+ }
+ }
+
+ return Subgraph(root, expandedNodes);
+}
+
+void AlignmentGraph::ExtractMinimalRules(const Options &options)
+{
+ // Determine which nodes are frontier nodes.
+ std::set<Node *> frontierSet;
+ ComputeFrontierSet(m_root, options, frontierSet);
+
+ // Form the minimal frontier graph fragment rooted at each frontier node.
+ std::vector<Subgraph> fragments;
+ fragments.reserve(frontierSet.size());
+ for (std::set<Node *>::iterator p(frontierSet.begin());
+ p != frontierSet.end(); ++p) {
+ Node *root = *p;
+ Subgraph fragment = ComputeMinimalFrontierGraphFragment(root, frontierSet);
+ assert(!fragment.IsTrivial());
+ // Can it form an SCFG rule?
+ // FIXME Does this exclude non-lexical unary rules?
+ if (root->GetType() == TREE && !root->GetSpan().empty()) {
+ root->AddRule(new Subgraph(fragment));
+ }
+ }
+}
+
+void AlignmentGraph::ExtractComposedRules(const Options &options)
+{
+ ExtractComposedRules(m_root, options);
+}
+
+void AlignmentGraph::ExtractComposedRules(Node *node, const Options &options)
+{
+ // Extract composed rules for all children first.
+ const std::vector<Node *> &children = node->GetChildren();
+ for (std::vector<Node *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ ExtractComposedRules(*p, options);
+ }
+
+ // If there is no minimal rule for this node then there are no composed
+ // rules.
+ const std::vector<const Subgraph*> &rules = node->GetRules();
+ assert(rules.size() <= 1);
+ if (rules.empty()) {
+ return;
+ }
+
+ // Construct an initial composition candidate from the minimal rule.
+ ComposedRule cr(*(rules[0]));
+ if (!cr.GetOpenAttachmentPoint()) {
+ // No composition possible.
+ return;
+ }
+
+ std::queue<ComposedRule> queue;
+ queue.push(cr);
+ while (!queue.empty()) {
+ ComposedRule cr = queue.front();
+ queue.pop();
+ const Node *attachmentPoint = cr.GetOpenAttachmentPoint();
+ assert(attachmentPoint);
+ assert(attachmentPoint != node);
+ // Create all possible rules by composing this node's minimal rule with the
+ // existing rules (both minimal and composed) rooted at the first open
+ // attachment point.
+ const std::vector<const Subgraph*> &rules = attachmentPoint->GetRules();
+ for (std::vector<const Subgraph*>::const_iterator p = rules.begin();
+ p != rules.end(); ++p) {
+ assert((*p)->GetRoot()->GetType() == TREE);
+ ComposedRule *cr2 = cr.AttemptComposition(**p, options);
+ if (cr2) {
+ node->AddRule(new Subgraph(cr2->CreateSubgraph()));
+ if (cr2->GetOpenAttachmentPoint()) {
+ queue.push(*cr2);
+ }
+ delete cr2;
+ }
+ }
+ // Done with this attachment point. Advance to the next, if any.
+ cr.CloseAttachmentPoint();
+ if (cr.GetOpenAttachmentPoint()) {
+ queue.push(cr);
+ }
+ }
+}
+
+Node *AlignmentGraph::CopyParseTree(const ParseTree *root)
+{
+ NodeType nodeType = (root->IsLeaf()) ? TARGET : TREE;
+
+ std::auto_ptr<Node> n(new Node(root->GetLabel(), nodeType));
+
+ if (nodeType == TREE) {
+ n->SetPcfgScore(root->GetPcfgScore());
+ }
+
+ const std::vector<ParseTree *> &children = root->GetChildren();
+ std::vector<Node *> childNodes;
+ childNodes.reserve(children.size());
+ for (std::vector<ParseTree *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ Node *child = CopyParseTree(*p);
+ child->AddParent(n.get());
+ childNodes.push_back(child);
+ }
+ n->SetChildren(childNodes);
+
+ Node *p = n.release();
+ m_targetNodes.push_back(p);
+ return p;
+}
+
+// Finds the set of frontier nodes. The definition of a frontier node differs
+// from Galley et al's (2004) in the following ways:
+//
+// 1. A node with an empty span is not a frontier node (this excludes
+// unaligned target subtrees).
+// 2. Target word nodes are not frontier nodes.
+// 3. Source word nodes are not frontier nodes.
+// 4. Unless the --AllowUnary option is used, a node is not a frontier node if
+// it has the same span as its parent.
+void AlignmentGraph::ComputeFrontierSet(Node *root,
+ const Options &options,
+ std::set<Node *> &frontierSet) const
+{
+ // Don't include word nodes or unaligned target subtrees.
+ if (root->GetType() != TREE || root->GetSpan().empty()) {
+ return;
+ }
+
+ if (!SpansIntersect(root->GetComplementSpan(), Closure(root->GetSpan()))) {
+ // Unless unary rules are explicitly allowed, we use Chung et al's (2011)
+ // modified defintion of a frontier node to eliminate the production of
+ // non-lexical unary rules.
+ assert(root->GetParents().size() <= 1);
+ if (options.allowUnary
+ || root->GetParents().empty()
+ || root->GetParents()[0]->GetSpan() != root->GetSpan()) {
+ frontierSet.insert(root);
+ }
+ }
+
+ const std::vector<Node *> &children = root->GetChildren();
+ for (std::vector<Node *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ ComputeFrontierSet(*p, options, frontierSet);
+ }
+}
+
+void AlignmentGraph::CalcComplementSpans(Node *root)
+{
+ Span compSpan;
+ std::set<Node *> siblings;
+
+ const std::vector<Node *> &parents = root->GetParents();
+ for (std::vector<Node *>::const_iterator p(parents.begin());
+ p != parents.end(); ++p) {
+ const Span &parentCompSpan = (*p)->GetComplementSpan();
+ compSpan.insert(parentCompSpan.begin(), parentCompSpan.end());
+ const std::vector<Node *> &c = (*p)->GetChildren();
+ siblings.insert(c.begin(), c.end());
+ }
+
+ for (std::set<Node *>::iterator p(siblings.begin());
+ p != siblings.end(); ++p) {
+ if (*p == root) {
+ continue;
+ }
+ const Span &siblingSpan = (*p)->GetSpan();
+ compSpan.insert(siblingSpan.begin(), siblingSpan.end());
+ }
+
+ root->SetComplementSpan(compSpan);
+
+ const std::vector<Node *> &children = root->GetChildren();
+ for (std::vector<Node *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ CalcComplementSpans(*p);
+ }
+}
+
+void AlignmentGraph::GetTargetTreeLeaves(Node *root,
+ std::vector<Node *> &leaves)
+{
+ if (root->IsSink()) {
+ leaves.push_back(root);
+ } else {
+ const std::vector<Node *> &children = root->GetChildren();
+ for (std::vector<Node *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ GetTargetTreeLeaves(*p, leaves);
+ }
+ }
+}
+
+void AlignmentGraph::AttachUnalignedSourceWords()
+{
+ // Find the unaligned source words (if any).
+ std::set<int> unaligned;
+ for (size_t i = 0; i < m_sourceNodes.size(); ++i) {
+ const Node &sourceNode = (*m_sourceNodes[i]);
+ if (sourceNode.GetParents().empty()) {
+ unaligned.insert(i);
+ }
+ }
+
+ // Determine the attachment point for each one and attach it.
+ for (std::set<int>::iterator p = unaligned.begin();
+ p != unaligned.end(); ++p) {
+ int index = *p;
+ Node *attachmentPoint = DetermineAttachmentPoint(index);
+ Node *sourceNode = m_sourceNodes[index];
+ attachmentPoint->AddChild(sourceNode);
+ sourceNode->AddParent(attachmentPoint);
+ }
+}
+
+Node *AlignmentGraph::DetermineAttachmentPoint(int index)
+{
+ // Find the nearest aligned neighbour to the left, if any.
+ int i = index;
+ while (--i >= 0) {
+ if (!m_sourceNodes[i]->GetParents().empty()) {
+ break;
+ }
+ }
+ // No aligned neighbours to the left, so attach to the root.
+ if (i == -1) {
+ return m_root;
+ }
+ // Find the nearest aligned neighbour to the right, if any.
+ size_t j = index;
+ while (++j < m_sourceNodes.size()) {
+ if (!m_sourceNodes[j]->GetParents().empty()) {
+ break;
+ }
+ }
+ // No aligned neighbours to the right, so attach to the root.
+ if (j == m_sourceNodes.size()) {
+ return m_root;
+ }
+ // Construct the set of target nodes that are aligned to the left and right
+ // neighbours.
+ const std::vector<Node *> &leftParents = m_sourceNodes[i]->GetParents();
+ assert(!leftParents.empty());
+ const std::vector<Node *> &rightParents = m_sourceNodes[j]->GetParents();
+ assert(!rightParents.empty());
+ std::set<Node *> targetSet;
+ targetSet.insert(leftParents.begin(), leftParents.end());
+ targetSet.insert(rightParents.begin(), rightParents.end());
+ // The attachment point is the lowest common ancestor of the target word
+ // nodes, unless the LCA is itself a target word, in which case the LCA
+ // is the parent. This is to avoid including introducing new word alignments.
+ // It assumes that the parse tree uses preterminals for parts of speech.
+ Node *lca = Node::LowestCommonAncestor(targetSet.begin(), targetSet.end());
+ if (lca->GetType() == TARGET) {
+ assert(lca->GetParents().size() == 1);
+ return lca->GetParents()[0];
+ }
+ return lca;
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/AlignmentGraph.h b/phrase-extract/extract-ghkm/AlignmentGraph.h
new file mode 100644
index 000000000..94948758a
--- /dev/null
+++ b/phrase-extract/extract-ghkm/AlignmentGraph.h
@@ -0,0 +1,76 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_ALIGNMENT_GRAPH_H_
+#define EXTRACT_GHKM_ALIGNMENT_GRAPH_H_
+
+#include "Alignment.h"
+#include "Options.h"
+
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+class Node;
+class ParseTree;
+class Subgraph;
+
+class AlignmentGraph
+{
+ public:
+ AlignmentGraph(const ParseTree *,
+ const std::vector<std::string> &,
+ const Alignment &);
+
+ ~AlignmentGraph();
+
+ Node *GetRoot() { return m_root; }
+ const std::vector<Node *> &GetTargetNodes() { return m_targetNodes; }
+
+ void ExtractMinimalRules(const Options &);
+ void ExtractComposedRules(const Options &);
+
+ private:
+ // Disallow copying
+ AlignmentGraph(const AlignmentGraph &);
+ AlignmentGraph &operator=(const AlignmentGraph &);
+
+ Node *CopyParseTree(const ParseTree *);
+ void ComputeFrontierSet(Node *, const Options &, std::set<Node *> &) const;
+ void CalcComplementSpans(Node *);
+ void GetTargetTreeLeaves(Node *, std::vector<Node *> &);
+ void AttachUnalignedSourceWords();
+ Node *DetermineAttachmentPoint(int);
+ Subgraph ComputeMinimalFrontierGraphFragment(Node *,
+ const std::set<Node *> &);
+ void ExtractComposedRules(Node *, const Options &);
+
+ Node *m_root;
+ std::vector<Node *> m_sourceNodes;
+ std::vector<Node *> m_targetNodes;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/ComposedRule.cpp b/phrase-extract/extract-ghkm/ComposedRule.cpp
new file mode 100644
index 000000000..8bf3cfc72
--- /dev/null
+++ b/phrase-extract/extract-ghkm/ComposedRule.cpp
@@ -0,0 +1,129 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "ComposedRule.h"
+
+#include "Node.h"
+#include "Options.h"
+#include "Subgraph.h"
+
+#include <set>
+#include <vector>
+#include <queue>
+
+namespace Moses {
+namespace GHKM {
+
+ComposedRule::ComposedRule(const Subgraph &baseRule)
+ : m_baseRule(baseRule)
+ , m_depth(baseRule.GetDepth())
+ , m_size(baseRule.GetSize())
+ , m_nodeCount(baseRule.GetNodeCount())
+{
+ const std::set<const Node *> &leaves = baseRule.GetLeaves();
+ for (std::set<const Node *>::const_iterator p = leaves.begin();
+ p != leaves.end(); ++p) {
+ if ((*p)->GetType() == TREE) {
+ m_openAttachmentPoints.push(*p);
+ }
+ }
+}
+
+ComposedRule::ComposedRule(const ComposedRule &other, const Subgraph &rule,
+ int depth)
+ : m_baseRule(other.m_baseRule)
+ , m_attachedRules(other.m_attachedRules)
+ , m_openAttachmentPoints(other.m_openAttachmentPoints)
+ , m_depth(depth)
+ , m_size(other.m_size+rule.GetSize())
+ , m_nodeCount(other.m_nodeCount+rule.GetNodeCount()-1)
+{
+ m_attachedRules.push_back(&rule);
+ m_openAttachmentPoints.pop();
+}
+
+const Node *ComposedRule::GetOpenAttachmentPoint()
+{
+ return m_openAttachmentPoints.empty() ? 0 : m_openAttachmentPoints.front();
+}
+
+void ComposedRule::CloseAttachmentPoint()
+{
+ assert(!m_openAttachmentPoints.empty());
+ m_attachedRules.push_back(0);
+ m_openAttachmentPoints.pop();
+}
+
+ComposedRule *ComposedRule::AttemptComposition(const Subgraph &rule,
+ const Options &options) const
+{
+ // The smallest possible rule fragment should be rooted at a tree node.
+ // Note that this differs from the original GHKM definition.
+ assert(rule.GetRoot()->GetType() == TREE);
+
+ // Check the node count of the proposed rule.
+ if (m_nodeCount+rule.GetNodeCount()-1 > options.maxNodes) {
+ return 0;
+ }
+
+ // Check the size of the proposed rule.
+ if (m_size+rule.GetSize() > options.maxRuleSize) {
+ return 0;
+ }
+
+ // Determine the depth of the proposed rule and test whether it exceeds the
+ // limit.
+ int attachmentPointDepth = 0;
+ const Node *n = rule.GetRoot();
+ while (n != m_baseRule.GetRoot()) {
+ assert(n->GetParents().size() == 1);
+ n = n->GetParents()[0];
+ ++attachmentPointDepth;
+ }
+ int newDepth = std::max(m_depth, attachmentPointDepth+rule.GetDepth());
+ if (newDepth > options.maxRuleDepth) {
+ return 0;
+ }
+
+ return new ComposedRule(*this, rule, newDepth);
+}
+
+Subgraph ComposedRule::CreateSubgraph()
+{
+ std::set<const Node *> leaves;
+ const std::set<const Node *> &baseLeaves = m_baseRule.GetLeaves();
+ size_t i = 0;
+ for (std::set<const Node *>::const_iterator p = baseLeaves.begin();
+ p != baseLeaves.end(); ++p) {
+ const Node *baseLeaf = *p;
+ if (baseLeaf->GetType() == TREE && i < m_attachedRules.size()) {
+ const Subgraph *attachedRule = m_attachedRules[i++];
+ if (attachedRule) {
+ leaves.insert(attachedRule->GetLeaves().begin(),
+ attachedRule->GetLeaves().end());
+ continue;
+ }
+ }
+ leaves.insert(baseLeaf);
+ }
+ return Subgraph(m_baseRule.GetRoot(), leaves);
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/ComposedRule.h b/phrase-extract/extract-ghkm/ComposedRule.h
new file mode 100644
index 000000000..65ce9ac70
--- /dev/null
+++ b/phrase-extract/extract-ghkm/ComposedRule.h
@@ -0,0 +1,70 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_COMPOSED_RULE_H_
+#define EXTRACT_GHKM_COMPOSED_RULE_H_
+
+#include "Subgraph.h"
+
+#include <vector>
+#include <queue>
+
+namespace Moses {
+namespace GHKM {
+
+class Node;
+struct Options;
+
+class ComposedRule
+{
+ public:
+ // Form a 'trivial' ComposedRule from a single existing rule.
+ ComposedRule(const Subgraph &baseRule);
+
+ // Returns the first open attachment point if any exist or 0 otherwise.
+ const Node *GetOpenAttachmentPoint();
+
+ // Close the first open attachment point without attaching a rule.
+ void CloseAttachmentPoint();
+
+ // Attempts to produce a new composed rule by attaching a given rule at the
+ // first open attachment point. This will fail if the proposed rule violates
+ // the constraints set in the Options object, in which case the function
+ // returns 0.
+ ComposedRule *AttemptComposition(const Subgraph &, const Options &) const;
+
+ // Constructs a Subgraph object corresponding to the composed rule.
+ Subgraph CreateSubgraph();
+
+ private:
+ ComposedRule(const ComposedRule &, const Subgraph &, int);
+
+ const Subgraph &m_baseRule;
+ std::vector<const Subgraph *> m_attachedRules;
+ std::queue<const Node *> m_openAttachmentPoints;
+ int m_depth;
+ int m_size;
+ int m_nodeCount;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/Exception.h b/phrase-extract/extract-ghkm/Exception.h
new file mode 100644
index 000000000..9928785f0
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Exception.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_EXCEPTION_H_
+#define EXTRACT_GHKM_EXCEPTION_H_
+
+#include <string>
+
+namespace Moses {
+namespace GHKM {
+
+class Exception
+{
+ public:
+ Exception(const char *msg) : m_msg(msg) {}
+ Exception(const std::string &msg) : m_msg(msg) {}
+ const std::string &GetMsg() const { return m_msg; }
+ private:
+ std::string m_msg;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
new file mode 100644
index 000000000..dae876116
--- /dev/null
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -0,0 +1,492 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "ExtractGHKM.h"
+
+#include "Alignment.h"
+#include "AlignmentGraph.h"
+#include "Exception.h"
+#include "InputFileStream.h"
+#include "Node.h"
+#include "OutputFileStream.h"
+#include "Options.h"
+#include "ParseTree.h"
+#include "ScfgRule.h"
+#include "ScfgRuleWriter.h"
+#include "Span.h"
+#include "XmlTreeParser.h"
+
+#include <boost/program_options.hpp>
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <sstream>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+int ExtractGHKM::Main(int argc, char *argv[])
+{
+ // Process command-line options.
+ Options options;
+ ProcessOptions(argc, argv, options);
+
+ // Open input files.
+ InputFileStream targetStream(options.targetFile);
+ InputFileStream sourceStream(options.sourceFile);
+ InputFileStream alignmentStream(options.alignmentFile);
+
+ // Open output files.
+ OutputFileStream fwdExtractStream;
+ OutputFileStream invExtractStream;
+ std::ofstream glueGrammarStream;
+ std::ofstream unknownWordStream;
+ std::string fwdFileName = options.extractFile;
+ std::string invFileName = options.extractFile + std::string(".inv");
+ if (options.gzOutput) {
+ fwdFileName += ".gz";
+ invFileName += ".gz";
+ }
+ OpenOutputFileOrDie(fwdFileName, fwdExtractStream);
+ OpenOutputFileOrDie(invFileName, invExtractStream);
+ if (!options.glueGrammarFile.empty()) {
+ OpenOutputFileOrDie(options.glueGrammarFile, glueGrammarStream);
+ }
+ if (!options.unknownWordFile.empty()) {
+ OpenOutputFileOrDie(options.unknownWordFile, unknownWordStream);
+ }
+
+ // Target label sets for producing glue grammar.
+ std::set<std::string> labelSet;
+ std::map<std::string, int> topLabelSet;
+
+ // Word count statistics for producing unknown word labels.
+ std::map<std::string, int> wordCount;
+ std::map<std::string, std::string> wordLabel;
+
+ std::string targetLine;
+ std::string sourceLine;
+ std::string alignmentLine;
+ XmlTreeParser xmlTreeParser(labelSet, topLabelSet);
+ ScfgRuleWriter writer(fwdExtractStream, invExtractStream, options);
+ size_t lineNum = 0;
+ while (true) {
+ std::getline(targetStream, targetLine);
+ std::getline(sourceStream, sourceLine);
+ std::getline(alignmentStream, alignmentLine);
+
+ if (targetStream.eof() && sourceStream.eof() && alignmentStream.eof()) {
+ break;
+ }
+
+ if (targetStream.eof() || sourceStream.eof() || alignmentStream.eof()) {
+ Error("Files must contain same number of lines");
+ }
+
+ ++lineNum;
+
+ // Parse target tree.
+ if (targetLine.size() == 0) {
+ std::cerr << "skipping line " << lineNum << " with empty target tree\n";
+ continue;
+ }
+ std::auto_ptr<ParseTree> t;
+ try {
+ t = xmlTreeParser.Parse(targetLine);
+ assert(t.get());
+ } catch (const Exception &e) {
+ std::ostringstream s;
+ s << "Failed to parse XML tree at line " << lineNum;
+ if (!e.GetMsg().empty()) {
+ s << ": " << e.GetMsg();
+ }
+ Error(s.str());
+ }
+
+ // Read source tokens.
+ std::vector<std::string> sourceTokens(ReadTokens(sourceLine));
+
+ // Read word alignments.
+ Alignment alignment;
+ try {
+ alignment = ReadAlignment(alignmentLine);
+ } catch (const Exception &e) {
+ std::ostringstream s;
+ s << "Failed to read alignment at line " << lineNum << ": ";
+ s << e.GetMsg();
+ Error(s.str());
+ }
+ if (alignment.size() == 0) {
+ std::cerr << "skipping line " << lineNum << " without alignment points\n";
+ continue;
+ }
+
+ // Record word counts.
+ if (!options.unknownWordFile.empty()) {
+ CollectWordLabelCounts(*t, wordCount, wordLabel);
+ }
+
+ // Form an alignment graph from the target tree, source words, and
+ // alignment.
+ AlignmentGraph graph(t.get(), sourceTokens, alignment);
+
+ // Extract minimal rules, adding each rule to its root node's rule set.
+ graph.ExtractMinimalRules(options);
+
+ // Extract composed rules.
+ if (!options.minimal) {
+ graph.ExtractComposedRules(options);
+ }
+
+ // Write the rules, subject to scope pruning.
+ const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
+ for (std::vector<Node *>::const_iterator p = targetNodes.begin();
+ p != targetNodes.end(); ++p) {
+ const std::vector<const Subgraph *> &rules = (*p)->GetRules();
+ for (std::vector<const Subgraph *>::const_iterator q = rules.begin();
+ q != rules.end(); ++q) {
+ ScfgRule r(**q);
+ // TODO Can scope pruning be done earlier?
+ if (r.Scope() <= options.maxScope) {
+ writer.Write(r);
+ }
+ }
+ }
+ }
+
+ if (!options.glueGrammarFile.empty()) {
+ WriteGlueGrammar(labelSet, topLabelSet, glueGrammarStream);
+ }
+
+ if (!options.unknownWordFile.empty()) {
+ WriteUnknownWordLabel(wordCount, wordLabel, unknownWordStream);
+ }
+
+ return 0;
+}
+
+void ExtractGHKM::OpenInputFileOrDie(const std::string &filename,
+ std::ifstream &stream)
+{
+ stream.open(filename.c_str());
+ if (!stream) {
+ std::ostringstream msg;
+ msg << "failed to open input file: " << filename;
+ Error(msg.str());
+ }
+}
+
+void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename,
+ std::ofstream &stream)
+{
+ stream.open(filename.c_str());
+ if (!stream) {
+ std::ostringstream msg;
+ msg << "failed to open output file: " << filename;
+ Error(msg.str());
+ }
+}
+
+void ExtractGHKM::OpenOutputFileOrDie(const std::string &filename,
+ OutputFileStream &stream)
+{
+ bool ret = stream.Open(filename);
+ if (!ret) {
+ std::ostringstream msg;
+ msg << "failed to open output file: " << filename;
+ Error(msg.str());
+ }
+}
+
+void ExtractGHKM::ProcessOptions(int argc, char *argv[],
+ Options &options) const
+{
+ namespace po = boost::program_options;
+ namespace cls = boost::program_options::command_line_style;
+
+ // Construct the 'top' of the usage message: the bit that comes before the
+ // options list.
+ std::ostringstream usageTop;
+ usageTop << "Usage: " << GetName()
+ << " [OPTION]... TARGET SOURCE ALIGNMENT EXTRACT\n\n"
+ << "SCFG rule extractor based on the GHKM algorithm described in\n"
+ << "Galley et al. (2004).\n\n"
+ << "Options";
+
+ // Construct the 'bottom' of the usage message.
+ std::ostringstream usageBottom;
+ usageBottom << "\nImplementation Notes:\n"
+ << "\nThe parse tree is assumed to contain part-of-speech preterminal nodes.\n"
+ << "\n"
+ << "For the composed rule constraints: rule depth is the maximum distance from the\nrule's root node to a sink node, not counting preterminal expansions or word\nalignments. Rule size is the measure defined in DeNeefe et al (2007): the\nnumber of non-part-of-speech, non-leaf constituent labels in the target tree.\nNode count is the number of target tree nodes (excluding target words).\n"
+ << "\n"
+ << "Scope pruning (Hopkins and Langmead, 2010) is applied to both minimal and\ncomposed rules.\n"
+ << "\n"
+ << "Unaligned source words are attached to the tree using the following heuristic:\nif there are aligned source words to both the left and the right of an unaligned\nsource word then it is attached to the lowest common ancestor of its nearest\nsuch left and right neighbours. Otherwise, it is attached to the root of the\nparse tree.\n"
+ << "\n"
+ << "Unless the --AllowUnary option is given, unary rules containing no lexical\nsource items are eliminated using the method described in Chung et al. (2011).\nThe parsing algorithm used in Moses is unable to handle such rules.\n"
+ << "\n"
+ << "References:\n"
+ << "Galley, M., Hopkins, M., Knight, K., and Marcu, D. (2004)\n"
+ << "\"What's in a Translation Rule?\", In Proceedings of HLT/NAACL 2004.\n"
+ << "\n"
+ << "DeNeefe, S., Knight, K., Wang, W., and Marcu, D. (2007)\n"
+ << "\"What Can Syntax-Based MT Learn from Phrase-Based MT?\", In Proceedings of\nEMNLP-CoNLL 2007.\n"
+ << "\n"
+ << "Hopkins, M. and Langmead, G. (2010)\n"
+ << "\"SCFG Decoding Without Binarization\", In Proceedings of EMNLP 2010.\n"
+ << "\n"
+ << "Chung, T. and Fang, L. and Gildea, D. (2011)\n"
+ << "\"Issues Concerning Decoding with Synchronous Context-free Grammar\", In\nProceedings of ACL/HLT 2011.";
+
+ // Declare the command line options that are visible to the user.
+ po::options_description visible(usageTop.str());
+ visible.add_options()
+ //("help", "print this help message and exit")
+ ("AllowUnary",
+ "allow fully non-lexical unary rules")
+ ("ConditionOnTargetLHS",
+ "write target LHS instead of \"X\" as source LHS")
+ ("GlueGrammar",
+ po::value(&options.glueGrammarFile),
+ "write glue grammar to named file")
+ ("GZOutput",
+ "write gzipped extract files")
+ ("MaxNodes",
+ po::value(&options.maxNodes)->default_value(options.maxNodes),
+ "set maximum number of tree nodes for composed rules")
+ ("MaxRuleDepth",
+ po::value(&options.maxRuleDepth)->default_value(options.maxRuleDepth),
+ "set maximum depth for composed rules")
+ ("MaxRuleSize",
+ po::value(&options.maxRuleSize)->default_value(options.maxRuleSize),
+ "set maximum size for composed rules")
+ ("MaxScope",
+ po::value(&options.maxScope)->default_value(options.maxScope),
+ "set maximum allowed scope")
+ ("Minimal",
+ "extract minimal rules only")
+ ("PCFG",
+ "include score based on PCFG scores in target corpus")
+ ("UnknownWordLabel",
+ po::value(&options.unknownWordFile),
+ "write unknown word labels to named file")
+ ("UnpairedExtractFormat",
+ "do not pair non-terminals in extract files")
+ ;
+
+ // Declare the command line options that are hidden from the user
+ // (these are used as positional options).
+ po::options_description hidden("Hidden options");
+ hidden.add_options()
+ ("TargetFile",
+ po::value(&options.targetFile),
+ "target file")
+ ("SourceFile",
+ po::value(&options.sourceFile),
+ "source file")
+ ("AlignmentFile",
+ po::value(&options.alignmentFile),
+ "alignment file")
+ ("ExtractFile",
+ po::value(&options.extractFile),
+ "extract file")
+ ;
+
+ // Compose the full set of command-line options.
+ po::options_description cmdLineOptions;
+ cmdLineOptions.add(visible).add(hidden);
+
+ // Register the positional options.
+ po::positional_options_description p;
+ p.add("TargetFile", 1);
+ p.add("SourceFile", 1);
+ p.add("AlignmentFile", 1);
+ p.add("ExtractFile", 1);
+
+ // Process the command-line.
+ po::variables_map vm;
+ const int optionStyle = cls::allow_long
+ | cls::long_allow_adjacent
+ | cls::long_allow_next;
+ try {
+ po::store(po::command_line_parser(argc, argv).style(optionStyle).
+ options(cmdLineOptions).positional(p).run(), vm);
+ po::notify(vm);
+ } catch (const std::exception &e) {
+ std::ostringstream msg;
+ msg << e.what() << "\n\n" << visible << usageBottom.str();
+ Error(msg.str());
+ }
+
+ if (vm.count("help")) {
+ std::cout << visible << usageBottom.str() << std::endl;
+ std::exit(0);
+ }
+
+ // Check all positional options were given.
+ if (!vm.count("TargetFile") ||
+ !vm.count("SourceFile") ||
+ !vm.count("AlignmentFile") ||
+ !vm.count("ExtractFile")) {
+ std::ostringstream msg;
+ std::cerr << visible << usageBottom.str() << std::endl;
+ std::exit(1);
+ }
+
+ // Process Boolean options.
+ if (vm.count("AllowUnary")) {
+ options.allowUnary = true;
+ }
+ if (vm.count("ConditionOnTargetLHS")) {
+ options.conditionOnTargetLhs = true;
+ }
+ if (vm.count("GZOutput")) {
+ options.gzOutput = true;
+ }
+ if (vm.count("Minimal")) {
+ options.minimal = true;
+ }
+ if (vm.count("PCFG")) {
+ options.pcfg = true;
+ }
+ if (vm.count("UnpairedExtractFormat")) {
+ options.unpairedExtractFormat = true;
+ }
+}
+
+void ExtractGHKM::Error(const std::string &msg) const
+{
+ std::cerr << GetName() << ": " << msg << std::endl;
+ std::exit(1);
+}
+
+std::vector<std::string> ExtractGHKM::ReadTokens(const std::string &s)
+{
+ std::vector<std::string> tokens;
+
+ std::string whitespace = " \t";
+
+ std::string::size_type begin = s.find_first_not_of(whitespace);
+ assert(begin != std::string::npos);
+ while (true) {
+ std::string::size_type end = s.find_first_of(whitespace, begin);
+ std::string token;
+ if (end == std::string::npos) {
+ token = s.substr(begin);
+ } else {
+ token = s.substr(begin, end-begin);
+ }
+ tokens.push_back(token);
+ if (end == std::string::npos) {
+ break;
+ }
+ begin = s.find_first_not_of(whitespace, end);
+ if (begin == std::string::npos) {
+ break;
+ }
+ }
+
+ return tokens;
+}
+
+void ExtractGHKM::WriteGlueGrammar(
+ const std::set<std::string> &labelSet,
+ const std::map<std::string, int> &topLabelSet,
+ std::ostream &out)
+{
+ // chose a top label that is not already a label
+ std::string topLabel = "QQQQQQ";
+ for(size_t i = 1; i <= topLabel.length(); i++) {
+ if (labelSet.find(topLabel.substr(0,i)) == labelSet.end() ) {
+ topLabel = topLabel.substr(0,i);
+ break;
+ }
+ }
+
+ // basic rules
+ out << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| " << std::endl;
+ out << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 " << std::endl;
+
+ // top rules
+ for (std::map<std::string, int>::const_iterator i = topLabelSet.begin();
+ i != topLabelSet.end(); ++i) {
+ out << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1" << std::endl;
+ }
+
+ // glue rules
+ for(std::set<std::string>::const_iterator i = labelSet.begin();
+ i != labelSet.end(); i++ ) {
+ out << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << std::endl;
+ }
+ // glue rule for unknown word...
+ out << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 " << std::endl;
+}
+
+void ExtractGHKM::CollectWordLabelCounts(
+ ParseTree &root,
+ std::map<std::string, int> &wordCount,
+ std::map<std::string, std::string> &wordLabel)
+{
+ std::vector<const ParseTree*> leaves;
+ root.GetLeaves(std::back_inserter(leaves));
+ for (std::vector<const ParseTree *>::const_iterator p = leaves.begin();
+ p != leaves.end(); ++p) {
+ const ParseTree &leaf = **p;
+ const std::string &word = leaf.GetLabel();
+ const std::string &label = leaf.GetParent()->GetLabel();
+ ++wordCount[word];
+ wordLabel[word] = label;
+ }
+}
+
+void ExtractGHKM::WriteUnknownWordLabel(
+ const std::map<std::string, int> &wordCount,
+ const std::map<std::string, std::string> &wordLabel,
+ std::ostream &out)
+{
+ std::map<std::string, int> labelCount;
+ int total = 0;
+ for (std::map<std::string, int>::const_iterator p = wordCount.begin();
+ p != wordCount.end(); ++p) {
+ // Only consider singletons.
+ if (p->second == 1) {
+ std::map<std::string, std::string>::const_iterator q =
+ wordLabel.find(p->first);
+ assert(q != wordLabel.end());
+ ++labelCount[q->second];
+ ++total;
+ }
+ }
+ for (std::map<std::string, int>::const_iterator p = labelCount.begin();
+ p != labelCount.end(); ++p) {
+ double ratio = static_cast<double>(p->second) / static_cast<double>(total);
+ if (ratio > 0.03) {
+ out << p->first << " " << ratio << std::endl;
+ }
+ }
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.h b/phrase-extract/extract-ghkm/ExtractGHKM.h
new file mode 100644
index 000000000..f16abc5f3
--- /dev/null
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.h
@@ -0,0 +1,71 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_EXTRACT_GHKM_H_
+#define EXTRACT_GHKM_EXTRACT_GHKM_H_
+
+#include <map>
+#include <ostream>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+
+class OutputFileStream;
+
+namespace GHKM {
+
+struct Options;
+class ParseTree;
+
+class ExtractGHKM
+{
+ public:
+ ExtractGHKM() : m_name("extract-ghkm") {}
+ const std::string &GetName() const { return m_name; }
+ int Main(int argc, char *argv[]);
+ private:
+ void Error(const std::string &) const;
+ void OpenInputFileOrDie(const std::string &, std::ifstream &);
+ void OpenOutputFileOrDie(const std::string &, std::ofstream &);
+ void OpenOutputFileOrDie(const std::string &, OutputFileStream &);
+ void RecordTreeLabels(const ParseTree &, std::set<std::string> &);
+ void CollectWordLabelCounts(ParseTree &,
+ std::map<std::string, int> &,
+ std::map<std::string, std::string> &);
+ void WriteUnknownWordLabel(
+ const std::map<std::string, int> &,
+ const std::map<std::string, std::string> &,
+ std::ostream &);
+ void WriteGlueGrammar(const std::set<std::string> &,
+ const std::map<std::string, int> &,
+ std::ostream &);
+ std::vector<std::string> ReadTokens(const std::string &);
+
+ void ProcessOptions(int, char *[], Options &) const;
+
+ std::string m_name;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/Jamfile b/phrase-extract/extract-ghkm/Jamfile
new file mode 100644
index 000000000..1a81c5f87
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Jamfile
@@ -0,0 +1 @@
+exe extract-ghkm : [ glob *.cpp ] ..//filestreams ..//trees ../..//boost_iostreams ../..//boost_program_options ../..//z ;
diff --git a/phrase-extract/extract-ghkm/Main.cpp b/phrase-extract/extract-ghkm/Main.cpp
new file mode 100644
index 000000000..faf3230a6
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Main.cpp
@@ -0,0 +1,26 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "ExtractGHKM.h"
+
+int main(int argc, char *argv[])
+{
+ Moses::GHKM::ExtractGHKM tool;
+ return tool.Main(argc, argv);
+}
diff --git a/phrase-extract/extract-ghkm/Node.cpp b/phrase-extract/extract-ghkm/Node.cpp
new file mode 100644
index 000000000..beb7470b8
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Node.cpp
@@ -0,0 +1,71 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "Node.h"
+
+#include "Subgraph.h"
+
+namespace Moses {
+namespace GHKM {
+
+Node::~Node()
+{
+ for (std::vector<const Subgraph*>::const_iterator p(m_rules.begin());
+ p != m_rules.end(); ++p) {
+ delete *p;
+ }
+}
+
+bool Node::IsPreterminal() const
+{
+ return (m_type == TREE
+ && m_children.size() == 1
+ && m_children[0]->m_type == TARGET);
+}
+
+void Node::PropagateIndex(int index)
+{
+ m_span.insert(index);
+ for (std::vector<Node *>::const_iterator p(m_parents.begin());
+ p != m_parents.end(); ++p) {
+ (*p)->PropagateIndex(index);
+ }
+}
+
+std::vector<std::string> Node::GetTargetWords() const
+{
+ std::vector<std::string> targetWords;
+ GetTargetWords(targetWords);
+ return targetWords;
+}
+
+void Node::GetTargetWords(std::vector<std::string> &targetWords) const
+{
+ if (m_type == TARGET) {
+ targetWords.push_back(m_label);
+ } else {
+ for (std::vector<Node *>::const_iterator p(m_children.begin());
+ p != m_children.end(); ++p) {
+ (*p)->GetTargetWords(targetWords);
+ }
+ }
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/Node.h b/phrase-extract/extract-ghkm/Node.h
new file mode 100644
index 000000000..775473362
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Node.h
@@ -0,0 +1,184 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_NODE_H_
+#define EXTRACT_GHKM_NODE_H_
+
+#include "Span.h"
+
+#include <cassert>
+#include <iterator>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+class Subgraph;
+
+enum NodeType { SOURCE, TARGET, TREE };
+
+class Node
+{
+ public:
+ Node(const std::string &label, NodeType type)
+ : m_label(label)
+ , m_type(type)
+ , m_pcfgScore(0.0f) {}
+
+ ~Node();
+
+ const std::string &GetLabel() const { return m_label; }
+ NodeType GetType() const { return m_type; }
+ const std::vector<Node*> &GetChildren() const { return m_children; }
+ const std::vector<Node*> &GetParents() const { return m_parents; }
+ float GetPcfgScore() const { return m_pcfgScore; }
+ const Span &GetSpan() const { return m_span; }
+ const Span &GetComplementSpan() const { return m_complementSpan; }
+ const std::vector<const Subgraph*> &GetRules() const { return m_rules; }
+
+ void SetChildren(const std::vector<Node*> &c) { m_children = c; }
+ void SetParents(const std::vector<Node*> &p) { m_parents = p; }
+ void SetPcfgScore(float s) { m_pcfgScore = s; }
+ void SetSpan(const Span &s) { m_span = s; }
+ void SetComplementSpan(const Span &cs) { m_complementSpan = cs; }
+
+ void AddChild(Node *c) { m_children.push_back(c); }
+ void AddParent(Node *p) { m_parents.push_back(p); }
+ void AddRule(const Subgraph *s) { m_rules.push_back(s); }
+
+ bool IsSink() const { return m_children.empty(); }
+ bool IsPreterminal() const;
+
+ void PropagateIndex(int);
+
+ std::vector<std::string> GetTargetWords() const;
+
+ // Gets the path from this node's parent to the root. This node is
+ // required to be part of the original parse tree (i.e. not a source word,
+ // which can have multiple parents).
+ template<typename OutputIterator>
+ void GetTreeAncestors(OutputIterator result, bool includeSelf=false);
+
+ // Returns the lowest common ancestor given a sequence of nodes belonging to
+ // the target tree.
+ template<typename InputIterator>
+ static Node *LowestCommonAncestor(InputIterator first, InputIterator last);
+
+ private:
+ // Disallow copying
+ Node(const Node &);
+ Node &operator=(const Node &);
+
+ void GetTargetWords(std::vector<std::string> &) const;
+
+ std::string m_label;
+ NodeType m_type;
+ std::vector<Node*> m_children;
+ std::vector<Node*> m_parents;
+ float m_pcfgScore;
+ Span m_span;
+ Span m_complementSpan;
+ std::vector<const Subgraph*> m_rules;
+};
+
+template<typename OutputIterator>
+void Node::GetTreeAncestors(OutputIterator result, bool includeSelf)
+{
+ // This function assumes the node is part of the parse tree.
+ assert(m_type == TARGET || m_type == TREE);
+
+ if (includeSelf) {
+ *result++ = this;
+ }
+
+ Node *ancestor = !(m_parents.empty()) ? m_parents[0] : 0;
+ while (ancestor != 0) {
+ *result++ = ancestor;
+ ancestor = !(ancestor->m_parents.empty()) ? ancestor->m_parents[0] : 0;
+ }
+}
+
+template<typename InputIterator>
+Node *Node::LowestCommonAncestor(InputIterator first, InputIterator last)
+{
+ // Check for an empty sequence.
+ if (first == last) {
+ return 0;
+ }
+
+ // Check for the case that the sequence contains only one distinct node.
+ // Also check that every node belongs to the target tree.
+ InputIterator p = first;
+ Node *lca = *p++;
+ for (; p != last; ++p) {
+ Node *node = *p;
+ assert(node->m_type != SOURCE);
+ if (node != lca) {
+ lca = 0;
+ }
+ }
+ if (lca) {
+ return lca;
+ }
+
+ // Now construct an ancestor path for each node, from itself to the root.
+ size_t minPathLength = 0;
+ std::vector<std::vector<Node *> > paths;
+ for (p = first; p != last; ++p) {
+ paths.resize(paths.size()+1);
+ (*p)->GetTreeAncestors(std::back_inserter(paths.back()), true);
+ size_t pathLength = paths.back().size();
+ assert(pathLength > 0);
+ if (paths.size() == 1 || pathLength < minPathLength) {
+ minPathLength = pathLength;
+ }
+ }
+
+ // Search for the start of the longest common suffix by working forward from
+ // the the earliest possible starting point to the root.
+ for (size_t i = 0; i < minPathLength; ++i) {
+ bool match = true;
+ for (size_t j = 0; j < paths.size(); ++j) {
+ size_t index = paths[j].size() - minPathLength + i;
+ assert(index >= 0);
+ assert(index < paths[j].size());
+ if (j == 0) {
+ lca = paths[j][index];
+ assert(lca);
+ } else if (lca != paths[j][index]) {
+ match = false;
+ break;
+ }
+ }
+ if (match) {
+ return lca;
+ }
+ }
+
+ // A lowest common ancestor should have been found.
+ assert(false);
+ return 0;
+}
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h
new file mode 100644
index 000000000..362fc95d2
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Options.h
@@ -0,0 +1,67 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_OPTIONS_H_
+#define EXTRACT_GHKM_OPTIONS_H_
+
+#include <string>
+
+namespace Moses {
+namespace GHKM {
+
+struct Options {
+ public:
+ Options()
+ : allowUnary(false)
+ , conditionOnTargetLhs(false)
+ , gzOutput(false)
+ , maxNodes(15)
+ , maxRuleDepth(3)
+ , maxRuleSize(3)
+ , maxScope(3)
+ , minimal(false)
+ , pcfg(false)
+ , unpairedExtractFormat(false) {}
+
+ // Positional options
+ std::string targetFile;
+ std::string sourceFile;
+ std::string alignmentFile;
+ std::string extractFile;
+
+ // All other options
+ bool allowUnary;
+ bool conditionOnTargetLhs;
+ std::string glueGrammarFile;
+ bool gzOutput;
+ int maxNodes;
+ int maxRuleDepth;
+ int maxRuleSize;
+ int maxScope;
+ bool minimal;
+ bool pcfg;
+ bool unpairedExtractFormat;
+ std::string unknownWordFile;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/ParseTree.cpp b/phrase-extract/extract-ghkm/ParseTree.cpp
new file mode 100644
index 000000000..052b8dee1
--- /dev/null
+++ b/phrase-extract/extract-ghkm/ParseTree.cpp
@@ -0,0 +1,54 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "ParseTree.h"
+
+namespace Moses {
+namespace GHKM {
+
+ParseTree::~ParseTree()
+{
+ for (std::vector<ParseTree*>::iterator p(m_children.begin());
+ p != m_children.end(); ++p) {
+ delete *p;
+ }
+}
+
+void ParseTree::SetChildren(const std::vector<ParseTree*> &children)
+{
+ m_children = children;
+}
+
+void ParseTree::SetParent(ParseTree *parent)
+{
+ m_parent = parent;
+}
+
+void ParseTree::AddChild(ParseTree *child)
+{
+ m_children.push_back(child);
+}
+
+bool ParseTree::IsLeaf() const
+{
+ return m_children.empty();
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/ParseTree.h b/phrase-extract/extract-ghkm/ParseTree.h
new file mode 100644
index 000000000..273e2e04e
--- /dev/null
+++ b/phrase-extract/extract-ghkm/ParseTree.h
@@ -0,0 +1,85 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_PARSE_TREE_H_
+#define EXTRACT_GHKM_PARSE_TREE_H_
+
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+class ParseTree
+{
+ public:
+ ParseTree(const std::string &label)
+ : m_label(label)
+ , m_parent(0)
+ , m_pcfgScore(0.0) {}
+
+ ~ParseTree();
+
+ const std::string &GetLabel() const { return m_label; }
+ const std::vector<ParseTree*> &GetChildren() const { return m_children; }
+ const ParseTree *GetParent() const { return m_parent; }
+ float GetPcfgScore() const { return m_pcfgScore; }
+
+ void SetParent(ParseTree *);
+ void SetChildren(const std::vector<ParseTree*> &);
+ void SetPcfgScore(float score) { m_pcfgScore = score; }
+
+ void AddChild(ParseTree *);
+
+ bool IsLeaf() const;
+
+ template<typename OutputIterator>
+ void GetLeaves(OutputIterator);
+
+ private:
+ // Disallow copying
+ ParseTree(const ParseTree &);
+ ParseTree &operator=(const ParseTree &);
+
+ std::string m_label;
+ std::vector<ParseTree*> m_children;
+ ParseTree *m_parent;
+ float m_pcfgScore; // log probability
+};
+
+template<typename OutputIterator>
+void ParseTree::GetLeaves(OutputIterator result)
+{
+ if (IsLeaf()) {
+ *result++ = this;
+ } else {
+ std::vector<ParseTree *>::const_iterator p = m_children.begin();
+ std::vector<ParseTree *>::const_iterator end = m_children.end();
+ while (p != end) {
+ ParseTree &child = **p++;
+ child.GetLeaves(result);
+ }
+ }
+}
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/ScfgRule.cpp b/phrase-extract/extract-ghkm/ScfgRule.cpp
new file mode 100644
index 000000000..5dc70052c
--- /dev/null
+++ b/phrase-extract/extract-ghkm/ScfgRule.cpp
@@ -0,0 +1,145 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "ScfgRule.h"
+
+#include "Node.h"
+#include "Subgraph.h"
+
+#include <algorithm>
+
+namespace Moses {
+namespace GHKM {
+
+ScfgRule::ScfgRule(const Subgraph &fragment)
+ : m_sourceLHS("X", NonTerminal)
+ , m_targetLHS(fragment.GetRoot()->GetLabel(), NonTerminal)
+ , m_pcfgScore(fragment.GetPcfgScore())
+{
+ // Source RHS
+
+ const std::set<const Node *> &leaves = fragment.GetLeaves();
+
+ std::vector<const Node *> sourceRHSNodes;
+ sourceRHSNodes.reserve(leaves.size());
+ for (std::set<const Node *>::const_iterator p(leaves.begin());
+ p != leaves.end(); ++p) {
+ const Node &leaf = **p;
+ if (!leaf.GetSpan().empty()) {
+ sourceRHSNodes.push_back(&leaf);
+ }
+ }
+
+ std::sort(sourceRHSNodes.begin(), sourceRHSNodes.end(), PartitionOrderComp);
+
+ // Build a mapping from target nodes to source-order indices, so that we
+ // can construct the Alignment object later.
+ std::map<const Node *, std::vector<int> > sourceOrder;
+
+ m_sourceRHS.reserve(sourceRHSNodes.size());
+ int srcIndex = 0;
+ for (std::vector<const Node *>::const_iterator p(sourceRHSNodes.begin());
+ p != sourceRHSNodes.end(); ++p, ++srcIndex) {
+ const Node &sinkNode = **p;
+ if (sinkNode.GetType() == TREE) {
+ m_sourceRHS.push_back(Symbol("X", NonTerminal));
+ sourceOrder[&sinkNode].push_back(srcIndex);
+ } else {
+ assert(sinkNode.GetType() == SOURCE);
+ m_sourceRHS.push_back(Symbol(sinkNode.GetLabel(), Terminal));
+ // Add all aligned target words to the sourceOrder map
+ const std::vector<Node *> &parents(sinkNode.GetParents());
+ for (std::vector<Node *>::const_iterator q(parents.begin());
+ q != parents.end(); ++q) {
+ if ((*q)->GetType() == TARGET) {
+ sourceOrder[*q].push_back(srcIndex);
+ }
+ }
+ }
+ }
+
+ // Target RHS + alignment
+
+ std::vector<const Node *> targetLeaves;
+ fragment.GetTargetLeaves(targetLeaves);
+
+ m_alignment.reserve(targetLeaves.size()); // might be too much but that's OK
+ m_targetRHS.reserve(targetLeaves.size());
+
+ for (std::vector<const Node *>::const_iterator p(targetLeaves.begin());
+ p != targetLeaves.end(); ++p) {
+ const Node &leaf = **p;
+ if (leaf.GetSpan().empty()) {
+ // The node doesn't cover any source words, so we can only add
+ // terminals to the target RHS (not a non-terminal).
+ std::vector<std::string> targetWords(leaf.GetTargetWords());
+ for (std::vector<std::string>::const_iterator q(targetWords.begin());
+ q != targetWords.end(); ++q) {
+ m_targetRHS.push_back(Symbol(*q, Terminal));
+ }
+ } else if (leaf.GetType() == SOURCE) {
+ // Do nothing
+ } else {
+ SymbolType type = (leaf.GetType() == TREE) ? NonTerminal : Terminal;
+ m_targetRHS.push_back(Symbol(leaf.GetLabel(), type));
+
+ int tgtIndex = m_targetRHS.size()-1;
+ std::map<const Node *, std::vector<int> >::iterator q(sourceOrder.find(&leaf));
+ assert(q != sourceOrder.end());
+ std::vector<int> &sourceNodes = q->second;
+ for (std::vector<int>::iterator r(sourceNodes.begin());
+ r != sourceNodes.end(); ++r) {
+ int srcIndex = *r;
+ m_alignment.push_back(std::make_pair(srcIndex, tgtIndex));
+ }
+ }
+ }
+}
+
+int ScfgRule::Scope() const
+{
+ int scope = 0;
+ bool predIsNonTerm = false;
+ if (m_sourceRHS[0].GetType() == NonTerminal) {
+ ++scope;
+ predIsNonTerm = true;
+ }
+ for (size_t i = 1; i < m_sourceRHS.size(); ++i) {
+ bool isNonTerm = m_sourceRHS[i].GetType() == NonTerminal;
+ if (isNonTerm && predIsNonTerm) {
+ ++scope;
+ }
+ predIsNonTerm = isNonTerm;
+ }
+ if (predIsNonTerm) {
+ ++scope;
+ }
+ return scope;
+}
+
+bool ScfgRule::PartitionOrderComp(const Node *a, const Node *b)
+{
+ const Span &aSpan = a->GetSpan();
+ const Span &bSpan = b->GetSpan();
+ assert(!aSpan.empty() && !bSpan.empty());
+ return *(aSpan.begin()) < *(bSpan.begin());
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/ScfgRule.h b/phrase-extract/extract-ghkm/ScfgRule.h
new file mode 100644
index 000000000..2405d8fa3
--- /dev/null
+++ b/phrase-extract/extract-ghkm/ScfgRule.h
@@ -0,0 +1,78 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_SCFG_RULE_H_
+#define EXTRACT_GHKM_SCFG_RULE_H_
+
+#include "Alignment.h"
+
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+class Node;
+class Subgraph;
+
+enum SymbolType { Terminal, NonTerminal };
+
+struct Symbol
+{
+ public:
+ Symbol(const std::string &v, SymbolType t) : m_value(v) , m_type(t) {}
+
+ const std::string &GetValue() const { return m_value; }
+ SymbolType GetType() const { return m_type; }
+
+ private:
+ std::string m_value;
+ SymbolType m_type;
+};
+
+class ScfgRule
+{
+ public:
+ ScfgRule(const Subgraph &fragment);
+
+ const Symbol &GetSourceLHS() const { return m_sourceLHS; }
+ const Symbol &GetTargetLHS() const { return m_targetLHS; }
+ const std::vector<Symbol> &GetSourceRHS() const { return m_sourceRHS; }
+ const std::vector<Symbol> &GetTargetRHS() const { return m_targetRHS; }
+ const Alignment &GetAlignment() const { return m_alignment; }
+ float GetPcfgScore() const { return m_pcfgScore; }
+
+ int Scope() const;
+
+ private:
+ static bool PartitionOrderComp(const Node *, const Node *);
+
+ Symbol m_sourceLHS;
+ Symbol m_targetLHS;
+ std::vector<Symbol> m_sourceRHS;
+ std::vector<Symbol> m_targetRHS;
+ Alignment m_alignment;
+ float m_pcfgScore;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
new file mode 100644
index 000000000..cd993d6e8
--- /dev/null
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
@@ -0,0 +1,164 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "ScfgRuleWriter.h"
+
+#include "Alignment.h"
+#include "Options.h"
+#include "ScfgRule.h"
+
+#include <cassert>
+#include <cmath>
+#include <ostream>
+#include <map>
+#include <sstream>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+void ScfgRuleWriter::Write(const ScfgRule &rule)
+{
+ std::ostringstream sourceSS;
+ std::ostringstream targetSS;
+
+ if (m_options.unpairedExtractFormat) {
+ WriteUnpairedFormat(rule, sourceSS, targetSS);
+ } else {
+ WriteStandardFormat(rule, sourceSS, targetSS);
+ }
+
+ // Write the rule to the forward and inverse extract files.
+ m_fwd << sourceSS.str() << " ||| " << targetSS.str() << " |||";
+ m_inv << targetSS.str() << " ||| " << sourceSS.str() << " |||";
+
+ const Alignment &alignment = rule.GetAlignment();
+ for (Alignment::const_iterator p = alignment.begin();
+ p != alignment.end(); ++p) {
+ m_fwd << " " << p->first << "-" << p->second;
+ m_inv << " " << p->second << "-" << p->first;
+ }
+
+ // Write a count of 1 and an empty NT length column to the forward extract
+ // file.
+ // TODO Add option to write NT length?
+ m_fwd << " ||| 1 ||| |||";
+ if (m_options.pcfg) {
+ // Write the PCFG score.
+ m_fwd << " " << std::exp(rule.GetPcfgScore());
+ }
+ m_fwd << std::endl;
+
+ // Write a count of 1 to the inverse extract file.
+ m_inv << " ||| 1" << std::endl;
+}
+
+void ScfgRuleWriter::WriteStandardFormat(const ScfgRule &rule,
+ std::ostream &sourceSS,
+ std::ostream &targetSS)
+{
+ const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
+ const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
+
+ std::map<int, int> sourceToTargetNTMap;
+ std::map<int, int> targetToSourceNTMap;
+
+ const Alignment &alignment = rule.GetAlignment();
+
+ for (Alignment::const_iterator p(alignment.begin());
+ p != alignment.end(); ++p) {
+ if (sourceRHS[p->first].GetType() == NonTerminal) {
+ assert(targetRHS[p->second].GetType() == NonTerminal);
+ sourceToTargetNTMap[p->first] = p->second;
+ targetToSourceNTMap[p->second] = p->first;
+ }
+ }
+
+ // Write the source side of the rule to sourceSS.
+ int i = 0;
+ for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
+ p != sourceRHS.end(); ++p, ++i) {
+ WriteSymbol(*p, sourceSS);
+ if (p->GetType() == NonTerminal) {
+ int targetIndex = sourceToTargetNTMap[i];
+ WriteSymbol(targetRHS[targetIndex], sourceSS);
+ }
+ sourceSS << " ";
+ }
+ if (m_options.conditionOnTargetLhs) {
+ WriteSymbol(rule.GetTargetLHS(), sourceSS);
+ } else {
+ WriteSymbol(rule.GetSourceLHS(), sourceSS);
+ }
+
+ // Write the target side of the rule to targetSS.
+ i = 0;
+ for (std::vector<Symbol>::const_iterator p(targetRHS.begin());
+ p != targetRHS.end(); ++p, ++i) {
+ if (p->GetType() == NonTerminal) {
+ int sourceIndex = targetToSourceNTMap[i];
+ WriteSymbol(sourceRHS[sourceIndex], targetSS);
+ }
+ WriteSymbol(*p, targetSS);
+ targetSS << " ";
+ }
+ WriteSymbol(rule.GetTargetLHS(), targetSS);
+}
+
+void ScfgRuleWriter::WriteUnpairedFormat(const ScfgRule &rule,
+ std::ostream &sourceSS,
+ std::ostream &targetSS)
+{
+ const std::vector<Symbol> &sourceRHS = rule.GetSourceRHS();
+ const std::vector<Symbol> &targetRHS = rule.GetTargetRHS();
+
+ // Write the source side of the rule to sourceSS.
+ int i = 0;
+ for (std::vector<Symbol>::const_iterator p(sourceRHS.begin());
+ p != sourceRHS.end(); ++p, ++i) {
+ WriteSymbol(*p, sourceSS);
+ sourceSS << " ";
+ }
+ if (m_options.conditionOnTargetLhs) {
+ WriteSymbol(rule.GetTargetLHS(), sourceSS);
+ } else {
+ WriteSymbol(rule.GetSourceLHS(), sourceSS);
+ }
+
+ // Write the target side of the rule to targetSS.
+ i = 0;
+ for (std::vector<Symbol>::const_iterator p(targetRHS.begin());
+ p != targetRHS.end(); ++p, ++i) {
+ WriteSymbol(*p, targetSS);
+ targetSS << " ";
+ }
+ WriteSymbol(rule.GetTargetLHS(), targetSS);
+}
+
+void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
+{
+ if (symbol.GetType() == NonTerminal) {
+ out << "[" << symbol.GetValue() << "]";
+ } else {
+ out << symbol.GetValue();
+ }
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
new file mode 100644
index 000000000..b92a432a1
--- /dev/null
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
@@ -0,0 +1,60 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_RULE_WRITER_H_
+#define EXTRACT_GHKM_RULE_WRITER_H_
+
+#include <ostream>
+
+namespace Moses {
+namespace GHKM {
+
+struct Options;
+class ScfgRule;
+struct Symbol;
+
+class ScfgRuleWriter
+{
+ public:
+ ScfgRuleWriter(std::ostream &fwd, std::ostream &inv, const Options &options)
+ : m_fwd(fwd)
+ , m_inv(inv)
+ , m_options(options) {}
+
+ void Write(const ScfgRule &);
+
+ private:
+ // Disallow copying
+ ScfgRuleWriter(const ScfgRuleWriter &);
+ ScfgRuleWriter &operator=(const ScfgRuleWriter &);
+
+ void WriteStandardFormat(const ScfgRule &, std::ostream &, std::ostream &);
+ void WriteUnpairedFormat(const ScfgRule &, std::ostream &, std::ostream &);
+ void WriteSymbol(const Symbol &, std::ostream &);
+
+ std::ostream &m_fwd;
+ std::ostream &m_inv;
+ const Options &m_options;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/Span.cpp b/phrase-extract/extract-ghkm/Span.cpp
new file mode 100644
index 000000000..f0eccbdf2
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Span.cpp
@@ -0,0 +1,46 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "Span.h"
+
+namespace Moses {
+namespace GHKM {
+
+bool SpansIntersect(const Span &a, const ContiguousSpan &b)
+{
+ for (Span::const_iterator p = a.begin(); p != a.end(); ++p) {
+ if (*p >= b.first && *p <= b.second) {
+ return true;
+ }
+ }
+ return false;
+}
+
+ContiguousSpan Closure(const Span &s)
+{
+ ContiguousSpan result(-1,-1);
+ if (!s.empty()) {
+ result.first = *(s.begin());
+ result.second = *(s.rbegin());
+ }
+ return result;
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/Span.h b/phrase-extract/extract-ghkm/Span.h
new file mode 100644
index 000000000..003d1ef84
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Span.h
@@ -0,0 +1,40 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_SPAN_H_
+#define EXTRACT_GHKM_SPAN_H_
+
+#include <map>
+#include <set>
+
+namespace Moses {
+namespace GHKM {
+
+typedef std::set<int> Span;
+typedef std::pair<int, int> ContiguousSpan;
+
+bool SpansIntersect(const Span &, const ContiguousSpan &);
+
+ContiguousSpan Closure(const Span &);
+
+} // namespace Moses
+} // namespace GHKM
+
+#endif
diff --git a/phrase-extract/extract-ghkm/Subgraph.cpp b/phrase-extract/extract-ghkm/Subgraph.cpp
new file mode 100644
index 000000000..e048f2c55
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Subgraph.cpp
@@ -0,0 +1,121 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "Subgraph.h"
+
+#include "Node.h"
+
+namespace Moses {
+namespace GHKM {
+
+void Subgraph::GetTargetLeaves(std::vector<const Node *> &result) const
+{
+ result.clear();
+ GetTargetLeaves(m_root, result);
+}
+
+void Subgraph::GetTargetLeaves(const Node *root,
+ std::vector<const Node *> &result) const
+{
+ if (root->GetType() == TARGET || m_leaves.find(root) != m_leaves.end()) {
+ result.push_back(root);
+ } else {
+ const std::vector<Node*> &children = root->GetChildren();
+ for (std::vector<Node *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ GetTargetLeaves(*p, result);
+ }
+ }
+}
+
+int Subgraph::CountNodes(const Node *n) const
+{
+ if (n->GetType() != TREE) {
+ return 0;
+ }
+ if (IsTrivial()) {
+ return 1;
+ }
+ int count = 1;
+ const std::vector<Node*> &children = n->GetChildren();
+ for (std::vector<Node *>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ const Node *child = *p;
+ if (m_leaves.find(child) == m_leaves.end()) {
+ count += CountNodes(child);
+ } else if (child->GetType() == TREE) {
+ ++count;
+ }
+ }
+ return count;
+}
+
+int Subgraph::CalcSize(const Node *n) const
+{
+ if (n->GetType() != TREE || n->IsPreterminal()) {
+ return 0;
+ }
+ if (IsTrivial()) {
+ return 1;
+ }
+ int count = 1;
+ const std::vector<Node*> &children = n->GetChildren();
+ for (std::vector<Node *>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ if (m_leaves.find(*p) == m_leaves.end()) {
+ count += CalcSize(*p);
+ }
+ }
+ return count;
+}
+
+int Subgraph::CalcDepth(const Node *n) const
+{
+ if (n->GetType() != TREE || n->IsPreterminal() || m_leaves.empty()) {
+ return 0;
+ }
+ int maxChildDepth = 0;
+ const std::vector<Node*> &children = n->GetChildren();
+ for (std::vector<Node *>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ if (m_leaves.find(*p) == m_leaves.end()) {
+ maxChildDepth = std::max(maxChildDepth, CalcDepth(*p));
+ }
+ }
+ return maxChildDepth + 1;
+}
+
+float Subgraph::CalcPcfgScore() const
+{
+ if (m_root->GetType() != TREE || m_leaves.empty()) {
+ return 0.0f;
+ }
+ float score = m_root->GetPcfgScore();
+ for (std::set<const Node *>::const_iterator p = m_leaves.begin();
+ p != m_leaves.end(); ++p) {
+ const Node *leaf = *p;
+ if (leaf->GetType() == TREE) {
+ score -= leaf->GetPcfgScore();
+ }
+ }
+ return score;
+}
+
+} // namespace Moses
+} // namespace GHKM
diff --git a/phrase-extract/extract-ghkm/Subgraph.h b/phrase-extract/extract-ghkm/Subgraph.h
new file mode 100644
index 000000000..ede1233e9
--- /dev/null
+++ b/phrase-extract/extract-ghkm/Subgraph.h
@@ -0,0 +1,87 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_SUBGRAPH_H_
+#define EXTRACT_GHKM_SUBGRAPH_H_
+
+#include "Node.h"
+
+#include <set>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+class Node;
+
+class Subgraph
+{
+ public:
+ Subgraph(const Node *root)
+ : m_root(root)
+ , m_depth(0)
+ , m_size(root->GetType() == TREE ? 1 : 0)
+ , m_nodeCount(1)
+ , m_pcfgScore(0.0f) {}
+
+ Subgraph(const Node *root, const std::set<const Node *> &leaves)
+ : m_root(root)
+ , m_leaves(leaves)
+ , m_depth(-1)
+ , m_size(-1)
+ , m_nodeCount(-1)
+ , m_pcfgScore(0.0f)
+ {
+ m_depth = CalcDepth(m_root);
+ m_size = CalcSize(m_root);
+ m_nodeCount = CountNodes(m_root);
+ m_pcfgScore = CalcPcfgScore();
+ }
+
+ const Node *GetRoot() const { return m_root; }
+ const std::set<const Node *> &GetLeaves() const { return m_leaves; }
+ int GetDepth() const { return m_depth; }
+ int GetSize() const { return m_size; }
+ int GetNodeCount() const { return m_nodeCount; }
+ float GetPcfgScore() const { return m_pcfgScore; }
+
+ bool IsTrivial() const { return m_leaves.empty(); }
+
+ void GetTargetLeaves(std::vector<const Node *> &) const;
+
+ private:
+ void GetTargetLeaves(const Node *, std::vector<const Node *> &) const;
+ int CalcDepth(const Node *) const;
+ int CalcSize(const Node *) const;
+ float CalcPcfgScore() const;
+ int CountNodes(const Node *) const;
+
+ const Node *m_root;
+ std::set<const Node *> m_leaves;
+ int m_depth;
+ int m_size;
+ int m_nodeCount;
+ float m_pcfgScore;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.cpp b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
new file mode 100644
index 000000000..cc961dc0c
--- /dev/null
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.cpp
@@ -0,0 +1,89 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "XmlTreeParser.h"
+
+#include "ParseTree.h"
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+
+#include <cassert>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+XmlTreeParser::XmlTreeParser(std::set<std::string> &labelSet,
+ std::map<std::string, int> &topLabelSet)
+ : m_labelSet(labelSet)
+ , m_topLabelSet(topLabelSet)
+{
+}
+
+std::auto_ptr<ParseTree> XmlTreeParser::Parse(const std::string &line)
+{
+ m_line = line;
+ m_tree.Clear();
+ try {
+ if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
+ throw Exception("");
+ }
+ } catch (const XmlException &e) {
+ throw Exception(e.getMsg());
+ }
+ m_tree.ConnectNodes();
+ SyntaxNode *root = m_tree.GetTop();
+ assert(root);
+ m_words = tokenize(m_line.c_str());
+ return ConvertTree(*root, m_words);
+}
+
+// Converts a SyntaxNode tree to a Moses::GHKM::ParseTree.
+std::auto_ptr<ParseTree> XmlTreeParser::ConvertTree(
+ const SyntaxNode &tree,
+ const std::vector<std::string> &words)
+{
+ std::auto_ptr<ParseTree> root(new ParseTree(tree.GetLabel()));
+ root->SetPcfgScore(tree.GetPcfgScore());
+ const std::vector<SyntaxNode*> &children = tree.GetChildren();
+ if (children.empty()) {
+ if (tree.GetStart() != tree.GetEnd()) {
+ std::ostringstream msg;
+ msg << "leaf node covers multiple words (" << tree.GetStart()
+ << "-" << tree.GetEnd() << "): this is currently unsupported";
+ throw Exception(msg.str());
+ }
+ std::auto_ptr<ParseTree> leaf(new ParseTree(words[tree.GetStart()]));
+ leaf->SetParent(root.get());
+ root->AddChild(leaf.release());
+ } else {
+ for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ assert(*p);
+ std::auto_ptr<ParseTree> child = ConvertTree(**p, words);
+ child->SetParent(root.get());
+ root->AddChild(child.release());
+ }
+ }
+ return root;
+}
+
+} // namespace GHKM
+} // namespace Moses
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h
new file mode 100644
index 000000000..664ab11a3
--- /dev/null
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.h
@@ -0,0 +1,59 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef EXTRACT_GHKM_XML_TREE_PARSER_H_
+#define EXTRACT_GHKM_XML_TREE_PARSER_H_
+
+#include "Exception.h"
+
+#include "SyntaxTree.h"
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace GHKM {
+
+class ParseTree;
+
+// Parses a string in Moses' XML parse tree format and returns a ParseTree
+// object.
+class XmlTreeParser {
+ public:
+ XmlTreeParser(std::set<std::string> &, std::map<std::string, int> &);
+ std::auto_ptr<ParseTree> Parse(const std::string &);
+ private:
+ std::auto_ptr<ParseTree> ConvertTree(const SyntaxNode &,
+ const std::vector<std::string> &);
+
+ std::set<std::string> &m_labelSet;
+ std::map<std::string, int> &m_topLabelSet;
+ std::string m_line;
+ SyntaxTree m_tree;
+ std::vector<std::string> m_words;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/extract-lex.cpp b/phrase-extract/extract-lex.cpp
new file mode 100644
index 000000000..9b03a6da0
--- /dev/null
+++ b/phrase-extract/extract-lex.cpp
@@ -0,0 +1,222 @@
+#include <iostream>
+#include <fstream>
+#include <cassert>
+#include <vector>
+#include "extract-lex.h"
+#include "InputFileStream.h"
+
+using namespace std;
+
+float COUNT_INCR = 1;
+
+void fix(std::ostream& stream)
+{
+ stream.setf(std::ios::fixed);
+ stream.precision(7);
+}
+
+int main(int argc, char* argv[])
+{
+ cerr << "Starting...\n";
+
+ assert(argc == 6);
+ char* &filePathTarget = argv[1];
+ char* &filePathSource = argv[2];
+ char* &filePathAlign = argv[3];
+ char* &filePathLexS2T = argv[4];
+ char* &filePathLexT2S = argv[5];
+
+ Moses::InputFileStream streamTarget(filePathTarget);
+ Moses::InputFileStream streamSource(filePathSource);
+ Moses::InputFileStream streamAlign(filePathAlign);
+
+ ofstream streamLexS2T;
+ ofstream streamLexT2S;
+ streamLexS2T.open(filePathLexS2T);
+ streamLexT2S.open(filePathLexT2S);
+
+ fix(streamLexS2T);
+ fix(streamLexT2S);
+
+ ExtractLex extractSingleton;
+
+ size_t lineCount = 0;
+ string lineTarget, lineSource, lineAlign;
+ while (getline(streamTarget, lineTarget))
+ {
+ if (lineCount % 10000 == 0)
+ cerr << lineCount << " ";
+
+ istream &isSource = getline(streamSource, lineSource);
+ assert(isSource);
+ istream &isAlign = getline(streamAlign, lineAlign);
+ assert(isAlign);
+
+ vector<string> toksTarget, toksSource, toksAlign;
+ Tokenize(toksTarget, lineTarget);
+ Tokenize(toksSource, lineSource);
+ Tokenize(toksAlign, lineAlign);
+
+ /*
+ cerr << endl
+ << toksTarget.size() << " " << lineTarget << endl
+ << toksSource.size() << " " << lineSource << endl
+ << toksAlign.size() << " " << lineAlign << endl;
+ */
+
+ extractSingleton.Process(toksTarget, toksSource, toksAlign, lineCount);
+
+ ++lineCount;
+ }
+
+ extractSingleton.Output(streamLexS2T, streamLexT2S);
+
+ streamTarget.Close();
+ streamSource.Close();
+ streamAlign.Close();
+ streamLexS2T.close();
+ streamLexT2S.close();
+
+ cerr << "\nFinished\n";
+}
+
+const std::string *Vocab::GetOrAdd(const std::string &word)
+{
+ const string *ret = &(*m_coll.insert(word).first);
+ return ret;
+}
+
+void ExtractLex::Process(vector<string> &toksTarget, vector<string> &toksSource, vector<string> &toksAlign, size_t lineCount)
+{
+ std::vector<bool> m_sourceAligned(toksSource.size(), false)
+ , m_targetAligned(toksTarget.size(), false);
+
+ vector<string>::const_iterator iterAlign;
+ for (iterAlign = toksAlign.begin(); iterAlign != toksAlign.end(); ++iterAlign)
+ {
+ const string &alignTok = *iterAlign;
+
+ vector<size_t> alignPos;
+ Tokenize(alignPos, alignTok, "-");
+ assert(alignPos.size() == 2);
+
+ if (alignPos[0] >= toksSource.size())
+ {
+ cerr << "ERROR: alignment over source length. Alignment " << alignPos[0] << " at line " << lineCount << endl;
+ continue;
+ }
+ if (alignPos[1] >= toksTarget.size())
+ {
+ cerr << "ERROR: alignment over target length. Alignment " << alignPos[1] << " at line " << lineCount << endl;
+ continue;
+ }
+
+ assert(alignPos[0] < toksSource.size());
+ assert(alignPos[1] < toksTarget.size());
+
+ m_sourceAligned[ alignPos[0] ] = true;
+ m_targetAligned[ alignPos[1] ] = true;
+
+ const string &tmpSource = toksSource[ alignPos[0] ];
+ const string &tmpTarget = toksTarget[ alignPos[1] ];
+
+ const string *source = m_vocab.GetOrAdd(tmpSource);
+ const string *target = m_vocab.GetOrAdd(tmpTarget);
+
+ Process(target, source);
+
+ }
+
+ ProcessUnaligned(toksTarget, toksSource, m_sourceAligned, m_targetAligned);
+}
+
+void ExtractLex::Process(const std::string *target, const std::string *source)
+{
+ WordCount &wcS2T = m_collS2T[source];
+ WordCount &wcT2S = m_collT2S[target];
+
+ wcS2T.AddCount(COUNT_INCR);
+ wcT2S.AddCount(COUNT_INCR);
+
+ Process(wcS2T, target);
+ Process(wcT2S, source);
+}
+
+void ExtractLex::Process(WordCount &wcIn, const std::string *out)
+{
+ std::map<const std::string*, WordCount> &collOut = wcIn.GetColl();
+ WordCount &wcOut = collOut[out];
+ wcOut.AddCount(COUNT_INCR);
+}
+
+void ExtractLex::ProcessUnaligned(vector<string> &toksTarget, vector<string> &toksSource
+ , const std::vector<bool> &m_sourceAligned, const std::vector<bool> &m_targetAligned)
+{
+ const string *nullWord = m_vocab.GetOrAdd("NULL");
+
+ for (size_t pos = 0; pos < m_sourceAligned.size(); ++pos)
+ {
+ bool isAlignedCurr = m_sourceAligned[pos];
+ if (!isAlignedCurr)
+ {
+ const string &tmpWord = toksSource[pos];
+ const string *sourceWord = m_vocab.GetOrAdd(tmpWord);
+
+ Process(nullWord, sourceWord);
+ }
+ }
+
+ for (size_t pos = 0; pos < m_targetAligned.size(); ++pos)
+ {
+ bool isAlignedCurr = m_targetAligned[pos];
+ if (!isAlignedCurr)
+ {
+ const string &tmpWord = toksTarget[pos];
+ const string *targetWord = m_vocab.GetOrAdd(tmpWord);
+
+ Process(targetWord, nullWord);
+ }
+ }
+
+}
+
+void ExtractLex::Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S)
+{
+ Output(m_collS2T, streamLexS2T);
+ Output(m_collT2S, streamLexT2S);
+}
+
+void ExtractLex::Output(const std::map<const std::string*, WordCount> &coll, std::ofstream &outStream)
+{
+ std::map<const std::string*, WordCount>::const_iterator iterOuter;
+ for (iterOuter = coll.begin(); iterOuter != coll.end(); ++iterOuter)
+ {
+ const string &inStr = *iterOuter->first;
+ const WordCount &inWC = iterOuter->second;
+
+ const std::map<const std::string*, WordCount> &outColl = inWC.GetColl();
+
+ std::map<const std::string*, WordCount>::const_iterator iterInner;
+ for (iterInner = outColl.begin(); iterInner != outColl.end(); ++iterInner)
+ {
+ const string &outStr = *iterInner->first;
+ const WordCount &outWC = iterInner->second;
+
+ float prob = outWC.GetCount() / inWC.GetCount();
+ outStream << outStr << " " << inStr << " " << prob << endl;
+ }
+ }
+}
+
+std::ostream& operator<<(std::ostream &out, const WordCount &obj)
+{
+ out << "(" << obj.GetCount() << ")";
+ return out;
+}
+
+void WordCount::AddCount(float incr)
+{
+ m_count += incr;
+}
+
+
diff --git a/phrase-extract/extract-lex.h b/phrase-extract/extract-lex.h
new file mode 100644
index 000000000..e2225ecbc
--- /dev/null
+++ b/phrase-extract/extract-lex.h
@@ -0,0 +1,117 @@
+#pragma once
+
+#include <map>
+#include <set>
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+
+//! convert string to variable of type T. Used to reading floats, int etc from files
+template<typename T>
+inline T Scan(const std::string &input)
+{
+ std::stringstream stream(input);
+ T ret;
+ stream >> ret;
+ return ret;
+}
+
+
+//! speeded up version of above
+template<typename T>
+inline void Scan(std::vector<T> &output, const std::vector< std::string > &input)
+{
+ output.resize(input.size());
+ for (size_t i = 0 ; i < input.size() ; i++)
+ {
+ output[i] = Scan<T>( input[i] );
+ }
+}
+
+
+inline void Tokenize(std::vector<std::string> &output
+ , const std::string& str
+ , const std::string& delimiters = " \t")
+{
+ // Skip delimiters at beginning.
+ std::string::size_type lastPos = str.find_first_not_of(delimiters, 0);
+ // Find first "non-delimiter".
+ std::string::size_type pos = str.find_first_of(delimiters, lastPos);
+
+ while (std::string::npos != pos || std::string::npos != lastPos) {
+ // Found a token, add it to the vector.
+ output.push_back(str.substr(lastPos, pos - lastPos));
+ // Skip delimiters. Note the "not_of"
+ lastPos = str.find_first_not_of(delimiters, pos);
+ // Find next "non-delimiter"
+ pos = str.find_first_of(delimiters, lastPos);
+ }
+}
+
+// speeded up version of above
+template<typename T>
+inline void Tokenize( std::vector<T> &output
+ , const std::string &input
+ , const std::string& delimiters = " \t")
+{
+ std::vector<std::string> stringVector;
+ Tokenize(stringVector, input, delimiters);
+ return Scan<T>(output, stringVector );
+}
+
+class WordCount
+{
+ friend std::ostream& operator<<(std::ostream&, const WordCount&);
+public:
+ float m_count;
+
+ std::map<const std::string*, WordCount> m_coll;
+
+ WordCount()
+ :m_count(0)
+ {}
+
+ //WordCount(const WordCount &copy);
+
+ WordCount(float count)
+ :m_count(count)
+ {}
+
+ void AddCount(float incr);
+
+ std::map<const std::string*, WordCount> &GetColl()
+ { return m_coll; }
+ const std::map<const std::string*, WordCount> &GetColl() const
+ { return m_coll; }
+
+ const float GetCount() const
+ { return m_count; }
+
+};
+
+class Vocab
+{
+ std::set<std::string> m_coll;
+public:
+ const std::string *GetOrAdd(const std::string &word);
+};
+
+class ExtractLex
+{
+ Vocab m_vocab;
+ std::map<const std::string*, WordCount> m_collS2T, m_collT2S;
+
+ void Process(const std::string *target, const std::string *source);
+ void Process(WordCount &wcIn, const std::string *out);
+ void ProcessUnaligned(std::vector<std::string> &toksTarget, std::vector<std::string> &toksSource
+ , const std::vector<bool> &m_sourceAligned, const std::vector<bool> &m_targetAligned);
+
+ void Output(const std::map<const std::string*, WordCount> &coll, std::ofstream &outStream);
+
+public:
+ void Process(std::vector<std::string> &toksTarget, std::vector<std::string> &toksSource, std::vector<std::string> &toksAlign, size_t lineCount);
+ void Output(std::ofstream &streamLexS2T, std::ofstream &streamLexT2S);
+
+};
+
diff --git a/phrase-extract/extract-lex.vcxproj b/phrase-extract/extract-lex.vcxproj
new file mode 100644
index 000000000..a291d51aa
--- /dev/null
+++ b/phrase-extract/extract-lex.vcxproj
@@ -0,0 +1,89 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <ItemGroup>
+ <ClCompile Include="extract-lex.cpp" />
+ <ClCompile Include="InputFileStream.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="extract-lex.h" />
+ <ClInclude Include="InputFileStream.h" />
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{808BF985-CA18-4E55-8AAC-70E04DC25117}</ProjectGuid>
+ <Keyword>Win32Proj</Keyword>
+ <RootNamespace>extractlex</RootNamespace>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>true</UseDebugLibraries>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <UseDebugLibraries>false</UseDebugLibraries>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <LinkIncremental>true</LinkIncremental>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <LinkIncremental>false</LinkIncremental>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <AdditionalDependencies>zdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <WarningLevel>Level3</WarningLevel>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <Optimization>MaxSpeed</Optimization>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ </ClCompile>
+ <Link>
+ <SubSystem>Console</SubSystem>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <OptimizeReferences>true</OptimizeReferences>
+ <AdditionalDependencies>zdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project> \ No newline at end of file
diff --git a/phrase-extract/extract-rules.cpp b/phrase-extract/extract-rules.cpp
new file mode 100644
index 000000000..997038224
--- /dev/null
+++ b/phrase-extract/extract-rules.cpp
@@ -0,0 +1,1100 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <algorithm>
+#include <assert.h>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <list>
+#include <map>
+#include <set>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#ifdef WIN32
+// Include Visual Leak Detector
+//#include <vld.h>
+#endif
+
+#include "ExtractedRule.h"
+#include "Hole.h"
+#include "HoleCollection.h"
+#include "RuleExist.h"
+#include "SafeGetline.h"
+#include "SentenceAlignmentWithSyntax.h"
+#include "SyntaxTree.h"
+#include "tables-core.h"
+#include "XmlTree.h"
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+#include "../../../moses/src/ThreadPool.h"
+#include "../../../moses/src/OutputCollector.h"
+
+#define LINE_MAX_LENGTH 500000
+
+using namespace std;
+
+typedef vector< int > LabelIndex;
+typedef map< int, int > WordIndex;
+
+class ExtractTask : public Moses::Task {
+private:
+ size_t m_id;
+ SentenceAlignmentWithSyntax *m_sentence;
+ RuleExtractionOptions &m_options;
+ Moses::OutputCollector* m_extractCollector;
+ Moses::OutputCollector* m_extractCollectorInv;
+
+public:
+ ExtractTask(size_t id, SentenceAlignmentWithSyntax *sentence, RuleExtractionOptions &options, Moses::OutputCollector* extractCollector, Moses::OutputCollector* extractCollectorInv):
+ m_id(id),
+ m_sentence(sentence),
+ m_options(options),
+ m_extractCollector(extractCollector),
+ m_extractCollectorInv(extractCollectorInv) {}
+ ~ExtractTask() { delete m_sentence; }
+ void Run();
+
+private:
+vector< ExtractedRule > m_extractedRules;
+
+// main functions
+void extractRules();
+void addRuleToCollection(ExtractedRule &rule);
+void consolidateRules();
+void writeRulesToFile();
+
+// subs
+void addRule( int, int, int, int, RuleExist &ruleExist);
+void addHieroRule( int startT, int endT, int startS, int endS
+ , RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
+void printHieroPhrase( int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl, LabelIndex &labelIndex);
+string printTargetHieroPhrase( int startT, int endT, int startS, int endS
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
+string printSourceHieroPhrase( int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl, const LabelIndex &labelIndex);
+void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
+ , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
+void printHieroAlignment( int startT, int endT, int startS, int endS
+ , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
+void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl);
+
+inline string IntToString( int i )
+{
+ stringstream out;
+ out << i;
+ return out.str();
+}
+};
+
+// stats for glue grammar and unknown word label probabilities
+void collectWordLabelCounts(SentenceAlignmentWithSyntax &sentence );
+void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
+void writeUnknownWordLabel(const string &);
+
+
+int main(int argc, char* argv[])
+{
+ cerr << "extract-rules, written by Philipp Koehn\n"
+ << "rule extraction from an aligned parallel corpus\n";
+
+ RuleExtractionOptions options;
+#ifdef WITH_THREADS
+ int thread_count = 1;
+#endif
+ if (argc < 5) {
+ cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["
+#ifdef WITH_THREADS
+ << " --threads NUM |"
+#endif
+ << " --GlueGrammar FILE"
+ << " | --UnknownWordLabel FILE"
+ << " | --OnlyDirect"
+ << " | --OutputNTLengths"
+ << " | --MaxSpan[" << options.maxSpan << "]"
+ << " | --MinHoleTarget[" << options.minHoleTarget << "]"
+ << " | --MinHoleSource[" << options.minHoleSource << "]"
+ << " | --MinWords[" << options.minWords << "]"
+ << " | --MaxSymbolsTarget[" << options.maxSymbolsTarget << "]"
+ << " | --MaxSymbolsSource[" << options.maxSymbolsSource << "]"
+ << " | --MaxNonTerm[" << options.maxNonTerm << "]"
+ << " | --MaxScope[" << options.maxScope << "]"
+ << " | --SourceSyntax | --TargetSyntax"
+ << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting"
+ << " | --UnpairedExtractFormat"
+ << " | --ConditionOnTargetLHS ]\n";
+ exit(1);
+ }
+ char* &fileNameT = argv[1];
+ char* &fileNameS = argv[2];
+ char* &fileNameA = argv[3];
+ string fileNameGlueGrammar;
+ string fileNameUnknownWordLabel;
+ string fileNameExtract = string(argv[4]);
+
+ int optionInd = 5;
+
+ for(int i=optionInd; i<argc; i++) {
+ // maximum span length
+ if (strcmp(argv[i],"--MaxSpan") == 0) {
+ options.maxSpan = atoi(argv[++i]);
+ if (options.maxSpan < 1) {
+ cerr << "extract error: --maxSpan should be at least 1" << endl;
+ exit(1);
+ }
+ } else if (strcmp(argv[i],"--MinHoleTarget") == 0) {
+ options.minHoleTarget = atoi(argv[++i]);
+ if (options.minHoleTarget < 1) {
+ cerr << "extract error: --minHoleTarget should be at least 1" << endl;
+ exit(1);
+ }
+ } else if (strcmp(argv[i],"--MinHoleSource") == 0) {
+ options.minHoleSource = atoi(argv[++i]);
+ if (options.minHoleSource < 1) {
+ cerr << "extract error: --minHoleSource should be at least 1" << endl;
+ exit(1);
+ }
+ }
+ // maximum number of words in hierarchical phrase
+ else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) {
+ options.maxSymbolsTarget = atoi(argv[++i]);
+ if (options.maxSymbolsTarget < 1) {
+ cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl;
+ exit(1);
+ }
+ }
+ // maximum number of words in hierarchical phrase
+ else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) {
+ options.maxSymbolsSource = atoi(argv[++i]);
+ if (options.maxSymbolsSource < 1) {
+ cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl;
+ exit(1);
+ }
+ }
+ // minimum number of words in hierarchical phrase
+ else if (strcmp(argv[i],"--MinWords") == 0) {
+ options.minWords = atoi(argv[++i]);
+ if (options.minWords < 0) {
+ cerr << "extract error: --MinWords should be at least 0" << endl;
+ exit(1);
+ }
+ }
+ // maximum number of non-terminals
+ else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
+ options.maxNonTerm = atoi(argv[++i]);
+ if (options.maxNonTerm < 1) {
+ cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
+ exit(1);
+ }
+ }
+ // maximum scope (see Hopkins and Langmead (2010))
+ else if (strcmp(argv[i],"--MaxScope") == 0) {
+ options.maxScope = atoi(argv[++i]);
+ if (options.maxScope < 0) {
+ cerr << "extract error: --MaxScope should be at least 0" << endl;
+ exit(1);
+ }
+ }
+ else if (strcmp(argv[i], "--GZOutput") == 0) {
+ options.gzOutput = true;
+ }
+ // allow consecutive non-terminals (X Y | X Y)
+ else if (strcmp(argv[i],"--TargetSyntax") == 0) {
+ options.targetSyntax = true;
+ } else if (strcmp(argv[i],"--SourceSyntax") == 0) {
+ options.sourceSyntax = true;
+ } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) {
+ options.requireAlignedWord = false;
+ } else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) {
+ options.nonTermConsecTarget = false;
+ } else if (strcmp(argv[i],"--NonTermConsecSource") == 0) {
+ options.nonTermConsecSource = true;
+ } else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) {
+ options.nonTermFirstWord = false;
+ } else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
+ options.onlyOutputSpanInfo = true;
+ } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
+ options.onlyDirectFlag = true;
+ } else if (strcmp(argv[i],"--GlueGrammar") == 0) {
+ options.glueGrammarFlag = true;
+ if (++i >= argc) {
+ cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
+ exit(0);
+ }
+ fileNameGlueGrammar = string(argv[i]);
+ cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
+ } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
+ options.unknownWordLabelFlag = true;
+ if (++i >= argc) {
+ cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
+ exit(0);
+ }
+ fileNameUnknownWordLabel = string(argv[i]);
+ cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
+ }
+ // TODO: this should be a useful option
+ //else if (strcmp(argv[i],"--ZipFiles") == 0) {
+ // zipFiles = true;
+ //}
+ // if an source phrase is paired with two target phrases, then count(t|s) = 0.5
+ else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
+ options.fractionalCounting = false;
+ } else if (strcmp(argv[i],"--PCFG") == 0) {
+ options.pcfgScore = true;
+ } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
+ options.outputNTLengths = true;
+ } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
+ options.unpairedExtractFormat = true;
+ } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
+ options.conditionOnTargetLhs = true;
+#ifdef WITH_THREADS
+ } else if (strcmp(argv[i],"-threads") == 0 ||
+ strcmp(argv[i],"--threads") == 0 ||
+ strcmp(argv[i],"--Threads") == 0) {
+ thread_count = atoi(argv[++i]);
+#endif
+ } else {
+ cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
+ exit(1);
+ }
+ }
+
+ cerr << "extracting hierarchical rules" << endl;
+
+ // open input files
+ Moses::InputFileStream tFile(fileNameT);
+ Moses::InputFileStream sFile(fileNameS);
+ Moses::InputFileStream aFile(fileNameA);
+
+ istream *tFileP = &tFile;
+ istream *sFileP = &sFile;
+ istream *aFileP = &aFile;
+
+ // open output files
+ string fileNameExtractInv = fileNameExtract + ".inv" + (options.gzOutput?".gz":"");
+ Moses::OutputFileStream extractFile;
+ Moses::OutputFileStream extractFileInv;
+ extractFile.Open((fileNameExtract + (options.gzOutput?".gz":"")).c_str());
+ if (!options.onlyDirectFlag)
+ extractFileInv.Open(fileNameExtractInv.c_str());
+
+ // output into file
+ Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);
+ Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);
+
+ // stats on labels for glue grammar and unknown word label probabilities
+ set< string > targetLabelCollection, sourceLabelCollection;
+ map< string, int > targetTopLabelCollection, sourceTopLabelCollection;
+
+#ifdef WITH_THREADS
+ // set up thread pool
+ Moses::ThreadPool pool(thread_count);
+ pool.SetQueueLimit(1000);
+#endif
+
+ // loop through all sentence pairs
+ size_t i=0;
+ while(true) {
+ i++;
+ if (i%1000 == 0) cerr << "." << flush;
+ if (i%10000 == 0) cerr << ":" << flush;
+ if (i%100000 == 0) cerr << "!" << flush;
+ char targetString[LINE_MAX_LENGTH];
+ char sourceString[LINE_MAX_LENGTH];
+ char alignmentString[LINE_MAX_LENGTH];
+ SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (tFileP->eof()) break;
+ SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
+ SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
+
+ SentenceAlignmentWithSyntax *sentence = new SentenceAlignmentWithSyntax
+ (targetLabelCollection, sourceLabelCollection,
+ targetTopLabelCollection, sourceTopLabelCollection, options);
+ //az: output src, tgt, and alingment line
+ if (options.onlyOutputSpanInfo) {
+ cout << "LOG: SRC: " << sourceString << endl;
+ cout << "LOG: TGT: " << targetString << endl;
+ cout << "LOG: ALT: " << alignmentString << endl;
+ cout << "LOG: PHRASES_BEGIN:" << endl;
+ }
+
+ if (sentence->create(targetString, sourceString, alignmentString, i)) {
+ if (options.unknownWordLabelFlag) {
+ collectWordLabelCounts(*sentence);
+ }
+ ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector, extractCollectorInv);
+#ifdef WITH_THREADS
+ if (thread_count == 1) {
+ task->Run();
+ delete task;
+ }
+ else {
+ pool.Submit(task);
+ }
+#else
+ task->Run();
+ delete task;
+#endif
+ }
+ if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
+ }
+
+#ifdef WITH_THREADS
+ // wait for all threads to finish
+ pool.Stop(true);
+#endif
+
+ tFile.Close();
+ sFile.Close();
+ aFile.Close();
+ // only close if we actually opened it
+ if (!options.onlyOutputSpanInfo) {
+ extractFile.Close();
+ if (!options.onlyDirectFlag) extractFileInv.Close();
+ }
+
+ if (options.glueGrammarFlag)
+ writeGlueGrammar(fileNameGlueGrammar, options, targetLabelCollection, targetTopLabelCollection);
+
+ if (options.unknownWordLabelFlag)
+ writeUnknownWordLabel(fileNameUnknownWordLabel);
+}
+
+void ExtractTask::Run() {
+ extractRules();
+ consolidateRules();
+ writeRulesToFile();
+ m_extractedRules.clear();
+}
+
+void ExtractTask::extractRules()
+{
+ int countT = m_sentence->target.size();
+ int countS = m_sentence->source.size();
+
+ // phrase repository for creating hiero phrases
+ RuleExist ruleExist(countT);
+
+ // check alignments for target phrase startT...endT
+ for(int lengthT=1;
+ lengthT <= m_options.maxSpan && lengthT <= countT;
+ lengthT++) {
+ for(int startT=0; startT < countT-(lengthT-1); startT++) {
+
+ // that's nice to have
+ int endT = startT + lengthT - 1;
+
+ // if there is target side syntax, there has to be a node
+ if (m_options.targetSyntax && !m_sentence->targetTree.HasNode(startT,endT))
+ continue;
+
+ // find find aligned source words
+ // first: find minimum and maximum source word
+ int minS = 9999;
+ int maxS = -1;
+ vector< int > usedS = m_sentence->alignedCountS;
+ for(int ti=startT; ti<=endT; ti++) {
+ for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
+ int si = m_sentence->alignedToT[ti][i];
+ if (si<minS) {
+ minS = si;
+ }
+ if (si>maxS) {
+ maxS = si;
+ }
+ usedS[ si ]--;
+ }
+ }
+
+ // unaligned phrases are not allowed
+ if( maxS == -1 )
+ continue;
+
+ // source phrase has to be within limits
+ if( maxS-minS >= m_options.maxSpan )
+ continue;
+
+ // check if source words are aligned to out of bound target words
+ bool out_of_bounds = false;
+ for(int si=minS; si<=maxS && !out_of_bounds; si++)
+ if (usedS[si]>0) {
+ out_of_bounds = true;
+ }
+
+ // if out of bound, you gotta go
+ if (out_of_bounds)
+ continue;
+
+ // done with all the checks, lets go over all consistent phrase pairs
+ // start point of source phrase may retreat over unaligned
+ for(int startS=minS;
+ (startS>=0 &&
+ startS>maxS - m_options.maxSpan && // within length limit
+ (startS==minS || m_sentence->alignedCountS[startS]==0)); // unaligned
+ startS--) {
+ // end point of source phrase may advance over unaligned
+ for(int endS=maxS;
+ (endS<countS && endS<startS + m_options.maxSpan && // within length limit
+ (endS==maxS || m_sentence->alignedCountS[endS]==0)); // unaligned
+ endS++) {
+ // if there is source side syntax, there has to be a node
+ if (m_options.sourceSyntax && !m_sentence->sourceTree.HasNode(startS,endS))
+ continue;
+
+ // TODO: loop over all source and target syntax labels
+
+ // if within length limits, add as fully-lexical phrase pair
+ if (endT-startT < m_options.maxSymbolsTarget && endS-startS < m_options.maxSymbolsSource) {
+ addRule(startT,endT,startS,endS, ruleExist);
+ }
+
+ // take note that this is a valid phrase alignment
+ ruleExist.Add(startT, endT, startS, endS);
+
+ // extract hierarchical rules
+
+ // are rules not allowed to start non-terminals?
+ int initStartT = m_options.nonTermFirstWord ? startT : startT + 1;
+
+ HoleCollection holeColl(startS, endS); // empty hole collection
+ addHieroRule(startT, endT, startS, endS,
+ ruleExist, holeColl, 0, initStartT,
+ endT-startT+1, endS-startS+1);
+ }
+ }
+ }
+ }
+}
+
+void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
+ , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex)
+{
+ vector<Hole*>::iterator iterHoleList = holeColl.GetSortedSourceHoles().begin();
+ assert(iterHoleList != holeColl.GetSortedSourceHoles().end());
+
+ int outPos = 0;
+ int holeCount = 0;
+ int holeTotal = holeColl.GetHoles().size();
+ for(int currPos = startS; currPos <= endS; currPos++) {
+ bool isHole = false;
+ if (iterHoleList != holeColl.GetSortedSourceHoles().end()) {
+ const Hole &hole = **iterHoleList;
+ isHole = hole.GetStart(0) == currPos;
+ }
+
+ if (isHole) {
+ Hole &hole = **iterHoleList;
+
+ int labelI = labelIndex[ 2+holeCount+holeTotal ];
+ string label = m_options.sourceSyntax ?
+ m_sentence->sourceTree.GetNodes(currPos,hole.GetEnd(0))[ labelI ]->GetLabel() : "X";
+ hole.SetLabel(label, 0);
+
+ currPos = hole.GetEnd(0);
+ hole.SetPos(outPos, 0);
+ ++iterHoleList;
+ ++holeCount;
+ } else {
+ indexS[currPos] = outPos;
+ }
+
+ outPos++;
+ }
+
+ assert(iterHoleList == holeColl.GetSortedSourceHoles().end());
+}
+
+string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore)
+{
+ HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
+ assert(iterHoleList != holeColl.GetHoles().end());
+
+ string out = "";
+ int outPos = 0;
+ int holeCount = 0;
+ for(int currPos = startT; currPos <= endT; currPos++) {
+ bool isHole = false;
+ if (iterHoleList != holeColl.GetHoles().end()) {
+ const Hole &hole = *iterHoleList;
+ isHole = hole.GetStart(1) == currPos;
+ }
+
+ if (isHole) {
+ Hole &hole = *iterHoleList;
+
+ const string &sourceLabel = hole.GetLabel(0);
+ assert(sourceLabel != "");
+
+ int labelI = labelIndex[ 2+holeCount ];
+ string targetLabel = m_options.targetSyntax ?
+ m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
+ hole.SetLabel(targetLabel, 1);
+
+ if (m_options.unpairedExtractFormat) {
+ out += "[" + targetLabel + "] ";
+ } else {
+ out += "[" + sourceLabel + "][" + targetLabel + "] ";
+ }
+
+ if (m_options.pcfgScore) {
+ double score = m_sentence->targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetPcfgScore();
+ logPCFGScore -= score;
+ }
+
+ currPos = hole.GetEnd(1);
+ hole.SetPos(outPos, 1);
+ ++iterHoleList;
+ holeCount++;
+ } else {
+ indexT[currPos] = outPos;
+ out += m_sentence->target[currPos] + " ";
+ }
+
+ outPos++;
+ }
+
+ assert(iterHoleList == holeColl.GetHoles().end());
+ return out.erase(out.size()-1);
+}
+
+string ExtractTask::printSourceHieroPhrase( int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl, const LabelIndex &labelIndex)
+{
+ vector<Hole*>::iterator iterHoleList = holeColl.GetSortedSourceHoles().begin();
+ assert(iterHoleList != holeColl.GetSortedSourceHoles().end());
+
+ string out = "";
+ int outPos = 0;
+ int holeCount = 0;
+ for(int currPos = startS; currPos <= endS; currPos++) {
+ bool isHole = false;
+ if (iterHoleList != holeColl.GetSortedSourceHoles().end()) {
+ const Hole &hole = **iterHoleList;
+ isHole = hole.GetStart(0) == currPos;
+ }
+
+ if (isHole) {
+ Hole &hole = **iterHoleList;
+
+ const string &targetLabel = hole.GetLabel(1);
+ assert(targetLabel != "");
+
+ const string &sourceLabel = hole.GetLabel(0);
+ if (m_options.unpairedExtractFormat) {
+ out += "[" + sourceLabel + "] ";
+ } else {
+ out += "[" + sourceLabel + "][" + targetLabel + "] ";
+ }
+
+ currPos = hole.GetEnd(0);
+ hole.SetPos(outPos, 0);
+ ++iterHoleList;
+ ++holeCount;
+ } else {
+ out += m_sentence->source[currPos] + " ";
+ }
+
+ outPos++;
+ }
+
+ assert(iterHoleList == holeColl.GetSortedSourceHoles().end());
+ return out.erase(out.size()-1);
+}
+
+void ExtractTask::printHieroAlignment( int startT, int endT, int startS, int endS
+ , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule)
+{
+ // print alignment of words
+ for(int ti=startT; ti<=endT; ti++) {
+ WordIndex::const_iterator p = indexT.find(ti);
+ if (p != indexT.end()) { // does word still exist?
+ for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
+ int si = m_sentence->alignedToT[ti][i];
+ std::string sourceSymbolIndex = IntToString(indexS.find(si)->second);
+ std::string targetSymbolIndex = IntToString(p->second);
+ rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
+ if (! m_options.onlyDirectFlag)
+ rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
+ }
+ }
+ }
+
+ // print alignment of non terminals
+ HoleList::const_iterator iterHole;
+ for (iterHole = holeColl.GetHoles().begin(); iterHole != holeColl.GetHoles().end(); ++iterHole) {
+ const Hole &hole = *iterHole;
+
+ std::string sourceSymbolIndex = IntToString(hole.GetPos(0));
+ std::string targetSymbolIndex = IntToString(hole.GetPos(1));
+ rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
+ if (!m_options.onlyDirectFlag)
+ rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
+
+ rule.SetSpanLength(hole.GetPos(0), hole.GetSize(0), hole.GetSize(1) ) ;
+
+ }
+
+ rule.alignment.erase(rule.alignment.size()-1);
+ if (!m_options.onlyDirectFlag) {
+ rule.alignmentInv.erase(rule.alignmentInv.size()-1);
+ }
+}
+
+void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
+ , HoleCollection &holeColl, LabelIndex &labelIndex)
+{
+ WordIndex indexS, indexT; // to keep track of word positions in rule
+
+ ExtractedRule rule( startT, endT, startS, endS );
+
+ // phrase labels
+ string targetLabel = m_options.targetSyntax ?
+ m_sentence->targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
+ string sourceLabel = m_options.sourceSyntax ?
+ m_sentence->sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
+
+ // create non-terms on the source side
+ preprocessSourceHieroPhrase(startT, endT, startS, endS, indexS, holeColl, labelIndex);
+
+ // target
+ if (m_options.pcfgScore) {
+ double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
+ rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ + " [" + targetLabel + "]";
+ rule.pcfgScore = std::exp(logPCFGScore);
+ } else {
+ double logPCFGScore = 0.0f;
+ rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ + " [" + targetLabel + "]";
+ }
+
+ // source
+ rule.source = printSourceHieroPhrase(startT, endT, startS, endS, holeColl, labelIndex);
+ if (m_options.conditionOnTargetLhs) {
+ rule.source += " [" + targetLabel + "]";
+ } else {
+ rule.source += " [" + sourceLabel + "]";
+ }
+
+ // alignment
+ printHieroAlignment(startT, endT, startS, endS, indexS, indexT, holeColl, rule);
+
+ addRuleToCollection( rule );
+}
+
+void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl)
+{
+ LabelIndex labelIndex,labelCount;
+
+ // number of target head labels
+ int numLabels = m_options.targetSyntax ? m_sentence->targetTree.GetNodes(startT,endT).size() : 1;
+ labelCount.push_back(numLabels);
+ labelIndex.push_back(0);
+
+ // number of source head labels
+ numLabels = m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(startS,endS).size() : 1;
+ labelCount.push_back(numLabels);
+ labelIndex.push_back(0);
+
+ // number of target hole labels
+ for( HoleList::const_iterator hole = holeColl.GetHoles().begin();
+ hole != holeColl.GetHoles().end(); hole++ ) {
+ int numLabels = m_options.targetSyntax ? m_sentence->targetTree.GetNodes(hole->GetStart(1),hole->GetEnd(1)).size() : 1 ;
+ labelCount.push_back(numLabels);
+ labelIndex.push_back(0);
+ }
+
+ // number of source hole labels
+ holeColl.SortSourceHoles();
+ for( vector<Hole*>::iterator i = holeColl.GetSortedSourceHoles().begin();
+ i != holeColl.GetSortedSourceHoles().end(); i++ ) {
+ const Hole &hole = **i;
+ int numLabels = m_options.sourceSyntax ? m_sentence->sourceTree.GetNodes(hole.GetStart(0),hole.GetEnd(0)).size() : 1 ;
+ labelCount.push_back(numLabels);
+ labelIndex.push_back(0);
+ }
+
+ // loop through the holes
+ bool done = false;
+ while(!done) {
+ printHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex );
+ for(unsigned int i=0; i<labelIndex.size(); i++) {
+ labelIndex[i]++;
+ if(labelIndex[i] == labelCount[i]) {
+ labelIndex[i] = 0;
+ if (i == labelIndex.size()-1)
+ done = true;
+ } else {
+ break;
+ }
+ }
+ }
+}
+
+// this function is called recursively
+// it pokes a new hole into the phrase pair, and then calls itself for more holes
+void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
+ , RuleExist &ruleExist, const HoleCollection &holeColl
+ , int numHoles, int initStartT, int wordCountT, int wordCountS)
+{
+ // done, if already the maximum number of non-terminals in phrase pair
+ if (numHoles >= m_options.maxNonTerm)
+ return;
+
+ // find a hole...
+ for (int startHoleT = initStartT; startHoleT <= endT; ++startHoleT) {
+ for (int endHoleT = startHoleT+(m_options.minHoleTarget-1); endHoleT <= endT; ++endHoleT) {
+ // if last non-terminal, enforce word count limit
+ if (numHoles == m_options.maxNonTerm-1 && wordCountT - (endHoleT-startT+1) + (numHoles+1) > m_options.maxSymbolsTarget)
+ continue;
+
+ // determine the number of remaining target words
+ const int newWordCountT = wordCountT - (endHoleT-startHoleT+1);
+
+ // always enforce min word count limit
+ if (newWordCountT < m_options.minWords)
+ continue;
+
+ // except the whole span
+ if (startHoleT == startT && endHoleT == endT)
+ continue;
+
+ // does a phrase cover this target span?
+ // if it does, then there should be a list of mapped source phrases
+ // (multiple possible due to unaligned words)
+ const HoleList &sourceHoles = ruleExist.GetSourceHoles(startHoleT, endHoleT);
+
+ // loop over sub phrase pairs
+ HoleList::const_iterator iterSourceHoles;
+ for (iterSourceHoles = sourceHoles.begin(); iterSourceHoles != sourceHoles.end(); ++iterSourceHoles) {
+ const Hole &sourceHole = *iterSourceHoles;
+
+ const int sourceHoleSize = sourceHole.GetEnd(0)-sourceHole.GetStart(0)+1;
+
+ // enforce minimum hole size
+ if (sourceHoleSize < m_options.minHoleSource)
+ continue;
+
+ // determine the number of remaining source words
+ const int newWordCountS = wordCountS - sourceHoleSize;
+
+ // if last non-terminal, enforce word count limit
+ if (numHoles == m_options.maxNonTerm-1 && newWordCountS + (numHoles+1) > m_options.maxSymbolsSource)
+ continue;
+
+ // enforce min word count limit
+ if (newWordCountS < m_options.minWords)
+ continue;
+
+ // hole must be subphrase of the source phrase
+ // (may be violated if subphrase contains additional unaligned source word)
+ if (startS > sourceHole.GetStart(0) || endS < sourceHole.GetEnd(0))
+ continue;
+
+ // make sure target side does not overlap with another hole
+ if (holeColl.OverlapSource(sourceHole))
+ continue;
+
+ // if consecutive non-terminals are not allowed, also check for source
+ if (!m_options.nonTermConsecSource && holeColl.ConsecSource(sourceHole) )
+ continue;
+
+ // check that rule scope would not exceed limit if sourceHole
+ // were added
+ if (holeColl.Scope(sourceHole) > m_options.maxScope)
+ continue;
+
+ // require that at least one aligned word is left (unless there are no words at all)
+ if (m_options.requireAlignedWord && (newWordCountS > 0 || newWordCountT > 0)) {
+ HoleList::const_iterator iterHoleList = holeColl.GetHoles().begin();
+ bool foundAlignedWord = false;
+ // loop through all word positions
+ for(int pos = startT; pos <= endT && !foundAlignedWord; pos++) {
+ // new hole? moving on...
+ if (pos == startHoleT) {
+ pos = endHoleT;
+ }
+ // covered by hole? moving on...
+ else if (iterHoleList != holeColl.GetHoles().end() && iterHoleList->GetStart(1) == pos) {
+ pos = iterHoleList->GetEnd(1);
+ ++iterHoleList;
+ }
+ // covered by word? check if it is aligned
+ else {
+ if (m_sentence->alignedToT[pos].size() > 0)
+ foundAlignedWord = true;
+ }
+ }
+ if (!foundAlignedWord)
+ continue;
+ }
+
+ // update list of holes in this phrase pair
+ HoleCollection copyHoleColl(holeColl);
+ copyHoleColl.Add(startHoleT, endHoleT, sourceHole.GetStart(0), sourceHole.GetEnd(0));
+
+ // now some checks that disallow this phrase pair, but not further recursion
+ bool allowablePhrase = true;
+
+ // maximum words count violation?
+ if (newWordCountS + (numHoles+1) > m_options.maxSymbolsSource)
+ allowablePhrase = false;
+
+ if (newWordCountT + (numHoles+1) > m_options.maxSymbolsTarget)
+ allowablePhrase = false;
+
+ // passed all checks...
+ if (allowablePhrase)
+ printAllHieroPhrases(startT, endT, startS, endS, copyHoleColl);
+
+ // recursively search for next hole
+ int nextInitStartT = m_options.nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2;
+ addHieroRule(startT, endT, startS, endS
+ , ruleExist, copyHoleColl, numHoles + 1, nextInitStartT
+ , newWordCountT, newWordCountS);
+ }
+ }
+ }
+}
+
+void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist &ruleExist)
+{
+ // source
+
+ if (m_options.onlyOutputSpanInfo) {
+ cout << startS << " " << endS << " " << startT << " " << endT << endl;
+ return;
+ }
+
+ ExtractedRule rule(startT, endT, startS, endS);
+
+ // phrase labels
+ string targetLabel,sourceLabel;
+ if (m_options.targetSyntax && m_options.conditionOnTargetLhs) {
+ sourceLabel = targetLabel = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel();
+ }
+ else {
+ sourceLabel = m_options.sourceSyntax ?
+ m_sentence->sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
+ targetLabel = m_options.targetSyntax ?
+ m_sentence->targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
+ }
+
+ // source
+ rule.source = "";
+ for(int si=startS; si<=endS; si++)
+ rule.source += m_sentence->source[si] + " ";
+ rule.source += "[" + sourceLabel + "]";
+
+ // target
+ rule.target = "";
+ for(int ti=startT; ti<=endT; ti++)
+ rule.target += m_sentence->target[ti] + " ";
+ rule.target += "[" + targetLabel + "]";
+
+ if (m_options.pcfgScore) {
+ double logPCFGScore = m_sentence->targetTree.GetNodes(startT,endT)[0]->GetPcfgScore();
+ rule.pcfgScore = std::exp(logPCFGScore);
+ }
+
+ // alignment
+ for(int ti=startT; ti<=endT; ti++) {
+ for(unsigned int i=0; i<m_sentence->alignedToT[ti].size(); i++) {
+ int si = m_sentence->alignedToT[ti][i];
+ std::string sourceSymbolIndex = IntToString(si-startS);
+ std::string targetSymbolIndex = IntToString(ti-startT);
+ rule.alignment += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
+ if (!m_options.onlyDirectFlag)
+ rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
+ }
+ }
+
+ rule.alignment.erase(rule.alignment.size()-1);
+ if (!m_options.onlyDirectFlag)
+ rule.alignmentInv.erase(rule.alignmentInv.size()-1);
+
+ addRuleToCollection( rule );
+}
+
+void ExtractTask::addRuleToCollection( ExtractedRule &newRule )
+{
+
+ // no double-counting of identical rules from overlapping spans
+ if (!m_options.duplicateRules) {
+ vector<ExtractedRule>::const_iterator rule;
+ for(rule = m_extractedRules.begin(); rule != m_extractedRules.end(); rule++ ) {
+ if (rule->source.compare( newRule.source ) == 0 &&
+ rule->target.compare( newRule.target ) == 0 &&
+ !(rule->endT < newRule.startT || rule->startT > newRule.endT)) { // overlapping
+ return;
+ }
+ }
+ }
+ m_extractedRules.push_back( newRule );
+}
+
+void ExtractTask::consolidateRules()
+{
+ typedef vector<ExtractedRule>::iterator R;
+ map<int, map<int, map<int, map<int,int> > > > spanCount;
+
+ // compute number of rules per span
+ if (m_options.fractionalCounting) {
+ for(R rule = m_extractedRules.begin(); rule != m_extractedRules.end(); rule++ ) {
+ spanCount[ rule->startT ][ rule->endT ][ rule->startS ][ rule->endS ]++;
+ }
+ }
+
+ // compute fractional counts
+ for(R rule = m_extractedRules.begin(); rule != m_extractedRules.end(); rule++ ) {
+ rule->count = 1.0/(float) (m_options.fractionalCounting ? spanCount[ rule->startT ][ rule->endT ][ rule->startS ][ rule->endS ] : 1.0 );
+ }
+
+ // consolidate counts
+ for(R rule = m_extractedRules.begin(); rule != m_extractedRules.end(); rule++ ) {
+ if (rule->count == 0)
+ continue;
+ for(R r2 = rule+1; r2 != m_extractedRules.end(); r2++ ) {
+ if (rule->source.compare( r2->source ) == 0 &&
+ rule->target.compare( r2->target ) == 0 &&
+ rule->alignment.compare( r2->alignment ) == 0) {
+ rule->count += r2->count;
+ r2->count = 0;
+ }
+ }
+ }
+}
+
+void ExtractTask::writeRulesToFile()
+{
+ vector<ExtractedRule>::const_iterator rule;
+ ostringstream out;
+ ostringstream outInv;
+ for(rule = m_extractedRules.begin(); rule != m_extractedRules.end(); rule++ ) {
+ if (rule->count == 0)
+ continue;
+
+ out << rule->source << " ||| "
+ << rule->target << " ||| "
+ << rule->alignment << " ||| "
+ << rule->count << " ||| ";
+ if (m_options.outputNTLengths) {
+ rule->OutputNTLengths(out);
+ }
+ if (m_options.pcfgScore) {
+ out << " ||| " << rule->pcfgScore;
+ }
+ out << "\n";
+
+ if (!m_options.onlyDirectFlag) {
+ outInv << rule->target << " ||| "
+ << rule->source << " ||| "
+ << rule->alignmentInv << " ||| "
+ << rule->count << "\n";
+ }
+ }
+ m_extractCollector->Write( m_id, out.str() );
+ m_extractCollectorInv->Write( m_id, outInv.str() );;
+}
+
+void writeGlueGrammar( const string & fileName, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection )
+{
+ ofstream grammarFile;
+ grammarFile.open(fileName.c_str());
+ if (!options.targetSyntax) {
+ grammarFile << "<s> [X] ||| <s> [S] ||| 1 ||| ||| 0" << endl
+ << "[X][S] </s> [X] ||| [X][S] </s> [S] ||| 1 ||| 0-0 ||| 0" << endl
+ << "[X][S] [X][X] [X] ||| [X][S] [X][X] [S] ||| 2.718 ||| 0-0 1-1 ||| 0" << endl;
+ } else {
+ // chose a top label that is not already a label
+ string topLabel = "QQQQQQ";
+ for( unsigned int i=1; i<=topLabel.length(); i++) {
+ if(targetLabelCollection.find( topLabel.substr(0,i) ) == targetLabelCollection.end() ) {
+ topLabel = topLabel.substr(0,i);
+ break;
+ }
+ }
+ // basic rules
+ grammarFile << "<s> [X] ||| <s> [" << topLabel << "] ||| 1 ||| " << endl
+ << "[X][" << topLabel << "] </s> [X] ||| [X][" << topLabel << "] </s> [" << topLabel << "] ||| 1 ||| 0-0 " << endl;
+
+ // top rules
+ for( map<string,int>::const_iterator i = targetTopLabelCollection.begin();
+ i != targetTopLabelCollection.end(); i++ ) {
+ grammarFile << "<s> [X][" << i->first << "] </s> [X] ||| <s> [X][" << i->first << "] </s> [" << topLabel << "] ||| 1 ||| 1-1" << endl;
+ }
+
+ // glue rules
+ for( set<string>::const_iterator i = targetLabelCollection.begin();
+ i != targetLabelCollection.end(); i++ ) {
+ grammarFile << "[X][" << topLabel << "] [X][" << *i << "] [X] ||| [X][" << topLabel << "] [X][" << *i << "] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1" << endl;
+ }
+ grammarFile << "[X][" << topLabel << "] [X][X] [X] ||| [X][" << topLabel << "] [X][X] [" << topLabel << "] ||| 2.718 ||| 0-0 1-1 " << endl; // glue rule for unknown word...
+ }
+ grammarFile.close();
+}
+
+// collect counts for labels for each word
+// ( labels of singleton words are used to estimate
+// distribution oflabels for unknown words )
+
+map<string,int> wordCount;
+map<string,string> wordLabel;
+void collectWordLabelCounts( SentenceAlignmentWithSyntax &sentence )
+{
+ int countT = sentence.target.size();
+ for(int ti=0; ti < countT; ti++) {
+ string &word = sentence.target[ ti ];
+ const vector< SyntaxNode* >& labels = sentence.targetTree.GetNodes(ti,ti);
+ if (labels.size() > 0) {
+ wordCount[ word ]++;
+ wordLabel[ word ] = labels[0]->GetLabel();
+ }
+ }
+}
+
+void writeUnknownWordLabel(const string & fileName)
+{
+ ofstream outFile;
+ outFile.open(fileName.c_str());
+ typedef map<string,int>::const_iterator I;
+
+ map<string,int> count;
+ int total = 0;
+ for(I word = wordCount.begin(); word != wordCount.end(); word++) {
+ // only consider singletons
+ if (word->second == 1) {
+ count[ wordLabel[ word->first ] ]++;
+ total++;
+ }
+ }
+
+ for(I pos = count.begin(); pos != count.end(); pos++) {
+ double ratio = ((double) pos->second / (double) total);
+ if (ratio > 0.03)
+ outFile << pos->first << " " << ratio << endl;
+ }
+
+ outFile.close();
+}
diff --git a/phrase-extract/extract-rules.vcxproj b/phrase-extract/extract-rules.vcxproj
new file mode 100644
index 000000000..ecd36fe50
--- /dev/null
+++ b/phrase-extract/extract-rules.vcxproj
@@ -0,0 +1,111 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{3C5CFAEC-6830-4491-9008-1C9E8C381C50}</ProjectGuid>
+ <RootNamespace>extractrules</RootNamespace>
+ <Keyword>Win32Proj</Keyword>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup>
+ <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(SolutionDir)$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <MinimalRebuild>true</MinimalRebuild>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <TargetMachine>MachineX86</TargetMachine>
+ <AdditionalDependencies>zdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <Optimization>MaxSpeed</Optimization>
+ <IntrinsicFunctions>true</IntrinsicFunctions>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+ <FunctionLevelLinking>true</FunctionLevelLinking>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <TargetMachine>MachineX86</TargetMachine>
+ <AdditionalDependencies>zdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="extract-rules.cpp" />
+ <ClCompile Include="ExtractedRule.cpp" />
+ <ClCompile Include="HoleCollection.cpp" />
+ <ClCompile Include="InputFileStream.cpp" />
+ <ClCompile Include="SentenceAlignment.cpp" />
+ <ClCompile Include="SentenceAlignmentWithSyntax.cpp" />
+ <ClCompile Include="SyntaxTree.cpp" />
+ <ClCompile Include="tables-core.cpp" />
+ <ClCompile Include="XmlTree.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="ExtractedRule.h" />
+ <ClInclude Include="Hole.h" />
+ <ClInclude Include="HoleCollection.h" />
+ <ClInclude Include="SentenceAlignment.h" />
+ <ClInclude Include="SentenceAlignmentWithSyntax.h" />
+ <ClInclude Include="SyntaxTree.h" />
+ <ClInclude Include="tables-core.h" />
+ <ClInclude Include="XmlTree.h" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project> \ No newline at end of file
diff --git a/phrase-extract/extract.cpp b/phrase-extract/extract.cpp
new file mode 100644
index 000000000..16b413da9
--- /dev/null
+++ b/phrase-extract/extract.cpp
@@ -0,0 +1,699 @@
+/*
+ * extract.cpp
+ *
+ * Modified by: Nadi Tomeh - LIMSI/CNRS
+ * Machine Translation Marathon 2010, Dublin
+ */
+
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <stdlib.h>
+#include <assert.h>
+#include <cstring>
+
+#include <map>
+#include <set>
+#include <vector>
+
+#include "SafeGetline.h"
+#include "SentenceAlignment.h"
+#include "tables-core.h"
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+
+using namespace std;
+
+#define LINE_MAX_LENGTH 500000
+
+// HPhraseVertex represents a point in the alignment matrix
+typedef pair <int, int> HPhraseVertex;
+
+// Phrase represents a bi-phrase; each bi-phrase is defined by two points in the alignment matrix:
+// bottom-left and top-right
+typedef pair<HPhraseVertex, HPhraseVertex> HPhrase;
+
+// HPhraseVector is a vector of HPhrases
+typedef vector < HPhrase > HPhraseVector;
+
+// SentenceVertices represents, from all extracted phrases, all vertices that have the same positioning
+// The key of the map is the English index and the value is a set of the source ones
+typedef map <int, set<int> > HSentenceVertices;
+
+enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
+enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
+
+REO_POS getOrientWordModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+ int, int, int, int, int, int, int,
+ bool (*)(int, int), bool (*)(int, int));
+REO_POS getOrientPhraseModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+ int, int, int, int, int, int, int,
+ bool (*)(int, int), bool (*)(int, int),
+ const HSentenceVertices &, const HSentenceVertices &);
+REO_POS getOrientHierModel(SentenceAlignment &, REO_MODEL_TYPE, bool, bool,
+ int, int, int, int, int, int, int,
+ bool (*)(int, int), bool (*)(int, int),
+ const HSentenceVertices &, const HSentenceVertices &,
+ const HSentenceVertices &, const HSentenceVertices &,
+ REO_POS);
+
+void insertVertex(HSentenceVertices &, int, int);
+void insertPhraseVertices(HSentenceVertices &, HSentenceVertices &, HSentenceVertices &, HSentenceVertices &,
+ int, int, int, int);
+string getOrientString(REO_POS, REO_MODEL_TYPE);
+
+bool ge(int, int);
+bool le(int, int);
+bool lt(int, int);
+
+void extractBase(SentenceAlignment &);
+void extract(SentenceAlignment &);
+void addPhrase(SentenceAlignment &, int, int, int, int, string &);
+bool isAligned (SentenceAlignment &, int, int);
+
+bool allModelsOutputFlag = false;
+
+bool wordModel = false;
+REO_MODEL_TYPE wordType = REO_MSD;
+bool phraseModel = false;
+REO_MODEL_TYPE phraseType = REO_MSD;
+bool hierModel = false;
+REO_MODEL_TYPE hierType = REO_MSD;
+
+
+Moses::OutputFileStream extractFile;
+Moses::OutputFileStream extractFileInv;
+Moses::OutputFileStream extractFileOrientation;
+Moses::OutputFileStream extractFileSentenceId;
+int maxPhraseLength;
+bool orientationFlag = false;
+bool translationFlag = true;
+bool sentenceIdFlag = false; //create extract file with sentence id
+bool onlyOutputSpanInfo = false;
+bool gzOutput = false;
+
+int main(int argc, char* argv[])
+{
+ cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
+ << "phrase extraction from an aligned parallel corpus\n";
+
+ if (argc < 6) {
+ cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
+ exit(1);
+ }
+ char* &fileNameE = argv[1];
+ char* &fileNameF = argv[2];
+ char* &fileNameA = argv[3];
+ string fileNameExtract = string(argv[4]);
+ maxPhraseLength = atoi(argv[5]);
+
+ for(int i=6; i<argc; i++) {
+ if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
+ onlyOutputSpanInfo = true;
+ } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
+ orientationFlag = true;
+ } else if (strcmp(argv[i],"--NoTTable") == 0) {
+ translationFlag = false;
+ } else if (strcmp(argv[i], "--SentenceId") == 0) {
+ sentenceIdFlag = true;
+ } else if (strcmp(argv[i], "--GZOutput") == 0) {
+ gzOutput = true;
+ } else if(strcmp(argv[i],"--model") == 0) {
+ if (i+1 >= argc) {
+ cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
+ exit(1);
+ }
+ char* modelParams = argv[++i];
+ char* modelName = strtok(modelParams, "-");
+ char* modelType = strtok(NULL, "-");
+
+ REO_MODEL_TYPE intModelType;
+
+ if(strcmp(modelName, "wbe") == 0) {
+ wordModel = true;
+ if(strcmp(modelType, "msd") == 0)
+ wordType = REO_MSD;
+ else if(strcmp(modelType, "mslr") == 0)
+ wordType = REO_MSLR;
+ else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
+ wordType = REO_MONO;
+ else {
+ cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
+ exit(1);
+ }
+ } else if(strcmp(modelName, "phrase") == 0) {
+ phraseModel = true;
+ if(strcmp(modelType, "msd") == 0)
+ phraseType = REO_MSD;
+ else if(strcmp(modelType, "mslr") == 0)
+ phraseType = REO_MSLR;
+ else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
+ phraseType = REO_MONO;
+ else {
+ cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
+ exit(1);
+ }
+ } else if(strcmp(modelName, "hier") == 0) {
+ hierModel = true;
+ if(strcmp(modelType, "msd") == 0)
+ hierType = REO_MSD;
+ else if(strcmp(modelType, "mslr") == 0)
+ hierType = REO_MSLR;
+ else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
+ hierType = REO_MONO;
+ else {
+ cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
+ exit(1);
+ }
+ } else {
+ cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
+ exit(1);
+ }
+
+ allModelsOutputFlag = true;
+ } else {
+ cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
+ exit(1);
+ }
+ }
+
+ // default reordering model if no model selected
+ // allows for the old syntax to be used
+ if(orientationFlag && !allModelsOutputFlag) {
+ wordModel = true;
+ wordType = REO_MSD;
+ }
+
+ // open input files
+ Moses::InputFileStream eFile(fileNameE);
+ Moses::InputFileStream fFile(fileNameF);
+ Moses::InputFileStream aFile(fileNameA);
+
+ istream *eFileP = &eFile;
+ istream *fFileP = &fFile;
+ istream *aFileP = &aFile;
+
+ // open output files
+ if (translationFlag) {
+ string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
+ extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
+ extractFileInv.Open(fileNameExtractInv.c_str());
+ }
+ if (orientationFlag) {
+ string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
+ extractFileOrientation.Open(fileNameExtractOrientation.c_str());
+ }
+
+ if (sentenceIdFlag) {
+ string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
+ extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
+ }
+
+ int i=0;
+ while(true) {
+ i++;
+ if (i%10000 == 0) cerr << "." << flush;
+ char englishString[LINE_MAX_LENGTH];
+ char foreignString[LINE_MAX_LENGTH];
+ char alignmentString[LINE_MAX_LENGTH];
+ SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (eFileP->eof()) break;
+ SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
+ SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
+ SentenceAlignment sentence;
+ // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
+ //az: output src, tgt, and alingment line
+ if (onlyOutputSpanInfo) {
+ cout << "LOG: SRC: " << foreignString << endl;
+ cout << "LOG: TGT: " << englishString << endl;
+ cout << "LOG: ALT: " << alignmentString << endl;
+ cout << "LOG: PHRASES_BEGIN:" << endl;
+ }
+
+ if (sentence.create( englishString, foreignString, alignmentString, i)) {
+ extract(sentence);
+ }
+ if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
+ }
+ eFile.Close();
+ fFile.Close();
+ aFile.Close();
+ //az: only close if we actually opened it
+ if (!onlyOutputSpanInfo) {
+ if (translationFlag) {
+ extractFile.Close();
+ extractFileInv.Close();
+ }
+ if (orientationFlag) extractFileOrientation.Close();
+ if (sentenceIdFlag) {
+ extractFileSentenceId.Close();
+ }
+ }
+}
+
+void extract(SentenceAlignment &sentence)
+{
+ int countE = sentence.target.size();
+ int countF = sentence.source.size();
+
+ HPhraseVector inboundPhrases;
+
+ HSentenceVertices inTopLeft;
+ HSentenceVertices inTopRight;
+ HSentenceVertices inBottomLeft;
+ HSentenceVertices inBottomRight;
+
+ HSentenceVertices outTopLeft;
+ HSentenceVertices outTopRight;
+ HSentenceVertices outBottomLeft;
+ HSentenceVertices outBottomRight;
+
+ HSentenceVertices::const_iterator it;
+
+ bool relaxLimit = hierModel;
+ bool buildExtraStructure = phraseModel || hierModel;
+
+ // check alignments for target phrase startE...endE
+ // loop over extracted phrases which are compatible with the word-alignments
+ for(int startE=0; startE<countE; startE++) {
+ for(int endE=startE;
+ (endE<countE && (relaxLimit || endE<startE+maxPhraseLength));
+ endE++) {
+
+ int minF = 9999;
+ int maxF = -1;
+ vector< int > usedF = sentence.alignedCountS;
+ for(int ei=startE; ei<=endE; ei++) {
+ for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
+ int fi = sentence.alignedToT[ei][i];
+ if (fi<minF) {
+ minF = fi;
+ }
+ if (fi>maxF) {
+ maxF = fi;
+ }
+ usedF[ fi ]--;
+ }
+ }
+
+ if (maxF >= 0 && // aligned to any source words at all
+ (relaxLimit || maxF-minF < maxPhraseLength)) { // source phrase within limits
+
+ // check if source words are aligned to out of bound target words
+ bool out_of_bounds = false;
+ for(int fi=minF; fi<=maxF && !out_of_bounds; fi++)
+ if (usedF[fi]>0) {
+ // cout << "ouf of bounds: " << fi << "\n";
+ out_of_bounds = true;
+ }
+
+ // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
+ if (!out_of_bounds) {
+ // start point of source phrase may retreat over unaligned
+ for(int startF=minF;
+ (startF>=0 &&
+ (relaxLimit || startF>maxF-maxPhraseLength) && // within length limit
+ (startF==minF || sentence.alignedCountS[startF]==0)); // unaligned
+ startF--)
+ // end point of source phrase may advance over unaligned
+ for(int endF=maxF;
+ (endF<countF &&
+ (relaxLimit || endF<startF+maxPhraseLength) && // within length limit
+ (endF==maxF || sentence.alignedCountS[endF]==0)); // unaligned
+ endF++) { // at this point we have extracted a phrase
+ if(buildExtraStructure) { // phrase || hier
+ if(endE-startE < maxPhraseLength && endF-startF < maxPhraseLength) { // within limit
+ inboundPhrases.push_back(HPhrase(HPhraseVertex(startF,startE),
+ HPhraseVertex(endF,endE)));
+ insertPhraseVertices(inTopLeft, inTopRight, inBottomLeft, inBottomRight,
+ startF, startE, endF, endE);
+ } else
+ insertPhraseVertices(outTopLeft, outTopRight, outBottomLeft, outBottomRight,
+ startF, startE, endF, endE);
+ } else {
+ string orientationInfo = "";
+ if(wordModel) {
+ REO_POS wordPrevOrient, wordNextOrient;
+ bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
+ bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
+ bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
+ bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
+ wordPrevOrient = getOrientWordModel(sentence, wordType, connectedLeftTopP, connectedRightTopP, startF, endF, startE, endE, countF, 0, 1, &ge, &lt);
+ wordNextOrient = getOrientWordModel(sentence, wordType, connectedLeftTopN, connectedRightTopN, endF, startF, endE, startE, 0, countF, -1, &lt, &ge);
+ orientationInfo += getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType);
+ if(allModelsOutputFlag)
+ " | | ";
+ }
+ addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
+ }
+ }
+ }
+ }
+ }
+ }
+
+ if(buildExtraStructure) { // phrase || hier
+ string orientationInfo = "";
+ REO_POS wordPrevOrient, wordNextOrient, phrasePrevOrient, phraseNextOrient, hierPrevOrient, hierNextOrient;
+
+ for(size_t i = 0; i < inboundPhrases.size(); i++) {
+ int startF = inboundPhrases[i].first.first;
+ int startE = inboundPhrases[i].first.second;
+ int endF = inboundPhrases[i].second.first;
+ int endE = inboundPhrases[i].second.second;
+
+ bool connectedLeftTopP = isAligned( sentence, startF-1, startE-1 );
+ bool connectedRightTopP = isAligned( sentence, endF+1, startE-1 );
+ bool connectedLeftTopN = isAligned( sentence, endF+1, endE+1 );
+ bool connectedRightTopN = isAligned( sentence, startF-1, endE+1 );
+
+ if(wordModel) {
+ wordPrevOrient = getOrientWordModel(sentence, wordType,
+ connectedLeftTopP, connectedRightTopP,
+ startF, endF, startE, endE, countF, 0, 1,
+ &ge, &lt);
+ wordNextOrient = getOrientWordModel(sentence, wordType,
+ connectedLeftTopN, connectedRightTopN,
+ endF, startF, endE, startE, 0, countF, -1,
+ &lt, &ge);
+ }
+ if (phraseModel) {
+ phrasePrevOrient = getOrientPhraseModel(sentence, phraseType,
+ connectedLeftTopP, connectedRightTopP,
+ startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft);
+ phraseNextOrient = getOrientPhraseModel(sentence, phraseType,
+ connectedLeftTopN, connectedRightTopN,
+ endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight);
+ } else {
+ phrasePrevOrient = phraseNextOrient = UNKNOWN;
+ }
+ if(hierModel) {
+ hierPrevOrient = getOrientHierModel(sentence, hierType,
+ connectedLeftTopP, connectedRightTopP,
+ startF, endF, startE, endE, countF-1, 0, 1, &ge, &lt, inBottomRight, inBottomLeft, outBottomRight, outBottomLeft, phrasePrevOrient);
+ hierNextOrient = getOrientHierModel(sentence, hierType,
+ connectedLeftTopN, connectedRightTopN,
+ endF, startF, endE, startE, 0, countF-1, -1, &lt, &ge, inBottomLeft, inBottomRight, outBottomLeft, outBottomRight, phraseNextOrient);
+ }
+
+ orientationInfo = ((wordModel)? getOrientString(wordPrevOrient, wordType) + " " + getOrientString(wordNextOrient, wordType) : "") + " | " +
+ ((phraseModel)? getOrientString(phrasePrevOrient, phraseType) + " " + getOrientString(phraseNextOrient, phraseType) : "") + " | " +
+ ((hierModel)? getOrientString(hierPrevOrient, hierType) + " " + getOrientString(hierNextOrient, hierType) : "");
+
+ addPhrase(sentence, startE, endE, startF, endF, orientationInfo);
+ }
+ }
+}
+
+REO_POS getOrientWordModel(SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
+ bool connectedLeftTop, bool connectedRightTop,
+ int startF, int endF, int startE, int endE, int countF, int zero, int unit,
+ bool (*ge)(int, int), bool (*lt)(int, int) )
+{
+
+ if( connectedLeftTop && !connectedRightTop)
+ return LEFT;
+ if(modelType == REO_MONO)
+ return UNKNOWN;
+ if (!connectedLeftTop && connectedRightTop)
+ return RIGHT;
+ if(modelType == REO_MSD)
+ return UNKNOWN;
+ for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
+ connectedLeftTop = isAligned(sentence, indexF, startE-unit);
+ for(int indexF=endF+2*unit; (*lt)(indexF,countF) && !connectedRightTop; indexF=indexF+unit)
+ connectedRightTop = isAligned(sentence, indexF, startE-unit);
+ if(connectedLeftTop && !connectedRightTop)
+ return DRIGHT;
+ else if(!connectedLeftTop && connectedRightTop)
+ return DLEFT;
+ return UNKNOWN;
+}
+
+// to be called with countF-1 instead of countF
+REO_POS getOrientPhraseModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
+ bool connectedLeftTop, bool connectedRightTop,
+ int startF, int endF, int startE, int endE, int countF, int zero, int unit,
+ bool (*ge)(int, int), bool (*lt)(int, int),
+ const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft)
+{
+
+ HSentenceVertices::const_iterator it;
+
+ if((connectedLeftTop && !connectedRightTop) ||
+ //(startE == 0 && startF == 0) ||
+ //(startE == sentence.target.size()-1 && startF == sentence.source.size()-1) ||
+ ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
+ it->second.find(startF-unit) != it->second.end()))
+ return LEFT;
+ if(modelType == REO_MONO)
+ return UNKNOWN;
+ if((!connectedLeftTop && connectedRightTop) ||
+ ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() && it->second.find(endF + unit) != it->second.end()))
+ return RIGHT;
+ if(modelType == REO_MSD)
+ return UNKNOWN;
+ connectedLeftTop = false;
+ for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit)
+ if(connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
+ it->second.find(indexF) != it->second.end())
+ return DRIGHT;
+ connectedRightTop = false;
+ for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit)
+ if(connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() &&
+ it->second.find(indexF) != it->second.end())
+ return DLEFT;
+ return UNKNOWN;
+}
+
+// to be called with countF-1 instead of countF
+REO_POS getOrientHierModel (SentenceAlignment & sentence, REO_MODEL_TYPE modelType,
+ bool connectedLeftTop, bool connectedRightTop,
+ int startF, int endF, int startE, int endE, int countF, int zero, int unit,
+ bool (*ge)(int, int), bool (*lt)(int, int),
+ const HSentenceVertices & inBottomRight, const HSentenceVertices & inBottomLeft,
+ const HSentenceVertices & outBottomRight, const HSentenceVertices & outBottomLeft,
+ REO_POS phraseOrient)
+{
+
+ HSentenceVertices::const_iterator it;
+
+ if(phraseOrient == LEFT ||
+ (connectedLeftTop && !connectedRightTop) ||
+ // (startE == 0 && startF == 0) ||
+ //(startE == sentence.target.size()-1 && startF == sentence.source.size()-1) ||
+ ((it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
+ it->second.find(startF-unit) != it->second.end()) ||
+ ((it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
+ it->second.find(startF-unit) != it->second.end()))
+ return LEFT;
+ if(modelType == REO_MONO)
+ return UNKNOWN;
+ if(phraseOrient == RIGHT ||
+ (!connectedLeftTop && connectedRightTop) ||
+ ((it = inBottomLeft.find(startE - unit)) != inBottomLeft.end() &&
+ it->second.find(endF + unit) != it->second.end()) ||
+ ((it = outBottomLeft.find(startE - unit)) != outBottomLeft.end() &&
+ it->second.find(endF + unit) != it->second.end()))
+ return RIGHT;
+ if(modelType == REO_MSD)
+ return UNKNOWN;
+ if(phraseOrient != UNKNOWN)
+ return phraseOrient;
+ connectedLeftTop = false;
+ for(int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) {
+ if((connectedLeftTop = (it = inBottomRight.find(startE - unit)) != inBottomRight.end() &&
+ it->second.find(indexF) != it->second.end()) ||
+ (connectedLeftTop = (it = outBottomRight.find(startE - unit)) != outBottomRight.end() &&
+ it->second.find(indexF) != it->second.end()))
+ return DRIGHT;
+ }
+ connectedRightTop = false;
+ for(int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) {
+ if((connectedRightTop = (it = inBottomLeft.find(startE - unit)) != inBottomRight.end() &&
+ it->second.find(indexF) != it->second.end()) ||
+ (connectedRightTop = (it = outBottomLeft.find(startE - unit)) != outBottomRight.end() &&
+ it->second.find(indexF) != it->second.end()))
+ return DLEFT;
+ }
+ return UNKNOWN;
+}
+
+bool isAligned ( SentenceAlignment &sentence, int fi, int ei )
+{
+ if (ei == -1 && fi == -1)
+ return true;
+ if (ei <= -1 || fi <= -1)
+ return false;
+ if ((size_t)ei == sentence.target.size() && (size_t)fi == sentence.source.size())
+ return true;
+ if ((size_t)ei >= sentence.target.size() || (size_t)fi >= sentence.source.size())
+ return false;
+ for(size_t i=0; i<sentence.alignedToT[ei].size(); i++)
+ if (sentence.alignedToT[ei][i] == fi)
+ return true;
+ return false;
+}
+
+bool ge(int first, int second)
+{
+ return first >= second;
+}
+
+bool le(int first, int second)
+{
+ return first <= second;
+}
+
+bool lt(int first, int second)
+{
+ return first < second;
+}
+
+void insertVertex( HSentenceVertices & corners, int x, int y )
+{
+ set<int> tmp;
+ tmp.insert(x);
+ pair< HSentenceVertices::iterator, bool > ret = corners.insert( pair<int, set<int> > (y, tmp) );
+ if(ret.second == false) {
+ ret.first->second.insert(x);
+ }
+}
+
+void insertPhraseVertices(
+ HSentenceVertices & topLeft,
+ HSentenceVertices & topRight,
+ HSentenceVertices & bottomLeft,
+ HSentenceVertices & bottomRight,
+ int startF, int startE, int endF, int endE)
+{
+
+ insertVertex(topLeft, startF, startE);
+ insertVertex(topRight, endF, startE);
+ insertVertex(bottomLeft, startF, endE);
+ insertVertex(bottomRight, endF, endE);
+}
+
+string getOrientString(REO_POS orient, REO_MODEL_TYPE modelType)
+{
+ switch(orient) {
+ case LEFT:
+ return "mono";
+ break;
+ case RIGHT:
+ return "swap";
+ break;
+ case DRIGHT:
+ return "dright";
+ break;
+ case DLEFT:
+ return "dleft";
+ break;
+ case UNKNOWN:
+ switch(modelType) {
+ case REO_MONO:
+ return "nomono";
+ break;
+ case REO_MSD:
+ return "other";
+ break;
+ case REO_MSLR:
+ return "dright";
+ break;
+ }
+ break;
+ }
+ return "";
+}
+
+void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, int endF , string &orientationInfo)
+{
+ // source
+ // cout << "adding ( " << startF << "-" << endF << ", " << startE << "-" << endE << ")\n";
+
+ if (onlyOutputSpanInfo) {
+ cout << startF << " " << endF << " " << startE << " " << endE << endl;
+ return;
+ }
+
+ for(int fi=startF; fi<=endF; fi++) {
+ if (translationFlag) extractFile << sentence.source[fi] << " ";
+ if (orientationFlag) extractFileOrientation << sentence.source[fi] << " ";
+ if (sentenceIdFlag) extractFileSentenceId << sentence.source[fi] << " ";
+ }
+ if (translationFlag) extractFile << "||| ";
+ if (orientationFlag) extractFileOrientation << "||| ";
+ if (sentenceIdFlag) extractFileSentenceId << "||| ";
+
+ // target
+ for(int ei=startE; ei<=endE; ei++) {
+ if (translationFlag) extractFile << sentence.target[ei] << " ";
+ if (translationFlag) extractFileInv << sentence.target[ei] << " ";
+ if (orientationFlag) extractFileOrientation << sentence.target[ei] << " ";
+ if (sentenceIdFlag) extractFileSentenceId << sentence.target[ei] << " ";
+ }
+ if (translationFlag) extractFile << "|||";
+ if (translationFlag) extractFileInv << "||| ";
+ if (orientationFlag) extractFileOrientation << "||| ";
+ if (sentenceIdFlag) extractFileSentenceId << "||| ";
+
+ // source (for inverse)
+ if (translationFlag) {
+ for(int fi=startF; fi<=endF; fi++)
+ extractFileInv << sentence.source[fi] << " ";
+ extractFileInv << "|||";
+ }
+
+ // alignment
+ if (translationFlag) {
+ for(int ei=startE; ei<=endE; ei++) {
+ for(size_t i=0; i<sentence.alignedToT[ei].size(); i++) {
+ int fi = sentence.alignedToT[ei][i];
+ extractFile << " " << fi-startF << "-" << ei-startE;
+ extractFileInv << " " << ei-startE << "-" << fi-startF;
+ }
+ }
+ }
+
+ if (orientationFlag)
+ extractFileOrientation << orientationInfo;
+
+ if (sentenceIdFlag) {
+ extractFileSentenceId << sentence.sentenceID;
+ }
+
+ if (translationFlag) extractFile << "\n";
+ if (translationFlag) extractFileInv << "\n";
+ if (orientationFlag) extractFileOrientation << "\n";
+ if (sentenceIdFlag) extractFileSentenceId << "\n";
+}
+
+// if proper conditioning, we need the number of times a source phrase occured
+void extractBase( SentenceAlignment &sentence )
+{
+ int countF = sentence.source.size();
+ for(int startF=0; startF<countF; startF++) {
+ for(int endF=startF;
+ (endF<countF && endF<startF+maxPhraseLength);
+ endF++) {
+ for(int fi=startF; fi<=endF; fi++) {
+ extractFile << sentence.source[fi] << " ";
+ }
+ extractFile << "|||" << endl;
+ }
+ }
+
+ int countE = sentence.target.size();
+ for(int startE=0; startE<countE; startE++) {
+ for(int endE=startE;
+ (endE<countE && endE<startE+maxPhraseLength);
+ endE++) {
+ for(int ei=startE; ei<=endE; ei++) {
+ extractFileInv << sentence.target[ei] << " ";
+ }
+ extractFileInv << "|||" << endl;
+ }
+ }
+}
diff --git a/phrase-extract/extract.vcxproj b/phrase-extract/extract.vcxproj
new file mode 100644
index 000000000..60a1128eb
--- /dev/null
+++ b/phrase-extract/extract.vcxproj
@@ -0,0 +1,103 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{2475F8E8-A5C9-4785-8B09-5F4E120FC518}</ProjectGuid>
+ <RootNamespace>extract</RootNamespace>
+ <Keyword>Win32Proj</Keyword>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup>
+ <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <MinimalRebuild>true</MinimalRebuild>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <RandomizedBaseAddress>false</RandomizedBaseAddress>
+ <DataExecutionPrevention>
+ </DataExecutionPrevention>
+ <TargetMachine>MachineX86</TargetMachine>
+ <AdditionalDependencies>zdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <RandomizedBaseAddress>false</RandomizedBaseAddress>
+ <DataExecutionPrevention>
+ </DataExecutionPrevention>
+ <TargetMachine>MachineX86</TargetMachine>
+ <AdditionalDependencies>zdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="extract.cpp" />
+ <ClCompile Include="InputFileStream.cpp" />
+ <ClCompile Include="SentenceAlignment.cpp" />
+ <ClCompile Include="tables-core.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="SentenceAlignment.h" />
+ <ClInclude Include="tables-core.h" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project> \ No newline at end of file
diff --git a/phrase-extract/extract.xcodeproj/project.pbxproj b/phrase-extract/extract.xcodeproj/project.pbxproj
new file mode 100644
index 000000000..1e02493cb
--- /dev/null
+++ b/phrase-extract/extract.xcodeproj/project.pbxproj
@@ -0,0 +1,788 @@
+// !$*UTF8*$!
+{
+ archiveVersion = 1;
+ classes = {
+ };
+ objectVersion = 46;
+ objects = {
+
+/* Begin PBXBuildFile section */
+ 1C05B9EC1174CC24003585B2 /* SentenceAlignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1C05B9E51174CC24003585B2 /* SentenceAlignment.cpp */; };
+ 1C05BA161174CEA4003585B2 /* tables-core.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1CE8CE4B0FC6EAA200924FEA /* tables-core.cpp */; };
+ 1C05BA171174CEA4003585B2 /* tables-core.h in Sources */ = {isa = PBXBuildFile; fileRef = 1CE8CE4C0FC6EAA200924FEA /* tables-core.h */; };
+ 1C05BA191174CEC7003585B2 /* SentenceAlignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1C05B9E51174CC24003585B2 /* SentenceAlignment.cpp */; };
+ 1C05BA1A1174CEC7003585B2 /* SentenceAlignment.h in Sources */ = {isa = PBXBuildFile; fileRef = 1C05B9E61174CC24003585B2 /* SentenceAlignment.h */; };
+ 1C05BA1B1174CEC7003585B2 /* SentenceAlignmentWithSyntax.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1C05B9E71174CC24003585B2 /* SentenceAlignmentWithSyntax.cpp */; };
+ 1C05BA1C1174CEC7003585B2 /* SentenceAlignmentWithSyntax.h in Sources */ = {isa = PBXBuildFile; fileRef = 1C05B9E81174CC24003585B2 /* SentenceAlignmentWithSyntax.h */; };
+ 1C05BA1D1174CED3003585B2 /* SyntaxTree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1C05B9EA1174CC24003585B2 /* SyntaxTree.cpp */; };
+ 1C05BA1E1174CED3003585B2 /* SyntaxTree.h in Sources */ = {isa = PBXBuildFile; fileRef = 1C05B9EB1174CC24003585B2 /* SyntaxTree.h */; };
+ 1C05BA211174CEE8003585B2 /* XmlTree.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1C05BA1F1174CEE8003585B2 /* XmlTree.cpp */; };
+ 1C05BA221174CEF3003585B2 /* XmlTree.h in Sources */ = {isa = PBXBuildFile; fileRef = 1C05BA201174CEE8003585B2 /* XmlTree.h */; };
+ 1C05BA251174CF03003585B2 /* Hole.h in Sources */ = {isa = PBXBuildFile; fileRef = 1C05B9DF1174CC06003585B2 /* Hole.h */; };
+ 1C05BA261174CF03003585B2 /* HoleCollection.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1C05B9E01174CC06003585B2 /* HoleCollection.cpp */; };
+ 1C05BA271174CF03003585B2 /* HoleCollection.h in Sources */ = {isa = PBXBuildFile; fileRef = 1C05B9E11174CC06003585B2 /* HoleCollection.h */; };
+ 1C05BA281174CF10003585B2 /* extract-rules.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1C05B9DB1174CBF9003585B2 /* extract-rules.cpp */; };
+ 1C05BA331174CF88003585B2 /* statistics.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1C05B9E91174CC24003585B2 /* statistics.cpp */; };
+ 1C05BA351174CF98003585B2 /* AlignmentPhrase.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1C6A83031111F5A300059E7F /* AlignmentPhrase.cpp */; };
+ 1C05BA361174CF98003585B2 /* AlignmentPhrase.h in Sources */ = {isa = PBXBuildFile; fileRef = 1C6A83041111F5A300059E7F /* AlignmentPhrase.h */; };
+ 1C05BA381174CFAD003585B2 /* tables-core.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1CE8CE4B0FC6EAA200924FEA /* tables-core.cpp */; };
+ 1C05BA391174CFAD003585B2 /* tables-core.h in Sources */ = {isa = PBXBuildFile; fileRef = 1CE8CE4C0FC6EAA200924FEA /* tables-core.h */; };
+ 1C475795102B78DD00AB74DB /* score.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1C475794102B78DD00AB74DB /* score.cpp */; };
+ 1C4757A8102B797900AB74DB /* tables-core.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1CE8CE4B0FC6EAA200924FEA /* tables-core.cpp */; };
+ 1C4757A9102B797900AB74DB /* tables-core.h in Sources */ = {isa = PBXBuildFile; fileRef = 1CE8CE4C0FC6EAA200924FEA /* tables-core.h */; };
+ 1C6A83051111F5A300059E7F /* AlignmentPhrase.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1C6A83031111F5A300059E7F /* AlignmentPhrase.cpp */; };
+ 1CE8CE2E0FC6EA0200924FEA /* extract.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1CE8CE2C0FC6EA0200924FEA /* extract.cpp */; };
+ 1CE8CE4D0FC6EAA200924FEA /* tables-core.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1CE8CE4B0FC6EAA200924FEA /* tables-core.cpp */; };
+ 1CFE962711762A2A006FF13B /* consolidate.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1CFE962411762A20006FF13B /* consolidate.cpp */; };
+ 1CFE962911762A3A006FF13B /* consolidate-direct.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1CFE962311762A20006FF13B /* consolidate-direct.cpp */; };
+ 1E2C902D141FDED400EA06A6 /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; };
+ 1E2C902E141FDF6D00EA06A6 /* tables-core.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1CE8CE4B0FC6EAA200924FEA /* tables-core.cpp */; };
+ 1E66AB3D155D8068009ABD7C /* OutputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E671A7E155C22C500119DD9 /* OutputFileStream.cpp */; };
+ 1E66AB3E155D8068009ABD7C /* OutputFileStream.h in Sources */ = {isa = PBXBuildFile; fileRef = 1E671A7F155C22C500119DD9 /* OutputFileStream.h */; };
+ 1E66AB40155D8233009ABD7C /* OutputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E671A7E155C22C500119DD9 /* OutputFileStream.cpp */; };
+ 1E66AB41155D8233009ABD7C /* OutputFileStream.h in Sources */ = {isa = PBXBuildFile; fileRef = 1E671A7F155C22C500119DD9 /* OutputFileStream.h */; };
+ 1E671A80155C22C500119DD9 /* OutputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E671A7E155C22C500119DD9 /* OutputFileStream.cpp */; };
+ 1E671A82155C234500119DD9 /* OutputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1E671A7E155C22C500119DD9 /* OutputFileStream.cpp */; };
+ 1E671A83155C234500119DD9 /* OutputFileStream.h in Sources */ = {isa = PBXBuildFile; fileRef = 1E671A7F155C22C500119DD9 /* OutputFileStream.h */; };
+ 1EB1C8321200D5C00079FCBB /* PhraseAlignment.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB1C8311200D5C00079FCBB /* PhraseAlignment.cpp */; };
+ 1EB29A3B1511C253005BC4BA /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; };
+ 1EB29A3C1511C253005BC4BA /* InputFileStream.h in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A210129C024C00041956 /* InputFileStream.h */; };
+ 1EB29A3E1511C2D9005BC4BA /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; };
+ 1EB29A3F1511C2D9005BC4BA /* InputFileStream.h in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A210129C024C00041956 /* InputFileStream.h */; };
+ 1EB8A212129C024C00041956 /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; };
+ 1EB8A261129C04C700041956 /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; };
+ 1EB8A297129C06A300041956 /* gzfilebuf.h in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A20C129C022000041956 /* gzfilebuf.h */; };
+ 1EB8A29A129C06B300041956 /* gzfilebuf.h in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A20C129C022000041956 /* gzfilebuf.h */; };
+ 1EB8A29B129C06B300041956 /* InputFileStream.h in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A210129C024C00041956 /* InputFileStream.h */; };
+ 1EB8A29C129C06B300041956 /* InputFileStream.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1EB8A211129C024C00041956 /* InputFileStream.cpp */; };
+ 1ED896C3141FFDF500F43F46 /* ExtractedRule.cpp in Sources */ = {isa = PBXBuildFile; fileRef = 1ED896C2141FFDF500F43F46 /* ExtractedRule.cpp */; };
+/* End PBXBuildFile section */
+
+/* Begin PBXCopyFilesBuildPhase section */
+ 8DD76F690486A84900D96B5E /* CopyFiles */ = {
+ isa = PBXCopyFilesBuildPhase;
+ buildActionMask = 8;
+ dstPath = /usr/share/man/man1/;
+ dstSubfolderSpec = 0;
+ files = (
+ );
+ runOnlyForDeploymentPostprocessing = 1;
+ };
+/* End PBXCopyFilesBuildPhase section */
+
+/* Begin PBXFileReference section */
+ 1C05B9DB1174CBF9003585B2 /* extract-rules.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = "extract-rules.cpp"; sourceTree = "<group>"; };
+ 1C05B9DD1174CC06003585B2 /* ExtractedRule.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = ExtractedRule.h; sourceTree = "<group>"; };
+ 1C05B9DE1174CC06003585B2 /* hierarchical.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = hierarchical.h; sourceTree = "<group>"; };
+ 1C05B9DF1174CC06003585B2 /* Hole.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = Hole.h; sourceTree = "<group>"; };
+ 1C05B9E01174CC06003585B2 /* HoleCollection.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = HoleCollection.cpp; sourceTree = "<group>"; };
+ 1C05B9E11174CC06003585B2 /* HoleCollection.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = HoleCollection.h; sourceTree = "<group>"; };
+ 1C05B9E31174CC24003585B2 /* RuleExist.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RuleExist.h; sourceTree = "<group>"; };
+ 1C05B9E41174CC24003585B2 /* RuleExtractionOptions.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = RuleExtractionOptions.h; sourceTree = "<group>"; };
+ 1C05B9E51174CC24003585B2 /* SentenceAlignment.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = SentenceAlignment.cpp; sourceTree = "<group>"; };
+ 1C05B9E61174CC24003585B2 /* SentenceAlignment.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = SentenceAlignment.h; sourceTree = "<group>"; };
+ 1C05B9E71174CC24003585B2 /* SentenceAlignmentWithSyntax.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = SentenceAlignmentWithSyntax.cpp; sourceTree = "<group>"; };
+ 1C05B9E81174CC24003585B2 /* SentenceAlignmentWithSyntax.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = SentenceAlignmentWithSyntax.h; sourceTree = "<group>"; };
+ 1C05B9E91174CC24003585B2 /* statistics.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = statistics.cpp; sourceTree = "<group>"; };
+ 1C05B9EA1174CC24003585B2 /* SyntaxTree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = SyntaxTree.cpp; sourceTree = "<group>"; };
+ 1C05B9EB1174CC24003585B2 /* SyntaxTree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = SyntaxTree.h; sourceTree = "<group>"; };
+ 1C05BA1F1174CEE8003585B2 /* XmlTree.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = XmlTree.cpp; sourceTree = "<group>"; };
+ 1C05BA201174CEE8003585B2 /* XmlTree.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = XmlTree.h; sourceTree = "<group>"; };
+ 1C475794102B78DD00AB74DB /* score.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = score.cpp; sourceTree = "<group>"; };
+ 1C6A83031111F5A300059E7F /* AlignmentPhrase.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = AlignmentPhrase.cpp; sourceTree = "<group>"; };
+ 1C6A83041111F5A300059E7F /* AlignmentPhrase.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = AlignmentPhrase.h; sourceTree = "<group>"; };
+ 1CE8CE2C0FC6EA0200924FEA /* extract.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = extract.cpp; sourceTree = "<group>"; };
+ 1CE8CE4B0FC6EAA200924FEA /* tables-core.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = "tables-core.cpp"; sourceTree = "<group>"; };
+ 1CE8CE4C0FC6EAA200924FEA /* tables-core.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "tables-core.h"; sourceTree = "<group>"; };
+ 1CFE962311762A20006FF13B /* consolidate-direct.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = "consolidate-direct.cpp"; sourceTree = "<group>"; };
+ 1CFE962411762A20006FF13B /* consolidate.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = consolidate.cpp; sourceTree = "<group>"; };
+ 1E3EF29E13DBEAF300C1D54A /* extract-lex.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = "extract-lex.cpp"; sourceTree = "<group>"; };
+ 1E3EF29F13DBEAF300C1D54A /* extract-lex.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = "extract-lex.h"; sourceTree = "<group>"; };
+ 1E671A7E155C22C500119DD9 /* OutputFileStream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = OutputFileStream.cpp; sourceTree = "<group>"; };
+ 1E671A7F155C22C500119DD9 /* OutputFileStream.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = OutputFileStream.h; sourceTree = "<group>"; };
+ 1E7C2CF611F1146200213451 /* extract */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = extract; sourceTree = BUILT_PRODUCTS_DIR; };
+ 1E7C2CF811F1146200213451 /* score */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = score; sourceTree = BUILT_PRODUCTS_DIR; };
+ 1E7C2CFA11F1146300213451 /* consolidate */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = consolidate; sourceTree = BUILT_PRODUCTS_DIR; };
+ 1E7C2CFC11F1146300213451 /* consolidate-direct */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "consolidate-direct"; sourceTree = BUILT_PRODUCTS_DIR; };
+ 1E7C2CFE11F1146300213451 /* extract-rules */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = "extract-rules"; sourceTree = BUILT_PRODUCTS_DIR; };
+ 1E7C2D0011F1146300213451 /* statistics */ = {isa = PBXFileReference; explicitFileType = "compiled.mach-o.executable"; includeInIndex = 0; path = statistics; sourceTree = BUILT_PRODUCTS_DIR; };
+ 1EB1C8301200D5C00079FCBB /* PhraseAlignment.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = PhraseAlignment.h; sourceTree = "<group>"; };
+ 1EB1C8311200D5C00079FCBB /* PhraseAlignment.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = PhraseAlignment.cpp; sourceTree = "<group>"; };
+ 1EB1C8491200D77E0079FCBB /* score.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = score.h; sourceTree = "<group>"; };
+ 1EB8A20C129C022000041956 /* gzfilebuf.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = gzfilebuf.h; sourceTree = "<group>"; };
+ 1EB8A210129C024C00041956 /* InputFileStream.h */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.c.h; path = InputFileStream.h; sourceTree = "<group>"; };
+ 1EB8A211129C024C00041956 /* InputFileStream.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = InputFileStream.cpp; sourceTree = "<group>"; };
+ 1ED896C2141FFDF500F43F46 /* ExtractedRule.cpp */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.cpp; path = ExtractedRule.cpp; sourceTree = "<group>"; };
+/* End PBXFileReference section */
+
+/* Begin PBXFrameworksBuildPhase section */
+ 1C05B9F51174CE51003585B2 /* Frameworks */ = {
+ isa = PBXFrameworksBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+ 1C05BA2C1174CF6C003585B2 /* Frameworks */ = {
+ isa = PBXFrameworksBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+ 1C47578D102B78AD00AB74DB /* Frameworks */ = {
+ isa = PBXFrameworksBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+ 1C4757C2102B7EAA00AB74DB /* Frameworks */ = {
+ isa = PBXFrameworksBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+ 1CF9F718108C9FE700EABCE5 /* Frameworks */ = {
+ isa = PBXFrameworksBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+ 8DD76F660486A84900D96B5E /* Frameworks */ = {
+ isa = PBXFrameworksBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+/* End PBXFrameworksBuildPhase section */
+
+/* Begin PBXGroup section */
+ 08FB7794FE84155DC02AAC07 /* extract */ = {
+ isa = PBXGroup;
+ children = (
+ 08FB7795FE84155DC02AAC07 /* Source */,
+ C6859E8C029090F304C91782 /* Documentation */,
+ 1AB674ADFE9D54B511CA2CBB /* Products */,
+ );
+ name = extract;
+ sourceTree = "<group>";
+ };
+ 08FB7795FE84155DC02AAC07 /* Source */ = {
+ isa = PBXGroup;
+ children = (
+ 1E671A7E155C22C500119DD9 /* OutputFileStream.cpp */,
+ 1E671A7F155C22C500119DD9 /* OutputFileStream.h */,
+ 1E3EF29E13DBEAF300C1D54A /* extract-lex.cpp */,
+ 1E3EF29F13DBEAF300C1D54A /* extract-lex.h */,
+ 1C6A83031111F5A300059E7F /* AlignmentPhrase.cpp */,
+ 1C6A83041111F5A300059E7F /* AlignmentPhrase.h */,
+ 1CFE962311762A20006FF13B /* consolidate-direct.cpp */,
+ 1CFE962411762A20006FF13B /* consolidate.cpp */,
+ 1C05B9DB1174CBF9003585B2 /* extract-rules.cpp */,
+ 1CE8CE2C0FC6EA0200924FEA /* extract.cpp */,
+ 1C05B9DD1174CC06003585B2 /* ExtractedRule.h */,
+ 1ED896C2141FFDF500F43F46 /* ExtractedRule.cpp */,
+ 1C05B9DE1174CC06003585B2 /* hierarchical.h */,
+ 1C05B9DF1174CC06003585B2 /* Hole.h */,
+ 1C05B9E01174CC06003585B2 /* HoleCollection.cpp */,
+ 1C05B9E11174CC06003585B2 /* HoleCollection.h */,
+ 1EB1C8311200D5C00079FCBB /* PhraseAlignment.cpp */,
+ 1EB1C8301200D5C00079FCBB /* PhraseAlignment.h */,
+ 1C05B9E31174CC24003585B2 /* RuleExist.h */,
+ 1C05B9E41174CC24003585B2 /* RuleExtractionOptions.h */,
+ 1C475794102B78DD00AB74DB /* score.cpp */,
+ 1EB1C8491200D77E0079FCBB /* score.h */,
+ 1C05B9E51174CC24003585B2 /* SentenceAlignment.cpp */,
+ 1C05B9E61174CC24003585B2 /* SentenceAlignment.h */,
+ 1C05B9E71174CC24003585B2 /* SentenceAlignmentWithSyntax.cpp */,
+ 1C05B9E81174CC24003585B2 /* SentenceAlignmentWithSyntax.h */,
+ 1C05B9E91174CC24003585B2 /* statistics.cpp */,
+ 1C05B9EA1174CC24003585B2 /* SyntaxTree.cpp */,
+ 1C05B9EB1174CC24003585B2 /* SyntaxTree.h */,
+ 1CE8CE4B0FC6EAA200924FEA /* tables-core.cpp */,
+ 1CE8CE4C0FC6EAA200924FEA /* tables-core.h */,
+ 1C05BA1F1174CEE8003585B2 /* XmlTree.cpp */,
+ 1C05BA201174CEE8003585B2 /* XmlTree.h */,
+ 1EB8A20C129C022000041956 /* gzfilebuf.h */,
+ 1EB8A211129C024C00041956 /* InputFileStream.cpp */,
+ 1EB8A210129C024C00041956 /* InputFileStream.h */,
+ );
+ name = Source;
+ sourceTree = "<group>";
+ };
+ 1AB674ADFE9D54B511CA2CBB /* Products */ = {
+ isa = PBXGroup;
+ children = (
+ 1E7C2CF611F1146200213451 /* extract */,
+ 1E7C2CF811F1146200213451 /* score */,
+ 1E7C2CFA11F1146300213451 /* consolidate */,
+ 1E7C2CFC11F1146300213451 /* consolidate-direct */,
+ 1E7C2CFE11F1146300213451 /* extract-rules */,
+ 1E7C2D0011F1146300213451 /* statistics */,
+ );
+ name = Products;
+ sourceTree = "<group>";
+ };
+ C6859E8C029090F304C91782 /* Documentation */ = {
+ isa = PBXGroup;
+ children = (
+ );
+ name = Documentation;
+ sourceTree = "<group>";
+ };
+/* End PBXGroup section */
+
+/* Begin PBXNativeTarget section */
+ 1C05B9F61174CE51003585B2 /* extract-rules */ = {
+ isa = PBXNativeTarget;
+ buildConfigurationList = 1C05BA151174CE58003585B2 /* Build configuration list for PBXNativeTarget "extract-rules" */;
+ buildPhases = (
+ 1C05B9F41174CE51003585B2 /* Sources */,
+ 1C05B9F51174CE51003585B2 /* Frameworks */,
+ );
+ buildRules = (
+ );
+ dependencies = (
+ );
+ name = "extract-rules";
+ productName = "extract-rules";
+ productReference = 1E7C2CFE11F1146300213451 /* extract-rules */;
+ productType = "com.apple.product-type.tool";
+ };
+ 1C05BA2D1174CF6C003585B2 /* statistics */ = {
+ isa = PBXNativeTarget;
+ buildConfigurationList = 1C05BA561174CFB7003585B2 /* Build configuration list for PBXNativeTarget "statistics" */;
+ buildPhases = (
+ 1C05BA2B1174CF6C003585B2 /* Sources */,
+ 1C05BA2C1174CF6C003585B2 /* Frameworks */,
+ );
+ buildRules = (
+ );
+ dependencies = (
+ );
+ name = statistics;
+ productName = statistics;
+ productReference = 1E7C2D0011F1146300213451 /* statistics */;
+ productType = "com.apple.product-type.tool";
+ };
+ 1C47578E102B78AD00AB74DB /* score */ = {
+ isa = PBXNativeTarget;
+ buildConfigurationList = 1C475796102B78DD00AB74DB /* Build configuration list for PBXNativeTarget "score" */;
+ buildPhases = (
+ 1C47578C102B78AD00AB74DB /* Sources */,
+ 1C47578D102B78AD00AB74DB /* Frameworks */,
+ );
+ buildRules = (
+ );
+ dependencies = (
+ );
+ name = score;
+ productName = score;
+ productReference = 1E7C2CF811F1146200213451 /* score */;
+ productType = "com.apple.product-type.tool";
+ };
+ 1C4757C3102B7EAA00AB74DB /* consolidate */ = {
+ isa = PBXNativeTarget;
+ buildConfigurationList = 1C4757CB102B7EC600AB74DB /* Build configuration list for PBXNativeTarget "consolidate" */;
+ buildPhases = (
+ 1C4757C1102B7EAA00AB74DB /* Sources */,
+ 1C4757C2102B7EAA00AB74DB /* Frameworks */,
+ );
+ buildRules = (
+ );
+ dependencies = (
+ );
+ name = consolidate;
+ productName = consolidate;
+ productReference = 1E7C2CFA11F1146300213451 /* consolidate */;
+ productType = "com.apple.product-type.tool";
+ };
+ 1CF9F719108C9FE700EABCE5 /* consolidate-direct */ = {
+ isa = PBXNativeTarget;
+ buildConfigurationList = 1CF9F729108CA01F00EABCE5 /* Build configuration list for PBXNativeTarget "consolidate-direct" */;
+ buildPhases = (
+ 1CF9F717108C9FE700EABCE5 /* Sources */,
+ 1CF9F718108C9FE700EABCE5 /* Frameworks */,
+ );
+ buildRules = (
+ );
+ dependencies = (
+ );
+ name = "consolidate-direct";
+ productName = "consolidate-direct";
+ productReference = 1E7C2CFC11F1146300213451 /* consolidate-direct */;
+ productType = "com.apple.product-type.tool";
+ };
+ 8DD76F620486A84900D96B5E /* extract */ = {
+ isa = PBXNativeTarget;
+ buildConfigurationList = 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "extract" */;
+ buildPhases = (
+ 8DD76F640486A84900D96B5E /* Sources */,
+ 8DD76F660486A84900D96B5E /* Frameworks */,
+ 8DD76F690486A84900D96B5E /* CopyFiles */,
+ );
+ buildRules = (
+ );
+ dependencies = (
+ );
+ name = extract;
+ productInstallPath = "$(HOME)/bin";
+ productName = extract;
+ productReference = 1E7C2CF611F1146200213451 /* extract */;
+ productType = "com.apple.product-type.tool";
+ };
+/* End PBXNativeTarget section */
+
+/* Begin PBXProject section */
+ 08FB7793FE84155DC02AAC07 /* Project object */ = {
+ isa = PBXProject;
+ attributes = {
+ LastUpgradeCheck = 0410;
+ };
+ buildConfigurationList = 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "extract" */;
+ compatibilityVersion = "Xcode 3.2";
+ developmentRegion = English;
+ hasScannedForEncodings = 1;
+ knownRegions = (
+ English,
+ Japanese,
+ French,
+ German,
+ );
+ mainGroup = 08FB7794FE84155DC02AAC07 /* extract */;
+ projectDirPath = "";
+ projectRoot = "";
+ targets = (
+ 8DD76F620486A84900D96B5E /* extract */,
+ 1C47578E102B78AD00AB74DB /* score */,
+ 1C4757C3102B7EAA00AB74DB /* consolidate */,
+ 1CF9F719108C9FE700EABCE5 /* consolidate-direct */,
+ 1C05B9F61174CE51003585B2 /* extract-rules */,
+ 1C05BA2D1174CF6C003585B2 /* statistics */,
+ );
+ };
+/* End PBXProject section */
+
+/* Begin PBXSourcesBuildPhase section */
+ 1C05B9F41174CE51003585B2 /* Sources */ = {
+ isa = PBXSourcesBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ 1E66AB3D155D8068009ABD7C /* OutputFileStream.cpp in Sources */,
+ 1E66AB3E155D8068009ABD7C /* OutputFileStream.h in Sources */,
+ 1EB29A3B1511C253005BC4BA /* InputFileStream.cpp in Sources */,
+ 1EB29A3C1511C253005BC4BA /* InputFileStream.h in Sources */,
+ 1C05BA281174CF10003585B2 /* extract-rules.cpp in Sources */,
+ 1C05BA251174CF03003585B2 /* Hole.h in Sources */,
+ 1C05BA261174CF03003585B2 /* HoleCollection.cpp in Sources */,
+ 1C05BA271174CF03003585B2 /* HoleCollection.h in Sources */,
+ 1C05BA211174CEE8003585B2 /* XmlTree.cpp in Sources */,
+ 1C05BA221174CEF3003585B2 /* XmlTree.h in Sources */,
+ 1C05BA1D1174CED3003585B2 /* SyntaxTree.cpp in Sources */,
+ 1C05BA1E1174CED3003585B2 /* SyntaxTree.h in Sources */,
+ 1C05BA191174CEC7003585B2 /* SentenceAlignment.cpp in Sources */,
+ 1C05BA1A1174CEC7003585B2 /* SentenceAlignment.h in Sources */,
+ 1C05BA1B1174CEC7003585B2 /* SentenceAlignmentWithSyntax.cpp in Sources */,
+ 1C05BA1C1174CEC7003585B2 /* SentenceAlignmentWithSyntax.h in Sources */,
+ 1C05BA161174CEA4003585B2 /* tables-core.cpp in Sources */,
+ 1C05BA171174CEA4003585B2 /* tables-core.h in Sources */,
+ 1ED896C3141FFDF500F43F46 /* ExtractedRule.cpp in Sources */,
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+ 1C05BA2B1174CF6C003585B2 /* Sources */ = {
+ isa = PBXSourcesBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ 1EB29A3E1511C2D9005BC4BA /* InputFileStream.cpp in Sources */,
+ 1EB29A3F1511C2D9005BC4BA /* InputFileStream.h in Sources */,
+ 1C05BA381174CFAD003585B2 /* tables-core.cpp in Sources */,
+ 1C05BA391174CFAD003585B2 /* tables-core.h in Sources */,
+ 1C05BA351174CF98003585B2 /* AlignmentPhrase.cpp in Sources */,
+ 1C05BA361174CF98003585B2 /* AlignmentPhrase.h in Sources */,
+ 1C05BA331174CF88003585B2 /* statistics.cpp in Sources */,
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+ 1C47578C102B78AD00AB74DB /* Sources */ = {
+ isa = PBXSourcesBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ 1E671A82155C234500119DD9 /* OutputFileStream.cpp in Sources */,
+ 1E671A83155C234500119DD9 /* OutputFileStream.h in Sources */,
+ 1EB8A29A129C06B300041956 /* gzfilebuf.h in Sources */,
+ 1EB8A29B129C06B300041956 /* InputFileStream.h in Sources */,
+ 1EB8A29C129C06B300041956 /* InputFileStream.cpp in Sources */,
+ 1C4757A8102B797900AB74DB /* tables-core.cpp in Sources */,
+ 1C4757A9102B797900AB74DB /* tables-core.h in Sources */,
+ 1C475795102B78DD00AB74DB /* score.cpp in Sources */,
+ 1C6A83051111F5A300059E7F /* AlignmentPhrase.cpp in Sources */,
+ 1EB1C8321200D5C00079FCBB /* PhraseAlignment.cpp in Sources */,
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+ 1C4757C1102B7EAA00AB74DB /* Sources */ = {
+ isa = PBXSourcesBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ 1E66AB40155D8233009ABD7C /* OutputFileStream.cpp in Sources */,
+ 1E66AB41155D8233009ABD7C /* OutputFileStream.h in Sources */,
+ 1E2C902E141FDF6D00EA06A6 /* tables-core.cpp in Sources */,
+ 1E2C902D141FDED400EA06A6 /* InputFileStream.cpp in Sources */,
+ 1CFE962711762A2A006FF13B /* consolidate.cpp in Sources */,
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+ 1CF9F717108C9FE700EABCE5 /* Sources */ = {
+ isa = PBXSourcesBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ 1EB8A297129C06A300041956 /* gzfilebuf.h in Sources */,
+ 1EB8A261129C04C700041956 /* InputFileStream.cpp in Sources */,
+ 1CFE962911762A3A006FF13B /* consolidate-direct.cpp in Sources */,
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+ 8DD76F640486A84900D96B5E /* Sources */ = {
+ isa = PBXSourcesBuildPhase;
+ buildActionMask = 2147483647;
+ files = (
+ 1CE8CE2E0FC6EA0200924FEA /* extract.cpp in Sources */,
+ 1CE8CE4D0FC6EAA200924FEA /* tables-core.cpp in Sources */,
+ 1C05B9EC1174CC24003585B2 /* SentenceAlignment.cpp in Sources */,
+ 1EB8A212129C024C00041956 /* InputFileStream.cpp in Sources */,
+ 1E671A80155C22C500119DD9 /* OutputFileStream.cpp in Sources */,
+ );
+ runOnlyForDeploymentPostprocessing = 0;
+ };
+/* End PBXSourcesBuildPhase section */
+
+/* Begin XCBuildConfiguration section */
+ 1C05B9F91174CE52003585B2 /* Debug */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ COPY_PHASE_STRIP = NO;
+ GCC_DYNAMIC_NO_PIC = NO;
+ GCC_MODEL_TUNING = G5;
+ GCC_OPTIMIZATION_LEVEL = 0;
+ INSTALL_PATH = /usr/local/bin;
+ OTHER_LDFLAGS = (
+ "-lz",
+ "-lboost_iostreams",
+ );
+ PRODUCT_NAME = "extract-rules";
+ SDKROOT = macosx;
+ };
+ name = Debug;
+ };
+ 1C05B9FA1174CE52003585B2 /* Release */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ COPY_PHASE_STRIP = YES;
+ DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+ GCC_MODEL_TUNING = G5;
+ INSTALL_PATH = /usr/local/bin;
+ ONLY_ACTIVE_ARCH = YES;
+ OTHER_LDFLAGS = (
+ "-lz",
+ "-lboost_iostreams",
+ );
+ PRODUCT_NAME = "extract-rules";
+ SDKROOT = macosx;
+ ZERO_LINK = NO;
+ };
+ name = Release;
+ };
+ 1C05BA301174CF6C003585B2 /* Debug */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ COPY_PHASE_STRIP = NO;
+ GCC_DYNAMIC_NO_PIC = NO;
+ GCC_MODEL_TUNING = G5;
+ GCC_OPTIMIZATION_LEVEL = 0;
+ INSTALL_PATH = /usr/local/bin;
+ OTHER_LDFLAGS = "-lz";
+ PRODUCT_NAME = statistics;
+ SDKROOT = macosx;
+ };
+ name = Debug;
+ };
+ 1C05BA311174CF6C003585B2 /* Release */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ COPY_PHASE_STRIP = YES;
+ DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+ GCC_MODEL_TUNING = G5;
+ INSTALL_PATH = /usr/local/bin;
+ ONLY_ACTIVE_ARCH = YES;
+ OTHER_LDFLAGS = "-lz";
+ PRODUCT_NAME = statistics;
+ SDKROOT = macosx;
+ ZERO_LINK = NO;
+ };
+ name = Release;
+ };
+ 1C475791102B78AE00AB74DB /* Debug */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ COPY_PHASE_STRIP = NO;
+ GCC_DYNAMIC_NO_PIC = NO;
+ GCC_MODEL_TUNING = G5;
+ GCC_OPTIMIZATION_LEVEL = 0;
+ INSTALL_PATH = /usr/local/bin;
+ ONLY_ACTIVE_ARCH = YES;
+ OTHER_LDFLAGS = (
+ "-lz",
+ "-lboost_iostreams",
+ );
+ PRODUCT_NAME = score;
+ SDKROOT = macosx;
+ };
+ name = Debug;
+ };
+ 1C475792102B78AE00AB74DB /* Release */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ COPY_PHASE_STRIP = YES;
+ DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+ GCC_MODEL_TUNING = G5;
+ INSTALL_PATH = /usr/local/bin;
+ ONLY_ACTIVE_ARCH = YES;
+ OTHER_LDFLAGS = (
+ "-lz",
+ "-lboost_iostreams",
+ );
+ PRODUCT_NAME = score;
+ SDKROOT = macosx;
+ ZERO_LINK = NO;
+ };
+ name = Release;
+ };
+ 1C4757C6102B7EAB00AB74DB /* Debug */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ COPY_PHASE_STRIP = NO;
+ GCC_DYNAMIC_NO_PIC = NO;
+ GCC_MODEL_TUNING = G5;
+ GCC_OPTIMIZATION_LEVEL = 0;
+ INSTALL_PATH = /usr/local/bin;
+ OTHER_LDFLAGS = (
+ "-lz",
+ "-lboost_iostreams",
+ );
+ PRODUCT_NAME = consolidate;
+ SDKROOT = macosx;
+ };
+ name = Debug;
+ };
+ 1C4757C7102B7EAB00AB74DB /* Release */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ COPY_PHASE_STRIP = YES;
+ DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+ GCC_MODEL_TUNING = G5;
+ INSTALL_PATH = /usr/local/bin;
+ ONLY_ACTIVE_ARCH = YES;
+ OTHER_LDFLAGS = (
+ "-lz",
+ "-lboost_iostreams",
+ );
+ PRODUCT_NAME = consolidate;
+ SDKROOT = macosx;
+ ZERO_LINK = NO;
+ };
+ name = Release;
+ };
+ 1CF9F71C108C9FE800EABCE5 /* Debug */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ COPY_PHASE_STRIP = NO;
+ GCC_DYNAMIC_NO_PIC = NO;
+ GCC_MODEL_TUNING = G5;
+ GCC_OPTIMIZATION_LEVEL = 0;
+ INSTALL_PATH = /usr/local/bin;
+ OTHER_LDFLAGS = "-lz";
+ PRODUCT_NAME = "consolidate-direct";
+ SDKROOT = macosx;
+ };
+ name = Debug;
+ };
+ 1CF9F71D108C9FE800EABCE5 /* Release */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ COPY_PHASE_STRIP = YES;
+ DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+ GCC_MODEL_TUNING = G5;
+ INSTALL_PATH = /usr/local/bin;
+ ONLY_ACTIVE_ARCH = YES;
+ OTHER_LDFLAGS = "-lz";
+ PRODUCT_NAME = "consolidate-direct";
+ SDKROOT = macosx;
+ ZERO_LINK = NO;
+ };
+ name = Release;
+ };
+ 1DEB923208733DC60010E9CD /* Debug */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ COPY_PHASE_STRIP = NO;
+ GCC_DYNAMIC_NO_PIC = NO;
+ GCC_MODEL_TUNING = G5;
+ GCC_OPTIMIZATION_LEVEL = 0;
+ GCC_PREPROCESSOR_DEFINITIONS = (
+ "_GLIBCXX_DEBUG=1",
+ "_GLIBCXX_DEBUG_PEDANTIC=1",
+ );
+ INSTALL_PATH = /usr/local/bin;
+ OTHER_LDFLAGS = (
+ "-lz",
+ "-lboost_iostreams",
+ );
+ PRODUCT_NAME = extract;
+ SDKROOT = macosx;
+ };
+ name = Debug;
+ };
+ 1DEB923308733DC60010E9CD /* Release */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ALWAYS_SEARCH_USER_PATHS = NO;
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ DEBUG_INFORMATION_FORMAT = "dwarf-with-dsym";
+ GCC_MODEL_TUNING = G5;
+ INSTALL_PATH = /usr/local/bin;
+ ONLY_ACTIVE_ARCH = YES;
+ OTHER_LDFLAGS = (
+ "-lz",
+ "-lboost_iostreams",
+ );
+ PRODUCT_NAME = extract;
+ SDKROOT = macosx;
+ };
+ name = Release;
+ };
+ 1DEB923608733DC60010E9CD /* Debug */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ GCC_C_LANGUAGE_STANDARD = c99;
+ GCC_OPTIMIZATION_LEVEL = 0;
+ GCC_WARN_ABOUT_RETURN_TYPE = YES;
+ GCC_WARN_UNUSED_VARIABLE = YES;
+ HEADER_SEARCH_PATHS = /opt/local/include;
+ LIBRARY_SEARCH_PATHS = /opt/local/lib;
+ ONLY_ACTIVE_ARCH = YES;
+ SDKROOT = macosx;
+ };
+ name = Debug;
+ };
+ 1DEB923708733DC60010E9CD /* Release */ = {
+ isa = XCBuildConfiguration;
+ buildSettings = {
+ ARCHS = "$(ARCHS_STANDARD_32_64_BIT)";
+ GCC_C_LANGUAGE_STANDARD = c99;
+ GCC_WARN_ABOUT_RETURN_TYPE = YES;
+ GCC_WARN_UNUSED_VARIABLE = YES;
+ HEADER_SEARCH_PATHS = /opt/local/include;
+ LIBRARY_SEARCH_PATHS = /opt/local/lib;
+ ONLY_ACTIVE_ARCH = YES;
+ SDKROOT = macosx;
+ };
+ name = Release;
+ };
+/* End XCBuildConfiguration section */
+
+/* Begin XCConfigurationList section */
+ 1C05BA151174CE58003585B2 /* Build configuration list for PBXNativeTarget "extract-rules" */ = {
+ isa = XCConfigurationList;
+ buildConfigurations = (
+ 1C05B9F91174CE52003585B2 /* Debug */,
+ 1C05B9FA1174CE52003585B2 /* Release */,
+ );
+ defaultConfigurationIsVisible = 0;
+ defaultConfigurationName = Release;
+ };
+ 1C05BA561174CFB7003585B2 /* Build configuration list for PBXNativeTarget "statistics" */ = {
+ isa = XCConfigurationList;
+ buildConfigurations = (
+ 1C05BA301174CF6C003585B2 /* Debug */,
+ 1C05BA311174CF6C003585B2 /* Release */,
+ );
+ defaultConfigurationIsVisible = 0;
+ defaultConfigurationName = Release;
+ };
+ 1C475796102B78DD00AB74DB /* Build configuration list for PBXNativeTarget "score" */ = {
+ isa = XCConfigurationList;
+ buildConfigurations = (
+ 1C475791102B78AE00AB74DB /* Debug */,
+ 1C475792102B78AE00AB74DB /* Release */,
+ );
+ defaultConfigurationIsVisible = 0;
+ defaultConfigurationName = Release;
+ };
+ 1C4757CB102B7EC600AB74DB /* Build configuration list for PBXNativeTarget "consolidate" */ = {
+ isa = XCConfigurationList;
+ buildConfigurations = (
+ 1C4757C6102B7EAB00AB74DB /* Debug */,
+ 1C4757C7102B7EAB00AB74DB /* Release */,
+ );
+ defaultConfigurationIsVisible = 0;
+ defaultConfigurationName = Release;
+ };
+ 1CF9F729108CA01F00EABCE5 /* Build configuration list for PBXNativeTarget "consolidate-direct" */ = {
+ isa = XCConfigurationList;
+ buildConfigurations = (
+ 1CF9F71C108C9FE800EABCE5 /* Debug */,
+ 1CF9F71D108C9FE800EABCE5 /* Release */,
+ );
+ defaultConfigurationIsVisible = 0;
+ defaultConfigurationName = Release;
+ };
+ 1DEB923108733DC60010E9CD /* Build configuration list for PBXNativeTarget "extract" */ = {
+ isa = XCConfigurationList;
+ buildConfigurations = (
+ 1DEB923208733DC60010E9CD /* Debug */,
+ 1DEB923308733DC60010E9CD /* Release */,
+ );
+ defaultConfigurationIsVisible = 0;
+ defaultConfigurationName = Release;
+ };
+ 1DEB923508733DC60010E9CD /* Build configuration list for PBXProject "extract" */ = {
+ isa = XCConfigurationList;
+ buildConfigurations = (
+ 1DEB923608733DC60010E9CD /* Debug */,
+ 1DEB923708733DC60010E9CD /* Release */,
+ );
+ defaultConfigurationIsVisible = 0;
+ defaultConfigurationName = Release;
+ };
+/* End XCConfigurationList section */
+ };
+ rootObject = 08FB7793FE84155DC02AAC07 /* Project object */;
+}
diff --git a/phrase-extract/gzfilebuf.h b/phrase-extract/gzfilebuf.h
new file mode 100644
index 000000000..b5b0ce87f
--- /dev/null
+++ b/phrase-extract/gzfilebuf.h
@@ -0,0 +1,85 @@
+#ifndef moses_gzfile_buf_h
+#define moses_gzfile_buf_h
+
+#include <streambuf>
+#include <zlib.h>
+#include <cstring>
+
+class gzfilebuf : public std::streambuf
+{
+public:
+ gzfilebuf(const char *filename) {
+ _gzf = gzopen(filename, "rb");
+ setg (_buff+sizeof(int), // beginning of putback area
+ _buff+sizeof(int), // read position
+ _buff+sizeof(int)); // end position
+ }
+ ~gzfilebuf() {
+ gzclose(_gzf);
+ }
+protected:
+ virtual int_type overflow (int_type c) {
+ throw;
+ }
+
+ // write multiple characters
+ virtual
+ std::streamsize xsputn (const char* s,
+ std::streamsize num) {
+ throw;
+ }
+
+ virtual std::streampos seekpos ( std::streampos sp, std::ios_base::openmode which = std::ios_base::in | std::ios_base::out ) {
+ throw;
+ }
+
+ //read one character
+ virtual int_type underflow () {
+ // is read position before end of _buff?
+ if (gptr() < egptr()) {
+ return traits_type::to_int_type(*gptr());
+ }
+
+ /* process size of putback area
+ * - use number of characters read
+ * - but at most four
+ */
+ unsigned int numPutback = gptr() - eback();
+ if (numPutback > sizeof(int)) {
+ numPutback = sizeof(int);
+ }
+
+ /* copy up to four characters previously read into
+ * the putback _buff (area of first four characters)
+ */
+ std::memmove (_buff+(sizeof(int)-numPutback), gptr()-numPutback,
+ numPutback);
+
+ // read new characters
+ int num = gzread(_gzf, _buff+sizeof(int), _buffsize-sizeof(int));
+ if (num <= 0) {
+ // ERROR or EOF
+ return EOF;
+ }
+
+ // reset _buff pointers
+ setg (_buff+(sizeof(int)-numPutback), // beginning of putback area
+ _buff+sizeof(int), // read position
+ _buff+sizeof(int)+num); // end of buffer
+
+ // return next character
+ return traits_type::to_int_type(*gptr());
+ }
+
+ std::streamsize xsgetn (char* s,
+ std::streamsize num) {
+ return gzread(_gzf,s,num);
+ }
+
+private:
+ gzFile _gzf;
+ static const unsigned int _buffsize = 1024;
+ char _buff[_buffsize];
+};
+
+#endif
diff --git a/phrase-extract/hierarchical.h b/phrase-extract/hierarchical.h
new file mode 100644
index 000000000..61c899013
--- /dev/null
+++ b/phrase-extract/hierarchical.h
@@ -0,0 +1,33 @@
+/*
+ * hierarchical.h
+ *
+ * Created on: Jan 27, 2010
+ * Author: Nadi Tomeh - LIMSI/CNRS
+ * MT Marathon 2010, Dublin
+ */
+
+#ifndef HIERARCHICAL_H_
+#define HIERARCHICAL_H_
+
+#include <utility>
+#include <map>
+#include <set>
+#include <vector>
+
+// HPhraseVertex represents a point in the alignment matrix
+typedef std::pair <int, int> HPhraseVertex;
+
+// Phrase represents a bi-phrase; each bi-phrase is defined by two points in the alignment matrix:
+// bottom-left and top-right
+typedef std::pair<HPhraseVertex, HPhraseVertex> HPhrase;
+
+// HPhraseVector is a std::vector of phrases
+// the bool value indicates if the associated phrase is within the length limit or not
+typedef std::vector < HPhrase > HPhraseVector;
+
+// SentenceVertices represents all vertices that have the same positioning of all extracted phrases
+// The key of the std::map is the English index and the value is a std::set of the foreign ones
+typedef std::map <int, std::set<int> > HSenteceVertices;
+
+
+#endif /* HIERARCHICAL_H_ */
diff --git a/phrase-extract/pcfg-common/Jamfile b/phrase-extract/pcfg-common/Jamfile
new file mode 100644
index 000000000..3dc272a56
--- /dev/null
+++ b/phrase-extract/pcfg-common/Jamfile
@@ -0,0 +1 @@
+lib pcfg_common : [ glob *.cc ] ..//trees ;
diff --git a/phrase-extract/pcfg-common/exception.h b/phrase-extract/pcfg-common/exception.h
new file mode 100644
index 000000000..3dbd59d0e
--- /dev/null
+++ b/phrase-extract/pcfg-common/exception.h
@@ -0,0 +1,41 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXCEPTION_H_
+#define PCFG_EXCEPTION_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+class Exception {
+ public:
+ Exception(const char *msg) : msg_(msg) {}
+ Exception(const std::string &msg) : msg_(msg) {}
+ const std::string &msg() const { return msg_; }
+ private:
+ std::string msg_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-common/numbered_set.h b/phrase-extract/pcfg-common/numbered_set.h
new file mode 100644
index 000000000..15e768b4c
--- /dev/null
+++ b/phrase-extract/pcfg-common/numbered_set.h
@@ -0,0 +1,109 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_NUMBERED_SET_H_
+#define PCFG_NUMBERED_SET_H_
+
+#include "exception.h"
+
+#include <boost/unordered_map.hpp>
+
+#include <limits>
+#include <sstream>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Stores a set of elements of type T, each of which is allocated an integral
+// ID of type I. IDs are contiguous starting at 0. Individual elements cannot
+// be removed once inserted (but the whole set can be cleared).
+template<typename T, typename I=std::size_t>
+class NumberedSet {
+ private:
+ typedef boost::unordered_map<T, I> ElementToIdMap;
+ typedef std::vector<const T *> IdToElementMap;
+
+ public:
+ typedef I IdType;
+ typedef typename IdToElementMap::const_iterator const_iterator;
+
+ NumberedSet() {}
+
+ const_iterator begin() const { return id_to_element_.begin(); }
+ const_iterator end() const { return id_to_element_.end(); }
+
+ // Static value
+ static I NullId() { return std::numeric_limits<I>::max(); }
+
+ bool Empty() const { return id_to_element_.empty(); }
+ std::size_t Size() const { return id_to_element_.size(); }
+
+ // Insert the given object and return its ID.
+ I Insert(const T &);
+
+ I Lookup(const T &) const;
+ const T &Lookup(I) const;
+
+ void Clear();
+
+ private:
+ ElementToIdMap element_to_id_;
+ IdToElementMap id_to_element_;
+};
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Lookup(const T &s) const {
+ typename ElementToIdMap::const_iterator p = element_to_id_.find(s);
+ return (p == element_to_id_.end()) ? NullId() : p->second;
+}
+
+template<typename T, typename I>
+const T &NumberedSet<T, I>::Lookup(I id) const {
+ if (id < 0 || id >= id_to_element_.size()) {
+ std::ostringstream msg;
+ msg << "Value not found: " << id;
+ throw Exception(msg.str());
+ }
+ return *(id_to_element_[id]);
+}
+
+template<typename T, typename I>
+I NumberedSet<T, I>::Insert(const T &x) {
+ std::pair<T, I> value(x, id_to_element_.size());
+ std::pair<typename ElementToIdMap::iterator, bool> result =
+ element_to_id_.insert(value);
+ if (result.second) {
+ // x is a new element.
+ id_to_element_.push_back(&result.first->first);
+ }
+ return result.first->second;
+}
+
+template<typename T, typename I>
+void NumberedSet<T, I>::Clear() {
+ element_to_id_.clear();
+ id_to_element_.clear();
+}
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-common/pcfg.cc b/phrase-extract/pcfg-common/pcfg.cc
new file mode 100644
index 000000000..054e20a48
--- /dev/null
+++ b/phrase-extract/pcfg-common/pcfg.cc
@@ -0,0 +1,106 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg.h"
+
+#include "exception.h"
+
+#include <boost/algorithm/string.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include <cassert>
+
+namespace Moses {
+namespace PCFG {
+
+void Pcfg::Add(const Key &key, double score) {
+ rules_[key] = score;
+}
+
+bool Pcfg::Lookup(const Key &key, double &score) const {
+ Map::const_iterator p = rules_.find(key);
+ if (p == rules_.end()) {
+ return false;
+ }
+ score = p->second;
+ return true;
+}
+
+void Pcfg::Read(std::istream &input, Vocabulary &vocab) {
+ std::string line;
+ std::string lhs_string;
+ std::vector<std::string> rhs_strings;
+ std::string score_string;
+ Key key;
+ while (std::getline(input, line)) {
+ // Read LHS.
+ std::size_t pos = line.find("|||");
+ if (pos == std::string::npos) {
+ throw Exception("missing first delimiter");
+ }
+ lhs_string = line.substr(0, pos);
+ boost::trim(lhs_string);
+
+ // Read RHS.
+ std::size_t begin = pos+3;
+ pos = line.find("|||", begin);
+ if (pos == std::string::npos) {
+ throw Exception("missing second delimiter");
+ }
+ std::string rhs_text = line.substr(begin, pos-begin);
+ boost::trim(rhs_text);
+ rhs_strings.clear();
+ boost::split(rhs_strings, rhs_text, boost::algorithm::is_space(),
+ boost::algorithm::token_compress_on);
+
+ // Read score.
+ score_string = line.substr(pos+3);
+ boost::trim(score_string);
+
+ // Construct key.
+ key.clear();
+ key.reserve(rhs_strings.size()+1);
+ key.push_back(vocab.Insert(lhs_string));
+ for (std::vector<std::string>::const_iterator p = rhs_strings.begin();
+ p != rhs_strings.end(); ++p) {
+ key.push_back(vocab.Insert(*p));
+ }
+
+ // Add rule.
+ double score = boost::lexical_cast<double>(score_string);
+ Add(key, score);
+ }
+}
+
+void Pcfg::Write(const Vocabulary &vocab, std::ostream &output) const {
+ for (const_iterator p = begin(); p != end(); ++p) {
+ const Key &key = p->first;
+ double score = p->second;
+ std::vector<std::size_t>::const_iterator q = key.begin();
+ std::vector<std::size_t>::const_iterator end = key.end();
+ output << vocab.Lookup(*q++) << " |||";
+ while (q != end) {
+ output << " " << vocab.Lookup(*q++);
+ }
+ output << " ||| " << score << std::endl;
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/phrase-extract/pcfg-common/pcfg.h b/phrase-extract/pcfg-common/pcfg.h
new file mode 100644
index 000000000..b87336584
--- /dev/null
+++ b/phrase-extract/pcfg-common/pcfg.h
@@ -0,0 +1,61 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_PCFG_H_
+#define PCFG_PCFG_H_
+
+#include "typedef.h"
+
+#include <istream>
+#include <map>
+#include <ostream>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+class Pcfg {
+ public:
+ typedef std::vector<std::size_t> Key;
+ typedef std::map<Key, double> Map;
+ typedef Map::iterator iterator;
+ typedef Map::const_iterator const_iterator;
+
+ Pcfg() {}
+
+ iterator begin() { return rules_.begin(); }
+ const_iterator begin() const { return rules_.begin(); }
+
+ iterator end() { return rules_.end(); }
+ const_iterator end() const { return rules_.end(); }
+
+ void Add(const Key &, double);
+ bool Lookup(const Key &, double &) const;
+ void Read(std::istream &, Vocabulary &);
+ void Write(const Vocabulary &, std::ostream &) const;
+
+ private:
+ Map rules_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-common/pcfg_tree.h b/phrase-extract/pcfg-common/pcfg_tree.h
new file mode 100644
index 000000000..bdac64dfc
--- /dev/null
+++ b/phrase-extract/pcfg-common/pcfg_tree.h
@@ -0,0 +1,77 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_PCFG_TREE_H_
+#define PCFG_PCFG_TREE_H_
+
+#include "syntax_tree.h"
+#include "xml_tree_writer.h"
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+template<typename DerivedType>
+class PcfgTreeBase : public SyntaxTreeBase<std::string, DerivedType> {
+ public:
+ typedef std::string LabelType;
+ typedef SyntaxTreeBase<LabelType, DerivedType> BaseType;
+
+ PcfgTreeBase(const LabelType &label) : BaseType(label), score_(0.0) {}
+
+ double score() const { return score_; }
+ void set_score(double s) { score_ = s; }
+
+ private:
+ double score_;
+};
+
+class PcfgTree : public PcfgTreeBase<PcfgTree> {
+ public:
+ typedef PcfgTreeBase<PcfgTree> BaseType;
+ PcfgTree(const BaseType::LabelType &label) : BaseType(label) {}
+};
+
+// Specialise XmlOutputHandler for PcfgTree.
+template<>
+class XmlOutputHandler<PcfgTree> {
+ public:
+ typedef std::map<std::string, std::string> AttributeMap;
+
+ void GetLabel(const PcfgTree &tree, std::string &label) const {
+ label = tree.label();
+ }
+
+ void GetAttributes(const PcfgTree &tree, AttributeMap &attribute_map) const {
+ attribute_map.clear();
+ double score = tree.score();
+ if (score != 0.0) {
+ std::ostringstream out;
+ out << tree.score();
+ attribute_map["pcfg"] = out.str();
+ }
+ }
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-common/syntax_tree.h b/phrase-extract/pcfg-common/syntax_tree.h
new file mode 100644
index 000000000..89c6ec0c3
--- /dev/null
+++ b/phrase-extract/pcfg-common/syntax_tree.h
@@ -0,0 +1,91 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SYNTAX_TREE_H_
+#define PCFG_SYNTAX_TREE_H_
+
+#include <cassert>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Base class for SyntaxTree, AgreementTree, and friends.
+template<typename T, typename DerivedType>
+class SyntaxTreeBase {
+ public:
+ // Constructors
+ SyntaxTreeBase(const T &label)
+ : label_(label)
+ , children_()
+ , parent_(0) {}
+
+ SyntaxTreeBase(const T &label, const std::vector<DerivedType *> &children)
+ : label_(label)
+ , children_(children)
+ , parent_(0) {}
+
+ // Destructor
+ virtual ~SyntaxTreeBase();
+
+ const T &label() const { return label_; }
+ const DerivedType *parent() const { return parent_; }
+ DerivedType *parent() { return parent_; }
+ const std::vector<DerivedType *> &children() const { return children_; }
+ std::vector<DerivedType *> &children() { return children_; }
+
+ void set_label(const T &label) { label_ = label; }
+ void set_parent(DerivedType *parent) { parent_ = parent; }
+ void set_children(const std::vector<DerivedType *> &c) { children_ = c; }
+
+ bool IsLeaf() const { return children_.empty(); }
+
+ bool IsPreterminal() const {
+ return children_.size() == 1 && children_[0]->IsLeaf();
+ }
+
+ void AddChild(DerivedType *child) { children_.push_back(child); }
+
+ private:
+ T label_;
+ std::vector<DerivedType *> children_;
+ DerivedType *parent_;
+};
+
+template<typename T>
+class SyntaxTree : public SyntaxTreeBase<T, SyntaxTree<T> > {
+ public:
+ typedef SyntaxTreeBase<T, SyntaxTree<T> > BaseType;
+ SyntaxTree(const T &label) : BaseType(label) {}
+ SyntaxTree(const T &label, const std::vector<SyntaxTree *> &children)
+ : BaseType(label, children) {}
+};
+
+template<typename T, typename DerivedType>
+SyntaxTreeBase<T, DerivedType>::~SyntaxTreeBase() {
+ for (std::size_t i = 0; i < children_.size(); ++i) {
+ delete children_[i];
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-common/tool.cc b/phrase-extract/pcfg-common/tool.cc
new file mode 100644
index 000000000..bebd220e1
--- /dev/null
+++ b/phrase-extract/pcfg-common/tool.cc
@@ -0,0 +1,80 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "tool.h"
+
+#include <sstream>
+
+namespace Moses {
+namespace PCFG {
+
+std::istream &Tool::OpenInputOrDie(const std::string &filename) {
+ // TODO Check that function is only called once?
+ if (filename.empty() || filename == "-") {
+ input_ptr_ = &(std::cin);
+ } else {
+ input_file_stream_.open(filename.c_str());
+ if (!input_file_stream_) {
+ std::ostringstream msg;
+ msg << "failed to open input file: " << filename;
+ Error(msg.str());
+ }
+ input_ptr_ = &input_file_stream_;
+ }
+ return *input_ptr_;
+}
+
+std::ostream &Tool::OpenOutputOrDie(const std::string &filename) {
+ // TODO Check that function is only called once?
+ if (filename.empty() || filename == "-") {
+ output_ptr_ = &(std::cout);
+ } else {
+ output_file_stream_.open(filename.c_str());
+ if (!output_file_stream_) {
+ std::ostringstream msg;
+ msg << "failed to open output file: " << filename;
+ Error(msg.str());
+ }
+ output_ptr_ = &output_file_stream_;
+ }
+ return *output_ptr_;
+}
+
+void Tool::OpenNamedInputOrDie(const std::string &filename,
+ std::ifstream &stream) {
+ stream.open(filename.c_str());
+ if (!stream) {
+ std::ostringstream msg;
+ msg << "failed to open input file: " << filename;
+ Error(msg.str());
+ }
+}
+
+void Tool::OpenNamedOutputOrDie(const std::string &filename,
+ std::ofstream &stream) {
+ stream.open(filename.c_str());
+ if (!stream) {
+ std::ostringstream msg;
+ msg << "failed to open output file: " << filename;
+ Error(msg.str());
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/phrase-extract/pcfg-common/tool.h b/phrase-extract/pcfg-common/tool.h
new file mode 100644
index 000000000..0af342569
--- /dev/null
+++ b/phrase-extract/pcfg-common/tool.h
@@ -0,0 +1,91 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_TOOL_H_
+#define PCFG_TOOL_H_
+
+#include <boost/program_options/cmdline.hpp>
+
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+class Tool {
+ public:
+ virtual ~Tool() {}
+
+ const std::string &name() const { return name_; }
+
+ virtual int Main(int argc, char *argv[]) = 0;
+
+ protected:
+ Tool(const std::string &name) : name_(name) {}
+
+ // Returns the boost::program_options style that should be used by all tools.
+ static int CommonOptionStyle() {
+ namespace cls = boost::program_options::command_line_style;
+ return cls::default_style & (~cls::allow_guessing);
+ }
+
+ void Warn(const std::string &msg) const {
+ std::cerr << name_ << ": warning: " << msg << std::endl;
+ }
+
+ void Error(const std::string &msg) const {
+ std::cerr << name_ << ": error: " << msg << std::endl;
+ std::exit(1);
+ }
+
+ // Initialises the tool's main input stream and returns a reference that is
+ // valid for the remainder of the tool's lifetime. If filename is empty or
+ // "-" then input is standard input; otherwise it is the named file. Calls
+ // Error() if the file cannot be opened for reading.
+ std::istream &OpenInputOrDie(const std::string &filename);
+
+ // Initialises the tool's main output stream and returns a reference that is
+ // valid for the remainder of the tool's lifetime. If filename is empty or
+ // "-" then output is standard output; otherwise it is the named file. Calls
+ // Error() if the file cannot be opened for writing.
+ std::ostream &OpenOutputOrDie(const std::string &filename);
+
+ // Opens the named input file using the supplied ifstream. Calls Error() if
+ // the file cannot be opened for reading.
+ void OpenNamedInputOrDie(const std::string &, std::ifstream &);
+
+ // Opens the named output file using the supplied ofstream. Calls Error() if
+ // the file cannot be opened for writing.
+ void OpenNamedOutputOrDie(const std::string &, std::ofstream &);
+
+ private:
+ std::string name_;
+ std::istream *input_ptr_;
+ std::ifstream input_file_stream_;
+ std::ostream *output_ptr_;
+ std::ofstream output_file_stream_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-common/typedef.h b/phrase-extract/pcfg-common/typedef.h
new file mode 100644
index 000000000..49a12d681
--- /dev/null
+++ b/phrase-extract/pcfg-common/typedef.h
@@ -0,0 +1,37 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_TYPEDEF_H_
+#define PCFG_TYPEDEF_H_
+
+#include "numbered_set.h"
+#include "syntax_tree.h"
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+typedef NumberedSet<std::string> Vocabulary;
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.cc b/phrase-extract/pcfg-common/xml_tree_parser.cc
new file mode 100644
index 000000000..fd9d11334
--- /dev/null
+++ b/phrase-extract/pcfg-common/xml_tree_parser.cc
@@ -0,0 +1,88 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "xml_tree_parser.h"
+
+#include "exception.h"
+#include "tables-core.h"
+#include "XmlException.h"
+#include "XmlTree.h"
+
+#include <cassert>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+XmlTreeParser::XmlTreeParser()
+{
+}
+
+std::auto_ptr<PcfgTree> XmlTreeParser::Parse(const std::string &line)
+{
+ m_line = line;
+ m_tree.Clear();
+ try {
+ if (!ProcessAndStripXMLTags(m_line, m_tree, m_labelSet, m_topLabelSet)) {
+ throw Exception("");
+ }
+ } catch (const XmlException &e) {
+ throw Exception(e.getMsg());
+ }
+ m_tree.ConnectNodes();
+ SyntaxNode *root = m_tree.GetTop();
+ if (!root) {
+ // There is no XML tree.
+ return std::auto_ptr<PcfgTree>();
+ }
+ m_words = tokenize(m_line.c_str());
+ return ConvertTree(*root, m_words);
+}
+
+// Converts a SyntaxNode tree to a Moses::PCFG::PcfgTree.
+std::auto_ptr<PcfgTree> XmlTreeParser::ConvertTree(
+ const SyntaxNode &tree,
+ const std::vector<std::string> &words)
+{
+ std::auto_ptr<PcfgTree> root(new PcfgTree(tree.GetLabel()));
+ const std::vector<SyntaxNode*> &children = tree.GetChildren();
+ if (children.empty()) {
+ if (tree.GetStart() != tree.GetEnd()) {
+ std::ostringstream msg;
+ msg << "leaf node covers multiple words (" << tree.GetStart()
+ << "-" << tree.GetEnd() << "): this is currently unsupported";
+ throw Exception(msg.str());
+ }
+ std::auto_ptr<PcfgTree> leaf(new PcfgTree(words[tree.GetStart()]));
+ leaf->set_parent(root.get());
+ root->AddChild(leaf.release());
+ } else {
+ for (std::vector<SyntaxNode*>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ assert(*p);
+ std::auto_ptr<PcfgTree> child = ConvertTree(**p, words);
+ child->set_parent(root.get());
+ root->AddChild(child.release());
+ }
+ }
+ return root;
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/phrase-extract/pcfg-common/xml_tree_parser.h b/phrase-extract/pcfg-common/xml_tree_parser.h
new file mode 100644
index 000000000..6b418c44e
--- /dev/null
+++ b/phrase-extract/pcfg-common/xml_tree_parser.h
@@ -0,0 +1,56 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_XML_TREE_PARSER_H_
+#define PCFG_XML_TREE_PARSER_H_
+
+#include "pcfg_tree.h"
+#include "SyntaxTree.h"
+
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Parses a string in Moses' XML parse tree format and returns a PcfgTree
+// object.
+class XmlTreeParser {
+ public:
+ XmlTreeParser();
+ std::auto_ptr<PcfgTree> Parse(const std::string &);
+ private:
+ std::auto_ptr<PcfgTree> ConvertTree(const SyntaxNode &,
+ const std::vector<std::string> &);
+
+ std::set<std::string> m_labelSet;
+ std::map<std::string, int> m_topLabelSet;
+ std::string m_line;
+ ::SyntaxTree m_tree;
+ std::vector<std::string> m_words;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-common/xml_tree_writer.h b/phrase-extract/pcfg-common/xml_tree_writer.h
new file mode 100644
index 000000000..6a9a3de05
--- /dev/null
+++ b/phrase-extract/pcfg-common/xml_tree_writer.h
@@ -0,0 +1,133 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_XML_TREE_WRITER_H_
+#define PCFG_XML_TREE_WRITER_H_
+
+#include "syntax_tree.h"
+
+#include "XmlTree.h"
+
+#include <cassert>
+#include <map>
+#include <memory>
+#include <ostream>
+#include <vector>
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+template<typename InputTree>
+class XmlOutputHandler {
+ public:
+ typedef std::map<std::string, std::string> AttributeMap;
+
+ void GetLabel(const InputTree &, std::string &) const;
+ void GetAttributes(const InputTree &, AttributeMap &) const;
+};
+
+template<typename InputTree>
+class XmlTreeWriter : public XmlOutputHandler<InputTree> {
+ public:
+ typedef XmlOutputHandler<InputTree> Base;
+ void Write(const InputTree &, std::ostream &) const;
+ private:
+ std::string Escape(const std::string &) const;
+};
+
+template<typename InputTree>
+void XmlTreeWriter<InputTree>::Write(const InputTree &tree,
+ std::ostream &out) const {
+ assert(!tree.IsLeaf());
+
+ // Opening tag
+
+ std::string label;
+ Base::GetLabel(tree, label);
+ out << "<tree label=\"" << Escape(label) << "\"";
+
+ typename Base::AttributeMap attribute_map;
+ Base::GetAttributes(tree, attribute_map);
+
+ for (typename Base::AttributeMap::const_iterator p = attribute_map.begin();
+ p != attribute_map.end(); ++p) {
+ out << " " << p->first << "=\"" << p->second << "\"";
+ }
+
+ out << ">";
+
+ // Children
+
+ const std::vector<InputTree *> &children = tree.children();
+ for (typename std::vector<InputTree *>::const_iterator p = children.begin();
+ p != children.end(); ++p) {
+ InputTree &child = **p;
+ if (child.IsLeaf()) {
+ Base::GetLabel(child, label);
+ out << " " << Escape(label);
+ } else {
+ out << " ";
+ Write(**p, out);
+ }
+ }
+
+ // Closing tag
+ out << " </tree>";
+
+ if (tree.parent() == 0) {
+ out << std::endl;
+ }
+}
+
+// Escapes XML special characters.
+template<typename InputTree>
+std::string XmlTreeWriter<InputTree>::Escape(const std::string &s) const {
+ std::string t;
+ std::size_t len = s.size();
+ t.reserve(len);
+ for (std::size_t i = 0; i < len; ++i) {
+ if (s[i] == '<') {
+ t += "&lt;";
+ } else if (s[i] == '>') {
+ t += "&gt;";
+ } else if (s[i] == '[') {
+ t += "&#91;";
+ } else if (s[i] == ']') {
+ t += "&#93;";
+ } else if (s[i] == '|') {
+ t += "&bar;";
+ } else if (s[i] == '&') {
+ t += "&amp;";
+ } else if (s[i] == '\'') {
+ t += "&apos;";
+ } else if (s[i] == '"') {
+ t += "&quot;";
+ } else {
+ t += s[i];
+ }
+ }
+ return t;
+}
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-extract/Jamfile b/phrase-extract/pcfg-extract/Jamfile
new file mode 100644
index 000000000..fc93f1a19
--- /dev/null
+++ b/phrase-extract/pcfg-extract/Jamfile
@@ -0,0 +1 @@
+exe pcfg-extract : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options ;
diff --git a/phrase-extract/pcfg-extract/main.cc b/phrase-extract/pcfg-extract/main.cc
new file mode 100644
index 000000000..47b45afc3
--- /dev/null
+++ b/phrase-extract/pcfg-extract/main.cc
@@ -0,0 +1,25 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg_extract.h"
+
+int main(int argc, char *argv[]) {
+ Moses::PCFG::PcfgExtract tool;
+ return tool.Main(argc, argv);
+}
diff --git a/phrase-extract/pcfg-extract/options.h b/phrase-extract/pcfg-extract/options.h
new file mode 100644
index 000000000..3acb31b58
--- /dev/null
+++ b/phrase-extract/pcfg-extract/options.h
@@ -0,0 +1,36 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_OPTIONS_H_
+#define PCFG_EXTRACT_OPTIONS_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+struct Options {
+ std::string corpus_file;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.cc b/phrase-extract/pcfg-extract/pcfg_extract.cc
new file mode 100644
index 000000000..71c2e31c3
--- /dev/null
+++ b/phrase-extract/pcfg-extract/pcfg_extract.cc
@@ -0,0 +1,131 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg_extract.h"
+
+#include "options.h"
+#include "rule_collection.h"
+#include "rule_extractor.h"
+
+#include "pcfg-common/exception.h"
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/syntax_tree.h"
+#include "pcfg-common/typedef.h"
+#include "pcfg-common/xml_tree_parser.h"
+
+#include <boost/program_options.hpp>
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+int PcfgExtract::Main(int argc, char *argv[]) {
+ // Process command-line options.
+ Options options;
+ ProcessOptions(argc, argv, options);
+
+ // Extract PCFG rules from corpus.
+ Vocabulary non_term_vocab;
+ RuleExtractor rule_extractor(non_term_vocab);
+ RuleCollection rule_collection;
+ XmlTreeParser parser;
+ std::string line;
+ std::size_t line_num = 0;
+ std::auto_ptr<PcfgTree> tree;
+ while (std::getline(std::cin, line)) {
+ ++line_num;
+ try {
+ tree = parser.Parse(line);
+ } catch (Exception &e) {
+ std::ostringstream msg;
+ msg << "line " << line_num << ": " << e.msg();
+ Error(msg.str());
+ }
+ if (!tree.get()) {
+ std::ostringstream msg;
+ msg << "no tree at line " << line_num;
+ Warn(msg.str());
+ continue;
+ }
+ rule_extractor.Extract(*tree, rule_collection);
+ }
+
+ // Score rules and write PCFG to output.
+ Pcfg pcfg;
+ rule_collection.CreatePcfg(pcfg);
+ pcfg.Write(non_term_vocab, std::cout);
+
+ return 0;
+}
+
+void PcfgExtract::ProcessOptions(int argc, char *argv[],
+ Options &options) const {
+ namespace po = boost::program_options;
+
+ std::ostringstream usage_top;
+ usage_top << "Usage: " << name() << "\n\n" << "Options";
+
+ // Declare the command line options that are visible to the user.
+ po::options_description visible(usage_top.str());
+ visible.add_options()
+ ("help", "print help message and exit")
+ ;
+
+ // Declare the command line options that are hidden from the user
+ // (these are used as positional options).
+ po::options_description hidden("Hidden options");
+ hidden.add_options();
+
+ // Compose the full set of command-line options.
+ po::options_description cmd_line_options;
+ cmd_line_options.add(visible).add(hidden);
+
+ // Register the positional options.
+ po::positional_options_description p;
+
+ // Process the command-line.
+ po::variables_map vm;
+ try {
+ po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+ options(cmd_line_options).positional(p).run(), vm);
+ po::notify(vm);
+ } catch (const std::exception &e) {
+ std::ostringstream msg;
+ msg << e.what() << "\n\n" << visible;
+ Error(msg.str());
+ }
+
+ if (vm.count("help")) {
+ std::cout << visible << std::endl;
+ std::exit(0);
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/phrase-extract/pcfg-extract/pcfg_extract.h b/phrase-extract/pcfg-extract/pcfg_extract.h
new file mode 100644
index 000000000..1af6cb4fe
--- /dev/null
+++ b/phrase-extract/pcfg-extract/pcfg_extract.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_PCFG_EXTRACT_H_
+#define PCFG_EXTRACT_PCFG_EXTRACT_H_
+
+#include "pcfg-common/tool.h"
+
+namespace Moses {
+namespace PCFG {
+
+class Options;
+
+class PcfgExtract : public Tool {
+ public:
+ PcfgExtract() : Tool("pcfg-extract") {}
+ virtual int Main(int, char *[]);
+ private:
+ void ProcessOptions(int, char *[], Options &) const;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-extract/rule_collection.cc b/phrase-extract/pcfg-extract/rule_collection.cc
new file mode 100644
index 000000000..32b63e0ef
--- /dev/null
+++ b/phrase-extract/pcfg-extract/rule_collection.cc
@@ -0,0 +1,58 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "rule_collection.h"
+
+#include "pcfg-common/pcfg.h"
+
+#include <cmath>
+
+namespace Moses {
+namespace PCFG {
+
+void RuleCollection::Add(std::size_t lhs, const std::vector<std::size_t> &rhs) {
+ ++collection_[lhs][rhs];
+}
+
+void RuleCollection::CreatePcfg(Pcfg &pcfg) {
+ std::vector<std::size_t> key;
+ for (const_iterator p = begin(); p != end(); ++p) {
+ std::size_t lhs = p->first;
+ const RhsCountMap &rhs_counts = p->second;
+ std::size_t total = 0;
+ for (RhsCountMap::const_iterator q = rhs_counts.begin();
+ q != rhs_counts.end(); ++q) {
+ total += q->second;
+ }
+ for (RhsCountMap::const_iterator q = rhs_counts.begin();
+ q != rhs_counts.end(); ++q) {
+ const std::vector<std::size_t> &rhs = q->first;
+ std::size_t count = q->second;
+ double score = std::log(static_cast<double>(count) /
+ static_cast<double>(total));
+ key.clear();
+ key.push_back(lhs);
+ key.insert(key.end(), rhs.begin(), rhs.end());
+ pcfg.Add(key, score);
+ }
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/phrase-extract/pcfg-extract/rule_collection.h b/phrase-extract/pcfg-extract/rule_collection.h
new file mode 100644
index 000000000..452fa0e97
--- /dev/null
+++ b/phrase-extract/pcfg-extract/rule_collection.h
@@ -0,0 +1,59 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_RULE_COLLECTION_H_
+#define PCFG_EXTRACT_RULE_COLLECTION_H_
+
+#include "pcfg-common/pcfg.h"
+
+#include <boost/unordered_map.hpp>
+
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+// Contains PCFG rules and their counts.
+class RuleCollection {
+ public:
+ typedef boost::unordered_map<std::vector<std::size_t>, std::size_t> RhsCountMap;
+ typedef boost::unordered_map<std::size_t, RhsCountMap> Map;
+ typedef Map::iterator iterator;
+ typedef Map::const_iterator const_iterator;
+
+ RuleCollection() {}
+
+ iterator begin() { return collection_.begin(); }
+ const_iterator begin() const { return collection_.begin(); }
+
+ iterator end() { return collection_.end(); }
+ const_iterator end() const { return collection_.end(); }
+
+ void Add(std::size_t, const std::vector<std::size_t> &);
+ void CreatePcfg(Pcfg &);
+
+ private:
+ Map collection_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-extract/rule_extractor.cc b/phrase-extract/pcfg-extract/rule_extractor.cc
new file mode 100644
index 000000000..217574e7d
--- /dev/null
+++ b/phrase-extract/pcfg-extract/rule_extractor.cc
@@ -0,0 +1,51 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "rule_extractor.h"
+
+#include "pcfg-common/pcfg_tree.h"
+
+namespace Moses {
+namespace PCFG {
+
+RuleExtractor::RuleExtractor(Vocabulary &non_term_vocab)
+ : non_term_vocab_(non_term_vocab) {
+}
+
+void RuleExtractor::Extract(const PcfgTree &tree, RuleCollection &rc) const {
+ if (tree.IsPreterminal() || tree.IsLeaf()) {
+ return;
+ }
+
+ std::size_t lhs = non_term_vocab_.Insert(tree.label());
+ std::vector<std::size_t> rhs;
+
+ const std::vector<PcfgTree *> &children = tree.children();
+ rhs.reserve(children.size());
+ for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ const PcfgTree &child = **p;
+ rhs.push_back(non_term_vocab_.Insert(child.label()));
+ Extract(child, rc);
+ }
+ rc.Add(lhs, rhs);
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/phrase-extract/pcfg-extract/rule_extractor.h b/phrase-extract/pcfg-extract/rule_extractor.h
new file mode 100644
index 000000000..6bcffbc61
--- /dev/null
+++ b/phrase-extract/pcfg-extract/rule_extractor.h
@@ -0,0 +1,45 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_EXTRACT_RULE_EXTRACTOR_H_
+#define PCFG_EXTRACT_RULE_EXTRACTOR_H_
+
+#include "rule_collection.h"
+
+#include "pcfg-common/typedef.h"
+
+namespace Moses {
+namespace PCFG {
+
+class PcfgTree;
+
+// Extracts PCFG rules from syntax trees and adds them to a RuleCollection.
+class RuleExtractor {
+ public:
+ RuleExtractor(Vocabulary &);
+ void Extract(const PcfgTree &, RuleCollection &) const;
+ private:
+ Vocabulary &non_term_vocab_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-score/Jamfile b/phrase-extract/pcfg-score/Jamfile
new file mode 100644
index 000000000..6b8c963ce
--- /dev/null
+++ b/phrase-extract/pcfg-score/Jamfile
@@ -0,0 +1 @@
+exe pcfg-score : [ glob *.cc ] ..//pcfg-common ../..//boost_program_options ;
diff --git a/phrase-extract/pcfg-score/main.cc b/phrase-extract/pcfg-score/main.cc
new file mode 100644
index 000000000..da5392add
--- /dev/null
+++ b/phrase-extract/pcfg-score/main.cc
@@ -0,0 +1,25 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg_score.h"
+
+int main(int argc, char *argv[]) {
+ Moses::PCFG::PcfgScore tool;
+ return tool.Main(argc, argv);
+}
diff --git a/phrase-extract/pcfg-score/options.h b/phrase-extract/pcfg-score/options.h
new file mode 100644
index 000000000..e54b2a0b9
--- /dev/null
+++ b/phrase-extract/pcfg-score/options.h
@@ -0,0 +1,36 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_OPTIONS_H_
+#define PCFG_SCORE_OPTIONS_H_
+
+#include <string>
+
+namespace Moses {
+namespace PCFG {
+
+struct Options {
+ std::string pcfg_file;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-score/pcfg_score.cc b/phrase-extract/pcfg-score/pcfg_score.cc
new file mode 100644
index 000000000..345d7fc60
--- /dev/null
+++ b/phrase-extract/pcfg-score/pcfg_score.cc
@@ -0,0 +1,152 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "pcfg_score.h"
+
+#include "options.h"
+#include "tree_scorer.h"
+
+#include "pcfg-common/exception.h"
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/syntax_tree.h"
+#include "pcfg-common/typedef.h"
+#include "pcfg-common/xml_tree_parser.h"
+
+#include <boost/program_options.hpp>
+
+#include <cassert>
+#include <cstdlib>
+#include <fstream>
+#include <iostream>
+#include <map>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+namespace Moses {
+namespace PCFG {
+
+int PcfgScore::Main(int argc, char *argv[]) {
+ // Process command-line options.
+ Options options;
+ ProcessOptions(argc, argv, options);
+
+ // Open PCFG stream.
+ std::ifstream pcfg_stream;
+ OpenNamedInputOrDie(options.pcfg_file, pcfg_stream);
+
+ // Read PCFG.
+ Pcfg pcfg;
+ Vocabulary non_term_vocab;
+ pcfg.Read(pcfg_stream, non_term_vocab);
+
+ // Score corpus according to PCFG.
+ TreeScorer scorer(pcfg, non_term_vocab);
+ XmlTreeParser parser;
+ XmlTreeWriter<PcfgTree> writer;
+ std::string line;
+ std::size_t line_num = 0;
+ std::auto_ptr<PcfgTree> tree;
+ while (std::getline(std::cin, line)) {
+ ++line_num;
+ try {
+ tree = parser.Parse(line);
+ } catch (Exception &e) {
+ std::ostringstream msg;
+ msg << "line " << line_num << ": " << e.msg();
+ Error(msg.str());
+ }
+ if (!tree.get()) {
+ std::ostringstream msg;
+ msg << "no tree at line " << line_num;
+ Warn(msg.str());
+ std::cout << line << std::endl;
+ continue;
+ }
+ if (!scorer.Score(*tree)) {
+ std::ostringstream msg;
+ msg << "failed to score tree at line " << line_num;
+ Warn(msg.str());
+ std::cout << line << std::endl;
+ continue;
+ }
+ writer.Write(*tree, std::cout);
+ }
+
+ return 0;
+}
+
+void PcfgScore::ProcessOptions(int argc, char *argv[], Options &options) const {
+ namespace po = boost::program_options;
+
+ std::ostringstream usage_top;
+ usage_top << "Usage: " << name() << " PCFG\n\n"
+ << "Options";
+
+ // Declare the command line options that are visible to the user.
+ po::options_description visible(usage_top.str());
+ visible.add_options()
+ ("help", "print help message and exit")
+ ;
+
+ // Declare the command line options that are hidden from the user
+ // (these are used as positional options).
+ po::options_description hidden("Hidden options");
+ hidden.add_options()
+ ("pcfg-file", po::value(&options.pcfg_file), "pcfg file")
+ ;
+
+ // Compose the full set of command-line options.
+ po::options_description cmd_line_options;
+ cmd_line_options.add(visible).add(hidden);
+
+ // Register the positional options.
+ po::positional_options_description p;
+ p.add("pcfg-file", 1);
+
+ // Process the command-line.
+ po::variables_map vm;
+ try {
+ po::store(po::command_line_parser(argc, argv).style(CommonOptionStyle()).
+ options(cmd_line_options).positional(p).run(), vm);
+ po::notify(vm);
+ } catch (const std::exception &e) {
+ std::ostringstream msg;
+ msg << e.what() << "\n\n" << visible;
+ Error(msg.str());
+ }
+
+ if (vm.count("help")) {
+ std::cout << visible << std::endl;
+ std::exit(0);
+ }
+
+ // Check positional options were given.
+
+ if (!vm.count("pcfg-file")) {
+ std::ostringstream msg;
+ msg << "missing required argument\n\n" << visible << std::endl;
+ Error(msg.str());
+ }
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/phrase-extract/pcfg-score/pcfg_score.h b/phrase-extract/pcfg-score/pcfg_score.h
new file mode 100644
index 000000000..5e506c39d
--- /dev/null
+++ b/phrase-extract/pcfg-score/pcfg_score.h
@@ -0,0 +1,42 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_PCFG_SCORE_H_
+#define PCFG_SCORE_PCFG_SCORE_H_
+
+#include "pcfg-common/tool.h"
+
+namespace Moses {
+namespace PCFG {
+
+class Options;
+
+class PcfgScore : public Tool {
+ public:
+ PcfgScore() : Tool("pcfg-score") {}
+ virtual int Main(int, char *[]);
+ private:
+ void ProcessOptions(int, char *[], Options &) const;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/pcfg-score/tree_scorer.cc b/phrase-extract/pcfg-score/tree_scorer.cc
new file mode 100644
index 000000000..f9ce97ae0
--- /dev/null
+++ b/phrase-extract/pcfg-score/tree_scorer.cc
@@ -0,0 +1,68 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "tree_scorer.h"
+
+#include <cassert>
+
+namespace Moses {
+namespace PCFG {
+
+TreeScorer::TreeScorer(const Pcfg &pcfg, const Vocabulary &non_term_vocab)
+ : pcfg_(pcfg)
+ , non_term_vocab_(non_term_vocab) {
+}
+
+bool TreeScorer::Score(PcfgTree &root) const {
+ if (root.IsPreterminal() || root.IsLeaf()) {
+ return true;
+ }
+
+ const std::vector<PcfgTree *> &children = root.children();
+
+ double log_prob = 0.0;
+
+ std::vector<std::size_t> key;
+ key.reserve(children.size()+1);
+ key.push_back(non_term_vocab_.Lookup(root.label()));
+
+ for (std::vector<PcfgTree *>::const_iterator p(children.begin());
+ p != children.end(); ++p) {
+ PcfgTree *child = *p;
+ assert(!child->IsLeaf());
+ key.push_back(non_term_vocab_.Lookup(child->label()));
+ if (!Score(*child)) {
+ return false;
+ }
+ if (!child->IsPreterminal()) {
+ log_prob += child->score();
+ }
+ }
+ double rule_score;
+ bool found = pcfg_.Lookup(key, rule_score);
+ if (!found) {
+ return false;
+ }
+ log_prob += rule_score;
+ root.set_score(log_prob);
+ return true;
+}
+
+} // namespace PCFG
+} // namespace Moses
diff --git a/phrase-extract/pcfg-score/tree_scorer.h b/phrase-extract/pcfg-score/tree_scorer.h
new file mode 100644
index 000000000..36f4e1e99
--- /dev/null
+++ b/phrase-extract/pcfg-score/tree_scorer.h
@@ -0,0 +1,47 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2012 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+#ifndef PCFG_SCORE_TREE_SCORER_H_
+#define PCFG_SCORE_TREE_SCORER_H_
+
+#include "pcfg-common/pcfg.h"
+#include "pcfg-common/pcfg_tree.h"
+#include "pcfg-common/typedef.h"
+
+namespace Moses {
+namespace PCFG {
+
+class TreeScorer {
+ public:
+ TreeScorer(const Pcfg &, const Vocabulary &);
+
+ // Score tree according to PCFG. Returns false if unsuccessful (due to
+ // missing rule).
+ bool Score(PcfgTree &) const;
+
+ private:
+ const Pcfg &pcfg_;
+ const Vocabulary &non_term_vocab_;
+};
+
+} // namespace PCFG
+} // namespace Moses
+
+#endif
diff --git a/phrase-extract/phrase-extract.sln b/phrase-extract/phrase-extract.sln
new file mode 100644
index 000000000..800c26192
--- /dev/null
+++ b/phrase-extract/phrase-extract.sln
@@ -0,0 +1,56 @@
+
+Microsoft Visual Studio Solution File, Format Version 11.00
+# Visual C++ Express 2010
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "score", "score.vcxproj", "{34AC84C7-62A1-4BBE-BCA9-4E95B2341039}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "extract", "extract.vcxproj", "{2475F8E8-A5C9-4785-8B09-5F4E120FC518}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "symal", "..\symal\symal.vcxproj", "{6716FB26-8298-47A3-A915-958AF0AC80F8}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "extract-rules", "extract-rules.vcxproj", "{3C5CFAEC-6830-4491-9008-1C9E8C381C50}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "consolidate", "consolidate.vcxproj", "{45410EFE-65C0-4078-82E4-D636258F9225}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "consolidate-direct", "consolidate-direct.vcxproj", "{33775109-60CF-4C1C-A869-5450B3DD88B3}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "extract-lex", "extract-lex.vcxproj", "{808BF985-CA18-4E55-8AAC-70E04DC25117}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Release|Win32 = Release|Win32
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {34AC84C7-62A1-4BBE-BCA9-4E95B2341039}.Debug|Win32.ActiveCfg = Debug|Win32
+ {34AC84C7-62A1-4BBE-BCA9-4E95B2341039}.Debug|Win32.Build.0 = Debug|Win32
+ {34AC84C7-62A1-4BBE-BCA9-4E95B2341039}.Release|Win32.ActiveCfg = Release|Win32
+ {34AC84C7-62A1-4BBE-BCA9-4E95B2341039}.Release|Win32.Build.0 = Release|Win32
+ {2475F8E8-A5C9-4785-8B09-5F4E120FC518}.Debug|Win32.ActiveCfg = Debug|Win32
+ {2475F8E8-A5C9-4785-8B09-5F4E120FC518}.Debug|Win32.Build.0 = Debug|Win32
+ {2475F8E8-A5C9-4785-8B09-5F4E120FC518}.Release|Win32.ActiveCfg = Release|Win32
+ {2475F8E8-A5C9-4785-8B09-5F4E120FC518}.Release|Win32.Build.0 = Release|Win32
+ {6716FB26-8298-47A3-A915-958AF0AC80F8}.Debug|Win32.ActiveCfg = Debug|Win32
+ {6716FB26-8298-47A3-A915-958AF0AC80F8}.Debug|Win32.Build.0 = Debug|Win32
+ {6716FB26-8298-47A3-A915-958AF0AC80F8}.Release|Win32.ActiveCfg = Release|Win32
+ {6716FB26-8298-47A3-A915-958AF0AC80F8}.Release|Win32.Build.0 = Release|Win32
+ {3C5CFAEC-6830-4491-9008-1C9E8C381C50}.Debug|Win32.ActiveCfg = Debug|Win32
+ {3C5CFAEC-6830-4491-9008-1C9E8C381C50}.Debug|Win32.Build.0 = Debug|Win32
+ {3C5CFAEC-6830-4491-9008-1C9E8C381C50}.Release|Win32.ActiveCfg = Release|Win32
+ {3C5CFAEC-6830-4491-9008-1C9E8C381C50}.Release|Win32.Build.0 = Release|Win32
+ {45410EFE-65C0-4078-82E4-D636258F9225}.Debug|Win32.ActiveCfg = Debug|Win32
+ {45410EFE-65C0-4078-82E4-D636258F9225}.Debug|Win32.Build.0 = Debug|Win32
+ {45410EFE-65C0-4078-82E4-D636258F9225}.Release|Win32.ActiveCfg = Release|Win32
+ {45410EFE-65C0-4078-82E4-D636258F9225}.Release|Win32.Build.0 = Release|Win32
+ {33775109-60CF-4C1C-A869-5450B3DD88B3}.Debug|Win32.ActiveCfg = Debug|Win32
+ {33775109-60CF-4C1C-A869-5450B3DD88B3}.Debug|Win32.Build.0 = Debug|Win32
+ {33775109-60CF-4C1C-A869-5450B3DD88B3}.Release|Win32.ActiveCfg = Release|Win32
+ {33775109-60CF-4C1C-A869-5450B3DD88B3}.Release|Win32.Build.0 = Release|Win32
+ {808BF985-CA18-4E55-8AAC-70E04DC25117}.Debug|Win32.ActiveCfg = Debug|Win32
+ {808BF985-CA18-4E55-8AAC-70E04DC25117}.Debug|Win32.Build.0 = Debug|Win32
+ {808BF985-CA18-4E55-8AAC-70E04DC25117}.Release|Win32.ActiveCfg = Release|Win32
+ {808BF985-CA18-4E55-8AAC-70E04DC25117}.Release|Win32.Build.0 = Release|Win32
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+EndGlobal
diff --git a/phrase-extract/relax-parse.cpp b/phrase-extract/relax-parse.cpp
new file mode 100644
index 000000000..6e561b921
--- /dev/null
+++ b/phrase-extract/relax-parse.cpp
@@ -0,0 +1,277 @@
+// vim:tabstop=2
+
+/***********************************************************************
+ Moses - factored hierarchical phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include "relax-parse.h"
+
+#include "SafeGetline.h"
+#include "tables-core.h"
+
+using namespace std;
+
+int main(int argc, char* argv[])
+{
+ init( argc, argv ); // initialize from switches, set flags
+
+ // loop through all sentences
+ int i=0;
+ char inBuffer[LINE_MAX_LENGTH];
+ while(true) {
+ i++;
+ if (i%1000 == 0) cerr << "." << flush;
+ if (i%10000 == 0) cerr << ":" << flush;
+ if (i%100000 == 0) cerr << "!" << flush;
+
+ // get line from stdin
+ SAFE_GETLINE( cin, inBuffer, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (cin.eof()) break;
+
+ // process into syntax tree representation
+ string inBufferString = string( inBuffer );
+ set< string > labelCollection; // set of labels, not used
+ map< string, int > topLabelCollection; // count of top labels, not used
+ SyntaxTree tree;
+ ProcessAndStripXMLTags( inBufferString, tree, labelCollection, topLabelCollection );
+ vector< string > inWords = tokenize( inBufferString.c_str() );
+
+ // output tree
+ // cerr << "BEFORE:" << endl << tree;
+
+ ParentNodes parents = tree.Parse();
+
+ // execute selected grammar relaxation schemes
+ if (leftBinarizeFlag)
+ LeftBinarize( tree, parents );
+
+ if (rightBinarizeFlag)
+ RightBinarize( tree, parents );
+
+ if (SAMTLevel>0)
+ SAMT( tree, parents );
+
+ // output tree
+ // cerr << "AFTER:" << endl << tree;
+
+ store( tree, inWords );
+ }
+}
+
+// initialize settings from switches
+
+void init(int argc, char* argv[])
+{
+ cerr << "Parse Relaxer v1.0, written by Philipp Koehn\n";
+ cerr << "adds additional constituents to a parse tree\n";
+
+ if (argc < 2) {
+ cerr << "syntax: relax-parse < in-parse > out-parse ["
+ << " --LeftBinarize | ---RightBinarize |"
+ << " --SAMT 1-4 ]" << endl;
+ exit(1);
+ }
+
+ for(int i=1; i<argc; i++) {
+ // add constituents with binarization
+ if (strcmp(argv[i],"--LeftBinarize") == 0) {
+ leftBinarizeFlag = true;
+ } else if (strcmp(argv[i],"--RightBinarize") == 0) {
+ rightBinarizeFlag = true;
+ }
+
+ // add constituents according to samt (Zollmann/Venugopal)
+ else if (strcmp(argv[i],"--SAMT") == 0) {
+ SAMTLevel = atoi( argv[++i] );
+ cerr << "using SAMT grammar, level " << SAMTLevel << endl;
+ }
+
+ // error
+ else {
+ cerr << "relax-grammar: syntax error, unknown option '" << string(argv[i]) << "'\n";
+ exit(1);
+ }
+ }
+}
+
+void store( SyntaxTree &tree, vector< string > &words )
+{
+ // output words
+ for( size_t i=0; i<words.size(); i++ ) {
+ if (i>0) {
+ cout << " ";
+ }
+ cout << words[i];
+ }
+
+ // output tree nodes
+ vector< SyntaxNode* > nodes = tree.GetAllNodes();
+ for( size_t i=0; i<nodes.size(); i++ ) {
+ cout << " <tree span=\"" << nodes[i]->GetStart()
+ << "-" << nodes[i]->GetEnd()
+ << "\" label=\"" << nodes[i]->GetLabel()
+ << "\"/>";
+ }
+ cout << endl;
+}
+
+void LeftBinarize( SyntaxTree &tree, ParentNodes &parents )
+{
+ for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
+ const SplitPoints &point = *p;
+ if (point.size() > 3) {
+ const vector< SyntaxNode* >& topNodes
+ = tree.GetNodes( point[0], point[point.size()-1]-1);
+ string topLabel = topNodes[0]->GetLabel();
+
+ for(size_t i=2; i<point.size()-1; i++) {
+ // cerr << "LeftBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[0] << "-" << point[i]-1 << " ^" << topLabel << endl;
+ tree.AddNode( point[0], point[i]-1, "^" + topLabel );
+ }
+ }
+ }
+}
+
+void RightBinarize( SyntaxTree &tree, ParentNodes &parents )
+{
+ for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
+ const SplitPoints &point = *p;
+ if (point.size() > 3) {
+ int endPoint = point[point.size()-1]-1;
+ const vector< SyntaxNode* >& topNodes
+ = tree.GetNodes( point[0], endPoint);
+ string topLabel = topNodes[0]->GetLabel();
+
+ for(size_t i=1; i<point.size()-2; i++) {
+ // cerr << "RightBin " << point[0] << "-" << (point[point.size()-1]-1) << ": " << point[i] << "-" << endPoint << " ^" << topLabel << endl;
+ tree.AddNode( point[i], endPoint, "^" + topLabel );
+ }
+ }
+ }
+}
+
+void SAMT( SyntaxTree &tree, ParentNodes &parents )
+{
+ int numWords = tree.GetNumWords();
+
+ SyntaxTree newTree; // to store new nodes
+
+ // look through parents to combine children
+ for(ParentNodes::const_iterator p = parents.begin(); p != parents.end(); p++) {
+ const SplitPoints &point = *p;
+
+ // neighboring childen: DET+ADJ
+ if (point.size() >= 3) {
+ // cerr << "complex parent: ";
+ // for(int i=0;i<point.size();i++) cerr << point[i] << " ";
+ // cerr << endl;
+
+ for(size_t i = 0; i+2 < point.size(); i++) {
+ // cerr << "\tadding " << point[i] << ";" << point[i+1] << ";" << (point[i+2]-1) << ": " << tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel() << "+" << tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() << endl;
+
+ newTree.AddNode( point[i],point[i+2]-1,
+ tree.GetNodes(point[i ],point[i+1]-1)[0]->GetLabel()
+ + "+" +
+ tree.GetNodes(point[i+1],point[i+2]-1)[0]->GetLabel() );
+ }
+ }
+ if (point.size() >= 4) {
+ int ps = point.size();
+ string topLabel = tree.GetNodes(point[0],point[ps-1]-1)[0]->GetLabel();
+
+ // cerr << "\tadding " << topLabel + "\\" + tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() << endl;
+ newTree.AddNode( point[1],point[ps-1]-1,
+ topLabel
+ + "\\" +
+ tree.GetNodes(point[0],point[1]-1)[0]->GetLabel() );
+
+ // cerr << "\tadding " << topLabel + "/" + tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() << endl;
+ newTree.AddNode( point[0],point[ps-2]-1,
+ topLabel
+ + "/" +
+ tree.GetNodes(point[ps-2],point[ps-1]-1)[0]->GetLabel() );
+ }
+ }
+
+ // rules for any bordering constituents...
+ for(int size = 2; size < numWords; size++) {
+ for(int start = 0; start < numWords-size+1; start++) {
+ int end = start+size-1;
+ bool done = false;
+
+ if (tree.HasNode( start,end ) || newTree.HasNode( start,end )
+ || SAMTLevel <= 1) {
+ continue;
+ }
+
+ // if matching two adjacent parse constituents: use ++
+
+ for(int mid=start+1; mid<=end && !done; mid++) {
+ if (tree.HasNode(start,mid-1) && tree.HasNode(mid,end)) {
+ // cerr << "\tadding " << tree.GetNodes(start,mid-1)[0]->GetLabel() << "++" << tree.GetNodes(mid, end )[0]->GetLabel() << endl;
+
+ newTree.AddNode( start, end,
+ tree.GetNodes(start,mid-1)[0]->GetLabel()
+ + "++" +
+ tree.GetNodes(mid, end )[0]->GetLabel() );
+ done = true;
+ }
+ }
+ if (done) continue;
+
+ // if matching a constituent A right-minus const. B: use A//B
+ for(int postEnd=end+1; postEnd<numWords && !done; postEnd++) {
+ if (tree.HasNode(start,postEnd) && tree.HasNode(end+1,postEnd)) {
+ newTree.AddNode( start, end,
+ tree.GetNodes(start,postEnd)[0]->GetLabel()
+ + "//" +
+ tree.GetNodes(end+1,postEnd)[0]->GetLabel() );
+ done = true;
+ }
+ }
+ if (done) continue;
+
+ // if matching a constituent A left-minus constituent B: use A\\B
+ for(int preStart=start-1; preStart>=0; preStart--) {
+ if (tree.HasNode(preStart,end) && tree.HasNode(preStart,start-1)) {
+ // cerr << "\tadding " << tree.GetNodes(preStart,end )[0]->GetLabel() << "\\\\" <<tree.GetNodes(preStart,start-1)[0]->GetLabel() << endl;
+ newTree.AddNode( start, end,
+ tree.GetNodes(preStart,end )[0]->GetLabel()
+ + "\\\\" +
+ tree.GetNodes(preStart,start-1)[0]->GetLabel() );
+ done = true;
+ }
+ }
+ if (done) continue;
+
+ // if matching three consecutive constituents, use double-plus
+ // SAMT Level 3, not yet implemented
+
+ // else: assign default category _FAIL
+ if (SAMTLevel>=4) {
+ newTree.AddNode( start, end, "_FAIL" );
+ }
+ }
+ }
+
+ // adding all new nodes
+ vector< SyntaxNode* > nodes = newTree.GetAllNodes();
+ for( size_t i=0; i<nodes.size(); i++ ) {
+ tree.AddNode( nodes[i]->GetStart(), nodes[i]->GetEnd(), nodes[i]->GetLabel());
+ }
+}
diff --git a/phrase-extract/relax-parse.h b/phrase-extract/relax-parse.h
new file mode 100644
index 000000000..ae5994641
--- /dev/null
+++ b/phrase-extract/relax-parse.h
@@ -0,0 +1,46 @@
+// vim:tabstop=2
+
+/***********************************************************************
+ Moses - factored hierarchical phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#pragma once
+
+#include <vector>
+#include <string>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <cstring>
+
+#include "SyntaxTree.h"
+#include "XmlTree.h"
+
+#define LINE_MAX_LENGTH 1000000
+
+bool leftBinarizeFlag = false;
+bool rightBinarizeFlag = false;
+char SAMTLevel = 0;
+
+// functions
+void init(int argc, char* argv[]);
+void store( SyntaxTree &tree, std::vector<std::string> &words );
+void LeftBinarize( SyntaxTree &tree, ParentNodes &parents );
+void RightBinarize( SyntaxTree &tree, ParentNodes &parents );
+void SAMT( SyntaxTree &tree, ParentNodes &parents );
+
diff --git a/phrase-extract/score.cpp b/phrase-extract/score.cpp
new file mode 100644
index 000000000..5e0ade627
--- /dev/null
+++ b/phrase-extract/score.cpp
@@ -0,0 +1,761 @@
+/***********************************************************************
+ Moses - factored phrase-based language decoder
+ Copyright (C) 2009 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ ***********************************************************************/
+
+#include <sstream>
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <stdlib.h>
+#include <assert.h>
+#include <cstring>
+#include <set>
+
+#include "SafeGetline.h"
+#include "tables-core.h"
+#include "PhraseAlignment.h"
+#include "score.h"
+#include "InputFileStream.h"
+#include "OutputFileStream.h"
+
+using namespace std;
+
+#define LINE_MAX_LENGTH 100000
+
+Vocabulary vcbT;
+Vocabulary vcbS;
+
+class LexicalTable
+{
+public:
+ map< WORD_ID, map< WORD_ID, double > > ltable;
+ void load( char[] );
+ double permissiveLookup( WORD_ID wordS, WORD_ID wordT ) {
+ // cout << endl << vcbS.getWord( wordS ) << "-" << vcbT.getWord( wordT ) << ":";
+ if (ltable.find( wordS ) == ltable.end()) return 1.0;
+ if (ltable[ wordS ].find( wordT ) == ltable[ wordS ].end()) return 1.0;
+ // cout << ltable[ wordS ][ wordT ];
+ return ltable[ wordS ][ wordT ];
+ }
+};
+
+vector<string> tokenize( const char [] );
+
+void writeCountOfCounts( const string &fileNameCountOfCounts );
+void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile);
+PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair );
+void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile );
+double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
+double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+set<string> functionWordList;
+void loadFunctionWords( const char* fileNameFunctionWords );
+double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
+ , map<size_t, map<size_t, float> > &sourceProb
+ , map<size_t, map<size_t, float> > &targetProb);
+void printSourcePhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
+void printTargetPhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
+
+LexicalTable lexTable;
+bool inverseFlag = false;
+bool hierarchicalFlag = false;
+bool pcfgFlag = false;
+bool unpairedExtractFormatFlag = false;
+bool conditionOnTargetLhsFlag = false;
+bool wordAlignmentFlag = false;
+bool goodTuringFlag = false;
+bool kneserNeyFlag = false;
+#define COC_MAX 10
+bool logProbFlag = false;
+int negLogProb = 1;
+bool lexFlag = true;
+bool unalignedFlag = false;
+bool unalignedFWFlag = false;
+bool outputNTLengths = false;
+int countOfCounts[COC_MAX+1];
+int totalDistinct = 0;
+float minCountHierarchical = 0;
+
+int main(int argc, char* argv[])
+{
+ cerr << "Score v2.0 written by Philipp Koehn\n"
+ << "scoring methods for extracted rules\n";
+
+ if (argc < 4) {
+ cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS]\n";
+ exit(1);
+ }
+ char* fileNameExtract = argv[1];
+ char* fileNameLex = argv[2];
+ char* fileNamePhraseTable = argv[3];
+ string fileNameCountOfCounts;
+ char* fileNameFunctionWords;
+
+ for(int i=4; i<argc; i++) {
+ if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
+ inverseFlag = true;
+ cerr << "using inverse mode\n";
+ } else if (strcmp(argv[i],"--Hierarchical") == 0) {
+ hierarchicalFlag = true;
+ cerr << "processing hierarchical rules\n";
+ } else if (strcmp(argv[i],"--PCFG") == 0) {
+ pcfgFlag = true;
+ cerr << "including PCFG scores\n";
+ } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
+ unpairedExtractFormatFlag = true;
+ cerr << "processing unpaired extract format\n";
+ } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
+ conditionOnTargetLhsFlag = true;
+ cerr << "processing unpaired extract format\n";
+ } else if (strcmp(argv[i],"--WordAlignment") == 0) {
+ wordAlignmentFlag = true;
+ cerr << "outputing word alignment" << endl;
+ } else if (strcmp(argv[i],"--NoLex") == 0) {
+ lexFlag = false;
+ cerr << "not computing lexical translation score\n";
+ } else if (strcmp(argv[i],"--GoodTuring") == 0) {
+ goodTuringFlag = true;
+ fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
+ cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
+ } else if (strcmp(argv[i],"--KneserNey") == 0) {
+ kneserNeyFlag = true;
+ fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
+ cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
+ } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
+ unalignedFlag = true;
+ cerr << "using unaligned word penalty\n";
+ } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
+ unalignedFWFlag = true;
+ if (i+1==argc) {
+ cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
+ exit(1);
+ }
+ fileNameFunctionWords = argv[++i];
+ cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl;
+ } else if (strcmp(argv[i],"--LogProb") == 0) {
+ logProbFlag = true;
+ cerr << "using log-probabilities\n";
+ } else if (strcmp(argv[i],"--NegLogProb") == 0) {
+ logProbFlag = true;
+ negLogProb = -1;
+ cerr << "using negative log-probabilities\n";
+ } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
+ minCountHierarchical = atof(argv[++i]);
+ cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n";
+ minCountHierarchical -= 0.00001; // account for rounding
+ } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
+ outputNTLengths = true;
+ } else {
+ cerr << "ERROR: unknown option " << argv[i] << endl;
+ exit(1);
+ }
+ }
+
+ // lexical translation table
+ if (lexFlag)
+ lexTable.load( fileNameLex );
+
+ // function word list
+ if (unalignedFWFlag)
+ loadFunctionWords( fileNameFunctionWords );
+
+ // compute count of counts for Good Turing discounting
+ if (goodTuringFlag || kneserNeyFlag) {
+ for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
+ }
+
+ // sorted phrase extraction file
+ Moses::InputFileStream extractFile(fileNameExtract);
+
+ if (extractFile.fail()) {
+ cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
+ exit(1);
+ }
+ istream &extractFileP = extractFile;
+
+ // output file: phrase translation table
+ ostream *phraseTableFile;
+
+ if (strcmp(fileNamePhraseTable, "-") == 0) {
+ phraseTableFile = &cout;
+ }
+ else {
+ Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
+ bool success = outputFile->Open(fileNamePhraseTable);
+ if (!success) {
+ cerr << "ERROR: could not open file phrase table file "
+ << fileNamePhraseTable << endl;
+ exit(1);
+ }
+ phraseTableFile = outputFile;
+ }
+
+ // loop through all extracted phrase translations
+ float lastCount = 0.0f;
+ float lastPcfgSum = 0.0f;
+ vector< PhraseAlignment > phrasePairsWithSameF;
+ int i=0;
+ char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
+ lastLine[0] = '\0';
+ PhraseAlignment *lastPhrasePair = NULL;
+ while(true) {
+ if (extractFileP.eof()) break;
+ if (++i % 100000 == 0) cerr << "." << flush;
+ SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (extractFileP.eof()) break;
+
+ // identical to last line? just add count
+ if (strcmp(line,lastLine) == 0) {
+ lastPhrasePair->count += lastCount;
+ lastPhrasePair->pcfgSum += lastPcfgSum;
+ continue;
+ }
+ strcpy( lastLine, line );
+
+ // create new phrase pair
+ PhraseAlignment phrasePair;
+ phrasePair.create( line, i );
+ lastCount = phrasePair.count;
+ lastPcfgSum = phrasePair.pcfgSum;
+
+ // only differs in count? just add count
+ if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) {
+ lastPhrasePair->count += phrasePair.count;
+ lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
+ continue;
+ }
+
+ // if new source phrase, process last batch
+ if (lastPhrasePair != NULL &&
+ lastPhrasePair->GetSource() != phrasePair.GetSource()) {
+ processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
+ phrasePairsWithSameF.clear();
+ lastPhrasePair = NULL;
+ }
+
+ // add phrase pairs to list, it's now the last one
+ phrasePairsWithSameF.push_back( phrasePair );
+ lastPhrasePair = &phrasePairsWithSameF.back();
+ }
+ processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
+
+ phraseTableFile->flush();
+ if (phraseTableFile != &cout) {
+ delete phraseTableFile;
+ }
+
+ // output count of count statistics
+ if (goodTuringFlag || kneserNeyFlag) {
+ writeCountOfCounts( fileNameCountOfCounts );
+ }
+}
+
+void writeCountOfCounts( const string &fileNameCountOfCounts )
+{
+ // open file
+ Moses::OutputFileStream countOfCountsFile;
+ bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str());
+ if (!success) {
+ cerr << "ERROR: could not open count-of-counts file "
+ << fileNameCountOfCounts << endl;
+ return;
+ }
+
+ // Kneser-Ney needs the total number of phrase pairs
+ countOfCountsFile << totalDistinct << endl;
+
+ // write out counts
+ for(int i=1; i<=COC_MAX; i++) {
+ countOfCountsFile << countOfCounts[ i ] << endl;
+ }
+ countOfCountsFile.Close();
+}
+
+void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile )
+{
+ if (phrasePair.size() == 0) return;
+
+ // group phrase pairs based on alignments that matter
+ // (i.e. that re-arrange non-terminals)
+ PhrasePairGroup phrasePairGroup;
+
+ float totalSource = 0;
+
+ //cerr << "phrasePair.size() = " << phrasePair.size() << endl;
+
+ // loop through phrase pairs
+ for(size_t i=0; i<phrasePair.size(); i++) {
+ // add to total count
+ PhraseAlignment &currPhrasePair = phrasePair[i];
+
+ totalSource += phrasePair[i].count;
+
+ // check for matches
+ //cerr << "phrasePairGroup.size() = " << phrasePairGroup.size() << endl;
+
+ PhraseAlignmentCollection phraseAlignColl;
+ phraseAlignColl.push_back(&currPhrasePair);
+ pair<PhrasePairGroup::iterator, bool> retInsert;
+ retInsert = phrasePairGroup.insert(phraseAlignColl);
+ if (!retInsert.second)
+ { // already exist. Add to that collection instead
+ PhraseAlignmentCollection &existingColl = const_cast<PhraseAlignmentCollection&>(*retInsert.first);
+ existingColl.push_back(&currPhrasePair);
+ }
+
+ }
+
+ // output the distinct phrase pairs, one at a time
+ const PhrasePairGroup::SortedColl &sortedColl = phrasePairGroup.GetSortedColl();
+ PhrasePairGroup::SortedColl::const_iterator iter;
+
+ for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter)
+ {
+ const PhraseAlignmentCollection &group = **iter;
+ outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile );
+
+ }
+
+}
+
+PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
+{
+ float bestAlignmentCount = -1;
+ PhraseAlignment* bestAlignment;
+
+ for(size_t i=0; i<phrasePair.size(); i++) {
+ if (phrasePair[i]->count > bestAlignmentCount) {
+ bestAlignmentCount = phrasePair[i]->count;
+ bestAlignment = phrasePair[i];
+ }
+ }
+
+ return bestAlignment;
+}
+
+
+void calcNTLengthProb(const map<size_t, map<size_t, size_t> > &lengths
+ , size_t total
+ , map<size_t, map<size_t, float> > &probs)
+{
+ map<size_t, map<size_t, size_t> >::const_iterator iterOuter;
+ for (iterOuter = lengths.begin(); iterOuter != lengths.end(); ++iterOuter)
+ {
+ size_t sourcePos = iterOuter->first;
+ const map<size_t, size_t> &inner = iterOuter->second;
+
+ map<size_t, size_t>::const_iterator iterInner;
+ for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner)
+ {
+ size_t length = iterInner->first;
+ size_t count = iterInner->second;
+ float prob = (float) count / (float) total;
+ probs[sourcePos][length] = prob;
+ }
+ }
+}
+
+void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
+ , map<size_t, map<size_t, float> > &sourceProb
+ , map<size_t, map<size_t, float> > &targetProb)
+{
+ map<size_t, map<size_t, size_t> > sourceLengths, targetLengths;
+ // 1st = position in source phrase, 2nd = length, 3rd = count
+ map<size_t, size_t> totals;
+ // 1st = position in source phrase, 2nd = total counts
+ // each source pos should have same count?
+
+ vector< PhraseAlignment* >::const_iterator iterOuter;
+ for (iterOuter = phrasePairs.begin(); iterOuter != phrasePairs.end(); ++iterOuter)
+ {
+ const PhraseAlignment &phrasePair = **iterOuter;
+ const std::map<size_t, std::pair<size_t, size_t> > &ntLengths = phrasePair.GetNTLengths();
+
+ std::map<size_t, std::pair<size_t, size_t> >::const_iterator iterInner;
+ for (iterInner = ntLengths.begin(); iterInner != ntLengths.end(); ++iterInner)
+ {
+ size_t sourcePos = iterInner->first;
+ size_t sourceLength = iterInner->second.first;
+ size_t targetLength = iterInner->second.second;
+
+ sourceLengths[sourcePos][sourceLength]++;
+ targetLengths[sourcePos][targetLength]++;
+
+ totals[sourcePos]++;
+ }
+ }
+
+ if (totals.size() == 0)
+ { // no non-term. Don't bother
+ return;
+ }
+
+ size_t total = totals.begin()->second;
+ if (totals.size() > 1)
+ {
+ assert(total == (++totals.begin())->second );
+ }
+
+ calcNTLengthProb(sourceLengths, total, sourceProb);
+ calcNTLengthProb(targetLengths, total, targetProb);
+
+}
+
+void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t, float> > &probs, const string &prefix)
+{
+ map<size_t, map<size_t, float> >::const_iterator iterOuter;
+ for (iterOuter = probs.begin(); iterOuter != probs.end(); ++iterOuter)
+ {
+ size_t sourcePos = iterOuter->first;
+ const map<size_t, float> &inner = iterOuter->second;
+
+ map<size_t, float>::const_iterator iterInner;
+ for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner)
+ {
+ size_t length = iterInner->first;
+ float prob = iterInner->second;
+
+ phraseTableFile << sourcePos << "|" << prefix << "|" << length << "=" << prob << " ";
+ }
+ }
+
+}
+
+void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile )
+{
+ if (phrasePair.size() == 0) return;
+
+ PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
+
+ // compute count
+ float count = 0;
+ for(size_t i=0; i<phrasePair.size(); i++) {
+ count += phrasePair[i]->count;
+ }
+
+ // collect count of count statistics
+ if (goodTuringFlag || kneserNeyFlag) {
+ totalDistinct++;
+ int countInt = count + 0.99999;
+ if(countInt <= COC_MAX)
+ countOfCounts[ countInt ]++;
+ }
+
+ // compute PCFG score
+ float pcfgScore;
+ if (pcfgFlag && !inverseFlag) {
+ float pcfgSum = 0;
+ for(size_t i=0; i<phrasePair.size(); ++i) {
+ pcfgSum += phrasePair[i]->pcfgSum;
+ }
+ pcfgScore = pcfgSum / count;
+ }
+
+ // output phrases
+ const PHRASE &phraseS = phrasePair[0]->GetSource();
+ const PHRASE &phraseT = phrasePair[0]->GetTarget();
+
+ // do not output if hierarchical and count below threshold
+ if (hierarchicalFlag && count < minCountHierarchical) {
+ for(size_t j=0; j<phraseS.size()-1; j++) {
+ if (isNonTerminal(vcbS.getWord( phraseS[j] )))
+ return;
+ }
+ }
+
+ // source phrase (unless inverse)
+ if (! inverseFlag) {
+ printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+ phraseTableFile << " ||| ";
+ }
+
+ // target phrase
+ printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+ phraseTableFile << " ||| ";
+
+ // source phrase (if inverse)
+ if (inverseFlag) {
+ printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+ phraseTableFile << " ||| ";
+ }
+
+ // lexical translation probability
+ if (lexFlag) {
+ double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
+ phraseTableFile << ( logProbFlag ? negLogProb*log(lexScore) : lexScore );
+ }
+
+ // unaligned word penalty
+ if (unalignedFlag) {
+ double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment);
+ phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
+ }
+
+ // unaligned function word penalty
+ if (unalignedFWFlag) {
+ double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment);
+ phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
+ }
+
+ // target-side PCFG score
+ if (pcfgFlag && !inverseFlag) {
+ phraseTableFile << " " << pcfgScore;
+ }
+
+ phraseTableFile << " ||| ";
+
+ // alignment info for non-terminals
+ if (! inverseFlag) {
+ if (hierarchicalFlag) {
+ // always output alignment if hiero style, but only for non-terms
+ assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
+ for(size_t j = 0; j < phraseT.size() - 1; j++) {
+ if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
+ if (bestAlignment->alignedToT[ j ].size() != 1) {
+ cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
+ phraseTableFile.flush();
+ assert(bestAlignment->alignedToT[ j ].size() == 1);
+ }
+ int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
+ phraseTableFile << sourcePos << "-" << j << " ";
+ }
+ }
+ } else if (wordAlignmentFlag) {
+ // alignment info in pb model
+ for(size_t j=0; j<bestAlignment->alignedToT.size(); j++) {
+ const set< size_t > &aligned = bestAlignment->alignedToT[j];
+ for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
+ phraseTableFile << *p << "-" << j << " ";
+ }
+ }
+ }
+ }
+
+ // counts
+
+ phraseTableFile << " ||| " << totalCount << " " << count;
+ if (kneserNeyFlag)
+ phraseTableFile << " " << distinctCount;
+
+ // nt lengths
+ if (outputNTLengths)
+ {
+ phraseTableFile << " ||| ";
+
+ if (!inverseFlag)
+ {
+ map<size_t, map<size_t, float> > sourceProb, targetProb;
+ // 1st sourcePos, 2nd = length, 3rd = prob
+
+ calcNTLengthProb(phrasePair, sourceProb, targetProb);
+
+ outputNTLengthProbs(phraseTableFile, sourceProb, "S");
+ outputNTLengthProbs(phraseTableFile, targetProb, "T");
+ }
+ }
+
+ phraseTableFile << endl;
+}
+
+double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+{
+ // unaligned word counter
+ double unaligned = 1.0;
+ // only checking target words - source words are caught when computing inverse
+ for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
+ const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+ if (srcIndices.empty()) {
+ unaligned *= 2.718;
+ }
+ }
+ return unaligned;
+}
+
+double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+{
+ // unaligned word counter
+ double unaligned = 1.0;
+ // only checking target words - source words are caught when computing inverse
+ for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
+ const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+ if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) {
+ unaligned *= 2.718;
+ }
+ }
+ return unaligned;
+}
+
+void loadFunctionWords( const char *fileName )
+{
+ cerr << "Loading function word list from " << fileName;
+ ifstream inFile;
+ inFile.open(fileName);
+ if (inFile.fail()) {
+ cerr << " - ERROR: could not open file\n";
+ exit(1);
+ }
+ istream *inFileP = &inFile;
+
+ char line[LINE_MAX_LENGTH];
+ while(true) {
+ SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (inFileP->eof()) break;
+ vector<string> token = tokenize( line );
+ if (token.size() > 0)
+ functionWordList.insert( token[0] );
+ }
+ inFile.close();
+
+ cerr << " - read " << functionWordList.size() << " function words\n";
+ inFile.close();
+}
+
+double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+{
+ // lexical translation probability
+ double lexScore = 1.0;
+ int null = vcbS.getWordID("NULL");
+ // all target words have to be explained
+ for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
+ const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+ if (srcIndices.empty()) {
+ // explain unaligned word by NULL
+ lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );
+ } else {
+ // go through all the aligned words to compute average
+ double thisWordScore = 0;
+ for (set< size_t >::const_iterator p(srcIndices.begin()); p != srcIndices.end(); ++p) {
+ thisWordScore += lexTable.permissiveLookup( phraseS[ *p ], phraseT[ ti ] );
+ }
+ lexScore *= thisWordScore / (double)srcIndices.size();
+ }
+ }
+ return lexScore;
+}
+
+void LexicalTable::load( char *fileName )
+{
+ cerr << "Loading lexical translation table from " << fileName;
+ ifstream inFile;
+ inFile.open(fileName);
+ if (inFile.fail()) {
+ cerr << " - ERROR: could not open file\n";
+ exit(1);
+ }
+ istream *inFileP = &inFile;
+
+ char line[LINE_MAX_LENGTH];
+
+ int i=0;
+ while(true) {
+ i++;
+ if (i%100000 == 0) cerr << "." << flush;
+ SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (inFileP->eof()) break;
+
+ vector<string> token = tokenize( line );
+ if (token.size() != 3) {
+ cerr << "line " << i << " in " << fileName
+ << " has wrong number of tokens, skipping:\n"
+ << token.size() << " " << token[0] << " " << line << endl;
+ continue;
+ }
+
+ double prob = atof( token[2].c_str() );
+ WORD_ID wordT = vcbT.storeIfNew( token[0] );
+ WORD_ID wordS = vcbS.storeIfNew( token[1] );
+ ltable[ wordS ][ wordT ] = prob;
+ }
+ cerr << endl;
+}
+
+void printSourcePhrase(const PHRASE &phraseS, const PHRASE &phraseT,
+ const PhraseAlignment &bestAlignment, ostream &out)
+{
+ // output source symbols, except root, in rule table format
+ for (std::size_t i = 0; i < phraseS.size()-1; ++i) {
+ const std::string &word = vcbS.getWord(phraseS[i]);
+ if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
+ out << word << " ";
+ continue;
+ }
+ // get corresponding target non-terminal and output pair
+ std::set<std::size_t> alignmentPoints = bestAlignment.alignedToS[i];
+ assert(alignmentPoints.size() == 1);
+ int j = *(alignmentPoints.begin());
+ if (inverseFlag) {
+ out << vcbT.getWord(phraseT[j]) << word << " ";
+ } else {
+ out << word << vcbT.getWord(phraseT[j]) << " ";
+ }
+ }
+ // output source root symbol
+ if (conditionOnTargetLhsFlag && !inverseFlag) {
+ out << "[X]";
+ } else {
+ out << vcbS.getWord(phraseS.back());
+ }
+}
+
+void printTargetPhrase(const PHRASE &phraseS, const PHRASE &phraseT,
+ const PhraseAlignment &bestAlignment, ostream &out)
+{
+ // output target symbols, except root, in rule table format
+ for (std::size_t i = 0; i < phraseT.size()-1; ++i) {
+ const std::string &word = vcbT.getWord(phraseT[i]);
+ if (!unpairedExtractFormatFlag || !isNonTerminal(word)) {
+ out << word << " ";
+ continue;
+ }
+ // get corresponding source non-terminal and output pair
+ std::set<std::size_t> alignmentPoints = bestAlignment.alignedToT[i];
+ assert(alignmentPoints.size() == 1);
+ int j = *(alignmentPoints.begin());
+ if (inverseFlag) {
+ out << word << vcbS.getWord(phraseS[j]) << " ";
+ } else {
+ out << vcbS.getWord(phraseS[j]) << word << " ";
+ }
+ }
+ // output target root symbol
+ if (conditionOnTargetLhsFlag) {
+ if (inverseFlag) {
+ out << "[X]";
+ } else {
+ out << vcbS.getWord(phraseS.back());
+ }
+ } else {
+ out << vcbT.getWord(phraseT.back());
+ }
+}
+
+std::pair<PhrasePairGroup::Coll::iterator,bool> PhrasePairGroup::insert ( const PhraseAlignmentCollection& obj )
+{
+ std::pair<iterator,bool> ret = m_coll.insert(obj);
+
+ if (ret.second)
+ { // obj inserted. Also add to sorted vector
+ const PhraseAlignmentCollection &insertedObj = *ret.first;
+ m_sortedColl.push_back(&insertedObj);
+ }
+
+ return ret;
+}
+
+
diff --git a/phrase-extract/score.h b/phrase-extract/score.h
new file mode 100644
index 000000000..9faa144c5
--- /dev/null
+++ b/phrase-extract/score.h
@@ -0,0 +1,65 @@
+#pragma once
+/*
+ * score.h
+ * extract
+ *
+ * Created by Hieu Hoang on 28/07/2010.
+ * Copyright 2010 __MyCompanyName__. All rights reserved.
+ *
+ */
+#include <string>
+#include <vector>
+
+class PhraseAlignment;
+
+typedef std::vector<PhraseAlignment*> PhraseAlignmentCollection;
+//typedef std::vector<PhraseAlignmentCollection> PhrasePairGroup;
+
+class PhraseAlignmentCollectionOrderer
+{
+public:
+ bool operator()(const PhraseAlignmentCollection &collA, const PhraseAlignmentCollection &collB) const
+ {
+ assert(collA.size() > 0);
+ assert(collB.size() > 0);
+
+ const PhraseAlignment &objA = *collA[0];
+ const PhraseAlignment &objB = *collB[0];
+ bool ret = objA < objB;
+
+ return ret;
+ }
+};
+
+
+//typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> PhrasePairGroup;
+
+class PhrasePairGroup
+{
+private:
+ typedef std::set<PhraseAlignmentCollection, PhraseAlignmentCollectionOrderer> Coll;
+ Coll m_coll;
+
+
+public:
+ typedef Coll::iterator iterator;
+ typedef Coll::const_iterator const_iterator;
+ typedef std::vector<const PhraseAlignmentCollection *> SortedColl;
+
+ std::pair<Coll::iterator,bool> insert ( const PhraseAlignmentCollection& obj );
+
+ const SortedColl &GetSortedColl() const
+ { return m_sortedColl; }
+ size_t GetSize() const
+ { return m_coll.size(); }
+
+private:
+ SortedColl m_sortedColl;
+
+};
+
+// other functions *********************************************
+inline bool isNonTerminal( const std::string &word )
+{
+ return (word.length()>=3 && word[0] == '[' && word[word.length()-1] == ']');
+}
diff --git a/phrase-extract/score.vcxproj b/phrase-extract/score.vcxproj
new file mode 100644
index 000000000..e5dc1df9a
--- /dev/null
+++ b/phrase-extract/score.vcxproj
@@ -0,0 +1,106 @@
+<?xml version="1.0" encoding="utf-8"?>
+<Project DefaultTargets="Build" ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
+ <ItemGroup Label="ProjectConfigurations">
+ <ProjectConfiguration Include="Debug|Win32">
+ <Configuration>Debug</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ <ProjectConfiguration Include="Release|Win32">
+ <Configuration>Release</Configuration>
+ <Platform>Win32</Platform>
+ </ProjectConfiguration>
+ </ItemGroup>
+ <PropertyGroup Label="Globals">
+ <ProjectGuid>{34AC84C7-62A1-4BBE-BCA9-4E95B2341039}</ProjectGuid>
+ <RootNamespace>score</RootNamespace>
+ <Keyword>Win32Proj</Keyword>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.Default.props" />
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ <WholeProgramOptimization>true</WholeProgramOptimization>
+ </PropertyGroup>
+ <PropertyGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="Configuration">
+ <ConfigurationType>Application</ConfigurationType>
+ <CharacterSet>Unicode</CharacterSet>
+ </PropertyGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
+ <ImportGroup Label="ExtensionSettings">
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <ImportGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'" Label="PropertySheets">
+ <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
+ </ImportGroup>
+ <PropertyGroup Label="UserMacros" />
+ <PropertyGroup>
+ <_ProjectFileVersion>10.0.30319.1</_ProjectFileVersion>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">true</LinkIncremental>
+ <OutDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</OutDir>
+ <IntDir Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">$(Configuration)\</IntDir>
+ <LinkIncremental Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">false</LinkIncremental>
+ </PropertyGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Debug|Win32'">
+ <ClCompile>
+ <Optimization>Disabled</Optimization>
+ <PreprocessorDefinitions>WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <MinimalRebuild>true</MinimalRebuild>
+ <BasicRuntimeChecks>EnableFastChecks</BasicRuntimeChecks>
+ <RuntimeLibrary>MultiThreadedDebugDLL</RuntimeLibrary>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>EditAndContinue</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>zdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <RandomizedBaseAddress>false</RandomizedBaseAddress>
+ <DataExecutionPrevention>
+ </DataExecutionPrevention>
+ <TargetMachine>MachineX86</TargetMachine>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|Win32'">
+ <ClCompile>
+ <PreprocessorDefinitions>WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions)</PreprocessorDefinitions>
+ <RuntimeLibrary>MultiThreadedDLL</RuntimeLibrary>
+ <PrecompiledHeader>
+ </PrecompiledHeader>
+ <WarningLevel>Level3</WarningLevel>
+ <DebugInformationFormat>ProgramDatabase</DebugInformationFormat>
+ </ClCompile>
+ <Link>
+ <AdditionalDependencies>zdll.lib;%(AdditionalDependencies)</AdditionalDependencies>
+ <GenerateDebugInformation>true</GenerateDebugInformation>
+ <SubSystem>Console</SubSystem>
+ <OptimizeReferences>true</OptimizeReferences>
+ <EnableCOMDATFolding>true</EnableCOMDATFolding>
+ <RandomizedBaseAddress>false</RandomizedBaseAddress>
+ <DataExecutionPrevention>
+ </DataExecutionPrevention>
+ <TargetMachine>MachineX86</TargetMachine>
+ </Link>
+ </ItemDefinitionGroup>
+ <ItemGroup>
+ <ClCompile Include="AlignmentPhrase.cpp" />
+ <ClCompile Include="InputFileStream.cpp" />
+ <ClCompile Include="PhraseAlignment.cpp" />
+ <ClCompile Include="score.cpp" />
+ <ClCompile Include="tables-core.cpp" />
+ </ItemGroup>
+ <ItemGroup>
+ <ClInclude Include="AlignmentPhrase.h" />
+ <ClInclude Include="InputFileStream.h" />
+ <ClInclude Include="PhraseAlignment.h" />
+ <ClInclude Include="tables-core.h" />
+ </ItemGroup>
+ <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
+ <ImportGroup Label="ExtensionTargets">
+ </ImportGroup>
+</Project> \ No newline at end of file
diff --git a/phrase-extract/statistics.cpp b/phrase-extract/statistics.cpp
new file mode 100644
index 000000000..a39e98a5d
--- /dev/null
+++ b/phrase-extract/statistics.cpp
@@ -0,0 +1,340 @@
+// $Id$
+// vim:tabstop=2
+
+#include <sstream>
+#include <cstdio>
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <stdlib.h>
+#include <assert.h>
+#include <time.h>
+
+#include "AlignmentPhrase.h"
+#include "SafeGetline.h"
+#include "tables-core.h"
+#include "InputFileStream.h"
+
+using namespace std;
+
+#define LINE_MAX_LENGTH 10000
+
+class PhraseAlignment
+{
+public:
+ int english, foreign;
+ vector< vector<size_t> > alignedToE;
+ vector< vector<size_t> > alignedToF;
+
+ bool create( char*, int );
+ void clear();
+ bool equals( const PhraseAlignment& );
+};
+
+class LexicalTable
+{
+public:
+ map< WORD_ID, map< WORD_ID, double > > ltable;
+ void load( char[] );
+};
+
+void processPhrasePairs( vector< PhraseAlignment > & );
+
+ofstream phraseTableFile;
+
+Vocabulary vcbE;
+Vocabulary vcbF;
+LexicalTable lexTable;
+PhraseTable phraseTableE;
+PhraseTable phraseTableF;
+bool inverseFlag;
+int phrasePairBase = 0; // only used for "proper" conditioning
+
+int main(int argc, char* argv[])
+{
+ cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n"
+ << "modifying PhraseScore v1.4 written by Philipp Koehn\n"
+ << "It computes statistics for extracted phrase pairs\n"
+ << "if (direct):\n"
+ << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n"
+ << "if (inverse)\n"
+ << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n";
+ time_t starttime = time(NULL);
+
+ if (argc != 4 && argc != 5) {
+ cerr << "syntax: statistics extract lex phrase-table [inverse]\n";
+ exit(1);
+ }
+ char* &fileNameExtract = argv[1];
+ char* &fileNameLex = argv[2];
+ char* &fileNamePhraseTable = argv[3];
+ inverseFlag = false;
+ if (argc > 4) {
+ inverseFlag = true;
+ cerr << "using inverse mode\n";
+ }
+
+ // lexical translation table
+ lexTable.load( fileNameLex );
+
+ // sorted phrase extraction file
+ Moses::InputFileStream extractFile(fileNameExtract);
+
+ if (extractFile.fail()) {
+ cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
+ exit(1);
+ }
+ istream &extractFileP = extractFile;
+
+ // output file: phrase translation table
+ phraseTableFile.open(fileNamePhraseTable);
+ if (phraseTableFile.fail()) {
+ cerr << "ERROR: could not open file phrase table file "
+ << fileNamePhraseTable << endl;
+ exit(1);
+ }
+
+ // loop through all extracted phrase translations
+ int lastForeign = -1;
+ vector< PhraseAlignment > phrasePairsWithSameF;
+ int i=0;
+ int fileCount = 0;
+ while(true) {
+ if (extractFileP.eof()) break;
+ if (++i % 100000 == 0) cerr << "." << flush;
+ char line[LINE_MAX_LENGTH];
+ SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+ // if (fileCount>0)
+ if (extractFileP.eof())
+ break;
+ PhraseAlignment phrasePair;
+ bool isPhrasePair = phrasePair.create( line, i );
+ if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
+ processPhrasePairs( phrasePairsWithSameF );
+ for(size_t j=0; j<phrasePairsWithSameF.size(); j++)
+ phrasePairsWithSameF[j].clear();
+ phrasePairsWithSameF.clear();
+ phraseTableE.clear();
+ phraseTableF.clear();
+ phrasePair.clear(); // process line again, since phrase tables flushed
+ phrasePair.create( line, i );
+ phrasePairBase = 0;
+ }
+ lastForeign = phrasePair.foreign;
+ if (isPhrasePair)
+ phrasePairsWithSameF.push_back( phrasePair );
+ else
+ phrasePairBase++;
+ }
+ processPhrasePairs( phrasePairsWithSameF );
+ phraseTableFile.close();
+}
+
+void processPhrasePairs( vector< PhraseAlignment > &phrasePair )
+{
+ if (phrasePair.size() == 0) return;
+ map<int, int> countE;
+ map<int, int> alignmentE;
+ int totalCount = 0;
+ int currentCount = 0;
+ int maxSameCount = 0;
+ int maxSame = -1;
+ int old = -1;
+ for(size_t i=0; i<phrasePair.size(); i++) {
+ if (i>0) {
+ if (phrasePair[old].english == phrasePair[i].english) {
+ if (! phrasePair[i].equals( phrasePair[old] )) {
+ if (currentCount > maxSameCount) {
+ maxSameCount = currentCount;
+ maxSame = i-1;
+ }
+ currentCount = 0;
+ }
+ } else {
+ // wrap up old E
+ if (currentCount > maxSameCount) {
+ maxSameCount = currentCount;
+ maxSame = i-1;
+ }
+
+ alignmentE[ phrasePair[old].english ] = maxSame;
+ // if (maxSameCount != totalCount)
+ // cout << "max count is " << maxSameCount << "/" << totalCount << endl;
+
+ // get ready for new E
+ totalCount = 0;
+ currentCount = 0;
+ maxSameCount = 0;
+ maxSame = -1;
+ }
+ }
+ countE[ phrasePair[i].english ]++;
+ old = i;
+ currentCount++;
+ totalCount++;
+ }
+
+ // wrap up old E
+ if (currentCount > maxSameCount) {
+ maxSameCount = currentCount;
+ maxSame = phrasePair.size()-1;
+ }
+ alignmentE[ phrasePair[old].english ] = maxSame;
+ // if (maxSameCount != totalCount)
+ // cout << "max count is " << maxSameCount << "/" << totalCount << endl;
+
+ // output table
+ typedef map< int, int >::iterator II;
+ PHRASE phraseF = phraseTableF.getPhrase( phrasePair[0].foreign );
+ size_t index = 0;
+ for(II i = countE.begin(); i != countE.end(); i++) {
+ //cout << "\tp( " << i->first << " | " << phrasePair[0].foreign << " ; " << phraseF.size() << " ) = ...\n";
+ //cerr << index << endl;
+
+ // foreign phrase (unless inverse)
+ if (! inverseFlag) {
+ for(size_t j=0; j<phraseF.size(); j++) {
+ phraseTableFile << vcbF.getWord( phraseF[j] );
+ phraseTableFile << " ";
+ }
+ phraseTableFile << "||| ";
+ }
+
+ // english phrase
+ PHRASE phraseE = phraseTableE.getPhrase( i->first );
+ for(size_t j=0; j<phraseE.size(); j++) {
+ phraseTableFile << vcbE.getWord( phraseE[j] );
+ phraseTableFile << " ";
+ }
+ phraseTableFile << "||| ";
+
+ // foreign phrase (if inverse)
+ if (inverseFlag) {
+ for(size_t j=0; j<phraseF.size(); j++) {
+ phraseTableFile << vcbF.getWord( phraseF[j] );
+ phraseTableFile << " ";
+ }
+ phraseTableFile << "||| ";
+ }
+
+ // phrase pair frequency
+ phraseTableFile << i->second;
+
+ //source phrase pair frequency
+ phraseTableFile << " " << phrasePair.size();
+
+ // source phrase length
+ phraseTableFile << " " << phraseF.size();
+
+ // target phrase length
+ phraseTableFile << " " << phraseE.size();
+
+ phraseTableFile << endl;
+
+ index += i->second;
+ }
+}
+
+bool PhraseAlignment::create( char line[], int lineID )
+{
+ vector< string > token = tokenize( line );
+ int item = 1;
+ PHRASE phraseF, phraseE;
+ for (size_t j=0; j<token.size(); j++) {
+ if (token[j] == "|||") item++;
+ else {
+ if (item == 1)
+ phraseF.push_back( vcbF.storeIfNew( token[j] ) );
+ else if (item == 2)
+ phraseE.push_back( vcbE.storeIfNew( token[j] ) );
+ else if (item == 3) {
+ int e,f;
+ sscanf(token[j].c_str(), "%d-%d", &f, &e);
+ if ((size_t)e >= phraseE.size() || (size_t)f >= phraseF.size()) {
+ cerr << "WARNING: sentence " << lineID << " has alignment point (" << f << ", " << e << ") out of bounds (" << phraseF.size() << ", " << phraseE.size() << ")\n";
+ } else {
+ if (alignedToE.size() == 0) {
+ vector< size_t > dummy;
+ for(size_t i=0; i<phraseE.size(); i++)
+ alignedToE.push_back( dummy );
+ for(size_t i=0; i<phraseF.size(); i++)
+ alignedToF.push_back( dummy );
+ foreign = phraseTableF.storeIfNew( phraseF );
+ english = phraseTableE.storeIfNew( phraseE );
+ }
+ alignedToE[e].push_back( f );
+ alignedToF[f].push_back( e );
+ }
+ }
+ }
+ }
+ return (item>2); // real phrase pair, not just foreign phrase
+}
+
+void PhraseAlignment::clear()
+{
+ for(size_t i=0; i<alignedToE.size(); i++)
+ alignedToE[i].clear();
+ for(size_t i=0; i<alignedToF.size(); i++)
+ alignedToF[i].clear();
+ alignedToE.clear();
+ alignedToF.clear();
+}
+
+bool PhraseAlignment::equals( const PhraseAlignment& other )
+{
+ if (this == &other) return true;
+ if (other.english != english) return false;
+ if (other.foreign != foreign) return false;
+ PHRASE phraseE = phraseTableE.getPhrase( english );
+ PHRASE phraseF = phraseTableF.getPhrase( foreign );
+ for(size_t i=0; i<phraseE.size(); i++) {
+ if (alignedToE[i].size() != other.alignedToE[i].size()) return false;
+ for(size_t j=0; j<alignedToE[i].size(); j++) {
+ if (alignedToE[i][j] != other.alignedToE[i][j]) return false;
+ }
+ }
+ for(size_t i=0; i<phraseF.size(); i++) {
+ if (alignedToF[i].size() != other.alignedToF[i].size()) return false;
+ for(size_t j=0; j<alignedToF[i].size(); j++) {
+ if (alignedToF[i][j] != other.alignedToF[i][j]) return false;
+ }
+ }
+ return true;
+}
+
+void LexicalTable::load( char *fileName )
+{
+ cerr << "Loading lexical translation table from " << fileName;
+ ifstream inFile;
+ inFile.open(fileName);
+ if (inFile.fail()) {
+ cerr << " - ERROR: could not open file\n";
+ exit(1);
+ }
+ istream *inFileP = &inFile;
+
+ char line[LINE_MAX_LENGTH];
+
+ int i=0;
+ while(true) {
+ i++;
+ if (i%100000 == 0) cerr << "." << flush;
+ SAFE_GETLINE((*inFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
+ if (inFileP->eof()) break;
+
+ vector<string> token = tokenize( line );
+ if (token.size() != 3) {
+ cerr << "line " << i << " in " << fileName << " has wrong number of tokens, skipping:\n" <<
+ token.size() << " " << token[0] << " " << line << endl;
+ continue;
+ }
+
+ double prob = atof( token[2].c_str() );
+ WORD_ID wordE = vcbE.storeIfNew( token[0] );
+ WORD_ID wordF = vcbF.storeIfNew( token[1] );
+ ltable[ wordF ][ wordE ] = prob;
+ }
+ cerr << endl;
+}
diff --git a/phrase-extract/tables-core.cpp b/phrase-extract/tables-core.cpp
new file mode 100644
index 000000000..93ad8b6a1
--- /dev/null
+++ b/phrase-extract/tables-core.cpp
@@ -0,0 +1,124 @@
+// $Id$
+//#include "beammain.h"
+#include "tables-core.h"
+
+#define TABLE_LINE_MAX_LENGTH 1000
+#define UNKNOWNSTR "UNK"
+
+using namespace std;
+
+// as in beamdecoder/tables.cpp
+vector<string> tokenize( const char* input )
+{
+ vector< string > token;
+ bool betweenWords = true;
+ int start=0;
+ int i=0;
+ for(; input[i] != '\0'; i++) {
+ bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+ if (!isSpace && betweenWords) {
+ start = i;
+ betweenWords = false;
+ } else if (isSpace && !betweenWords) {
+ token.push_back( string( input+start, i-start ) );
+ betweenWords = true;
+ }
+ }
+ if (!betweenWords)
+ token.push_back( string( input+start, i-start ) );
+ return token;
+}
+
+bool isNonTerminal( const WORD &symbol ) {
+ return symbol.substr(0, 1) == "[" && symbol.substr(symbol.size()-1, 1) == "]";
+}
+
+WORD_ID Vocabulary::storeIfNew( const WORD& word )
+{
+ map<WORD, WORD_ID>::iterator i = lookup.find( word );
+
+ if( i != lookup.end() )
+ return i->second;
+
+ WORD_ID id = vocab.size();
+ vocab.push_back( word );
+ lookup[ word ] = id;
+ return id;
+}
+
+WORD_ID Vocabulary::getWordID( const WORD& word )
+{
+ map<WORD, WORD_ID>::iterator i = lookup.find( word );
+ if( i == lookup.end() )
+ return 0;
+ return i->second;
+}
+
+PHRASE_ID PhraseTable::storeIfNew( const PHRASE& phrase )
+{
+ map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
+ if( i != lookup.end() )
+ return i->second;
+
+ PHRASE_ID id = phraseTable.size();
+ phraseTable.push_back( phrase );
+ lookup[ phrase ] = id;
+ return id;
+}
+
+PHRASE_ID PhraseTable::getPhraseID( const PHRASE& phrase )
+{
+ map< PHRASE, PHRASE_ID >::iterator i = lookup.find( phrase );
+ if( i == lookup.end() )
+ return 0;
+ return i->second;
+}
+
+void PhraseTable::clear()
+{
+ lookup.clear();
+ phraseTable.clear();
+}
+
+void DTable::init()
+{
+ for(int i = -10; i<10; i++)
+ dtable[i] = -abs( i );
+}
+
+void DTable::load( const string& fileName )
+{
+ ifstream inFile;
+ inFile.open(fileName.c_str());
+
+ std::string line;
+ int i=0;
+ while(true) {
+ i++;
+ getline(inFile, line);
+ if (inFile.eof()) break;
+ if (!inFile) {
+ std::cerr << "Error reading from " << fileName << std::endl;
+ abort();
+ }
+
+ vector<string> token = tokenize(line.c_str());
+ if (token.size() < 2) {
+ cerr << "line " << i << " in " << fileName << " too short, skipping\n";
+ continue;
+ }
+
+ int d = atoi( token[0].c_str() );
+ double prob = log( atof( token[1].c_str() ) );
+ dtable[ d ] = prob;
+ }
+}
+
+double DTable::get( int distortion )
+{
+ if (dtable.find( distortion ) == dtable.end())
+ return log( 0.00001 );
+ return dtable[ distortion ];
+}
+
diff --git a/phrase-extract/tables-core.h b/phrase-extract/tables-core.h
new file mode 100644
index 000000000..1899b4d77
--- /dev/null
+++ b/phrase-extract/tables-core.h
@@ -0,0 +1,66 @@
+// $Id$
+
+#ifndef _TABLES_H
+#define _TABLES_H
+
+#include <iostream>
+#include <fstream>
+#include <assert.h>
+#include <stdlib.h>
+#include <string>
+#include <queue>
+#include <map>
+#include <cmath>
+
+extern std::vector<std::string> tokenize( const char*);
+
+typedef std::string WORD;
+typedef unsigned int WORD_ID;
+
+class Vocabulary
+{
+public:
+ std::map<WORD, WORD_ID> lookup;
+ std::vector< WORD > vocab;
+ WORD_ID storeIfNew( const WORD& );
+ WORD_ID getWordID( const WORD& );
+ inline WORD &getWord( WORD_ID id ) {
+ return vocab[ id ];
+ }
+};
+
+typedef std::vector< WORD_ID > PHRASE;
+typedef unsigned int PHRASE_ID;
+
+class PhraseTable
+{
+public:
+ std::map< PHRASE, PHRASE_ID > lookup;
+ std::vector< PHRASE > phraseTable;
+ PHRASE_ID storeIfNew( const PHRASE& );
+ PHRASE_ID getPhraseID( const PHRASE& );
+ void clear();
+ inline PHRASE &getPhrase( const PHRASE_ID id ) {
+ return phraseTable[ id ];
+ }
+};
+
+typedef std::vector< std::pair< PHRASE_ID, double > > PHRASEPROBVEC;
+
+class TTable
+{
+public:
+ std::map< PHRASE_ID, std::vector< std::pair< PHRASE_ID, double > > > ttable;
+ std::map< PHRASE_ID, std::vector< std::pair< PHRASE_ID, std::vector< double > > > > ttableMulti;
+};
+
+class DTable
+{
+public:
+ std::map< int, double > dtable;
+ void init();
+ void load( const std::string& );
+ double get( int );
+};
+
+#endif