Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--phrase-extract/ExtractionPhrasePair.cpp90
-rw-r--r--phrase-extract/ExtractionPhrasePair.h6
-rw-r--r--phrase-extract/PhraseExtractionOptions.h1
-rw-r--r--phrase-extract/extract-ghkm/ExtractGHKM.cpp64
-rw-r--r--phrase-extract/extract-ghkm/Options.h2
-rw-r--r--phrase-extract/extract-ghkm/PhraseOrientation.cpp417
-rw-r--r--phrase-extract/extract-ghkm/PhraseOrientation.h102
-rw-r--r--phrase-extract/extract-ghkm/ScfgRuleWriter.cpp13
-rw-r--r--phrase-extract/extract-ghkm/ScfgRuleWriter.h2
-rw-r--r--phrase-extract/extract-ghkm/XmlTreeParser.h2
-rw-r--r--phrase-extract/extract-main.cpp2
-rw-r--r--phrase-extract/score-main.cpp124
-rwxr-xr-xscripts/ems/experiment.perl16
-rwxr-xr-xscripts/generic/extract-parallel.perl33
-rwxr-xr-xscripts/training/train-model.perl14
15 files changed, 858 insertions, 30 deletions
diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp
index 9564b1cfe..ccf0fc275 100644
--- a/phrase-extract/ExtractionPhrasePair.cpp
+++ b/phrase-extract/ExtractionPhrasePair.cpp
@@ -463,6 +463,96 @@ std::string ExtractionPhrasePair::CollectAllLabelsSeparateLHSAndRHS(const std::s
}
+void ExtractionPhrasePair::CollectAllPhraseOrientations(const std::string &key,
+ const std::vector<float> &orientationClassPriorsL2R,
+ const std::vector<float> &orientationClassPriorsR2L,
+ double smoothingFactor,
+ std::ostream &out) const
+{
+ assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dright dleft
+
+ const PROPERTY_VALUES *allPropertyValues = GetProperty( key );
+
+ if ( allPropertyValues == NULL ) {
+ return;
+ }
+
+ // bidirectional MSLR phrase orientation with 2x4 orientation classes:
+ // mono swap dright dleft
+ std::vector<float> orientationClassCountSumL2R(4,0);
+ std::vector<float> orientationClassCountSumR2L(4,0);
+
+ for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin();
+ iter!=allPropertyValues->end(); ++iter) {
+ std::string l2rOrientationClass, r2lOrientationClass;
+ try {
+ istringstream tokenizer(iter->first);
+ tokenizer >> l2rOrientationClass;
+ tokenizer >> r2lOrientationClass;
+ if ( tokenizer.peek() != EOF ) {
+ UTIL_THROW(util::Exception, "ExtractionPhrasePair"
+ << ": Collecting phrase orientations failed. "
+ << "Too many tokens?");
+ }
+ } catch (const std::exception &e) {
+ UTIL_THROW(util::Exception, "ExtractionPhrasePair"
+ << ": Collecting phrase orientations failed. "
+ << "Flawed property value in extract file?");
+ }
+
+ int l2rOrientationClassId = -1;
+ if (!l2rOrientationClass.compare("mono")) {
+ l2rOrientationClassId = 0;
+ }
+ if (!l2rOrientationClass.compare("swap")) {
+ l2rOrientationClassId = 1;
+ }
+ if (!l2rOrientationClass.compare("dright")) {
+ l2rOrientationClassId = 2;
+ }
+ if (!l2rOrientationClass.compare("dleft")) {
+ l2rOrientationClassId = 3;
+ }
+ if (l2rOrientationClassId == -1) {
+ UTIL_THROW(util::Exception, "ExtractionPhrasePair"
+ << ": Collecting phrase orientations failed. "
+ << "Unknown orientation class \"" << l2rOrientationClass << "\"." );
+ }
+ int r2lOrientationClassId = -1;
+ if (!r2lOrientationClass.compare("mono")) {
+ r2lOrientationClassId = 0;
+ }
+ if (!r2lOrientationClass.compare("swap")) {
+ r2lOrientationClassId = 1;
+ }
+ if (!r2lOrientationClass.compare("dright")) {
+ r2lOrientationClassId = 2;
+ }
+ if (!r2lOrientationClass.compare("dleft")) {
+ r2lOrientationClassId = 3;
+ }
+ if (r2lOrientationClassId == -1) {
+ UTIL_THROW(util::Exception, "ExtractionPhrasePair"
+ << ": Collecting phrase orientations failed. "
+ << "Unknown orientation class \"" << r2lOrientationClass << "\"." );
+ }
+
+ orientationClassCountSumL2R[l2rOrientationClassId] += iter->second;
+ orientationClassCountSumR2L[r2lOrientationClassId] += iter->second;
+ }
+
+ for (size_t i=0; i<4; ++i) {
+ if (i>0) {
+ out << " ";
+ }
+ out << (float)( (smoothingFactor*orientationClassPriorsL2R[i] + orientationClassCountSumL2R[i]) / (smoothingFactor + m_count) );
+ }
+ for (size_t i=0; i<4; ++i) {
+ out << " " << (float)( (smoothingFactor*orientationClassPriorsR2L[i] + orientationClassCountSumR2L[i]) / (smoothingFactor + m_count) );
+ }
+}
+
+
}
diff --git a/phrase-extract/ExtractionPhrasePair.h b/phrase-extract/ExtractionPhrasePair.h
index ba23ac1f2..e0f5dc5fb 100644
--- a/phrase-extract/ExtractionPhrasePair.h
+++ b/phrase-extract/ExtractionPhrasePair.h
@@ -131,6 +131,12 @@ public:
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& sourceRHSAndLHSJointCounts,
Vocabulary &vcbT) const;
+ void CollectAllPhraseOrientations(const std::string &key,
+ const std::vector<float> &orientationClassPriorsL2R,
+ const std::vector<float> &orientationClassPriorsR2L,
+ double smoothingFactor,
+ std::ostream &out) const;
+
void AddProperties( const std::string &str, float count );
void AddProperty( const std::string &key, const std::string &value, float count )
diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h
index 87712d6d3..7132974d4 100644
--- a/phrase-extract/PhraseExtractionOptions.h
+++ b/phrase-extract/PhraseExtractionOptions.h
@@ -18,7 +18,6 @@
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
***********************************************************************/
-/* Created by Rohit Gupta, CDAC, Mumbai, India on 18 July, 2012*/
#include <string>
#include <vector>
diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
index b86c28586..36dfee2e5 100644
--- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp
+++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp
@@ -27,6 +27,7 @@
#include "OutputFileStream.h"
#include "Options.h"
#include "ParseTree.h"
+#include "PhraseOrientation.h"
#include "ScfgRule.h"
#include "ScfgRuleWriter.h"
#include "Span.h"
@@ -66,11 +67,12 @@ int ExtractGHKM::Main(int argc, char *argv[])
// Open output files.
OutputFileStream fwdExtractStream;
OutputFileStream invExtractStream;
- std::ofstream glueGrammarStream;
- std::ofstream targetUnknownWordStream;
- std::ofstream sourceUnknownWordStream;
- std::ofstream sourceLabelSetStream;
- std::ofstream unknownWordSoftMatchesStream;
+ OutputFileStream glueGrammarStream;
+ OutputFileStream targetUnknownWordStream;
+ OutputFileStream sourceUnknownWordStream;
+ OutputFileStream sourceLabelSetStream;
+ OutputFileStream unknownWordSoftMatchesStream;
+
std::string fwdFileName = options.extractFile;
std::string invFileName = options.extractFile + std::string(".inv");
if (options.gzOutput) {
@@ -79,6 +81,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
}
OpenOutputFileOrDie(fwdFileName, fwdExtractStream);
OpenOutputFileOrDie(invFileName, invExtractStream);
+
if (!options.glueGrammarFile.empty()) {
OpenOutputFileOrDie(options.glueGrammarFile, glueGrammarStream);
}
@@ -118,7 +121,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
std::string sourceLine;
std::string alignmentLine;
Alignment alignment;
- XmlTreeParser xmlTreeParser(targetLabelSet, targetTopLabelSet);
+ XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet);
// XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet);
ScfgRuleWriter writer(fwdExtractStream, invExtractStream, options);
size_t lineNum = options.sentenceOffset;
@@ -144,7 +147,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
}
std::auto_ptr<ParseTree> targetParseTree;
try {
- targetParseTree = xmlTreeParser.Parse(targetLine);
+ targetParseTree = targetXmlTreeParser.Parse(targetLine);
assert(targetParseTree.get());
} catch (const Exception &e) {
std::ostringstream oss;
@@ -181,7 +184,7 @@ int ExtractGHKM::Main(int argc, char *argv[])
// Read source tokens.
std::vector<std::string> sourceTokens(ReadTokens(sourceLine));
- // Construct a source ParseTree object object from the SyntaxTree object.
+ // Construct a source ParseTree object from the SyntaxTree object.
std::auto_ptr<ParseTree> sourceParseTree;
if (options.sourceLabels) {
@@ -235,11 +238,26 @@ int ExtractGHKM::Main(int argc, char *argv[])
graph.ExtractComposedRules(options);
}
+ // Initialize phrase orientation scoring object
+ PhraseOrientation phraseOrientation( sourceTokens, targetXmlTreeParser.GetWords(), alignment);
+
// Write the rules, subject to scope pruning.
const std::vector<Node *> &targetNodes = graph.GetTargetNodes();
for (std::vector<Node *>::const_iterator p = targetNodes.begin();
p != targetNodes.end(); ++p) {
+
const std::vector<const Subgraph *> &rules = (*p)->GetRules();
+
+ REO_POS l2rOrientation, r2lOrientation;
+ if (options.phraseOrientation && !rules.empty()) {
+ int sourceSpanBegin = *((*p)->GetSpan().begin());
+ int sourceSpanEnd = *((*p)->GetSpan().rbegin());
+ l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,L2R);
+ r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,R2L);
+ // std::cerr << "span " << sourceSpanBegin << " " << sourceSpanEnd << std::endl;
+ // std::cerr << "phraseOrientation " << phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd) << std::endl;
+ }
+
for (std::vector<const Subgraph *>::const_iterator q = rules.begin();
q != rules.end(); ++q) {
ScfgRule *r = 0;
@@ -251,16 +269,34 @@ int ExtractGHKM::Main(int argc, char *argv[])
// TODO Can scope pruning be done earlier?
if (r->Scope() <= options.maxScope) {
if (!options.treeFragments) {
- writer.Write(*r);
+ writer.Write(*r,false);
} else {
- writer.Write(*r,**q);
+ writer.Write(*r,**q,false);
+ }
+ if (options.phraseOrientation) {
+ fwdExtractStream << " {{Orientation ";
+ phraseOrientation.WriteOrientation(fwdExtractStream,l2rOrientation);
+ fwdExtractStream << " ";
+ phraseOrientation.WriteOrientation(fwdExtractStream,r2lOrientation);
+ fwdExtractStream << "}}";
+ phraseOrientation.IncrementPriorCount(L2R,l2rOrientation,1);
+ phraseOrientation.IncrementPriorCount(R2L,r2lOrientation,1);
}
+ fwdExtractStream << std::endl;
+ invExtractStream << std::endl;
}
delete r;
}
}
}
+ if (options.phraseOrientation) {
+ std::string phraseOrientationPriorsFileName = options.extractFile + std::string(".phraseOrientationPriors");
+ OutputFileStream phraseOrientationPriorsStream;
+ OpenOutputFileOrDie(phraseOrientationPriorsFileName, phraseOrientationPriorsStream);
+ PhraseOrientation::WritePriorCounts(phraseOrientationPriorsStream);
+ }
+
std::map<std::string,size_t> sourceLabels;
if (options.sourceLabels && !options.sourceLabelSetFile.empty()) {
@@ -398,6 +434,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
"extract minimal rules only")
("PCFG",
"include score based on PCFG scores in target corpus")
+ ("PhraseOrientation",
+ "output phrase orientation information")
("TreeFragments",
"output parse tree information")
("SourceLabels",
@@ -502,6 +540,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[],
if (vm.count("PCFG")) {
options.pcfg = true;
}
+ if (vm.count("PhraseOrientation")) {
+ options.phraseOrientation = true;
+ }
if (vm.count("TreeFragments")) {
options.treeFragments = true;
}
@@ -736,8 +777,7 @@ void ExtractGHKM::WriteUnknownWordSoftMatches(
const std::set<std::string> &labelSet,
std::ostream &out)
{
- std::set<std::string>::const_iterator p = labelSet.begin();
- for (p; p != labelSet.end(); ++p) {
+ for (std::set<std::string>::const_iterator p = labelSet.begin(); p != labelSet.end(); ++p) {
std::string label = *p;
out << "UNK " << label << std::endl;
}
diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h
index 28a581802..0102e2f64 100644
--- a/phrase-extract/extract-ghkm/Options.h
+++ b/phrase-extract/extract-ghkm/Options.h
@@ -40,6 +40,7 @@ public:
, maxScope(3)
, minimal(false)
, pcfg(false)
+ , phraseOrientation(false)
, treeFragments(false)
, sourceLabels(false)
, sentenceOffset(0)
@@ -64,6 +65,7 @@ public:
int maxScope;
bool minimal;
bool pcfg;
+ bool phraseOrientation;
bool treeFragments;
bool sourceLabels;
std::string sourceLabelSetFile;
diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.cpp b/phrase-extract/extract-ghkm/PhraseOrientation.cpp
new file mode 100644
index 000000000..a96e5361c
--- /dev/null
+++ b/phrase-extract/extract-ghkm/PhraseOrientation.cpp
@@ -0,0 +1,417 @@
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#include "PhraseOrientation.h"
+
+#include <iostream>
+
+#include <boost/assign/list_of.hpp>
+
+namespace Moses
+{
+namespace GHKM
+{
+
+std::vector<float> PhraseOrientation::m_l2rOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0);
+std::vector<float> PhraseOrientation::m_r2lOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0);
+
+PhraseOrientation::PhraseOrientation(const std::vector<std::string> &source,
+ const std::vector<std::string> &target,
+ const Alignment &alignment)
+ : m_source(source)
+ , m_target(target)
+ , m_alignment(alignment)
+{
+
+ int countF = m_source.size();
+ int countE = m_target.size();
+
+ // prepare data structures for alignments
+ std::vector<std::vector<int> > alignedToS;
+ for(int i=0; i<countF; ++i) {
+ std::vector< int > dummy;
+ alignedToS.push_back(dummy);
+ }
+ for(int i=0; i<countE; ++i) {
+ std::vector< int > dummy;
+ m_alignedToT.push_back(dummy);
+ }
+ std::vector<int> alignedCountS(countF,0);
+
+ for (Alignment::const_iterator a=alignment.begin(); a!=alignment.end(); ++a) {
+ m_alignedToT[a->second].push_back(a->first);
+ alignedCountS[a->first]++;
+ alignedToS[a->first].push_back(a->second);
+ }
+
+ for (int startF=0; startF<countF; ++startF) {
+ for (int endF=startF; endF<countF; ++endF) {
+
+ int minE = 9999;
+ int maxE = -1;
+ for (int fi=startF; fi<=endF; ++fi) {
+ for (size_t i=0; i<alignedToS[fi].size(); ++i) {
+ int ei = alignedToS[fi][i];
+ if (ei<minE) {
+ minE = ei;
+ }
+ if (ei>maxE) {
+ maxE = ei;
+ }
+ }
+ }
+
+ m_minAndMaxAlignedToSourceSpan[ std::pair<int,int>(startF,endF) ] = std::pair<int,int>(minE,maxE);
+ }
+ }
+
+ // check alignments for target phrase startE...endE
+ // loop over continuous phrases which are compatible with the word alignments
+ for (int startE=0; startE<countE; ++startE) {
+ for (int endE=startE; endE<countE; ++endE) {
+
+ int minF = 9999;
+ int maxF = -1;
+ std::vector< int > usedF = alignedCountS;
+ for (int ei=startE; ei<=endE; ++ei) {
+ for (size_t i=0; i<m_alignedToT[ei].size(); ++i) {
+ int fi = m_alignedToT[ei][i];
+ if (fi<minF) {
+ minF = fi;
+ }
+ if (fi>maxF) {
+ maxF = fi;
+ }
+ usedF[fi]--;
+ }
+ }
+
+ if (maxF >= 0) { // aligned to any source words at all
+
+ // check if source words are aligned to out of bound target words
+ bool out_of_bounds = false;
+ for (int fi=minF; fi<=maxF && !out_of_bounds; ++fi)
+ if (usedF[fi]>0) {
+ // cout << "ouf of bounds: " << fi << "\n";
+ out_of_bounds = true;
+ }
+
+ // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n";
+ if (!out_of_bounds) {
+ // start point of source phrase may retreat over unaligned
+ for (int startF=minF;
+ (startF>=0 &&
+ (startF==minF || alignedCountS[startF]==0)); // unaligned
+ startF--) {
+ // end point of source phrase may advance over unaligned
+ for (int endF=maxF;
+ (endF<countF &&
+ (endF==maxF || alignedCountS[endF]==0)); // unaligned
+ endF++) { // at this point we have extracted a phrase
+
+ InsertPhraseVertices(m_topLeft, m_topRight, m_bottomLeft, m_bottomRight,
+ startF, startE, endF, endE);
+ }
+ }
+ }
+ }
+ }
+ }
+}
+
+
+void PhraseOrientation::InsertVertex( HSentenceVertices & corners, int x, int y )
+{
+ std::set<int> tmp;
+ tmp.insert(x);
+ std::pair< HSentenceVertices::iterator, bool > ret = corners.insert( std::pair<int, std::set<int> > (y, tmp) );
+ if (ret.second == false) {
+ ret.first->second.insert(x);
+ }
+}
+
+
+void PhraseOrientation::InsertPhraseVertices(HSentenceVertices & topLeft,
+ HSentenceVertices & topRight,
+ HSentenceVertices & bottomLeft,
+ HSentenceVertices & bottomRight,
+ int startF, int startE, int endF, int endE)
+{
+
+ InsertVertex(topLeft, startF, startE);
+ InsertVertex(topRight, endF, startE);
+ InsertVertex(bottomLeft, startF, endE);
+ InsertVertex(bottomRight, endF, endE);
+}
+
+
+const std::string PhraseOrientation::GetOrientationInfoString(int startF, int endF, REO_DIR direction) const
+{
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax
+ = m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) );
+
+ if ( foundMinMax != m_minAndMaxAlignedToSourceSpan.end() ) {
+ int startE = (foundMinMax->second).first;
+ int endE = (foundMinMax->second).second;
+// std::cerr << "Phrase orientation for"
+// << " startF=" << startF
+// << " endF=" << endF
+// << " startE=" << startE
+// << " endE=" << endE
+// << std::endl;
+ return GetOrientationInfoString(startF, startE, endF, endE, direction);
+ } else {
+ std::cerr << "Error: not able to determine phrase orientation" << std::endl;
+ std::exit(1);
+ }
+}
+
+
+const std::string PhraseOrientation::GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction) const
+{
+ REO_POS hierPrevOrient, hierNextOrient;
+
+ bool connectedLeftTopP = IsAligned( startF-1, startE-1 );
+ bool connectedRightTopP = IsAligned( endF+1, startE-1 );
+ bool connectedLeftTopN = IsAligned( endF+1, endE+1 );
+ bool connectedRightTopN = IsAligned( startF-1, endE+1 );
+
+ if ( direction == L2R || direction == BIDIR )
+ hierPrevOrient = GetOrientHierModel(REO_MSLR,
+ connectedLeftTopP, connectedRightTopP,
+ startF, endF, startE, endE, m_source.size()-1, 0, 1,
+ &ge, &lt,
+ m_bottomRight, m_bottomLeft);
+
+ if ( direction == R2L || direction == BIDIR )
+ hierNextOrient = GetOrientHierModel(REO_MSLR,
+ connectedLeftTopN, connectedRightTopN,
+ endF, startF, endE, startE, 0, m_source.size()-1, -1,
+ &lt, &ge,
+ m_bottomLeft, m_bottomRight);
+
+ switch (direction) {
+ case L2R:
+ return GetOrientationString(hierPrevOrient, REO_MSLR);
+ break;
+ case R2L:
+ return GetOrientationString(hierNextOrient, REO_MSLR);
+ break;
+ case BIDIR:
+ return GetOrientationString(hierPrevOrient, REO_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MSLR);
+ break;
+ default:
+ return GetOrientationString(hierPrevOrient, REO_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MSLR);
+ break;
+ }
+ return "PhraseOrientationERROR";
+}
+
+
+REO_POS PhraseOrientation::GetOrientationInfo(int startF, int endF, REO_DIR direction) const
+{
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax
+ = m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) );
+
+ if ( foundMinMax != m_minAndMaxAlignedToSourceSpan.end() ) {
+ int startE = (foundMinMax->second).first;
+ int endE = (foundMinMax->second).second;
+// std::cerr << "Phrase orientation for"
+// << " startF=" << startF
+// << " endF=" << endF
+// << " startE=" << startE
+// << " endE=" << endE
+// << std::endl;
+ return GetOrientationInfo(startF, startE, endF, endE, direction);
+ } else {
+ std::cerr << "Error: not able to determine phrase orientation" << std::endl;
+ std::exit(1);
+ }
+}
+
+
+REO_POS PhraseOrientation::GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const
+{
+ if ( direction != L2R && direction != R2L ) {
+ std::cerr << "PhraseOrientation::GetOrientationInfo(): direction should be either L2R or R2L" << std::endl;
+ std::exit(1);
+ }
+
+ bool connectedLeftTopP = IsAligned( startF-1, startE-1 );
+ bool connectedRightTopP = IsAligned( endF+1, startE-1 );
+ bool connectedLeftTopN = IsAligned( endF+1, endE+1 );
+ bool connectedRightTopN = IsAligned( startF-1, endE+1 );
+
+ if ( direction == L2R )
+ return GetOrientHierModel(REO_MSLR,
+ connectedLeftTopP, connectedRightTopP,
+ startF, endF, startE, endE, m_source.size()-1, 0, 1,
+ &ge, &lt,
+ m_bottomRight, m_bottomLeft);
+
+ if ( direction == R2L )
+ return GetOrientHierModel(REO_MSLR,
+ connectedLeftTopN, connectedRightTopN,
+ endF, startF, endE, startE, 0, m_source.size()-1, -1,
+ &lt, &ge,
+ m_bottomLeft, m_bottomRight);
+
+ return UNKNOWN;
+}
+
+
+// to be called with countF-1 instead of countF
+REO_POS PhraseOrientation::GetOrientHierModel(REO_MODEL_TYPE modelType,
+ bool connectedLeftTop, bool connectedRightTop,
+ int startF, int endF, int startE, int endE, int countF, int zero, int unit,
+ bool (*ge)(int, int), bool (*lt)(int, int),
+ const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const
+{
+ HSentenceVertices::const_iterator it;
+
+ if ((connectedLeftTop && !connectedRightTop) ||
+ ((it = bottomRight.find(startE - unit)) != bottomRight.end() &&
+ it->second.find(startF-unit) != it->second.end()))
+ return LEFT;
+
+ if (modelType == REO_MONO)
+ return UNKNOWN;
+
+ if ((!connectedLeftTop && connectedRightTop) ||
+ ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() &&
+ it->second.find(endF + unit) != it->second.end()))
+ return RIGHT;
+
+ if (modelType == REO_MSD)
+ return UNKNOWN;
+
+ connectedLeftTop = false;
+ for (int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) {
+ if ((connectedLeftTop = ((it = bottomRight.find(startE - unit)) != bottomRight.end() &&
+ it->second.find(indexF) != it->second.end())))
+ return DRIGHT;
+ }
+
+ connectedRightTop = false;
+ for (int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) {
+ if ((connectedRightTop = ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() &&
+ it->second.find(indexF) != it->second.end())))
+ return DLEFT;
+ }
+
+ return UNKNOWN;
+}
+
+
+const std::string PhraseOrientation::GetOrientationString(const REO_POS orient, const REO_MODEL_TYPE modelType)
+{
+ std::ostringstream oss;
+ WriteOrientation(oss, orient, modelType);
+ return oss.str();
+}
+
+
+void PhraseOrientation::WriteOrientation(std::ostream& out, const REO_POS orient, const REO_MODEL_TYPE modelType)
+{
+ switch(orient) {
+ case LEFT:
+ out << "mono";
+ break;
+ case RIGHT:
+ out << "swap";
+ break;
+ case DRIGHT:
+ out << "dright";
+ break;
+ case DLEFT:
+ out << "dleft";
+ break;
+ case UNKNOWN:
+ switch(modelType) {
+ case REO_MONO:
+ out << "nomono";
+ break;
+ case REO_MSD:
+ out << "other";
+ break;
+ case REO_MSLR:
+ out << "dright";
+ break;
+ }
+ break;
+ }
+}
+
+
+bool PhraseOrientation::IsAligned(int fi, int ei) const
+{
+ if (ei == -1 && fi == -1)
+ return true;
+
+ if (ei <= -1 || fi <= -1)
+ return false;
+
+ if (ei == (int)m_target.size() && fi == (int)m_source.size())
+ return true;
+
+ if (ei >= (int)m_target.size() || fi >= (int)m_source.size())
+ return false;
+
+ for (size_t i=0; i<m_alignedToT[ei].size(); ++i)
+ if (m_alignedToT[ei][i] == fi)
+ return true;
+
+ return false;
+}
+
+
+void PhraseOrientation::IncrementPriorCount(REO_DIR direction, REO_POS orient, float increment)
+{
+ assert(direction==L2R || direction==R2L);
+ if (direction == L2R) {
+ m_l2rOrientationPriorCounts[orient] += increment;
+ } else if (direction == R2L) {
+ m_r2lOrientationPriorCounts[orient] += increment;
+ }
+}
+
+
+void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType)
+{
+ std::map<std::string,float> l2rOrientationPriorCountsMap;
+ std::map<std::string,float> r2lOrientationPriorCountsMap;
+ for (int orient=0; orient<=UNKNOWN; ++orient) {
+ l2rOrientationPriorCountsMap[GetOrientationString((REO_POS)orient, modelType)] += m_l2rOrientationPriorCounts[orient];
+ }
+ for (int orient=0; orient<=UNKNOWN; ++orient) {
+ r2lOrientationPriorCountsMap[GetOrientationString((REO_POS)orient, modelType)] += m_r2lOrientationPriorCounts[orient];
+ }
+ for (std::map<std::string,float>::const_iterator l2rOrientationPriorCountsMapIt = l2rOrientationPriorCountsMap.begin();
+ l2rOrientationPriorCountsMapIt != l2rOrientationPriorCountsMap.end(); ++l2rOrientationPriorCountsMapIt) {
+ out << "L2R_" << l2rOrientationPriorCountsMapIt->first << " " << l2rOrientationPriorCountsMapIt->second << std::endl;
+ }
+ for (std::map<std::string,float>::const_iterator r2lOrientationPriorCountsMapIt = r2lOrientationPriorCountsMap.begin();
+ r2lOrientationPriorCountsMapIt != r2lOrientationPriorCountsMap.end(); ++r2lOrientationPriorCountsMapIt) {
+ out << "R2L_" << r2lOrientationPriorCountsMapIt->first << " " << r2lOrientationPriorCountsMapIt->second << std::endl;
+ }
+}
+
+} // namespace GHKM
+} // namespace Moses
+
diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.h b/phrase-extract/extract-ghkm/PhraseOrientation.h
new file mode 100644
index 000000000..6e83929f1
--- /dev/null
+++ b/phrase-extract/extract-ghkm/PhraseOrientation.h
@@ -0,0 +1,102 @@
+
+/***********************************************************************
+ Moses - statistical machine translation system
+ Copyright (C) 2006-2011 University of Edinburgh
+
+ This library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ This library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with this library; if not, write to the Free Software
+ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+***********************************************************************/
+
+#pragma once
+
+#include "Alignment.h"
+
+#include <map>
+#include <set>
+#include <string>
+#include <vector>
+#include <boost/unordered_map.hpp>
+
+namespace Moses
+{
+namespace GHKM
+{
+
+enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO};
+enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN};
+enum REO_DIR {L2R, R2L, BIDIR};
+
+// The key of the map is the English index and the value is a set of the source ones
+typedef std::map <int, std::set<int> > HSentenceVertices;
+
+
+class PhraseOrientation
+{
+public:
+
+ PhraseOrientation(const std::vector<std::string> &source,
+ const std::vector<std::string> &target,
+ const Alignment &alignment);
+
+ REO_POS GetOrientationInfo(int startF, int endF, REO_DIR direction) const;
+ REO_POS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const;
+ const std::string GetOrientationInfoString(int startF, int endF, REO_DIR direction=BIDIR) const;
+ const std::string GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction=BIDIR) const;
+ static const std::string GetOrientationString(const REO_POS orient, const REO_MODEL_TYPE modelType=REO_MSLR);
+ static void WriteOrientation(std::ostream& out, const REO_POS orient, const REO_MODEL_TYPE modelType=REO_MSLR);
+ void IncrementPriorCount(REO_DIR direction, REO_POS orient, float increment);
+ static void WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType=REO_MSLR);
+
+private:
+
+ void InsertVertex( HSentenceVertices & corners, int x, int y );
+
+ void InsertPhraseVertices(HSentenceVertices & topLeft,
+ HSentenceVertices & topRight,
+ HSentenceVertices & bottomLeft,
+ HSentenceVertices & bottomRight,
+ int startF, int startE, int endF, int endE);
+
+ REO_POS GetOrientHierModel(REO_MODEL_TYPE modelType,
+ bool connectedLeftTop, bool connectedRightTop,
+ int startF, int endF, int startE, int endE, int countF, int zero, int unit,
+ bool (*ge)(int, int), bool (*lt)(int, int),
+ const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const;
+
+ bool IsAligned(int fi, int ei) const;
+
+ static bool ge(int first, int second) { return first >= second; };
+ static bool le(int first, int second) { return first <= second; };
+ static bool lt(int first, int second) { return first < second; };
+
+ const std::vector<std::string> &m_source;
+ const std::vector<std::string> &m_target;
+ const Alignment &m_alignment;
+
+ std::vector<std::vector<int> > m_alignedToT;
+
+ HSentenceVertices m_topLeft;
+ HSentenceVertices m_topRight;
+ HSentenceVertices m_bottomLeft;
+ HSentenceVertices m_bottomRight;
+
+ boost::unordered_map< std::pair<int,int> , std::pair<int,int> > m_minAndMaxAlignedToSourceSpan;
+
+ static std::vector<float> m_l2rOrientationPriorCounts;
+ static std::vector<float> m_r2lOrientationPriorCounts;
+};
+
+} // namespace GHKM
+} // namespace Moses
+
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
index be373b67b..2fba6930b 100644
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp
@@ -169,14 +169,17 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out)
}
}
-void ScfgRuleWriter::Write(const ScfgRule &rule, const Subgraph &g)
+void ScfgRuleWriter::Write(const ScfgRule &rule, const Subgraph &g, bool printEndl)
{
- Write(rule,false);
- m_fwd << " {{Tree ";
- g.PrintTree(m_fwd);
- m_fwd << "}}";
+ Write(rule,false);
+ m_fwd << " {{Tree ";
+ g.PrintTree(m_fwd);
+ m_fwd << "}}";
+
+ if (printEndl) {
m_fwd << std::endl;
m_inv << std::endl;
+ }
}
} // namespace GHKM
diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
index 18f423149..8a8564580 100644
--- a/phrase-extract/extract-ghkm/ScfgRuleWriter.h
+++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.h
@@ -44,7 +44,7 @@ public:
void Write(const ScfgRule &rule, bool printEndl=true);
- void Write(const ScfgRule &rule, const Subgraph &g);
+ void Write(const ScfgRule &rule, const Subgraph &g, bool printEndl=true);
private:
// Disallow copying
diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h
index e5bf5b463..d0209254f 100644
--- a/phrase-extract/extract-ghkm/XmlTreeParser.h
+++ b/phrase-extract/extract-ghkm/XmlTreeParser.h
@@ -49,6 +49,8 @@ public:
static std::auto_ptr<ParseTree> ConvertTree(const MosesTraining::SyntaxNode &,
const std::vector<std::string> &);
+ const std::vector<std::string>& GetWords() { return m_words; };
+
private:
std::set<std::string> &m_labelSet;
diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp
index fe3d99cd2..552dcb739 100644
--- a/phrase-extract/extract-main.cpp
+++ b/phrase-extract/extract-main.cpp
@@ -363,8 +363,6 @@ void ExtractTask::extract(SentenceAlignment &sentence)
HSentenceVertices outBottomLeft;
HSentenceVertices outBottomRight;
- HSentenceVertices::const_iterator it;
-
bool relaxLimit = m_options.isHierModel();
bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel();
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index e8ba1d942..7f155f6ed 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -46,6 +46,7 @@ LexicalTable lexTable;
bool inverseFlag = false;
bool hierarchicalFlag = false;
bool pcfgFlag = false;
+bool phraseOrientationFlag = false;
bool treeFragmentsFlag = false;
bool sourceSyntaxLabelsFlag = false;
bool sourceSyntaxLabelSetFlag = false;
@@ -69,6 +70,7 @@ bool nonTermContext = false;
int countOfCounts[COC_MAX+1];
int totalDistinct = 0;
float minCountHierarchical = 0;
+bool phraseOrientationPriorsFlag = false;
boost::unordered_map<std::string,float> sourceLHSCounts;
boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > targetLHSAndSourceLHSJointCounts;
@@ -82,6 +84,9 @@ std::set<std::string> targetPreferenceLabelSet;
std::map<std::string,size_t> targetPreferenceLabels;
std::vector<std::string> targetPreferenceLabelsByIndex;
+std::vector<float> orientationClassPriorsL2R(4,0); // mono swap dright dleft
+std::vector<float> orientationClassPriorsR2L(4,0); // mono swap dright dleft
+
Vocabulary vcbT;
Vocabulary vcbS;
@@ -106,6 +111,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostrea
double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource );
set<std::string> functionWordList;
+void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors, std::vector<float> &orientationClassPriorsL2R, std::vector<float> &orientationClassPriorsR2L);
void loadFunctionWords( const string &fileNameFunctionWords );
double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource );
@@ -136,6 +142,7 @@ int main(int argc, char* argv[])
std::string fileNameTargetPreferenceLabelSet;
std::string fileNameLeftHandSideTargetPreferenceLabelCounts;
std::string fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts;
+ std::string fileNamePhraseOrientationPriors;
std::vector<std::string> featureArgs; // all unknown args passed to feature manager
for(int i=4; i<argc; i++) {
@@ -148,9 +155,12 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--PCFG") == 0) {
pcfgFlag = true;
std::cerr << "including PCFG scores" << std::endl;
+ } else if (strcmp(argv[i],"--PhraseOrientation") == 0) {
+ phraseOrientationFlag = true;
+ std::cerr << "including phrase orientation information" << std::endl;
} else if (strcmp(argv[i],"--TreeFragments") == 0) {
treeFragmentsFlag = true;
- std::cerr << "including tree fragment information from syntactic parse\n";
+ std::cerr << "including tree fragment information from syntactic parse" << std::endl;
} else if (strcmp(argv[i],"--SourceLabels") == 0) {
sourceSyntaxLabelsFlag = true;
std::cerr << "including source label information" << std::endl;
@@ -216,6 +226,14 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
crossedNonTerm = true;
std::cerr << "crossed non-term reordering feature" << std::endl;
+ } else if (strcmp(argv[i],"--PhraseOrientationPriors") == 0) {
+ phraseOrientationPriorsFlag = true;
+ if (i+1==argc) {
+ std::cerr << "ERROR: specify priors file for phrase orientation!" << std::endl;
+ exit(1);
+ }
+ fileNamePhraseOrientationPriors = argv[++i];
+ std::cerr << "smoothing phrase orientation with priors from " << fileNamePhraseOrientationPriors << std::endl;
} else if (strcmp(argv[i],"--SpanLength") == 0) {
spanLength = true;
std::cerr << "span length feature" << std::endl;
@@ -254,6 +272,10 @@ int main(int argc, char* argv[])
for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
}
+ if (phraseOrientationPriorsFlag) {
+ loadOrientationPriors(fileNamePhraseOrientationPriors,orientationClassPriorsL2R,orientationClassPriorsR2L);
+ }
+
// sorted phrase extraction file
Moses::InputFileStream extractFile(fileNameExtract);
@@ -774,11 +796,6 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
if (kneserNeyFlag)
phraseTableFile << " " << distinctCount;
- if ((treeFragmentsFlag || sourceSyntaxLabelsFlag || targetPreferenceLabelsFlag) &&
- !inverseFlag) {
- phraseTableFile << " |||";
- }
-
phraseTableFile << " |||";
// tree fragments
@@ -832,6 +849,13 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
}
}
+ // phrase orientation
+ if (phraseOrientationFlag && !inverseFlag) {
+ phraseTableFile << " {{Orientation ";
+ phrasePair.CollectAllPhraseOrientations("Orientation",orientationClassPriorsL2R,orientationClassPriorsR2L,0.5,phraseTableFile);
+ phraseTableFile << "}}";
+ }
+
if (spanLength && !inverseFlag) {
string propValue = phrasePair.CollectAllPropertyValues("SpanLength");
if (!propValue.empty()) {
@@ -851,6 +875,94 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair,
+void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors,
+ std::vector<float> &orientationClassPriorsL2R,
+ std::vector<float> &orientationClassPriorsR2L)
+{
+ assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dright dleft
+
+ std::cerr << "Loading phrase orientation priors from " << fileNamePhraseOrientationPriors;
+ ifstream inFile;
+ inFile.open(fileNamePhraseOrientationPriors.c_str());
+ if (inFile.fail()) {
+ std::cerr << " - ERROR: could not open file" << std::endl;
+ exit(1);
+ }
+
+ std::string line;
+ size_t linesRead = 0;
+ float l2rSum = 0;
+ float r2lSum = 0;
+ while (getline(inFile, line)) {
+ istringstream tokenizer(line);
+ std::string key;
+ tokenizer >> key;
+
+ bool l2rFlag = false;
+ bool r2lFlag = false;
+ if (!key.substr(0,4).compare("L2R_")) {
+ l2rFlag = true;
+ }
+ if (!key.substr(0,4).compare("R2L_")) {
+ r2lFlag = true;
+ }
+ if (!l2rFlag && !r2lFlag) {
+ std::cerr << " - ERROR: malformed line in orientation priors file" << std::endl;
+ }
+ key.erase(0,4);
+
+ int orientationClassId = -1;
+ if (!key.compare("mono")) {
+ orientationClassId = 0;
+ }
+ if (!key.compare("swap")) {
+ orientationClassId = 1;
+ }
+ if (!key.compare("dright")) {
+ orientationClassId = 2;
+ }
+ if (!key.compare("dleft")) {
+ orientationClassId = 3;
+ }
+ if (orientationClassId == -1) {
+ std::cerr << " - ERROR: malformed line in orientation priors file" << std::endl;
+ }
+
+ float count;
+ tokenizer >> count;
+
+ if (l2rFlag) {
+ orientationClassPriorsL2R[orientationClassId] += count;
+ l2rSum += count;
+ }
+ if (r2lFlag) {
+ orientationClassPriorsR2L[orientationClassId] += count;
+ r2lSum += count;
+ }
+
+ ++linesRead;
+ }
+
+ // normalization: return prior probabilities, not counts
+ if (l2rSum != 0) {
+ for (std::vector<float>::iterator orientationClassPriorsL2RIt = orientationClassPriorsL2R.begin();
+ orientationClassPriorsL2RIt != orientationClassPriorsL2R.end(); ++orientationClassPriorsL2RIt) {
+ *orientationClassPriorsL2RIt /= l2rSum;
+ }
+ }
+ if (r2lSum != 0) {
+ for (std::vector<float>::iterator orientationClassPriorsR2LIt = orientationClassPriorsR2L.begin();
+ orientationClassPriorsR2LIt != orientationClassPriorsR2L.end(); ++orientationClassPriorsR2LIt) {
+ *orientationClassPriorsR2LIt /= r2lSum;
+ }
+ }
+
+ std::cerr << " - read " << linesRead << " lines from orientation priors file" << std::endl;
+ inFile.close();
+}
+
+
+
bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *alignmentTargetToSource )
{
for (size_t currTarget = 0; currTarget < alignmentTargetToSource->size(); ++currTarget) {
diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl
index bf0f0129a..1108bec1b 100755
--- a/scripts/ems/experiment.perl
+++ b/scripts/ems/experiment.perl
@@ -2195,9 +2195,10 @@ sub define_training_extract_phrases {
$cmd .= "-glue-grammar-file $glue_grammar_file ";
if (&get("GENERAL:output-parser") && (&get("TRAINING:use-unknown-word-labels") || &get("TRAINING:use-unknown-word-soft-matches"))) {
- my $unknown_word_label = &versionize(&long_file_name("unknown-word-label","model",""));
- $cmd .= "-unknown-word-label $unknown_word_label ";
+ my $unknown_word_label = &versionize(&long_file_name("unknown-word-label","model",""));
+ $cmd .= "-unknown-word-label $unknown_word_label ";
}
+
if (&get("GENERAL:output-parser") && &get("TRAINING:use-unknown-word-soft-matches")) {
my $unknown_word_soft_matches = &versionize(&long_file_name("unknown-word-soft-matches","model",""));
$cmd .= "-unknown-word-soft-matches $unknown_word_soft_matches ";
@@ -2210,6 +2211,12 @@ sub define_training_extract_phrases {
if (&get("TRAINING:ghkm-tree-fragments")) {
$cmd .= "-ghkm-tree-fragments ";
}
+
+ if (&get("TRAINING:ghkm-phrase-orientation")) {
+ $cmd .= "-ghkm-phrase-orientation ";
+ my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
+ $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
+ }
}
my $extract_settings = &get("TRAINING:extract-settings");
@@ -2242,6 +2249,11 @@ sub define_training_build_ttable {
if (&get("TRAINING:ghkm-tree-fragments")) {
$cmd .= "-ghkm-tree-fragments ";
}
+ if (&get("TRAINING:ghkm-phrase-orientation")) {
+ $cmd .= "-ghkm-phrase-orientation ";
+ my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model",""));
+ $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file ";
+ }
}
&create_step($step_id,$cmd);
diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl
index 7abada1de..ff6a058b5 100755
--- a/scripts/generic/extract-parallel.perl
+++ b/scripts/generic/extract-parallel.perl
@@ -29,6 +29,8 @@ my $otherExtractArgs= "";
my $weights = "";
my $baselineExtract;
my $glueFile;
+my $phraseOrientation = 0;
+my $phraseOrientationPriorsFile;
for (my $i = 8; $i < $#ARGV + 1; ++$i)
{
@@ -45,6 +47,11 @@ for (my $i = 8; $i < $#ARGV + 1; ++$i)
$glueFile = $ARGV[++$i];
next;
}
+ $phraseOrientation = 1 if $ARGV[$i] eq "--PhraseOrientation";
+ if ($ARGV[$i] eq '--PhraseOrientationPriors') {
+ $phraseOrientationPriorsFile = $ARGV[++$i];
+ next;
+ }
$otherExtractArgs .= $ARGV[$i] ." ";
}
@@ -219,6 +226,32 @@ if (defined($glueFile)) {
print STDERR `$cmd`;
}
+# phrase orientation priors (GHKM extraction)
+if ($phraseOrientation && defined($phraseOrientationPriorsFile)) {
+ print STDERR "Merging phrase orientation priors\n";
+
+ my @orientationPriorsCountFiles = glob("$TMPDIR/*.phraseOrientationPriors");
+ my %priorCounts;
+
+ foreach my $filenamePhraseOrientationPriors (@orientationPriorsCountFiles) {
+ if (-f $filenamePhraseOrientationPriors) {
+ open my $infilePhraseOrientationPriors, '<', $filenamePhraseOrientationPriors or die "cannot open $filenamePhraseOrientationPriors: $!";
+ while (my $line = <$infilePhraseOrientationPriors>) {
+ print $line;
+ my ($key, $value) = split / /, $line;
+ $priorCounts{$key} += $value;
+ }
+ close $infilePhraseOrientationPriors;
+ }
+ }
+
+ open my $outPhraseOrientationPriors, '>', $phraseOrientationPriorsFile or die "cannot open $phraseOrientationPriorsFile: $!";
+ foreach my $key (sort keys %priorCounts) {
+ print $outPhraseOrientationPriors $key." ".$priorCounts{$key}."\n";
+ }
+ close($outPhraseOrientationPriors);
+}
+
# delete temporary files
$cmd = "rm -rf $TMPDIR \n";
print STDERR $cmd;
diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl
index 22ecc5ff9..da8e677bc 100755
--- a/scripts/training/train-model.perl
+++ b/scripts/training/train-model.perl
@@ -32,7 +32,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_
$_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE,
@_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS,
$_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE,
- $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
+ $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS,
$_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE,
$_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES,
$_MEMSCORE, $_FINAL_ALIGNMENT_MODEL,
@@ -110,6 +110,8 @@ $_HELP = 1
'unknown-word-soft-matches-file=s' => \$_UNKNOWN_WORD_SOFT_MATCHES_FILE, # give dummy label to unknown word, and allow soft matches to all other labels (with cost determined by sparse features)
'ghkm' => \$_GHKM,
'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS,
+ 'ghkm-phrase-orientation' => \$_GHKM_PHRASE_ORIENTATION,
+ 'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation
'pcfg' => \$_PCFG,
'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1,
'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2,
@@ -1426,6 +1428,8 @@ sub extract_phrase {
$cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
$cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
+ $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
+ $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE);
if (!defined($_GHKM)) {
$cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX;
$cmd .= " --TargetSyntax" if $_TARGET_SYNTAX;
@@ -1550,6 +1554,9 @@ sub score_phrase_phrase_extract {
my $NEG_LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NegLogProb/);
my $NO_LEX = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/);
my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef;
+ my $SOURCE_LABELS = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabels/);
+ my $SOURCE_LABEL_COUNTS_LHS = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabelCountsLHS/);
+ my $SOURCE_LABEL_SET = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabelSet/);
my $SPAN_LENGTH = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SpanLength/);
my $CORE_SCORE_OPTIONS = "";
$CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB;
@@ -1557,6 +1564,9 @@ sub score_phrase_phrase_extract {
$CORE_SCORE_OPTIONS .= " --NoLex" if $NO_LEX;
$CORE_SCORE_OPTIONS .= " --Singleton" if $SINGLETON;
$CORE_SCORE_OPTIONS .= " --CrossedNonTerm" if $CROSSEDNONTERM;
+ $CORE_SCORE_OPTIONS .= " --SourceLabels" if $SOURCE_LABELS;
+ $CORE_SCORE_OPTIONS .= " --SourceLabelCountsLHS " if $SOURCE_LABEL_COUNTS_LHS;
+ $CORE_SCORE_OPTIONS .= " --SourceLabelSet " if $SOURCE_LABEL_SET;
my $substep = 1;
my $isParent = 1;
@@ -1597,6 +1607,8 @@ sub score_phrase_phrase_extract {
$cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2;
$cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1;
$cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS;
+ $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION;
+ $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE);
$cmd .= " $DOMAIN" if $DOMAIN;
$cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS);
$cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE;