diff options
-rw-r--r-- | phrase-extract/ExtractionPhrasePair.cpp | 90 | ||||
-rw-r--r-- | phrase-extract/ExtractionPhrasePair.h | 6 | ||||
-rw-r--r-- | phrase-extract/PhraseExtractionOptions.h | 1 | ||||
-rw-r--r-- | phrase-extract/extract-ghkm/ExtractGHKM.cpp | 64 | ||||
-rw-r--r-- | phrase-extract/extract-ghkm/Options.h | 2 | ||||
-rw-r--r-- | phrase-extract/extract-ghkm/PhraseOrientation.cpp | 417 | ||||
-rw-r--r-- | phrase-extract/extract-ghkm/PhraseOrientation.h | 102 | ||||
-rw-r--r-- | phrase-extract/extract-ghkm/ScfgRuleWriter.cpp | 13 | ||||
-rw-r--r-- | phrase-extract/extract-ghkm/ScfgRuleWriter.h | 2 | ||||
-rw-r--r-- | phrase-extract/extract-ghkm/XmlTreeParser.h | 2 | ||||
-rw-r--r-- | phrase-extract/extract-main.cpp | 2 | ||||
-rw-r--r-- | phrase-extract/score-main.cpp | 124 | ||||
-rwxr-xr-x | scripts/ems/experiment.perl | 16 | ||||
-rwxr-xr-x | scripts/generic/extract-parallel.perl | 33 | ||||
-rwxr-xr-x | scripts/training/train-model.perl | 14 |
15 files changed, 858 insertions, 30 deletions
diff --git a/phrase-extract/ExtractionPhrasePair.cpp b/phrase-extract/ExtractionPhrasePair.cpp index 9564b1cfe..ccf0fc275 100644 --- a/phrase-extract/ExtractionPhrasePair.cpp +++ b/phrase-extract/ExtractionPhrasePair.cpp @@ -463,6 +463,96 @@ std::string ExtractionPhrasePair::CollectAllLabelsSeparateLHSAndRHS(const std::s } +void ExtractionPhrasePair::CollectAllPhraseOrientations(const std::string &key, + const std::vector<float> &orientationClassPriorsL2R, + const std::vector<float> &orientationClassPriorsR2L, + double smoothingFactor, + std::ostream &out) const +{ + assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dright dleft + + const PROPERTY_VALUES *allPropertyValues = GetProperty( key ); + + if ( allPropertyValues == NULL ) { + return; + } + + // bidirectional MSLR phrase orientation with 2x4 orientation classes: + // mono swap dright dleft + std::vector<float> orientationClassCountSumL2R(4,0); + std::vector<float> orientationClassCountSumR2L(4,0); + + for (PROPERTY_VALUES::const_iterator iter=allPropertyValues->begin(); + iter!=allPropertyValues->end(); ++iter) { + std::string l2rOrientationClass, r2lOrientationClass; + try { + istringstream tokenizer(iter->first); + tokenizer >> l2rOrientationClass; + tokenizer >> r2lOrientationClass; + if ( tokenizer.peek() != EOF ) { + UTIL_THROW(util::Exception, "ExtractionPhrasePair" + << ": Collecting phrase orientations failed. " + << "Too many tokens?"); + } + } catch (const std::exception &e) { + UTIL_THROW(util::Exception, "ExtractionPhrasePair" + << ": Collecting phrase orientations failed. " + << "Flawed property value in extract file?"); + } + + int l2rOrientationClassId = -1; + if (!l2rOrientationClass.compare("mono")) { + l2rOrientationClassId = 0; + } + if (!l2rOrientationClass.compare("swap")) { + l2rOrientationClassId = 1; + } + if (!l2rOrientationClass.compare("dright")) { + l2rOrientationClassId = 2; + } + if (!l2rOrientationClass.compare("dleft")) { + l2rOrientationClassId = 3; + } + if (l2rOrientationClassId == -1) { + UTIL_THROW(util::Exception, "ExtractionPhrasePair" + << ": Collecting phrase orientations failed. " + << "Unknown orientation class \"" << l2rOrientationClass << "\"." ); + } + int r2lOrientationClassId = -1; + if (!r2lOrientationClass.compare("mono")) { + r2lOrientationClassId = 0; + } + if (!r2lOrientationClass.compare("swap")) { + r2lOrientationClassId = 1; + } + if (!r2lOrientationClass.compare("dright")) { + r2lOrientationClassId = 2; + } + if (!r2lOrientationClass.compare("dleft")) { + r2lOrientationClassId = 3; + } + if (r2lOrientationClassId == -1) { + UTIL_THROW(util::Exception, "ExtractionPhrasePair" + << ": Collecting phrase orientations failed. " + << "Unknown orientation class \"" << r2lOrientationClass << "\"." ); + } + + orientationClassCountSumL2R[l2rOrientationClassId] += iter->second; + orientationClassCountSumR2L[r2lOrientationClassId] += iter->second; + } + + for (size_t i=0; i<4; ++i) { + if (i>0) { + out << " "; + } + out << (float)( (smoothingFactor*orientationClassPriorsL2R[i] + orientationClassCountSumL2R[i]) / (smoothingFactor + m_count) ); + } + for (size_t i=0; i<4; ++i) { + out << " " << (float)( (smoothingFactor*orientationClassPriorsR2L[i] + orientationClassCountSumR2L[i]) / (smoothingFactor + m_count) ); + } +} + + } diff --git a/phrase-extract/ExtractionPhrasePair.h b/phrase-extract/ExtractionPhrasePair.h index ba23ac1f2..e0f5dc5fb 100644 --- a/phrase-extract/ExtractionPhrasePair.h +++ b/phrase-extract/ExtractionPhrasePair.h @@ -131,6 +131,12 @@ public: boost::unordered_map<std::string, boost::unordered_map<std::string,float>* >& sourceRHSAndLHSJointCounts, Vocabulary &vcbT) const; + void CollectAllPhraseOrientations(const std::string &key, + const std::vector<float> &orientationClassPriorsL2R, + const std::vector<float> &orientationClassPriorsR2L, + double smoothingFactor, + std::ostream &out) const; + void AddProperties( const std::string &str, float count ); void AddProperty( const std::string &key, const std::string &value, float count ) diff --git a/phrase-extract/PhraseExtractionOptions.h b/phrase-extract/PhraseExtractionOptions.h index 87712d6d3..7132974d4 100644 --- a/phrase-extract/PhraseExtractionOptions.h +++ b/phrase-extract/PhraseExtractionOptions.h @@ -18,7 +18,6 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA ***********************************************************************/ -/* Created by Rohit Gupta, CDAC, Mumbai, India on 18 July, 2012*/ #include <string> #include <vector> diff --git a/phrase-extract/extract-ghkm/ExtractGHKM.cpp b/phrase-extract/extract-ghkm/ExtractGHKM.cpp index b86c28586..36dfee2e5 100644 --- a/phrase-extract/extract-ghkm/ExtractGHKM.cpp +++ b/phrase-extract/extract-ghkm/ExtractGHKM.cpp @@ -27,6 +27,7 @@ #include "OutputFileStream.h" #include "Options.h" #include "ParseTree.h" +#include "PhraseOrientation.h" #include "ScfgRule.h" #include "ScfgRuleWriter.h" #include "Span.h" @@ -66,11 +67,12 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Open output files. OutputFileStream fwdExtractStream; OutputFileStream invExtractStream; - std::ofstream glueGrammarStream; - std::ofstream targetUnknownWordStream; - std::ofstream sourceUnknownWordStream; - std::ofstream sourceLabelSetStream; - std::ofstream unknownWordSoftMatchesStream; + OutputFileStream glueGrammarStream; + OutputFileStream targetUnknownWordStream; + OutputFileStream sourceUnknownWordStream; + OutputFileStream sourceLabelSetStream; + OutputFileStream unknownWordSoftMatchesStream; + std::string fwdFileName = options.extractFile; std::string invFileName = options.extractFile + std::string(".inv"); if (options.gzOutput) { @@ -79,6 +81,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) } OpenOutputFileOrDie(fwdFileName, fwdExtractStream); OpenOutputFileOrDie(invFileName, invExtractStream); + if (!options.glueGrammarFile.empty()) { OpenOutputFileOrDie(options.glueGrammarFile, glueGrammarStream); } @@ -118,7 +121,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) std::string sourceLine; std::string alignmentLine; Alignment alignment; - XmlTreeParser xmlTreeParser(targetLabelSet, targetTopLabelSet); + XmlTreeParser targetXmlTreeParser(targetLabelSet, targetTopLabelSet); // XmlTreeParser sourceXmlTreeParser(sourceLabelSet, sourceTopLabelSet); ScfgRuleWriter writer(fwdExtractStream, invExtractStream, options); size_t lineNum = options.sentenceOffset; @@ -144,7 +147,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) } std::auto_ptr<ParseTree> targetParseTree; try { - targetParseTree = xmlTreeParser.Parse(targetLine); + targetParseTree = targetXmlTreeParser.Parse(targetLine); assert(targetParseTree.get()); } catch (const Exception &e) { std::ostringstream oss; @@ -181,7 +184,7 @@ int ExtractGHKM::Main(int argc, char *argv[]) // Read source tokens. std::vector<std::string> sourceTokens(ReadTokens(sourceLine)); - // Construct a source ParseTree object object from the SyntaxTree object. + // Construct a source ParseTree object from the SyntaxTree object. std::auto_ptr<ParseTree> sourceParseTree; if (options.sourceLabels) { @@ -235,11 +238,26 @@ int ExtractGHKM::Main(int argc, char *argv[]) graph.ExtractComposedRules(options); } + // Initialize phrase orientation scoring object + PhraseOrientation phraseOrientation( sourceTokens, targetXmlTreeParser.GetWords(), alignment); + // Write the rules, subject to scope pruning. const std::vector<Node *> &targetNodes = graph.GetTargetNodes(); for (std::vector<Node *>::const_iterator p = targetNodes.begin(); p != targetNodes.end(); ++p) { + const std::vector<const Subgraph *> &rules = (*p)->GetRules(); + + REO_POS l2rOrientation, r2lOrientation; + if (options.phraseOrientation && !rules.empty()) { + int sourceSpanBegin = *((*p)->GetSpan().begin()); + int sourceSpanEnd = *((*p)->GetSpan().rbegin()); + l2rOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,L2R); + r2lOrientation = phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd,R2L); + // std::cerr << "span " << sourceSpanBegin << " " << sourceSpanEnd << std::endl; + // std::cerr << "phraseOrientation " << phraseOrientation.GetOrientationInfo(sourceSpanBegin,sourceSpanEnd) << std::endl; + } + for (std::vector<const Subgraph *>::const_iterator q = rules.begin(); q != rules.end(); ++q) { ScfgRule *r = 0; @@ -251,16 +269,34 @@ int ExtractGHKM::Main(int argc, char *argv[]) // TODO Can scope pruning be done earlier? if (r->Scope() <= options.maxScope) { if (!options.treeFragments) { - writer.Write(*r); + writer.Write(*r,false); } else { - writer.Write(*r,**q); + writer.Write(*r,**q,false); + } + if (options.phraseOrientation) { + fwdExtractStream << " {{Orientation "; + phraseOrientation.WriteOrientation(fwdExtractStream,l2rOrientation); + fwdExtractStream << " "; + phraseOrientation.WriteOrientation(fwdExtractStream,r2lOrientation); + fwdExtractStream << "}}"; + phraseOrientation.IncrementPriorCount(L2R,l2rOrientation,1); + phraseOrientation.IncrementPriorCount(R2L,r2lOrientation,1); } + fwdExtractStream << std::endl; + invExtractStream << std::endl; } delete r; } } } + if (options.phraseOrientation) { + std::string phraseOrientationPriorsFileName = options.extractFile + std::string(".phraseOrientationPriors"); + OutputFileStream phraseOrientationPriorsStream; + OpenOutputFileOrDie(phraseOrientationPriorsFileName, phraseOrientationPriorsStream); + PhraseOrientation::WritePriorCounts(phraseOrientationPriorsStream); + } + std::map<std::string,size_t> sourceLabels; if (options.sourceLabels && !options.sourceLabelSetFile.empty()) { @@ -398,6 +434,8 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], "extract minimal rules only") ("PCFG", "include score based on PCFG scores in target corpus") + ("PhraseOrientation", + "output phrase orientation information") ("TreeFragments", "output parse tree information") ("SourceLabels", @@ -502,6 +540,9 @@ void ExtractGHKM::ProcessOptions(int argc, char *argv[], if (vm.count("PCFG")) { options.pcfg = true; } + if (vm.count("PhraseOrientation")) { + options.phraseOrientation = true; + } if (vm.count("TreeFragments")) { options.treeFragments = true; } @@ -736,8 +777,7 @@ void ExtractGHKM::WriteUnknownWordSoftMatches( const std::set<std::string> &labelSet, std::ostream &out) { - std::set<std::string>::const_iterator p = labelSet.begin(); - for (p; p != labelSet.end(); ++p) { + for (std::set<std::string>::const_iterator p = labelSet.begin(); p != labelSet.end(); ++p) { std::string label = *p; out << "UNK " << label << std::endl; } diff --git a/phrase-extract/extract-ghkm/Options.h b/phrase-extract/extract-ghkm/Options.h index 28a581802..0102e2f64 100644 --- a/phrase-extract/extract-ghkm/Options.h +++ b/phrase-extract/extract-ghkm/Options.h @@ -40,6 +40,7 @@ public: , maxScope(3) , minimal(false) , pcfg(false) + , phraseOrientation(false) , treeFragments(false) , sourceLabels(false) , sentenceOffset(0) @@ -64,6 +65,7 @@ public: int maxScope; bool minimal; bool pcfg; + bool phraseOrientation; bool treeFragments; bool sourceLabels; std::string sourceLabelSetFile; diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.cpp b/phrase-extract/extract-ghkm/PhraseOrientation.cpp new file mode 100644 index 000000000..a96e5361c --- /dev/null +++ b/phrase-extract/extract-ghkm/PhraseOrientation.cpp @@ -0,0 +1,417 @@ +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2011 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#include "PhraseOrientation.h" + +#include <iostream> + +#include <boost/assign/list_of.hpp> + +namespace Moses +{ +namespace GHKM +{ + +std::vector<float> PhraseOrientation::m_l2rOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0); +std::vector<float> PhraseOrientation::m_r2lOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0); + +PhraseOrientation::PhraseOrientation(const std::vector<std::string> &source, + const std::vector<std::string> &target, + const Alignment &alignment) + : m_source(source) + , m_target(target) + , m_alignment(alignment) +{ + + int countF = m_source.size(); + int countE = m_target.size(); + + // prepare data structures for alignments + std::vector<std::vector<int> > alignedToS; + for(int i=0; i<countF; ++i) { + std::vector< int > dummy; + alignedToS.push_back(dummy); + } + for(int i=0; i<countE; ++i) { + std::vector< int > dummy; + m_alignedToT.push_back(dummy); + } + std::vector<int> alignedCountS(countF,0); + + for (Alignment::const_iterator a=alignment.begin(); a!=alignment.end(); ++a) { + m_alignedToT[a->second].push_back(a->first); + alignedCountS[a->first]++; + alignedToS[a->first].push_back(a->second); + } + + for (int startF=0; startF<countF; ++startF) { + for (int endF=startF; endF<countF; ++endF) { + + int minE = 9999; + int maxE = -1; + for (int fi=startF; fi<=endF; ++fi) { + for (size_t i=0; i<alignedToS[fi].size(); ++i) { + int ei = alignedToS[fi][i]; + if (ei<minE) { + minE = ei; + } + if (ei>maxE) { + maxE = ei; + } + } + } + + m_minAndMaxAlignedToSourceSpan[ std::pair<int,int>(startF,endF) ] = std::pair<int,int>(minE,maxE); + } + } + + // check alignments for target phrase startE...endE + // loop over continuous phrases which are compatible with the word alignments + for (int startE=0; startE<countE; ++startE) { + for (int endE=startE; endE<countE; ++endE) { + + int minF = 9999; + int maxF = -1; + std::vector< int > usedF = alignedCountS; + for (int ei=startE; ei<=endE; ++ei) { + for (size_t i=0; i<m_alignedToT[ei].size(); ++i) { + int fi = m_alignedToT[ei][i]; + if (fi<minF) { + minF = fi; + } + if (fi>maxF) { + maxF = fi; + } + usedF[fi]--; + } + } + + if (maxF >= 0) { // aligned to any source words at all + + // check if source words are aligned to out of bound target words + bool out_of_bounds = false; + for (int fi=minF; fi<=maxF && !out_of_bounds; ++fi) + if (usedF[fi]>0) { + // cout << "ouf of bounds: " << fi << "\n"; + out_of_bounds = true; + } + + // cout << "doing if for ( " << minF << "-" << maxF << ", " << startE << "," << endE << ")\n"; + if (!out_of_bounds) { + // start point of source phrase may retreat over unaligned + for (int startF=minF; + (startF>=0 && + (startF==minF || alignedCountS[startF]==0)); // unaligned + startF--) { + // end point of source phrase may advance over unaligned + for (int endF=maxF; + (endF<countF && + (endF==maxF || alignedCountS[endF]==0)); // unaligned + endF++) { // at this point we have extracted a phrase + + InsertPhraseVertices(m_topLeft, m_topRight, m_bottomLeft, m_bottomRight, + startF, startE, endF, endE); + } + } + } + } + } + } +} + + +void PhraseOrientation::InsertVertex( HSentenceVertices & corners, int x, int y ) +{ + std::set<int> tmp; + tmp.insert(x); + std::pair< HSentenceVertices::iterator, bool > ret = corners.insert( std::pair<int, std::set<int> > (y, tmp) ); + if (ret.second == false) { + ret.first->second.insert(x); + } +} + + +void PhraseOrientation::InsertPhraseVertices(HSentenceVertices & topLeft, + HSentenceVertices & topRight, + HSentenceVertices & bottomLeft, + HSentenceVertices & bottomRight, + int startF, int startE, int endF, int endE) +{ + + InsertVertex(topLeft, startF, startE); + InsertVertex(topRight, endF, startE); + InsertVertex(bottomLeft, startF, endE); + InsertVertex(bottomRight, endF, endE); +} + + +const std::string PhraseOrientation::GetOrientationInfoString(int startF, int endF, REO_DIR direction) const +{ + boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax + = m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) ); + + if ( foundMinMax != m_minAndMaxAlignedToSourceSpan.end() ) { + int startE = (foundMinMax->second).first; + int endE = (foundMinMax->second).second; +// std::cerr << "Phrase orientation for" +// << " startF=" << startF +// << " endF=" << endF +// << " startE=" << startE +// << " endE=" << endE +// << std::endl; + return GetOrientationInfoString(startF, startE, endF, endE, direction); + } else { + std::cerr << "Error: not able to determine phrase orientation" << std::endl; + std::exit(1); + } +} + + +const std::string PhraseOrientation::GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction) const +{ + REO_POS hierPrevOrient, hierNextOrient; + + bool connectedLeftTopP = IsAligned( startF-1, startE-1 ); + bool connectedRightTopP = IsAligned( endF+1, startE-1 ); + bool connectedLeftTopN = IsAligned( endF+1, endE+1 ); + bool connectedRightTopN = IsAligned( startF-1, endE+1 ); + + if ( direction == L2R || direction == BIDIR ) + hierPrevOrient = GetOrientHierModel(REO_MSLR, + connectedLeftTopP, connectedRightTopP, + startF, endF, startE, endE, m_source.size()-1, 0, 1, + &ge, <, + m_bottomRight, m_bottomLeft); + + if ( direction == R2L || direction == BIDIR ) + hierNextOrient = GetOrientHierModel(REO_MSLR, + connectedLeftTopN, connectedRightTopN, + endF, startF, endE, startE, 0, m_source.size()-1, -1, + <, &ge, + m_bottomLeft, m_bottomRight); + + switch (direction) { + case L2R: + return GetOrientationString(hierPrevOrient, REO_MSLR); + break; + case R2L: + return GetOrientationString(hierNextOrient, REO_MSLR); + break; + case BIDIR: + return GetOrientationString(hierPrevOrient, REO_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MSLR); + break; + default: + return GetOrientationString(hierPrevOrient, REO_MSLR) + " " + GetOrientationString(hierNextOrient, REO_MSLR); + break; + } + return "PhraseOrientationERROR"; +} + + +REO_POS PhraseOrientation::GetOrientationInfo(int startF, int endF, REO_DIR direction) const +{ + boost::unordered_map< std::pair<int,int> , std::pair<int,int> >::const_iterator foundMinMax + = m_minAndMaxAlignedToSourceSpan.find( std::pair<int,int>(startF,endF) ); + + if ( foundMinMax != m_minAndMaxAlignedToSourceSpan.end() ) { + int startE = (foundMinMax->second).first; + int endE = (foundMinMax->second).second; +// std::cerr << "Phrase orientation for" +// << " startF=" << startF +// << " endF=" << endF +// << " startE=" << startE +// << " endE=" << endE +// << std::endl; + return GetOrientationInfo(startF, startE, endF, endE, direction); + } else { + std::cerr << "Error: not able to determine phrase orientation" << std::endl; + std::exit(1); + } +} + + +REO_POS PhraseOrientation::GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const +{ + if ( direction != L2R && direction != R2L ) { + std::cerr << "PhraseOrientation::GetOrientationInfo(): direction should be either L2R or R2L" << std::endl; + std::exit(1); + } + + bool connectedLeftTopP = IsAligned( startF-1, startE-1 ); + bool connectedRightTopP = IsAligned( endF+1, startE-1 ); + bool connectedLeftTopN = IsAligned( endF+1, endE+1 ); + bool connectedRightTopN = IsAligned( startF-1, endE+1 ); + + if ( direction == L2R ) + return GetOrientHierModel(REO_MSLR, + connectedLeftTopP, connectedRightTopP, + startF, endF, startE, endE, m_source.size()-1, 0, 1, + &ge, <, + m_bottomRight, m_bottomLeft); + + if ( direction == R2L ) + return GetOrientHierModel(REO_MSLR, + connectedLeftTopN, connectedRightTopN, + endF, startF, endE, startE, 0, m_source.size()-1, -1, + <, &ge, + m_bottomLeft, m_bottomRight); + + return UNKNOWN; +} + + +// to be called with countF-1 instead of countF +REO_POS PhraseOrientation::GetOrientHierModel(REO_MODEL_TYPE modelType, + bool connectedLeftTop, bool connectedRightTop, + int startF, int endF, int startE, int endE, int countF, int zero, int unit, + bool (*ge)(int, int), bool (*lt)(int, int), + const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const +{ + HSentenceVertices::const_iterator it; + + if ((connectedLeftTop && !connectedRightTop) || + ((it = bottomRight.find(startE - unit)) != bottomRight.end() && + it->second.find(startF-unit) != it->second.end())) + return LEFT; + + if (modelType == REO_MONO) + return UNKNOWN; + + if ((!connectedLeftTop && connectedRightTop) || + ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() && + it->second.find(endF + unit) != it->second.end())) + return RIGHT; + + if (modelType == REO_MSD) + return UNKNOWN; + + connectedLeftTop = false; + for (int indexF=startF-2*unit; (*ge)(indexF, zero) && !connectedLeftTop; indexF=indexF-unit) { + if ((connectedLeftTop = ((it = bottomRight.find(startE - unit)) != bottomRight.end() && + it->second.find(indexF) != it->second.end()))) + return DRIGHT; + } + + connectedRightTop = false; + for (int indexF=endF+2*unit; (*lt)(indexF, countF) && !connectedRightTop; indexF=indexF+unit) { + if ((connectedRightTop = ((it = bottomLeft.find(startE - unit)) != bottomLeft.end() && + it->second.find(indexF) != it->second.end()))) + return DLEFT; + } + + return UNKNOWN; +} + + +const std::string PhraseOrientation::GetOrientationString(const REO_POS orient, const REO_MODEL_TYPE modelType) +{ + std::ostringstream oss; + WriteOrientation(oss, orient, modelType); + return oss.str(); +} + + +void PhraseOrientation::WriteOrientation(std::ostream& out, const REO_POS orient, const REO_MODEL_TYPE modelType) +{ + switch(orient) { + case LEFT: + out << "mono"; + break; + case RIGHT: + out << "swap"; + break; + case DRIGHT: + out << "dright"; + break; + case DLEFT: + out << "dleft"; + break; + case UNKNOWN: + switch(modelType) { + case REO_MONO: + out << "nomono"; + break; + case REO_MSD: + out << "other"; + break; + case REO_MSLR: + out << "dright"; + break; + } + break; + } +} + + +bool PhraseOrientation::IsAligned(int fi, int ei) const +{ + if (ei == -1 && fi == -1) + return true; + + if (ei <= -1 || fi <= -1) + return false; + + if (ei == (int)m_target.size() && fi == (int)m_source.size()) + return true; + + if (ei >= (int)m_target.size() || fi >= (int)m_source.size()) + return false; + + for (size_t i=0; i<m_alignedToT[ei].size(); ++i) + if (m_alignedToT[ei][i] == fi) + return true; + + return false; +} + + +void PhraseOrientation::IncrementPriorCount(REO_DIR direction, REO_POS orient, float increment) +{ + assert(direction==L2R || direction==R2L); + if (direction == L2R) { + m_l2rOrientationPriorCounts[orient] += increment; + } else if (direction == R2L) { + m_r2lOrientationPriorCounts[orient] += increment; + } +} + + +void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType) +{ + std::map<std::string,float> l2rOrientationPriorCountsMap; + std::map<std::string,float> r2lOrientationPriorCountsMap; + for (int orient=0; orient<=UNKNOWN; ++orient) { + l2rOrientationPriorCountsMap[GetOrientationString((REO_POS)orient, modelType)] += m_l2rOrientationPriorCounts[orient]; + } + for (int orient=0; orient<=UNKNOWN; ++orient) { + r2lOrientationPriorCountsMap[GetOrientationString((REO_POS)orient, modelType)] += m_r2lOrientationPriorCounts[orient]; + } + for (std::map<std::string,float>::const_iterator l2rOrientationPriorCountsMapIt = l2rOrientationPriorCountsMap.begin(); + l2rOrientationPriorCountsMapIt != l2rOrientationPriorCountsMap.end(); ++l2rOrientationPriorCountsMapIt) { + out << "L2R_" << l2rOrientationPriorCountsMapIt->first << " " << l2rOrientationPriorCountsMapIt->second << std::endl; + } + for (std::map<std::string,float>::const_iterator r2lOrientationPriorCountsMapIt = r2lOrientationPriorCountsMap.begin(); + r2lOrientationPriorCountsMapIt != r2lOrientationPriorCountsMap.end(); ++r2lOrientationPriorCountsMapIt) { + out << "R2L_" << r2lOrientationPriorCountsMapIt->first << " " << r2lOrientationPriorCountsMapIt->second << std::endl; + } +} + +} // namespace GHKM +} // namespace Moses + diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.h b/phrase-extract/extract-ghkm/PhraseOrientation.h new file mode 100644 index 000000000..6e83929f1 --- /dev/null +++ b/phrase-extract/extract-ghkm/PhraseOrientation.h @@ -0,0 +1,102 @@ + +/*********************************************************************** + Moses - statistical machine translation system + Copyright (C) 2006-2011 University of Edinburgh + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +***********************************************************************/ + +#pragma once + +#include "Alignment.h" + +#include <map> +#include <set> +#include <string> +#include <vector> +#include <boost/unordered_map.hpp> + +namespace Moses +{ +namespace GHKM +{ + +enum REO_MODEL_TYPE {REO_MSD, REO_MSLR, REO_MONO}; +enum REO_POS {LEFT, RIGHT, DLEFT, DRIGHT, UNKNOWN}; +enum REO_DIR {L2R, R2L, BIDIR}; + +// The key of the map is the English index and the value is a set of the source ones +typedef std::map <int, std::set<int> > HSentenceVertices; + + +class PhraseOrientation +{ +public: + + PhraseOrientation(const std::vector<std::string> &source, + const std::vector<std::string> &target, + const Alignment &alignment); + + REO_POS GetOrientationInfo(int startF, int endF, REO_DIR direction) const; + REO_POS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const; + const std::string GetOrientationInfoString(int startF, int endF, REO_DIR direction=BIDIR) const; + const std::string GetOrientationInfoString(int startF, int startE, int endF, int endE, REO_DIR direction=BIDIR) const; + static const std::string GetOrientationString(const REO_POS orient, const REO_MODEL_TYPE modelType=REO_MSLR); + static void WriteOrientation(std::ostream& out, const REO_POS orient, const REO_MODEL_TYPE modelType=REO_MSLR); + void IncrementPriorCount(REO_DIR direction, REO_POS orient, float increment); + static void WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE modelType=REO_MSLR); + +private: + + void InsertVertex( HSentenceVertices & corners, int x, int y ); + + void InsertPhraseVertices(HSentenceVertices & topLeft, + HSentenceVertices & topRight, + HSentenceVertices & bottomLeft, + HSentenceVertices & bottomRight, + int startF, int startE, int endF, int endE); + + REO_POS GetOrientHierModel(REO_MODEL_TYPE modelType, + bool connectedLeftTop, bool connectedRightTop, + int startF, int endF, int startE, int endE, int countF, int zero, int unit, + bool (*ge)(int, int), bool (*lt)(int, int), + const HSentenceVertices & bottomRight, const HSentenceVertices & bottomLeft) const; + + bool IsAligned(int fi, int ei) const; + + static bool ge(int first, int second) { return first >= second; }; + static bool le(int first, int second) { return first <= second; }; + static bool lt(int first, int second) { return first < second; }; + + const std::vector<std::string> &m_source; + const std::vector<std::string> &m_target; + const Alignment &m_alignment; + + std::vector<std::vector<int> > m_alignedToT; + + HSentenceVertices m_topLeft; + HSentenceVertices m_topRight; + HSentenceVertices m_bottomLeft; + HSentenceVertices m_bottomRight; + + boost::unordered_map< std::pair<int,int> , std::pair<int,int> > m_minAndMaxAlignedToSourceSpan; + + static std::vector<float> m_l2rOrientationPriorCounts; + static std::vector<float> m_r2lOrientationPriorCounts; +}; + +} // namespace GHKM +} // namespace Moses + diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp index be373b67b..2fba6930b 100644 --- a/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp +++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.cpp @@ -169,14 +169,17 @@ void ScfgRuleWriter::WriteSymbol(const Symbol &symbol, std::ostream &out) } } -void ScfgRuleWriter::Write(const ScfgRule &rule, const Subgraph &g) +void ScfgRuleWriter::Write(const ScfgRule &rule, const Subgraph &g, bool printEndl) { - Write(rule,false); - m_fwd << " {{Tree "; - g.PrintTree(m_fwd); - m_fwd << "}}"; + Write(rule,false); + m_fwd << " {{Tree "; + g.PrintTree(m_fwd); + m_fwd << "}}"; + + if (printEndl) { m_fwd << std::endl; m_inv << std::endl; + } } } // namespace GHKM diff --git a/phrase-extract/extract-ghkm/ScfgRuleWriter.h b/phrase-extract/extract-ghkm/ScfgRuleWriter.h index 18f423149..8a8564580 100644 --- a/phrase-extract/extract-ghkm/ScfgRuleWriter.h +++ b/phrase-extract/extract-ghkm/ScfgRuleWriter.h @@ -44,7 +44,7 @@ public: void Write(const ScfgRule &rule, bool printEndl=true); - void Write(const ScfgRule &rule, const Subgraph &g); + void Write(const ScfgRule &rule, const Subgraph &g, bool printEndl=true); private: // Disallow copying diff --git a/phrase-extract/extract-ghkm/XmlTreeParser.h b/phrase-extract/extract-ghkm/XmlTreeParser.h index e5bf5b463..d0209254f 100644 --- a/phrase-extract/extract-ghkm/XmlTreeParser.h +++ b/phrase-extract/extract-ghkm/XmlTreeParser.h @@ -49,6 +49,8 @@ public: static std::auto_ptr<ParseTree> ConvertTree(const MosesTraining::SyntaxNode &, const std::vector<std::string> &); + const std::vector<std::string>& GetWords() { return m_words; }; + private: std::set<std::string> &m_labelSet; diff --git a/phrase-extract/extract-main.cpp b/phrase-extract/extract-main.cpp index fe3d99cd2..552dcb739 100644 --- a/phrase-extract/extract-main.cpp +++ b/phrase-extract/extract-main.cpp @@ -363,8 +363,6 @@ void ExtractTask::extract(SentenceAlignment &sentence) HSentenceVertices outBottomLeft; HSentenceVertices outBottomRight; - HSentenceVertices::const_iterator it; - bool relaxLimit = m_options.isHierModel(); bool buildExtraStructure = m_options.isPhraseModel() || m_options.isHierModel(); diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp index e8ba1d942..7f155f6ed 100644 --- a/phrase-extract/score-main.cpp +++ b/phrase-extract/score-main.cpp @@ -46,6 +46,7 @@ LexicalTable lexTable; bool inverseFlag = false; bool hierarchicalFlag = false; bool pcfgFlag = false; +bool phraseOrientationFlag = false; bool treeFragmentsFlag = false; bool sourceSyntaxLabelsFlag = false; bool sourceSyntaxLabelSetFlag = false; @@ -69,6 +70,7 @@ bool nonTermContext = false; int countOfCounts[COC_MAX+1]; int totalDistinct = 0; float minCountHierarchical = 0; +bool phraseOrientationPriorsFlag = false; boost::unordered_map<std::string,float> sourceLHSCounts; boost::unordered_map<std::string, boost::unordered_map<std::string,float>* > targetLHSAndSourceLHSJointCounts; @@ -82,6 +84,9 @@ std::set<std::string> targetPreferenceLabelSet; std::map<std::string,size_t> targetPreferenceLabels; std::vector<std::string> targetPreferenceLabelsByIndex; +std::vector<float> orientationClassPriorsL2R(4,0); // mono swap dright dleft +std::vector<float> orientationClassPriorsR2L(4,0); // mono swap dright dleft + Vocabulary vcbT; Vocabulary vcbS; @@ -106,6 +111,7 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, float, int, ostrea double computeLexicalTranslation( const PHRASE *phraseSource, const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ); double computeUnalignedPenalty( const ALIGNMENT *alignmentTargetToSource ); set<std::string> functionWordList; +void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors, std::vector<float> &orientationClassPriorsL2R, std::vector<float> &orientationClassPriorsR2L); void loadFunctionWords( const string &fileNameFunctionWords ); double computeUnalignedFWPenalty( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ); int calcCrossedNonTerm( const PHRASE *phraseTarget, const ALIGNMENT *alignmentTargetToSource ); @@ -136,6 +142,7 @@ int main(int argc, char* argv[]) std::string fileNameTargetPreferenceLabelSet; std::string fileNameLeftHandSideTargetPreferenceLabelCounts; std::string fileNameLeftHandSideRuleTargetTargetPreferenceLabelCounts; + std::string fileNamePhraseOrientationPriors; std::vector<std::string> featureArgs; // all unknown args passed to feature manager for(int i=4; i<argc; i++) { @@ -148,9 +155,12 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--PCFG") == 0) { pcfgFlag = true; std::cerr << "including PCFG scores" << std::endl; + } else if (strcmp(argv[i],"--PhraseOrientation") == 0) { + phraseOrientationFlag = true; + std::cerr << "including phrase orientation information" << std::endl; } else if (strcmp(argv[i],"--TreeFragments") == 0) { treeFragmentsFlag = true; - std::cerr << "including tree fragment information from syntactic parse\n"; + std::cerr << "including tree fragment information from syntactic parse" << std::endl; } else if (strcmp(argv[i],"--SourceLabels") == 0) { sourceSyntaxLabelsFlag = true; std::cerr << "including source label information" << std::endl; @@ -216,6 +226,14 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) { crossedNonTerm = true; std::cerr << "crossed non-term reordering feature" << std::endl; + } else if (strcmp(argv[i],"--PhraseOrientationPriors") == 0) { + phraseOrientationPriorsFlag = true; + if (i+1==argc) { + std::cerr << "ERROR: specify priors file for phrase orientation!" << std::endl; + exit(1); + } + fileNamePhraseOrientationPriors = argv[++i]; + std::cerr << "smoothing phrase orientation with priors from " << fileNamePhraseOrientationPriors << std::endl; } else if (strcmp(argv[i],"--SpanLength") == 0) { spanLength = true; std::cerr << "span length feature" << std::endl; @@ -254,6 +272,10 @@ int main(int argc, char* argv[]) for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0; } + if (phraseOrientationPriorsFlag) { + loadOrientationPriors(fileNamePhraseOrientationPriors,orientationClassPriorsL2R,orientationClassPriorsR2L); + } + // sorted phrase extraction file Moses::InputFileStream extractFile(fileNameExtract); @@ -774,11 +796,6 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, if (kneserNeyFlag) phraseTableFile << " " << distinctCount; - if ((treeFragmentsFlag || sourceSyntaxLabelsFlag || targetPreferenceLabelsFlag) && - !inverseFlag) { - phraseTableFile << " |||"; - } - phraseTableFile << " |||"; // tree fragments @@ -832,6 +849,13 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, } } + // phrase orientation + if (phraseOrientationFlag && !inverseFlag) { + phraseTableFile << " {{Orientation "; + phrasePair.CollectAllPhraseOrientations("Orientation",orientationClassPriorsL2R,orientationClassPriorsR2L,0.5,phraseTableFile); + phraseTableFile << "}}"; + } + if (spanLength && !inverseFlag) { string propValue = phrasePair.CollectAllPropertyValues("SpanLength"); if (!propValue.empty()) { @@ -851,6 +875,94 @@ void outputPhrasePair(const ExtractionPhrasePair &phrasePair, +void loadOrientationPriors(const std::string &fileNamePhraseOrientationPriors, + std::vector<float> &orientationClassPriorsL2R, + std::vector<float> &orientationClassPriorsR2L) +{ + assert(orientationClassPriorsL2R.size()==4 && orientationClassPriorsR2L.size()==4); // mono swap dright dleft + + std::cerr << "Loading phrase orientation priors from " << fileNamePhraseOrientationPriors; + ifstream inFile; + inFile.open(fileNamePhraseOrientationPriors.c_str()); + if (inFile.fail()) { + std::cerr << " - ERROR: could not open file" << std::endl; + exit(1); + } + + std::string line; + size_t linesRead = 0; + float l2rSum = 0; + float r2lSum = 0; + while (getline(inFile, line)) { + istringstream tokenizer(line); + std::string key; + tokenizer >> key; + + bool l2rFlag = false; + bool r2lFlag = false; + if (!key.substr(0,4).compare("L2R_")) { + l2rFlag = true; + } + if (!key.substr(0,4).compare("R2L_")) { + r2lFlag = true; + } + if (!l2rFlag && !r2lFlag) { + std::cerr << " - ERROR: malformed line in orientation priors file" << std::endl; + } + key.erase(0,4); + + int orientationClassId = -1; + if (!key.compare("mono")) { + orientationClassId = 0; + } + if (!key.compare("swap")) { + orientationClassId = 1; + } + if (!key.compare("dright")) { + orientationClassId = 2; + } + if (!key.compare("dleft")) { + orientationClassId = 3; + } + if (orientationClassId == -1) { + std::cerr << " - ERROR: malformed line in orientation priors file" << std::endl; + } + + float count; + tokenizer >> count; + + if (l2rFlag) { + orientationClassPriorsL2R[orientationClassId] += count; + l2rSum += count; + } + if (r2lFlag) { + orientationClassPriorsR2L[orientationClassId] += count; + r2lSum += count; + } + + ++linesRead; + } + + // normalization: return prior probabilities, not counts + if (l2rSum != 0) { + for (std::vector<float>::iterator orientationClassPriorsL2RIt = orientationClassPriorsL2R.begin(); + orientationClassPriorsL2RIt != orientationClassPriorsL2R.end(); ++orientationClassPriorsL2RIt) { + *orientationClassPriorsL2RIt /= l2rSum; + } + } + if (r2lSum != 0) { + for (std::vector<float>::iterator orientationClassPriorsR2LIt = orientationClassPriorsR2L.begin(); + orientationClassPriorsR2LIt != orientationClassPriorsR2L.end(); ++orientationClassPriorsR2LIt) { + *orientationClassPriorsR2LIt /= r2lSum; + } + } + + std::cerr << " - read " << linesRead << " lines from orientation priors file" << std::endl; + inFile.close(); +} + + + bool calcCrossedNonTerm( size_t targetPos, size_t sourcePos, const ALIGNMENT *alignmentTargetToSource ) { for (size_t currTarget = 0; currTarget < alignmentTargetToSource->size(); ++currTarget) { diff --git a/scripts/ems/experiment.perl b/scripts/ems/experiment.perl index bf0f0129a..1108bec1b 100755 --- a/scripts/ems/experiment.perl +++ b/scripts/ems/experiment.perl @@ -2195,9 +2195,10 @@ sub define_training_extract_phrases { $cmd .= "-glue-grammar-file $glue_grammar_file "; if (&get("GENERAL:output-parser") && (&get("TRAINING:use-unknown-word-labels") || &get("TRAINING:use-unknown-word-soft-matches"))) { - my $unknown_word_label = &versionize(&long_file_name("unknown-word-label","model","")); - $cmd .= "-unknown-word-label $unknown_word_label "; + my $unknown_word_label = &versionize(&long_file_name("unknown-word-label","model","")); + $cmd .= "-unknown-word-label $unknown_word_label "; } + if (&get("GENERAL:output-parser") && &get("TRAINING:use-unknown-word-soft-matches")) { my $unknown_word_soft_matches = &versionize(&long_file_name("unknown-word-soft-matches","model","")); $cmd .= "-unknown-word-soft-matches $unknown_word_soft_matches "; @@ -2210,6 +2211,12 @@ sub define_training_extract_phrases { if (&get("TRAINING:ghkm-tree-fragments")) { $cmd .= "-ghkm-tree-fragments "; } + + if (&get("TRAINING:ghkm-phrase-orientation")) { + $cmd .= "-ghkm-phrase-orientation "; + my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model","")); + $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file "; + } } my $extract_settings = &get("TRAINING:extract-settings"); @@ -2242,6 +2249,11 @@ sub define_training_build_ttable { if (&get("TRAINING:ghkm-tree-fragments")) { $cmd .= "-ghkm-tree-fragments "; } + if (&get("TRAINING:ghkm-phrase-orientation")) { + $cmd .= "-ghkm-phrase-orientation "; + my $phrase_orientation_priors_file = &versionize(&long_file_name("phrase-orientation-priors","model","")); + $cmd .= "-phrase-orientation-priors-file $phrase_orientation_priors_file "; + } } &create_step($step_id,$cmd); diff --git a/scripts/generic/extract-parallel.perl b/scripts/generic/extract-parallel.perl index 7abada1de..ff6a058b5 100755 --- a/scripts/generic/extract-parallel.perl +++ b/scripts/generic/extract-parallel.perl @@ -29,6 +29,8 @@ my $otherExtractArgs= ""; my $weights = ""; my $baselineExtract; my $glueFile; +my $phraseOrientation = 0; +my $phraseOrientationPriorsFile; for (my $i = 8; $i < $#ARGV + 1; ++$i) { @@ -45,6 +47,11 @@ for (my $i = 8; $i < $#ARGV + 1; ++$i) $glueFile = $ARGV[++$i]; next; } + $phraseOrientation = 1 if $ARGV[$i] eq "--PhraseOrientation"; + if ($ARGV[$i] eq '--PhraseOrientationPriors') { + $phraseOrientationPriorsFile = $ARGV[++$i]; + next; + } $otherExtractArgs .= $ARGV[$i] ." "; } @@ -219,6 +226,32 @@ if (defined($glueFile)) { print STDERR `$cmd`; } +# phrase orientation priors (GHKM extraction) +if ($phraseOrientation && defined($phraseOrientationPriorsFile)) { + print STDERR "Merging phrase orientation priors\n"; + + my @orientationPriorsCountFiles = glob("$TMPDIR/*.phraseOrientationPriors"); + my %priorCounts; + + foreach my $filenamePhraseOrientationPriors (@orientationPriorsCountFiles) { + if (-f $filenamePhraseOrientationPriors) { + open my $infilePhraseOrientationPriors, '<', $filenamePhraseOrientationPriors or die "cannot open $filenamePhraseOrientationPriors: $!"; + while (my $line = <$infilePhraseOrientationPriors>) { + print $line; + my ($key, $value) = split / /, $line; + $priorCounts{$key} += $value; + } + close $infilePhraseOrientationPriors; + } + } + + open my $outPhraseOrientationPriors, '>', $phraseOrientationPriorsFile or die "cannot open $phraseOrientationPriorsFile: $!"; + foreach my $key (sort keys %priorCounts) { + print $outPhraseOrientationPriors $key." ".$priorCounts{$key}."\n"; + } + close($outPhraseOrientationPriors); +} + # delete temporary files $cmd = "rm -rf $TMPDIR \n"; print STDERR $cmd; diff --git a/scripts/training/train-model.perl b/scripts/training/train-model.perl index 22ecc5ff9..da8e677bc 100755 --- a/scripts/training/train-model.perl +++ b/scripts/training/train-model.perl @@ -32,7 +32,7 @@ my($_EXTERNAL_BINDIR, $_ROOT_DIR, $_CORPUS_DIR, $_GIZA_E2F, $_GIZA_F2E, $_MODEL_ $_DECODING_STEPS, $_PARALLEL, $_FACTOR_DELIMITER, @_PHRASE_TABLE, @_REORDERING_TABLE, @_GENERATION_TABLE, @_GENERATION_TYPE, $_GENERATION_CORPUS, $_DONT_ZIP, $_MGIZA, $_MGIZA_CPUS, $_SNT2COOC, $_HMM_ALIGN, $_CONFIG, $_OSM, $_OSM_FACTORS, $_POST_DECODING_TRANSLIT, $_TRANSLITERATION_PHRASE_TABLE, - $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS, + $_HIERARCHICAL,$_XML,$_SOURCE_SYNTAX,$_TARGET_SYNTAX,$_GLUE_GRAMMAR,$_GLUE_GRAMMAR_FILE,$_UNKNOWN_WORD_LABEL_FILE,$_GHKM,$_GHKM_TREE_FRAGMENTS,$_GHKM_PHRASE_ORIENTATION,$_PHRASE_ORIENTATION_PRIORS_FILE,$_PCFG,@_EXTRACT_OPTIONS,@_SCORE_OPTIONS, $_ALT_DIRECT_RULE_SCORE_1, $_ALT_DIRECT_RULE_SCORE_2, $_UNKNOWN_WORD_SOFT_MATCHES_FILE, $_OMIT_WORD_ALIGNMENT,$_FORCE_FACTORED_FILENAMES, $_MEMSCORE, $_FINAL_ALIGNMENT_MODEL, @@ -110,6 +110,8 @@ $_HELP = 1 'unknown-word-soft-matches-file=s' => \$_UNKNOWN_WORD_SOFT_MATCHES_FILE, # give dummy label to unknown word, and allow soft matches to all other labels (with cost determined by sparse features) 'ghkm' => \$_GHKM, 'ghkm-tree-fragments' => \$_GHKM_TREE_FRAGMENTS, + 'ghkm-phrase-orientation' => \$_GHKM_PHRASE_ORIENTATION, + 'phrase-orientation-priors-file=s' => \$_PHRASE_ORIENTATION_PRIORS_FILE, # currently relevant for GHKM extraction only; phrase orientation for PBT has different implementation 'pcfg' => \$_PCFG, 'alt-direct-rule-score-1' => \$_ALT_DIRECT_RULE_SCORE_1, 'alt-direct-rule-score-2' => \$_ALT_DIRECT_RULE_SCORE_2, @@ -1426,6 +1428,8 @@ sub extract_phrase { $cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2; $cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1; $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS; + $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION; + $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if defined($_PHRASE_ORIENTATION_PRIORS_FILE); if (!defined($_GHKM)) { $cmd .= " --SourceSyntax" if $_SOURCE_SYNTAX; $cmd .= " --TargetSyntax" if $_TARGET_SYNTAX; @@ -1550,6 +1554,9 @@ sub score_phrase_phrase_extract { my $NEG_LOG_PROB = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NegLogProb/); my $NO_LEX = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /NoLex/); my $MIN_COUNT_HIERARCHICAL = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /MinCountHierarchical ([\d\.]+)/) ? $1 : undef; + my $SOURCE_LABELS = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabels/); + my $SOURCE_LABEL_COUNTS_LHS = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabelCountsLHS/); + my $SOURCE_LABEL_SET = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SourceLabelSet/); my $SPAN_LENGTH = (defined($_SCORE_OPTIONS) && $_SCORE_OPTIONS =~ /SpanLength/); my $CORE_SCORE_OPTIONS = ""; $CORE_SCORE_OPTIONS .= " --LogProb" if $LOG_PROB; @@ -1557,6 +1564,9 @@ sub score_phrase_phrase_extract { $CORE_SCORE_OPTIONS .= " --NoLex" if $NO_LEX; $CORE_SCORE_OPTIONS .= " --Singleton" if $SINGLETON; $CORE_SCORE_OPTIONS .= " --CrossedNonTerm" if $CROSSEDNONTERM; + $CORE_SCORE_OPTIONS .= " --SourceLabels" if $SOURCE_LABELS; + $CORE_SCORE_OPTIONS .= " --SourceLabelCountsLHS " if $SOURCE_LABEL_COUNTS_LHS; + $CORE_SCORE_OPTIONS .= " --SourceLabelSet " if $SOURCE_LABEL_SET; my $substep = 1; my $isParent = 1; @@ -1597,6 +1607,8 @@ sub score_phrase_phrase_extract { $cmd .= " --UnpairedExtractFormat" if $_ALT_DIRECT_RULE_SCORE_1 || $_ALT_DIRECT_RULE_SCORE_2; $cmd .= " --ConditionOnTargetLHS" if $_ALT_DIRECT_RULE_SCORE_1; $cmd .= " --TreeFragments" if $_GHKM_TREE_FRAGMENTS; + $cmd .= " --PhraseOrientation" if $_GHKM_PHRASE_ORIENTATION; + $cmd .= " --PhraseOrientationPriors $_PHRASE_ORIENTATION_PRIORS_FILE" if $_GHKM_PHRASE_ORIENTATION && defined($_PHRASE_ORIENTATION_PRIORS_FILE); $cmd .= " $DOMAIN" if $DOMAIN; $cmd .= " $CORE_SCORE_OPTIONS" if defined($_SCORE_OPTIONS); $cmd .= " --FlexibilityScore=$FLEX_SCORER" if $_FLEXIBILITY_SCORE; |