diff options
author | Matthias Huck <mhuck@inf.ed.ac.uk> | 2015-12-10 15:56:37 +0300 |
---|---|---|
committer | Matthias Huck <mhuck@inf.ed.ac.uk> | 2015-12-10 15:56:37 +0300 |
commit | bd3f5734524395cd87c9acb07a23c6c816a8ef27 (patch) | |
tree | 1e4e63d5a2f744a7478d75c66c9705a5ba8244b9 /phrase-extract | |
parent | 831dc83778f4594375e9de6a4e2831a02b3e1314 (diff) |
Hiero phrase orientation
Diffstat (limited to 'phrase-extract')
-rw-r--r-- | phrase-extract/Alignment.cpp (renamed from phrase-extract/extract-ghkm/Alignment.cpp) | 8 | ||||
-rw-r--r-- | phrase-extract/Alignment.h (renamed from phrase-extract/extract-ghkm/Alignment.h) | 6 | ||||
-rw-r--r-- | phrase-extract/ExtractedRule.h | 14 | ||||
-rw-r--r-- | phrase-extract/PhraseOrientation.cpp (renamed from phrase-extract/extract-ghkm/PhraseOrientation.cpp) | 18 | ||||
-rw-r--r-- | phrase-extract/PhraseOrientation.h (renamed from phrase-extract/extract-ghkm/PhraseOrientation.h) | 17 | ||||
-rw-r--r-- | phrase-extract/RuleExtractionOptions.h | 4 | ||||
-rw-r--r-- | phrase-extract/extract-rules-main.cpp | 54 |
7 files changed, 81 insertions, 40 deletions
diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/Alignment.cpp index d12f9398b..cef7a4638 100644 --- a/phrase-extract/extract-ghkm/Alignment.cpp +++ b/phrase-extract/Alignment.cpp @@ -27,10 +27,6 @@ namespace MosesTraining { -namespace Syntax -{ -namespace GHKM -{ void ReadAlignment(const std::string &s, Alignment &a) { @@ -46,7 +42,7 @@ void ReadAlignment(const std::string &s, Alignment &a) } int src = std::atoi(s.substr(begin, end-begin).c_str()); if (end+1 == s.size()) { - throw Exception("Target index missing"); + throw Syntax::Exception("Target index missing"); } begin = end+1; @@ -71,6 +67,4 @@ void FlipAlignment(Alignment &a) } } -} // namespace GHKM -} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/Alignment.h b/phrase-extract/Alignment.h index da1279f8f..c25896e6d 100644 --- a/phrase-extract/extract-ghkm/Alignment.h +++ b/phrase-extract/Alignment.h @@ -25,10 +25,6 @@ namespace MosesTraining { -namespace Syntax -{ -namespace GHKM -{ typedef std::vector<std::pair<int, int> > Alignment; @@ -36,6 +32,4 @@ void ReadAlignment(const std::string &, Alignment &); void FlipAlignment(Alignment &); -} // namespace GHKM -} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/ExtractedRule.h b/phrase-extract/ExtractedRule.h index cb2f2261d..e879b96c4 100644 --- a/phrase-extract/ExtractedRule.h +++ b/phrase-extract/ExtractedRule.h @@ -26,6 +26,8 @@ #include <sstream> #include <map> +#include "extract-ghkm/PhraseOrientation.h" + namespace MosesTraining { @@ -37,8 +39,6 @@ public: std::string target; std::string alignment; std::string alignmentInv; - std::string orientation; - std::string orientationForward; std::string sourceContextLeft; std::string sourceContextRight; std::string targetContextLeft; @@ -51,14 +51,14 @@ public: int endS; float count; double pcfgScore; + Syntax::GHKM::PhraseOrientation::REO_CLASS l2rOrientation; + Syntax::GHKM::PhraseOrientation::REO_CLASS r2lOrientation; ExtractedRule(int sT, int eT, int sS, int eS) : source() , target() , alignment() , alignmentInv() - , orientation() - , orientationForward() , sourceContextLeft() , sourceContextRight() , targetContextLeft() @@ -70,8 +70,10 @@ public: , startS(sS) , endS(eS) , count(0) - , pcfgScore(0.0) { - } + , pcfgScore(0.0) + , l2rOrientation(Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN) + , r2lOrientation(Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN) + { } }; } diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.cpp b/phrase-extract/PhraseOrientation.cpp index f07e19a46..20297b4de 100644 --- a/phrase-extract/extract-ghkm/PhraseOrientation.cpp +++ b/phrase-extract/PhraseOrientation.cpp @@ -28,10 +28,6 @@ namespace MosesTraining { -namespace Syntax -{ -namespace GHKM -{ std::vector<float> PhraseOrientation::m_l2rOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0); std::vector<float> PhraseOrientation::m_r2lOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0); @@ -100,6 +96,18 @@ PhraseOrientation::PhraseOrientation(int sourceSize, Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS); } +PhraseOrientation::PhraseOrientation(int sourceSize, + int targetSize, + const std::vector<std::vector<int> > &alignedToT, + const std::vector<std::vector<int> > &alignedToS, + const std::vector<int> &alignedCountS) + : m_countF(sourceSize) + , m_countE(targetSize) + , m_alignedToT(alignedToT) +{ + Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS); +} + void PhraseOrientation::Init(int sourceSize, int targetSize, @@ -470,6 +478,4 @@ void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE } } -} // namespace GHKM -} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.h b/phrase-extract/PhraseOrientation.h index d956e2bc8..1cdfb65be 100644 --- a/phrase-extract/extract-ghkm/PhraseOrientation.h +++ b/phrase-extract/PhraseOrientation.h @@ -32,10 +32,6 @@ namespace MosesTraining { -namespace Syntax -{ -namespace GHKM -{ // The key of the map is the English index and the value is a set of the source ones typedef std::map <int, std::set<int> > HSentenceVertices; @@ -49,6 +45,7 @@ public: enum REO_CLASS {REO_CLASS_LEFT, REO_CLASS_RIGHT, REO_CLASS_DLEFT, REO_CLASS_DRIGHT, REO_CLASS_UNKNOWN}; enum REO_DIR {REO_DIR_L2R, REO_DIR_R2L, REO_DIR_BIDIR}; + PhraseOrientation() {}; PhraseOrientation(int sourceSize, int targetSize, @@ -59,6 +56,12 @@ public: const Moses::AlignmentInfo &alignTerm, const Moses::AlignmentInfo &alignNonTerm); + PhraseOrientation(int sourceSize, + int targetSize, + const std::vector<std::vector<int> > &alignedToT, + const std::vector<std::vector<int> > &alignedToS, + const std::vector<int> &alignedCountS); + REO_CLASS GetOrientationInfo(int startF, int endF, REO_DIR direction) const; REO_CLASS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const; const std::string GetOrientationInfoString(int startF, int endF, REO_DIR direction=REO_DIR_BIDIR) const; @@ -104,8 +107,8 @@ private: return first < second; }; - const int m_countF; - const int m_countE; + int m_countF; + int m_countE; std::vector<std::vector<int> > m_alignedToT; @@ -121,6 +124,4 @@ private: static std::vector<float> m_r2lOrientationPriorCounts; }; -} // namespace GHKM -} // namespace Syntax } // namespace MosesTraining diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h index b38258470..aab059cf9 100644 --- a/phrase-extract/RuleExtractionOptions.h +++ b/phrase-extract/RuleExtractionOptions.h @@ -54,6 +54,7 @@ public: bool conditionOnTargetLhs; bool boundaryRules; bool flexScoreFlag; + bool phraseOrientation; RuleExtractionOptions() : maxSpan(10) @@ -86,7 +87,8 @@ public: , unpairedExtractFormat(false) , conditionOnTargetLhs(false) , boundaryRules(false) - , flexScoreFlag(false) {} + , flexScoreFlag(false) + , phraseOrientation(false) {} }; } diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp index e6fff965d..49db5797c 100644 --- a/phrase-extract/extract-rules-main.cpp +++ b/phrase-extract/extract-rules-main.cpp @@ -46,6 +46,7 @@ #include "XmlTree.h" #include "InputFileStream.h" #include "OutputFileStream.h" +#include "extract-ghkm/PhraseOrientation.h" using namespace std; using namespace MosesTraining; @@ -62,6 +63,7 @@ private: Moses::OutputFileStream& m_extractFileInv; Moses::OutputFileStream& m_extractFileContext; Moses::OutputFileStream& m_extractFileContextInv; + Syntax::GHKM::PhraseOrientation m_phraseOrientation; vector< ExtractedRule > m_extractedRules; @@ -109,6 +111,7 @@ public: void collectWordLabelCounts(SentenceAlignmentWithSyntax &sentence ); void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection); void writeUnknownWordLabel(const string &); +void writePhraseOrientationPriors(const string &); double getPcfgScore(const SyntaxNode &); @@ -142,7 +145,8 @@ int main(int argc, char* argv[]) << " | --UnpairedExtractFormat" << " | --ConditionOnTargetLHS ]" << " | --BoundaryRules[" << options.boundaryRules << "]" - << " | --FlexibilityScore\n"; + << " | --FlexibilityScore" + << " | --PhraseOrientation\n"; exit(1); } @@ -267,6 +271,8 @@ int main(int argc, char* argv[]) options.conditionOnTargetLhs = true; } else if (strcmp(argv[i],"--FlexibilityScore") == 0) { options.flexScoreFlag = true; + } else if (strcmp(argv[i],"--PhraseOrientation") == 0) { + options.phraseOrientation = true; } else if (strcmp(argv[i],"-threads") == 0 || strcmp(argv[i],"--threads") == 0 || strcmp(argv[i],"--Threads") == 0) { @@ -377,6 +383,11 @@ int main(int argc, char* argv[]) if (options.unknownWordLabelFlag) writeUnknownWordLabel(fileNameUnknownWordLabel); + + if (options.phraseOrientation) { + std::string fileNamePhraseOrientationPriors = fileNameExtract + string(".phraseOrientationPriors"); + writePhraseOrientationPriors(fileNamePhraseOrientationPriors); + } } void ExtractTask::Run() @@ -392,6 +403,12 @@ void ExtractTask::extractRules() int countT = m_sentence.target.size(); int countS = m_sentence.source.size(); + // initialize phrase orientation scoring object (for lexicalized reordering model) + if (m_options.phraseOrientation) { + m_sentence.invertAlignment(); // fill m_sentence.alignedToS + m_phraseOrientation = Syntax::GHKM::PhraseOrientation(countS, countT, m_sentence.alignedToT, m_sentence.alignedToS, m_sentence.alignedCountS); + } + // phrase repository for creating hiero phrases RuleExist ruleExist(countT); @@ -990,6 +1007,10 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count } } + rule.alignment.erase(rule.alignment.size()-1); + if (!m_options.onlyDirectFlag) + rule.alignmentInv.erase(rule.alignmentInv.size()-1); + // context (words to left and right) if (m_options.flexScoreFlag) { rule.sourceContextLeft = startS == 0 ? "<s>" : m_sentence.source[startS-1]; @@ -998,9 +1019,14 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count rule.targetContextRight = endT+1 == m_sentence.target.size() ? "<s>" : m_sentence.target[endT+1]; } - rule.alignment.erase(rule.alignment.size()-1); - if (!m_options.onlyDirectFlag) - rule.alignmentInv.erase(rule.alignmentInv.size()-1); + // phrase orientation (lexicalized reordering model) + if (m_options.phraseOrientation) { + rule.l2rOrientation = m_phraseOrientation.GetOrientationInfo(startS,endS,Syntax::GHKM::PhraseOrientation::REO_DIR_L2R); + rule.r2lOrientation = m_phraseOrientation.GetOrientationInfo(startS,endS,Syntax::GHKM::PhraseOrientation::REO_DIR_R2L); + // std::cerr << "span " << startS << " " << endS << std::endl; + // std::cerr << "phraseOrientationL2R " << m_phraseOrientation.GetOrientationInfo(startS,endS,Syntax::GHKM::PhraseOrientation::REO_DIR_L2R) << std::endl; + // std::cerr << "phraseOrientationR2L " << m_phraseOrientation.GetOrientationInfo(startS,endS,Syntax::GHKM::PhraseOrientation::REO_DIR_R2L) << std::endl; + } addRuleToCollection( rule ); } @@ -1070,6 +1096,15 @@ void ExtractTask::writeRulesToFile() if (m_options.pcfgScore) { out << " ||| " << rule->pcfgScore; } + if (m_options.phraseOrientation) { + out << " {{Orientation "; + m_phraseOrientation.WriteOrientation(out,rule->l2rOrientation); + out << " "; + m_phraseOrientation.WriteOrientation(out,rule->r2lOrientation); + m_phraseOrientation.IncrementPriorCount(Syntax::GHKM::PhraseOrientation::REO_DIR_L2R,rule->l2rOrientation,1); + m_phraseOrientation.IncrementPriorCount(Syntax::GHKM::PhraseOrientation::REO_DIR_R2L,rule->r2lOrientation,1); + out << "}}"; + } out << "\n"; if (!m_options.onlyDirectFlag) { @@ -1171,8 +1206,7 @@ void collectWordLabelCounts( SentenceAlignmentWithSyntax &sentence ) } } -void writeUnknownWordLabel(const string & fileName) -{ +void writeUnknownWordLabel(const string & fileName) { ofstream outFile; outFile.open(fileName.c_str()); typedef map<string,int>::const_iterator I; @@ -1196,6 +1230,14 @@ void writeUnknownWordLabel(const string & fileName) outFile.close(); } +void writePhraseOrientationPriors(const string &fileName) +{ + ofstream outFile; + outFile.open(fileName.c_str()); + Syntax::GHKM::PhraseOrientation::WritePriorCounts(outFile); + outFile.close(); +} + double getPcfgScore(const SyntaxNode &node) { double score = 0.0f; |