Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMatthias Huck <mhuck@inf.ed.ac.uk>2015-12-10 15:56:37 +0300
committerMatthias Huck <mhuck@inf.ed.ac.uk>2015-12-10 15:56:37 +0300
commitbd3f5734524395cd87c9acb07a23c6c816a8ef27 (patch)
tree1e4e63d5a2f744a7478d75c66c9705a5ba8244b9 /phrase-extract
parent831dc83778f4594375e9de6a4e2831a02b3e1314 (diff)
Hiero phrase orientation
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/Alignment.cpp (renamed from phrase-extract/extract-ghkm/Alignment.cpp)8
-rw-r--r--phrase-extract/Alignment.h (renamed from phrase-extract/extract-ghkm/Alignment.h)6
-rw-r--r--phrase-extract/ExtractedRule.h14
-rw-r--r--phrase-extract/PhraseOrientation.cpp (renamed from phrase-extract/extract-ghkm/PhraseOrientation.cpp)18
-rw-r--r--phrase-extract/PhraseOrientation.h (renamed from phrase-extract/extract-ghkm/PhraseOrientation.h)17
-rw-r--r--phrase-extract/RuleExtractionOptions.h4
-rw-r--r--phrase-extract/extract-rules-main.cpp54
7 files changed, 81 insertions, 40 deletions
diff --git a/phrase-extract/extract-ghkm/Alignment.cpp b/phrase-extract/Alignment.cpp
index d12f9398b..cef7a4638 100644
--- a/phrase-extract/extract-ghkm/Alignment.cpp
+++ b/phrase-extract/Alignment.cpp
@@ -27,10 +27,6 @@
namespace MosesTraining
{
-namespace Syntax
-{
-namespace GHKM
-{
void ReadAlignment(const std::string &s, Alignment &a)
{
@@ -46,7 +42,7 @@ void ReadAlignment(const std::string &s, Alignment &a)
}
int src = std::atoi(s.substr(begin, end-begin).c_str());
if (end+1 == s.size()) {
- throw Exception("Target index missing");
+ throw Syntax::Exception("Target index missing");
}
begin = end+1;
@@ -71,6 +67,4 @@ void FlipAlignment(Alignment &a)
}
}
-} // namespace GHKM
-} // namespace Syntax
} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/Alignment.h b/phrase-extract/Alignment.h
index da1279f8f..c25896e6d 100644
--- a/phrase-extract/extract-ghkm/Alignment.h
+++ b/phrase-extract/Alignment.h
@@ -25,10 +25,6 @@
namespace MosesTraining
{
-namespace Syntax
-{
-namespace GHKM
-{
typedef std::vector<std::pair<int, int> > Alignment;
@@ -36,6 +32,4 @@ void ReadAlignment(const std::string &, Alignment &);
void FlipAlignment(Alignment &);
-} // namespace GHKM
-} // namespace Syntax
} // namespace MosesTraining
diff --git a/phrase-extract/ExtractedRule.h b/phrase-extract/ExtractedRule.h
index cb2f2261d..e879b96c4 100644
--- a/phrase-extract/ExtractedRule.h
+++ b/phrase-extract/ExtractedRule.h
@@ -26,6 +26,8 @@
#include <sstream>
#include <map>
+#include "extract-ghkm/PhraseOrientation.h"
+
namespace MosesTraining
{
@@ -37,8 +39,6 @@ public:
std::string target;
std::string alignment;
std::string alignmentInv;
- std::string orientation;
- std::string orientationForward;
std::string sourceContextLeft;
std::string sourceContextRight;
std::string targetContextLeft;
@@ -51,14 +51,14 @@ public:
int endS;
float count;
double pcfgScore;
+ Syntax::GHKM::PhraseOrientation::REO_CLASS l2rOrientation;
+ Syntax::GHKM::PhraseOrientation::REO_CLASS r2lOrientation;
ExtractedRule(int sT, int eT, int sS, int eS)
: source()
, target()
, alignment()
, alignmentInv()
- , orientation()
- , orientationForward()
, sourceContextLeft()
, sourceContextRight()
, targetContextLeft()
@@ -70,8 +70,10 @@ public:
, startS(sS)
, endS(eS)
, count(0)
- , pcfgScore(0.0) {
- }
+ , pcfgScore(0.0)
+ , l2rOrientation(Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN)
+ , r2lOrientation(Syntax::GHKM::PhraseOrientation::REO_CLASS_UNKNOWN)
+ { }
};
}
diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.cpp b/phrase-extract/PhraseOrientation.cpp
index f07e19a46..20297b4de 100644
--- a/phrase-extract/extract-ghkm/PhraseOrientation.cpp
+++ b/phrase-extract/PhraseOrientation.cpp
@@ -28,10 +28,6 @@
namespace MosesTraining
{
-namespace Syntax
-{
-namespace GHKM
-{
std::vector<float> PhraseOrientation::m_l2rOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0);
std::vector<float> PhraseOrientation::m_r2lOrientationPriorCounts = boost::assign::list_of(0)(0)(0)(0)(0);
@@ -100,6 +96,18 @@ PhraseOrientation::PhraseOrientation(int sourceSize,
Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS);
}
+PhraseOrientation::PhraseOrientation(int sourceSize,
+ int targetSize,
+ const std::vector<std::vector<int> > &alignedToT,
+ const std::vector<std::vector<int> > &alignedToS,
+ const std::vector<int> &alignedCountS)
+ : m_countF(sourceSize)
+ , m_countE(targetSize)
+ , m_alignedToT(alignedToT)
+{
+ Init(sourceSize, targetSize, m_alignedToT, alignedToS, alignedCountS);
+}
+
void PhraseOrientation::Init(int sourceSize,
int targetSize,
@@ -470,6 +478,4 @@ void PhraseOrientation::WritePriorCounts(std::ostream& out, const REO_MODEL_TYPE
}
}
-} // namespace GHKM
-} // namespace Syntax
} // namespace MosesTraining
diff --git a/phrase-extract/extract-ghkm/PhraseOrientation.h b/phrase-extract/PhraseOrientation.h
index d956e2bc8..1cdfb65be 100644
--- a/phrase-extract/extract-ghkm/PhraseOrientation.h
+++ b/phrase-extract/PhraseOrientation.h
@@ -32,10 +32,6 @@
namespace MosesTraining
{
-namespace Syntax
-{
-namespace GHKM
-{
// The key of the map is the English index and the value is a set of the source ones
typedef std::map <int, std::set<int> > HSentenceVertices;
@@ -49,6 +45,7 @@ public:
enum REO_CLASS {REO_CLASS_LEFT, REO_CLASS_RIGHT, REO_CLASS_DLEFT, REO_CLASS_DRIGHT, REO_CLASS_UNKNOWN};
enum REO_DIR {REO_DIR_L2R, REO_DIR_R2L, REO_DIR_BIDIR};
+ PhraseOrientation() {};
PhraseOrientation(int sourceSize,
int targetSize,
@@ -59,6 +56,12 @@ public:
const Moses::AlignmentInfo &alignTerm,
const Moses::AlignmentInfo &alignNonTerm);
+ PhraseOrientation(int sourceSize,
+ int targetSize,
+ const std::vector<std::vector<int> > &alignedToT,
+ const std::vector<std::vector<int> > &alignedToS,
+ const std::vector<int> &alignedCountS);
+
REO_CLASS GetOrientationInfo(int startF, int endF, REO_DIR direction) const;
REO_CLASS GetOrientationInfo(int startF, int startE, int endF, int endE, REO_DIR direction) const;
const std::string GetOrientationInfoString(int startF, int endF, REO_DIR direction=REO_DIR_BIDIR) const;
@@ -104,8 +107,8 @@ private:
return first < second;
};
- const int m_countF;
- const int m_countE;
+ int m_countF;
+ int m_countE;
std::vector<std::vector<int> > m_alignedToT;
@@ -121,6 +124,4 @@ private:
static std::vector<float> m_r2lOrientationPriorCounts;
};
-} // namespace GHKM
-} // namespace Syntax
} // namespace MosesTraining
diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h
index b38258470..aab059cf9 100644
--- a/phrase-extract/RuleExtractionOptions.h
+++ b/phrase-extract/RuleExtractionOptions.h
@@ -54,6 +54,7 @@ public:
bool conditionOnTargetLhs;
bool boundaryRules;
bool flexScoreFlag;
+ bool phraseOrientation;
RuleExtractionOptions()
: maxSpan(10)
@@ -86,7 +87,8 @@ public:
, unpairedExtractFormat(false)
, conditionOnTargetLhs(false)
, boundaryRules(false)
- , flexScoreFlag(false) {}
+ , flexScoreFlag(false)
+ , phraseOrientation(false) {}
};
}
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index e6fff965d..49db5797c 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -46,6 +46,7 @@
#include "XmlTree.h"
#include "InputFileStream.h"
#include "OutputFileStream.h"
+#include "extract-ghkm/PhraseOrientation.h"
using namespace std;
using namespace MosesTraining;
@@ -62,6 +63,7 @@ private:
Moses::OutputFileStream& m_extractFileInv;
Moses::OutputFileStream& m_extractFileContext;
Moses::OutputFileStream& m_extractFileContextInv;
+ Syntax::GHKM::PhraseOrientation m_phraseOrientation;
vector< ExtractedRule > m_extractedRules;
@@ -109,6 +111,7 @@ public:
void collectWordLabelCounts(SentenceAlignmentWithSyntax &sentence );
void writeGlueGrammar(const string &, RuleExtractionOptions &options, set< string > &targetLabelCollection, map< string, int > &targetTopLabelCollection);
void writeUnknownWordLabel(const string &);
+void writePhraseOrientationPriors(const string &);
double getPcfgScore(const SyntaxNode &);
@@ -142,7 +145,8 @@ int main(int argc, char* argv[])
<< " | --UnpairedExtractFormat"
<< " | --ConditionOnTargetLHS ]"
<< " | --BoundaryRules[" << options.boundaryRules << "]"
- << " | --FlexibilityScore\n";
+ << " | --FlexibilityScore"
+ << " | --PhraseOrientation\n";
exit(1);
}
@@ -267,6 +271,8 @@ int main(int argc, char* argv[])
options.conditionOnTargetLhs = true;
} else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
options.flexScoreFlag = true;
+ } else if (strcmp(argv[i],"--PhraseOrientation") == 0) {
+ options.phraseOrientation = true;
} else if (strcmp(argv[i],"-threads") == 0 ||
strcmp(argv[i],"--threads") == 0 ||
strcmp(argv[i],"--Threads") == 0) {
@@ -377,6 +383,11 @@ int main(int argc, char* argv[])
if (options.unknownWordLabelFlag)
writeUnknownWordLabel(fileNameUnknownWordLabel);
+
+ if (options.phraseOrientation) {
+ std::string fileNamePhraseOrientationPriors = fileNameExtract + string(".phraseOrientationPriors");
+ writePhraseOrientationPriors(fileNamePhraseOrientationPriors);
+ }
}
void ExtractTask::Run()
@@ -392,6 +403,12 @@ void ExtractTask::extractRules()
int countT = m_sentence.target.size();
int countS = m_sentence.source.size();
+ // initialize phrase orientation scoring object (for lexicalized reordering model)
+ if (m_options.phraseOrientation) {
+ m_sentence.invertAlignment(); // fill m_sentence.alignedToS
+ m_phraseOrientation = Syntax::GHKM::PhraseOrientation(countS, countT, m_sentence.alignedToT, m_sentence.alignedToS, m_sentence.alignedCountS);
+ }
+
// phrase repository for creating hiero phrases
RuleExist ruleExist(countT);
@@ -990,6 +1007,10 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
}
}
+ rule.alignment.erase(rule.alignment.size()-1);
+ if (!m_options.onlyDirectFlag)
+ rule.alignmentInv.erase(rule.alignmentInv.size()-1);
+
// context (words to left and right)
if (m_options.flexScoreFlag) {
rule.sourceContextLeft = startS == 0 ? "<s>" : m_sentence.source[startS-1];
@@ -998,9 +1019,14 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, int count
rule.targetContextRight = endT+1 == m_sentence.target.size() ? "<s>" : m_sentence.target[endT+1];
}
- rule.alignment.erase(rule.alignment.size()-1);
- if (!m_options.onlyDirectFlag)
- rule.alignmentInv.erase(rule.alignmentInv.size()-1);
+ // phrase orientation (lexicalized reordering model)
+ if (m_options.phraseOrientation) {
+ rule.l2rOrientation = m_phraseOrientation.GetOrientationInfo(startS,endS,Syntax::GHKM::PhraseOrientation::REO_DIR_L2R);
+ rule.r2lOrientation = m_phraseOrientation.GetOrientationInfo(startS,endS,Syntax::GHKM::PhraseOrientation::REO_DIR_R2L);
+ // std::cerr << "span " << startS << " " << endS << std::endl;
+ // std::cerr << "phraseOrientationL2R " << m_phraseOrientation.GetOrientationInfo(startS,endS,Syntax::GHKM::PhraseOrientation::REO_DIR_L2R) << std::endl;
+ // std::cerr << "phraseOrientationR2L " << m_phraseOrientation.GetOrientationInfo(startS,endS,Syntax::GHKM::PhraseOrientation::REO_DIR_R2L) << std::endl;
+ }
addRuleToCollection( rule );
}
@@ -1070,6 +1096,15 @@ void ExtractTask::writeRulesToFile()
if (m_options.pcfgScore) {
out << " ||| " << rule->pcfgScore;
}
+ if (m_options.phraseOrientation) {
+ out << " {{Orientation ";
+ m_phraseOrientation.WriteOrientation(out,rule->l2rOrientation);
+ out << " ";
+ m_phraseOrientation.WriteOrientation(out,rule->r2lOrientation);
+ m_phraseOrientation.IncrementPriorCount(Syntax::GHKM::PhraseOrientation::REO_DIR_L2R,rule->l2rOrientation,1);
+ m_phraseOrientation.IncrementPriorCount(Syntax::GHKM::PhraseOrientation::REO_DIR_R2L,rule->r2lOrientation,1);
+ out << "}}";
+ }
out << "\n";
if (!m_options.onlyDirectFlag) {
@@ -1171,8 +1206,7 @@ void collectWordLabelCounts( SentenceAlignmentWithSyntax &sentence )
}
}
-void writeUnknownWordLabel(const string & fileName)
-{
+void writeUnknownWordLabel(const string & fileName) {
ofstream outFile;
outFile.open(fileName.c_str());
typedef map<string,int>::const_iterator I;
@@ -1196,6 +1230,14 @@ void writeUnknownWordLabel(const string & fileName)
outFile.close();
}
+void writePhraseOrientationPriors(const string &fileName)
+{
+ ofstream outFile;
+ outFile.open(fileName.c_str());
+ Syntax::GHKM::PhraseOrientation::WritePriorCounts(outFile);
+ outFile.close();
+}
+
double getPcfgScore(const SyntaxNode &node)
{
double score = 0.0f;