Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <fishandfrolick@gmail.com>2012-08-23 22:40:09 +0400
committerHieu Hoang <fishandfrolick@gmail.com>2012-08-23 22:40:09 +0400
commit5dbb0e66ce36de4a27737a47d11276cc028d961c (patch)
tree63f3b546e6412ae29fca5efdc72dfa5678000144 /phrase-extract
parent20341c091fb48770fd438d0ed5e97d0538746d7f (diff)
option to produce rules that have boundary <s> & </s> words. Like Chris Dyer's extraction
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/RuleExtractionOptions.h2
-rw-r--r--phrase-extract/SentenceAlignment.cpp38
-rw-r--r--phrase-extract/SentenceAlignment.h7
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.cpp8
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.h4
-rw-r--r--phrase-extract/extract-rules.cpp76
-rw-r--r--phrase-extract/extract.cpp2
7 files changed, 99 insertions, 38 deletions
diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h
index bb2d97580..431be58b0 100644
--- a/phrase-extract/RuleExtractionOptions.h
+++ b/phrase-extract/RuleExtractionOptions.h
@@ -53,6 +53,7 @@ public:
bool gzOutput;
bool unpairedExtractFormat;
bool conditionOnTargetLhs;
+ bool boundaryRules;
RuleExtractionOptions()
: maxSpan(10)
@@ -85,6 +86,7 @@ public:
, gzOutput(false)
, unpairedExtractFormat(false)
, conditionOnTargetLhs(false)
+ , boundaryRules(false)
{}
};
diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp
index 8e44bddc4..af1cfa953 100644
--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@@ -25,33 +25,45 @@
#include "tables-core.h"
+using namespace std;
+
namespace MosesTraining
{
SentenceAlignment::~SentenceAlignment() {}
-bool SentenceAlignment::processTargetSentence(const char * targetString, int)
+void addBoundaryWords(vector<string> &phrase)
+{
+ phrase.insert(phrase.begin(), "<s>");
+ phrase.push_back("</s>");
+}
+
+bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
{
target = tokenize(targetString);
+ if (boundaryRules)
+ addBoundaryWords(target);
return true;
}
-bool SentenceAlignment::processSourceSentence(const char * sourceString, int)
+bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
{
source = tokenize(sourceString);
+ if (boundaryRules)
+ addBoundaryWords(source);
return true;
}
-bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID)
+bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID, bool boundaryRules)
{
using namespace std;
this->sentenceID = sentenceID;
// process sentence strings and store in target and source members.
- if (!processTargetSentence(targetString, sentenceID)) {
+ if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {
return false;
}
- if (!processSourceSentence(sourceString, sentenceID)) {
+ if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) {
return false;
}
@@ -81,6 +93,12 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return false;
}
+
+ if (boundaryRules) {
+ ++s;
+ ++t;
+ }
+
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
@@ -90,6 +108,16 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a
alignedToT[t].push_back( s );
alignedCountS[s]++;
}
+
+ if (boundaryRules) {
+ alignedToT[0].push_back(0);
+ alignedCountS[0]++;
+
+ alignedToT.back().push_back(alignedCountS.size() - 1);
+ alignedCountS.back()++;
+
+ }
+
return true;
}
diff --git a/phrase-extract/SentenceAlignment.h b/phrase-extract/SentenceAlignment.h
index b1fb5933a..7c2988780 100644
--- a/phrase-extract/SentenceAlignment.h
+++ b/phrase-extract/SentenceAlignment.h
@@ -38,12 +38,13 @@ public:
virtual ~SentenceAlignment();
- virtual bool processTargetSentence(const char *, int);
+ virtual bool processTargetSentence(const char *, int, bool boundaryRules);
- virtual bool processSourceSentence(const char *, int);
+ virtual bool processSourceSentence(const char *, int, bool boundaryRules);
bool create(char targetString[], char sourceString[],
- char alignmentString[], int sentenceID);
+ char alignmentString[], int sentenceID, bool boundaryRules);
+
};
}
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.cpp b/phrase-extract/SentenceAlignmentWithSyntax.cpp
index 83a048757..5d866edfb 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.cpp
+++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp
@@ -32,10 +32,10 @@ using namespace std;
namespace MosesTraining
{
-bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID)
+bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
{
if (!m_options.targetSyntax) {
- return SentenceAlignment::processTargetSentence(targetString, sentenceID);
+ return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
}
string targetStringCPP(targetString);
@@ -52,10 +52,10 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
return true;
}
-bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID)
+bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
{
if (!m_options.sourceSyntax) {
- return SentenceAlignment::processSourceSentence(sourceString, sentenceID);
+ return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
}
string sourceStringCPP(sourceString);
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h
index 38fa77907..28eef57b7 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.h
+++ b/phrase-extract/SentenceAlignmentWithSyntax.h
@@ -59,10 +59,10 @@ public:
virtual ~SentenceAlignmentWithSyntax() {}
bool
- processTargetSentence(const char *, int);
+ processTargetSentence(const char *, int, bool boundaryRules);
bool
- processSourceSentence(const char *, int);
+ processSourceSentence(const char *, int, bool boundaryRules);
};
}
diff --git a/phrase-extract/extract-rules.cpp b/phrase-extract/extract-rules.cpp
index 4f6b5b436..96d2bfc93 100644
--- a/phrase-extract/extract-rules.cpp
+++ b/phrase-extract/extract-rules.cpp
@@ -72,20 +72,20 @@ private:
void writeRulesToFile();
// subs
- void addRule( int, int, int, int, RuleExist &ruleExist);
+ void addRule( int, int, int, int, int, RuleExist &ruleExist);
void addHieroRule( int startT, int endT, int startS, int endS
, RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
void printHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, LabelIndex &labelIndex);
+ , HoleCollection &holeColl, LabelIndex &labelIndex, int countS);
string printTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
string printSourceHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, const LabelIndex &labelIndex);
void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
, WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
void printHieroAlignment( int startT, int endT, int startS, int endS
, const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
- void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl);
+ void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS);
inline string IntToString( int i )
{
@@ -135,7 +135,9 @@ int main(int argc, char* argv[])
<< " | --SourceSyntax | --TargetSyntax"
<< " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting"
<< " | --UnpairedExtractFormat"
- << " | --ConditionOnTargetLHS ]\n";
+ << " | --ConditionOnTargetLHS ]"
+ << " | --BoundaryRules[" << options.boundaryRules << "]";
+
exit(1);
}
char* &fileNameT = argv[1];
@@ -260,6 +262,8 @@ int main(int argc, char* argv[])
options.unpairedExtractFormat = true;
} else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
options.conditionOnTargetLhs = true;
+ } else if (strcmp(argv[i],"--BoundaryRules") == 0) {
+ options.boundaryRules = true;
} else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
@@ -315,7 +319,7 @@ int main(int argc, char* argv[])
cout << "LOG: PHRASES_BEGIN:" << endl;
}
- if (sentence.create(targetString, sourceString, alignmentString, i)) {
+ if (sentence.create(targetString, sourceString, alignmentString, i, options.boundaryRules)) {
if (options.unknownWordLabelFlag) {
collectWordLabelCounts(sentence);
}
@@ -427,7 +431,7 @@ void ExtractTask::extractRules()
// if within length limits, add as fully-lexical phrase pair
if (endT-startT < m_options.maxSymbolsTarget && endS-startS < m_options.maxSymbolsSource) {
- addRule(startT,endT,startS,endS, ruleExist);
+ addRule(startT,endT,startS,endS, countS, ruleExist);
}
// take note that this is a valid phrase alignment
@@ -487,7 +491,8 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
}
string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore)
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore
+ , int countS)
{
HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
assert(iterHoleList != holeColl.GetHoles().end());
@@ -509,8 +514,15 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
assert(sourceLabel != "");
int labelI = labelIndex[ 2+holeCount ];
- string targetLabel = m_options.targetSyntax ?
- m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
+ string targetLabel;
+ if (m_options.targetSyntax) {
+ targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel();
+ } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+ targetLabel = "S";
+ } else {
+ targetLabel = "X";
+ }
+
hole.SetLabel(targetLabel, 1);
if (m_options.unpairedExtractFormat) {
@@ -624,15 +636,22 @@ void ExtractTask::printHieroAlignment( int startT, int endT, int startS, int end
}
void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, LabelIndex &labelIndex)
+ , HoleCollection &holeColl, LabelIndex &labelIndex, int countS)
{
WordIndex indexS, indexT; // to keep track of word positions in rule
ExtractedRule rule( startT, endT, startS, endS );
// phrase labels
- string targetLabel = m_options.targetSyntax ?
- m_sentence.targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
+ string targetLabel;
+ if (m_options.targetSyntax) {
+ targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel();
+ } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+ targetLabel = "S";
+ } else {
+ targetLabel = "X";
+ }
+
string sourceLabel = m_options.sourceSyntax ?
m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
@@ -642,12 +661,12 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
// target
if (m_options.pcfgScore) {
double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
- rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
+ " [" + targetLabel + "]";
rule.pcfgScore = std::exp(logPCFGScore);
} else {
double logPCFGScore = 0.0f;
- rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
+ " [" + targetLabel + "]";
}
@@ -665,7 +684,7 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
addRuleToCollection( rule );
}
-void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl)
+void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS)
{
LabelIndex labelIndex,labelCount;
@@ -700,7 +719,7 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
// loop through the holes
bool done = false;
while(!done) {
- printHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex );
+ printHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex, countS );
for(unsigned int i=0; i<labelIndex.size(); i++) {
labelIndex[i]++;
if(labelIndex[i] == labelCount[i]) {
@@ -828,7 +847,7 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
// passed all checks...
if (allowablePhrase)
- printAllHieroPhrases(startT, endT, startS, endS, copyHoleColl);
+ printAllHieroPhrases(startT, endT, startS, endS, copyHoleColl, wordCountS);
// recursively search for next hole
int nextInitStartT = m_options.nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2;
@@ -840,10 +859,15 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
}
}
-void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist &ruleExist)
+void ExtractTask::addRule( int startT, int endT, int startS, int endS, int countS, RuleExist &ruleExist)
{
- // source
-
+ // contains only <s> or </s>. Don't output
+ if (m_options.boundaryRules
+ && ( (startS == 0 && endS == 0)
+ || (startS == countS-1 && endS == countS-1))) {
+ return;
+ }
+
if (m_options.onlyOutputSpanInfo) {
cout << startS << " " << endS << " " << startT << " " << endT << endl;
return;
@@ -859,8 +883,14 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist
else {
sourceLabel = m_options.sourceSyntax ?
m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
- targetLabel = m_options.targetSyntax ?
- m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
+
+ if (m_options.targetSyntax) {
+ targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
+ } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+ targetLabel = "S";
+ } else {
+ targetLabel = "X";
+ }
}
// source
diff --git a/phrase-extract/extract.cpp b/phrase-extract/extract.cpp
index fe55788e8..5142ebc37 100644
--- a/phrase-extract/extract.cpp
+++ b/phrase-extract/extract.cpp
@@ -253,7 +253,7 @@ int main(int argc, char* argv[])
cout << "LOG: ALT: " << alignmentString << endl;
cout << "LOG: PHRASES_BEGIN:" << endl;
}
- if (sentence.create( englishString, foreignString, alignmentString, i)) {
+ if (sentence.create( englishString, foreignString, alignmentString, i, false)) {
ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation, extractFileSentenceId);
task->Run();
delete task;