Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorphikoehn <pkoehn@inf.ed.ac.uk>2012-09-03 10:27:41 +0400
committerphikoehn <pkoehn@inf.ed.ac.uk>2012-09-03 10:27:41 +0400
commit5d9859ba0e742bb5207c8ee78f50252241723cdb (patch)
tree7c447da9fd6acb41117238d443fcae67eeeb5d35 /phrase-extract
parent19ef78514693a5557bae5614c8a2cc31a77a47d3 (diff)
parent92b15c103fa542a19789c043d47b629d2563bad8 (diff)
merge issues
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/RuleExtractionOptions.h2
-rw-r--r--phrase-extract/SentenceAlignment.cpp38
-rw-r--r--phrase-extract/SentenceAlignment.h7
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.cpp8
-rw-r--r--phrase-extract/SentenceAlignmentWithSyntax.h4
-rw-r--r--phrase-extract/extract-rules.cpp78
-rw-r--r--phrase-extract/extract.cpp2
-rw-r--r--phrase-extract/score.cpp152
8 files changed, 218 insertions, 73 deletions
diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h
index bb2d97580..431be58b0 100644
--- a/phrase-extract/RuleExtractionOptions.h
+++ b/phrase-extract/RuleExtractionOptions.h
@@ -53,6 +53,7 @@ public:
bool gzOutput;
bool unpairedExtractFormat;
bool conditionOnTargetLhs;
+ bool boundaryRules;
RuleExtractionOptions()
: maxSpan(10)
@@ -85,6 +86,7 @@ public:
, gzOutput(false)
, unpairedExtractFormat(false)
, conditionOnTargetLhs(false)
+ , boundaryRules(false)
{}
};
diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp
index 8e44bddc4..af1cfa953 100644
--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@@ -25,33 +25,45 @@
#include "tables-core.h"
+using namespace std;
+
namespace MosesTraining
{
SentenceAlignment::~SentenceAlignment() {}
-bool SentenceAlignment::processTargetSentence(const char * targetString, int)
+void addBoundaryWords(vector<string> &phrase)
+{
+ phrase.insert(phrase.begin(), "<s>");
+ phrase.push_back("</s>");
+}
+
+bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
{
target = tokenize(targetString);
+ if (boundaryRules)
+ addBoundaryWords(target);
return true;
}
-bool SentenceAlignment::processSourceSentence(const char * sourceString, int)
+bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
{
source = tokenize(sourceString);
+ if (boundaryRules)
+ addBoundaryWords(source);
return true;
}
-bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID)
+bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID, bool boundaryRules)
{
using namespace std;
this->sentenceID = sentenceID;
// process sentence strings and store in target and source members.
- if (!processTargetSentence(targetString, sentenceID)) {
+ if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {
return false;
}
- if (!processSourceSentence(sourceString, sentenceID)) {
+ if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) {
return false;
}
@@ -81,6 +93,12 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a
cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
return false;
}
+
+ if (boundaryRules) {
+ ++s;
+ ++t;
+ }
+
// cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
@@ -90,6 +108,16 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a
alignedToT[t].push_back( s );
alignedCountS[s]++;
}
+
+ if (boundaryRules) {
+ alignedToT[0].push_back(0);
+ alignedCountS[0]++;
+
+ alignedToT.back().push_back(alignedCountS.size() - 1);
+ alignedCountS.back()++;
+
+ }
+
return true;
}
diff --git a/phrase-extract/SentenceAlignment.h b/phrase-extract/SentenceAlignment.h
index b1fb5933a..7c2988780 100644
--- a/phrase-extract/SentenceAlignment.h
+++ b/phrase-extract/SentenceAlignment.h
@@ -38,12 +38,13 @@ public:
virtual ~SentenceAlignment();
- virtual bool processTargetSentence(const char *, int);
+ virtual bool processTargetSentence(const char *, int, bool boundaryRules);
- virtual bool processSourceSentence(const char *, int);
+ virtual bool processSourceSentence(const char *, int, bool boundaryRules);
bool create(char targetString[], char sourceString[],
- char alignmentString[], int sentenceID);
+ char alignmentString[], int sentenceID, bool boundaryRules);
+
};
}
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.cpp b/phrase-extract/SentenceAlignmentWithSyntax.cpp
index 83a048757..5d866edfb 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.cpp
+++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp
@@ -32,10 +32,10 @@ using namespace std;
namespace MosesTraining
{
-bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID)
+bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
{
if (!m_options.targetSyntax) {
- return SentenceAlignment::processTargetSentence(targetString, sentenceID);
+ return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
}
string targetStringCPP(targetString);
@@ -52,10 +52,10 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
return true;
}
-bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID)
+bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
{
if (!m_options.sourceSyntax) {
- return SentenceAlignment::processSourceSentence(sourceString, sentenceID);
+ return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
}
string sourceStringCPP(sourceString);
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h
index 38fa77907..28eef57b7 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.h
+++ b/phrase-extract/SentenceAlignmentWithSyntax.h
@@ -59,10 +59,10 @@ public:
virtual ~SentenceAlignmentWithSyntax() {}
bool
- processTargetSentence(const char *, int);
+ processTargetSentence(const char *, int, bool boundaryRules);
bool
- processSourceSentence(const char *, int);
+ processSourceSentence(const char *, int, bool boundaryRules);
};
}
diff --git a/phrase-extract/extract-rules.cpp b/phrase-extract/extract-rules.cpp
index f031df8e4..52a141917 100644
--- a/phrase-extract/extract-rules.cpp
+++ b/phrase-extract/extract-rules.cpp
@@ -72,20 +72,20 @@ private:
void writeRulesToFile();
// subs
- void addRule( int, int, int, int, RuleExist &ruleExist);
+ void addRule( int, int, int, int, int, RuleExist &ruleExist);
void addHieroRule( int startT, int endT, int startS, int endS
, RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
void printHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, LabelIndex &labelIndex);
+ , HoleCollection &holeColl, LabelIndex &labelIndex, int countS);
string printTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
string printSourceHieroPhrase( int startT, int endT, int startS, int endS
, HoleCollection &holeColl, const LabelIndex &labelIndex);
void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
, WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
void printHieroAlignment( int startT, int endT, int startS, int endS
, const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
- void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl);
+ void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS);
inline string IntToString( int i )
{
@@ -138,7 +138,9 @@ int main(int argc, char* argv[])
<< " | --SourceSyntax | --TargetSyntax"
<< " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting"
<< " | --UnpairedExtractFormat"
- << " | --ConditionOnTargetLHS ]\n";
+ << " | --ConditionOnTargetLHS ]"
+ << " | --BoundaryRules[" << options.boundaryRules << "]";
+
exit(1);
}
char* &fileNameT = argv[1];
@@ -263,18 +265,18 @@ int main(int argc, char* argv[])
options.unpairedExtractFormat = true;
} else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
options.conditionOnTargetLhs = true;
-#ifdef WITH_THREADS
} else if (strcmp(argv[i],"-threads") == 0 ||
strcmp(argv[i],"--threads") == 0 ||
strcmp(argv[i],"--Threads") == 0) {
thread_count = atoi(argv[++i]);
-#endif
} else if (strcmp(argv[i], "--SentenceOffset") == 0) {
if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
exit(1);
}
sentenceOffset = atoi(argv[++i]);
+ } else if (strcmp(argv[i],"--BoundaryRules") == 0) {
+ options.boundaryRules = true;
} else {
cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
exit(1);
@@ -330,7 +332,7 @@ int main(int argc, char* argv[])
cout << "LOG: PHRASES_BEGIN:" << endl;
}
- if (sentence.create(targetString, sourceString, alignmentString, i)) {
+ if (sentence.create(targetString, sourceString, alignmentString, i, options.boundaryRules)) {
if (options.unknownWordLabelFlag) {
collectWordLabelCounts(sentence);
}
@@ -442,7 +444,7 @@ void ExtractTask::extractRules()
// if within length limits, add as fully-lexical phrase pair
if (endT-startT < m_options.maxSymbolsTarget && endS-startS < m_options.maxSymbolsSource) {
- addRule(startT,endT,startS,endS, ruleExist);
+ addRule(startT,endT,startS,endS, countS, ruleExist);
}
// take note that this is a valid phrase alignment
@@ -502,7 +504,8 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
}
string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS
- , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore)
+ , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore
+ , int countS)
{
HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
assert(iterHoleList != holeColl.GetHoles().end());
@@ -524,8 +527,15 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
assert(sourceLabel != "");
int labelI = labelIndex[ 2+holeCount ];
- string targetLabel = m_options.targetSyntax ?
- m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
+ string targetLabel;
+ if (m_options.targetSyntax) {
+ targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel();
+ } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+ targetLabel = "S";
+ } else {
+ targetLabel = "X";
+ }
+
hole.SetLabel(targetLabel, 1);
if (m_options.unpairedExtractFormat) {
@@ -639,15 +649,22 @@ void ExtractTask::printHieroAlignment( int startT, int endT, int startS, int end
}
void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
- , HoleCollection &holeColl, LabelIndex &labelIndex)
+ , HoleCollection &holeColl, LabelIndex &labelIndex, int countS)
{
WordIndex indexS, indexT; // to keep track of word positions in rule
ExtractedRule rule( startT, endT, startS, endS );
// phrase labels
- string targetLabel = m_options.targetSyntax ?
- m_sentence.targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
+ string targetLabel;
+ if (m_options.targetSyntax) {
+ targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel();
+ } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+ targetLabel = "S";
+ } else {
+ targetLabel = "X";
+ }
+
string sourceLabel = m_options.sourceSyntax ?
m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
@@ -657,12 +674,12 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
// target
if (m_options.pcfgScore) {
double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
- rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
+ " [" + targetLabel + "]";
rule.pcfgScore = std::exp(logPCFGScore);
} else {
double logPCFGScore = 0.0f;
- rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+ rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
+ " [" + targetLabel + "]";
}
@@ -680,7 +697,7 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
addRuleToCollection( rule );
}
-void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl)
+void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS)
{
LabelIndex labelIndex,labelCount;
@@ -715,7 +732,7 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
// loop through the holes
bool done = false;
while(!done) {
- printHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex );
+ printHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex, countS );
for(unsigned int i=0; i<labelIndex.size(); i++) {
labelIndex[i]++;
if(labelIndex[i] == labelCount[i]) {
@@ -843,7 +860,7 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
// passed all checks...
if (allowablePhrase)
- printAllHieroPhrases(startT, endT, startS, endS, copyHoleColl);
+ printAllHieroPhrases(startT, endT, startS, endS, copyHoleColl, wordCountS);
// recursively search for next hole
int nextInitStartT = m_options.nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2;
@@ -855,10 +872,15 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
}
}
-void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist &ruleExist)
+void ExtractTask::addRule( int startT, int endT, int startS, int endS, int countS, RuleExist &ruleExist)
{
- // source
-
+ // contains only <s> or </s>. Don't output
+ if (m_options.boundaryRules
+ && ( (startS == 0 && endS == 0)
+ || (startS == countS-1 && endS == countS-1))) {
+ return;
+ }
+
if (m_options.onlyOutputSpanInfo) {
cout << startS << " " << endS << " " << startT << " " << endT << endl;
return;
@@ -874,8 +896,14 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist
else {
sourceLabel = m_options.sourceSyntax ?
m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
- targetLabel = m_options.targetSyntax ?
- m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
+
+ if (m_options.targetSyntax) {
+ targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
+ } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+ targetLabel = "S";
+ } else {
+ targetLabel = "X";
+ }
}
// source
diff --git a/phrase-extract/extract.cpp b/phrase-extract/extract.cpp
index 58eb4b2f3..6a1ee77ab 100644
--- a/phrase-extract/extract.cpp
+++ b/phrase-extract/extract.cpp
@@ -261,7 +261,7 @@ int main(int argc, char* argv[])
cout << "LOG: ALT: " << alignmentString << endl;
cout << "LOG: PHRASES_BEGIN:" << endl;
}
- if (sentence.create( englishString, foreignString, alignmentString, i)) {
+ if (sentence.create( englishString, foreignString, alignmentString, i, false)) {
ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation, extractFileSentenceId);
task->Run();
delete task;
diff --git a/phrase-extract/score.cpp b/phrase-extract/score.cpp
index 8348a44bc..4de7acc0f 100644
--- a/phrase-extract/score.cpp
+++ b/phrase-extract/score.cpp
@@ -59,6 +59,8 @@ bool lexFlag = true;
bool unalignedFlag = false;
bool unalignedFWFlag = false;
bool outputNTLengths = false;
+bool singletonFeature = false;
+bool crossedNonTerm = false;
int countOfCounts[COC_MAX+1];
int totalDistinct = 0;
float minCountHierarchical = 0;
@@ -77,14 +79,14 @@ Vocabulary vcbS;
vector<string> tokenize( const char [] );
void writeCountOfCounts( const string &fileNameCountOfCounts );
-void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile);
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair );
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile );
-double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
-double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton);
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair );
+void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton );
+double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & );
+double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
set<string> functionWordList;
void loadFunctionWords( const string &fileNameFunctionWords );
-double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
, map<size_t, map<size_t, float> > &sourceProb
, map<size_t, map<size_t, float> > &targetProb);
@@ -97,7 +99,7 @@ int main(int argc, char* argv[])
<< "scoring methods for extracted rules\n";
if (argc < 4) {
- cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]\n";
+ cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]] [--Singleton] [--CrossedNonTerm] \n";
exit(1);
}
string fileNameExtract = argv[1];
@@ -177,6 +179,12 @@ int main(int argc, char* argv[])
minCountHierarchical -= 0.00001; // account for rounding
} else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
outputNTLengths = true;
+ } else if (strcmp(argv[i],"--Singleton") == 0) {
+ singletonFeature = true;
+ cerr << "binary singleton feature\n";
+ } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
+ crossedNonTerm = true;
+ cerr << "crossed non-term reordering feature\n";
} else {
cerr << "ERROR: unknown option " << argv[i] << endl;
exit(1);
@@ -238,6 +246,7 @@ int main(int argc, char* argv[])
float lastCount = 0.0f;
float lastPcfgSum = 0.0f;
vector< PhraseAlignment > phrasePairsWithSameF;
+ bool isSingleton = true;
int i=0;
char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
lastLine[0] = '\0';
@@ -272,16 +281,22 @@ int main(int argc, char* argv[])
// if new source phrase, process last batch
if (lastPhrasePair != NULL &&
lastPhrasePair->GetSource() != phrasePair.GetSource()) {
- processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
+ processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
+
phrasePairsWithSameF.clear();
+ isSingleton = false;
lastPhrasePair = NULL;
}
+ else
+ {
+ isSingleton = true;
+ }
// add phrase pairs to list, it's now the last one
phrasePairsWithSameF.push_back( phrasePair );
lastPhrasePair = &phrasePairsWithSameF.back();
}
- processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
+ processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
phraseTableFile->flush();
if (phraseTableFile != &cout) {
@@ -315,7 +330,7 @@ void writeCountOfCounts( const string &fileNameCountOfCounts )
countOfCountsFile.Close();
}
-void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile )
+void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile, bool isSingleton )
{
if (phrasePair.size() == 0) return;
@@ -356,12 +371,12 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter)
{
const PhraseAlignmentCollection &group = **iter;
- outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile );
+ outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile, isSingleton );
}
}
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair )
{
float bestAlignmentCount = -1;
PhraseAlignment* bestAlignment = NULL;
@@ -382,7 +397,7 @@ PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
}
}
- return bestAlignment;
+ return *bestAlignment;
}
@@ -473,11 +488,65 @@ void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t,
}
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile )
+bool calcCrossedNonTerm(int sourcePos, int targetPos, const std::vector< std::set<size_t> > &alignedToS)
+{
+ for (int currSource = 0; currSource < alignedToS.size(); ++currSource)
+ {
+ if (currSource == sourcePos)
+ { // skip
+ }
+ else
+ {
+ const std::set<size_t> &targetSet = alignedToS[currSource];
+ std::set<size_t>::const_iterator iter;
+ for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
+ {
+ size_t currTarget = *iter;
+
+ if ((currSource < sourcePos && currTarget > targetPos)
+ || (currSource > sourcePos && currTarget < targetPos)
+ )
+ {
+ return true;
+ }
+ }
+
+ }
+ }
+
+ return false;
+}
+
+int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment)
+{
+ const std::vector< std::set<size_t> > &alignedToS = bestAlignment.alignedToS;
+
+ for (int sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos)
+ {
+ const std::set<size_t> &targetSet = alignedToS[sourcePos];
+
+ WORD_ID wordId = phraseS[sourcePos];
+ const WORD &word = vcbS.getWord(wordId);
+ bool isNonTerm = isNonTerminal(word);
+
+ if (isNonTerm)
+ {
+ assert(targetSet.size() == 1);
+ int targetPos = *targetSet.begin();
+ bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS);
+ if (ret)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton )
{
if (phrasePair.size() == 0) return;
- PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
+ const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair );
// compute count
float count = 0;
@@ -529,17 +598,17 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
// source phrase (unless inverse)
if (! inverseFlag) {
- printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+ printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
phraseTableFile << " ||| ";
}
// target phrase
- printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+ printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
phraseTableFile << " ||| ";
// source phrase (if inverse)
if (inverseFlag) {
- printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+ printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
phraseTableFile << " ||| ";
}
@@ -561,6 +630,14 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
phraseTableFile << " " << maybeLogProb( penalty );
}
+ if (singletonFeature) {
+ phraseTableFile << " " << (isSingleton ? 1 : 0);
+ }
+
+ if (crossedNonTerm && !inverseFlag) {
+ phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment);
+ }
+
// target-side PCFG score
if (pcfgFlag && !inverseFlag) {
phraseTableFile << " " << maybeLogProb( pcfgScore );
@@ -632,22 +709,31 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
if (! inverseFlag) {
if (hierarchicalFlag) {
// always output alignment if hiero style, but only for non-terms
- assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
+ assert(phraseT.size() == bestAlignment.alignedToT.size() + 1);
for(size_t j = 0; j < phraseT.size() - 1; j++) {
if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
- if (bestAlignment->alignedToT[ j ].size() != 1) {
+ if (bestAlignment.alignedToT[ j ].size() != 1) {
cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
phraseTableFile.flush();
- assert(bestAlignment->alignedToT[ j ].size() == 1);
+ assert(bestAlignment.alignedToT[ j ].size() == 1);
}
- int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
+ int sourcePos = *(bestAlignment.alignedToT[ j ].begin());
phraseTableFile << sourcePos << "-" << j << " ";
}
+ else if (wordAlignmentFlag) {
+ const std::set<size_t> &sourceSet = bestAlignment.alignedToT[ j ];
+ std::set<size_t>::const_iterator iter;
+ for (iter = sourceSet.begin(); iter != sourceSet.end(); ++iter)
+ {
+ int sourcePos = *iter;
+ phraseTableFile << sourcePos << "-" << j << " ";
+ }
+ }
}
} else if (wordAlignmentFlag) {
// alignment info in pb model
- for(size_t j=0; j<bestAlignment->alignedToT.size(); j++) {
- const set< size_t > &aligned = bestAlignment->alignedToT[j];
+ for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) {
+ const set< size_t > &aligned = bestAlignment.alignedToT[j];
for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
phraseTableFile << *p << "-" << j << " ";
}
@@ -681,13 +767,13 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
phraseTableFile << endl;
}
-double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
{
// unaligned word counter
double unaligned = 1.0;
// only checking target words - source words are caught when computing inverse
- for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
- const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+ for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+ const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
if (srcIndices.empty()) {
unaligned *= 2.718;
}
@@ -695,13 +781,13 @@ double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, Ph
return unaligned;
}
-double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
{
// unaligned word counter
double unaligned = 1.0;
// only checking target words - source words are caught when computing inverse
- for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
- const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+ for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+ const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) {
unaligned *= 2.718;
}
@@ -734,14 +820,14 @@ void loadFunctionWords( const string &fileName )
inFile.close();
}
-double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
{
// lexical translation probability
double lexScore = 1.0;
int null = vcbS.getWordID("NULL");
// all target words have to be explained
- for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
- const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+ for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+ const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
if (srcIndices.empty()) {
// explain unaligned word by NULL
lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );