Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorHieu Hoang <fishandfrolick@gmail.com>2012-08-25 03:47:57 +0400
committerHieu Hoang <fishandfrolick@gmail.com>2012-08-25 03:47:57 +0400
commit33c03edfbbb2c80fd1c02e24ef56357b92546c08 (patch)
tree898aed945ba3d8c5c5a24b96a319afef04c26c9c /phrase-extract
parent1931bfe959b2ab0088229996a4e7f61cca61909c (diff)
binary hiero reordering feature. Implementation of 1 described in nist 2012. 1 if non-term is reordered wrt to other words or non-terms. 0 otherwise
Diffstat (limited to 'phrase-extract')
-rw-r--r--phrase-extract/score.cpp137
1 files changed, 106 insertions, 31 deletions
diff --git a/phrase-extract/score.cpp b/phrase-extract/score.cpp
index 225b5f2c9..8ce704f7d 100644
--- a/phrase-extract/score.cpp
+++ b/phrase-extract/score.cpp
@@ -58,6 +58,7 @@ bool unalignedFlag = false;
bool unalignedFWFlag = false;
bool outputNTLengths = false;
bool singletonFeature = false;
+bool crossedNonTerm = false;
int countOfCounts[COC_MAX+1];
int totalDistinct = 0;
float minCountHierarchical = 0;
@@ -71,13 +72,13 @@ vector<string> tokenize( const char [] );
void writeCountOfCounts( const string &fileNameCountOfCounts );
void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton);
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair );
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair );
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton );
-double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
-double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & );
+double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
set<string> functionWordList;
void loadFunctionWords( const string &fileNameFunctionWords );
-double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
, map<size_t, map<size_t, float> > &sourceProb
, map<size_t, map<size_t, float> > &targetProb);
@@ -90,7 +91,7 @@ int main(int argc, char* argv[])
<< "scoring methods for extracted rules\n";
if (argc < 4) {
- cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] \n";
+ cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
exit(1);
}
string fileNameExtract = argv[1];
@@ -156,6 +157,9 @@ int main(int argc, char* argv[])
} else if (strcmp(argv[i],"--Singleton") == 0) {
singletonFeature = true;
cerr << "binary singleton feature\n";
+ } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
+ crossedNonTerm = true;
+ cerr << "crossed non-term reordering feature\n";
} else {
cerr << "ERROR: unknown option " << argv[i] << endl;
exit(1);
@@ -243,12 +247,12 @@ int main(int argc, char* argv[])
processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
phrasePairsWithSameF.clear();
- isSingleton = true;
+ isSingleton = false;
lastPhrasePair = NULL;
}
else
{
- isSingleton = false;
+ isSingleton = true;
}
// add phrase pairs to list, it's now the last one
@@ -336,7 +340,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
}
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair )
{
float bestAlignmentCount = -1;
PhraseAlignment* bestAlignment;
@@ -357,7 +361,7 @@ PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
}
}
- return bestAlignment;
+ return *bestAlignment;
}
@@ -448,11 +452,73 @@ void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t,
}
+bool calcCrossedNonTerm(int sourcePos, int targetPos, const std::vector< std::set<size_t> > &alignedToS)
+{
+ for (int currSource = 0; currSource < alignedToS.size(); ++currSource)
+ {
+ if (currSource == sourcePos)
+ { // skip
+ }
+ else
+ {
+ const std::set<size_t> &targetSet = alignedToS[currSource];
+ std::set<size_t>::const_iterator iter;
+ for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
+ {
+ size_t currTarget = *iter;
+
+ if ((currSource < sourcePos && currTarget > targetPos)
+ || (currSource > sourcePos && currTarget < targetPos)
+ )
+ {
+ return true;
+ }
+ }
+
+ }
+ }
+
+ return false;
+}
+
+int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment)
+{
+ const std::vector< std::set<size_t> > &alignedToS = bestAlignment.alignedToS;
+
+ for (int sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos)
+ {
+ const std::set<size_t> &targetSet = alignedToS[sourcePos];
+ cerr << "size=" << targetSet.size() << " ";
+ std::set<size_t>::const_iterator iter;
+ for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
+ {
+ size_t targetPos = *iter;
+ cerr << sourcePos << "-" << targetPos << " ";
+ }
+ cerr << endl;
+
+ WORD_ID wordId = phraseS[sourcePos];
+ const WORD &word = vcbS.getWord(wordId);
+ bool isNonTerm = isNonTerminal(word);
+
+ if (isNonTerm)
+ {
+ assert(targetSet.size() == 1);
+ int targetPos = *targetSet.begin();
+ bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS);
+ if (ret)
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton )
{
if (phrasePair.size() == 0) return;
- PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
+ const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair );
// compute count
float count = 0;
@@ -492,17 +558,17 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
// source phrase (unless inverse)
if (! inverseFlag) {
- printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+ printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
phraseTableFile << " ||| ";
}
// target phrase
- printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+ printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
phraseTableFile << " ||| ";
// source phrase (if inverse)
if (inverseFlag) {
- printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+ printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
phraseTableFile << " ||| ";
}
@@ -525,7 +591,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
}
if (singletonFeature) {
- phraseTableFile << " " << (isSingleton?1:0);
+ phraseTableFile << " " << (isSingleton ? 1 : 0);
+ }
+
+ if (crossedNonTerm && !inverseFlag) {
+ phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment);
}
// target-side PCFG score
@@ -539,26 +609,31 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
if (! inverseFlag) {
if (hierarchicalFlag) {
// always output alignment if hiero style, but only for non-terms
- assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
+ assert(phraseT.size() == bestAlignment.alignedToT.size() + 1);
for(size_t j = 0; j < phraseT.size() - 1; j++) {
if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
- if (bestAlignment->alignedToT[ j ].size() != 1) {
+ if (bestAlignment.alignedToT[ j ].size() != 1) {
cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
phraseTableFile.flush();
- assert(bestAlignment->alignedToT[ j ].size() == 1);
+ assert(bestAlignment.alignedToT[ j ].size() == 1);
}
- int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
+ int sourcePos = *(bestAlignment.alignedToT[ j ].begin());
phraseTableFile << sourcePos << "-" << j << " ";
}
else if (wordAlignmentFlag) {
- int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
- phraseTableFile << sourcePos << "-" << j << " ";
+ const std::set<size_t> &sourceSet = bestAlignment.alignedToT[ j ];
+ std::set<size_t>::const_iterator iter;
+ for (iter = sourceSet.begin(); iter != sourceSet.end(); ++iter)
+ {
+ int sourcePos = *iter;
+ phraseTableFile << sourcePos << "-" << j << " ";
+ }
}
}
} else if (wordAlignmentFlag) {
// alignment info in pb model
- for(size_t j=0; j<bestAlignment->alignedToT.size(); j++) {
- const set< size_t > &aligned = bestAlignment->alignedToT[j];
+ for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) {
+ const set< size_t > &aligned = bestAlignment.alignedToT[j];
for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
phraseTableFile << *p << "-" << j << " ";
}
@@ -592,13 +667,13 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
phraseTableFile << endl;
}
-double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
{
// unaligned word counter
double unaligned = 1.0;
// only checking target words - source words are caught when computing inverse
- for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
- const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+ for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+ const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
if (srcIndices.empty()) {
unaligned *= 2.718;
}
@@ -606,13 +681,13 @@ double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, Ph
return unaligned;
}
-double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
{
// unaligned word counter
double unaligned = 1.0;
// only checking target words - source words are caught when computing inverse
- for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
- const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+ for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+ const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) {
unaligned *= 2.718;
}
@@ -645,14 +720,14 @@ void loadFunctionWords( const string &fileName )
inFile.close();
}
-double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
{
// lexical translation probability
double lexScore = 1.0;
int null = vcbS.getWordID("NULL");
// all target words have to be explained
- for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
- const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+ for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+ const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
if (srcIndices.empty()) {
// explain unaligned word by NULL
lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );