diff options
author | Hieu Hoang <fishandfrolick@gmail.com> | 2012-08-25 03:47:57 +0400 |
---|---|---|
committer | Hieu Hoang <fishandfrolick@gmail.com> | 2012-08-25 03:47:57 +0400 |
commit | 33c03edfbbb2c80fd1c02e24ef56357b92546c08 (patch) | |
tree | 898aed945ba3d8c5c5a24b96a319afef04c26c9c /phrase-extract | |
parent | 1931bfe959b2ab0088229996a4e7f61cca61909c (diff) |
binary hiero reordering feature. Implementation of 1 described in nist 2012. 1 if non-term is reordered wrt to other words or non-terms. 0 otherwise
Diffstat (limited to 'phrase-extract')
-rw-r--r-- | phrase-extract/score.cpp | 137 |
1 files changed, 106 insertions, 31 deletions
diff --git a/phrase-extract/score.cpp b/phrase-extract/score.cpp index 225b5f2c9..8ce704f7d 100644 --- a/phrase-extract/score.cpp +++ b/phrase-extract/score.cpp @@ -58,6 +58,7 @@ bool unalignedFlag = false; bool unalignedFWFlag = false; bool outputNTLengths = false; bool singletonFeature = false; +bool crossedNonTerm = false; int countOfCounts[COC_MAX+1]; int totalDistinct = 0; float minCountHierarchical = 0; @@ -71,13 +72,13 @@ vector<string> tokenize( const char [] ); void writeCountOfCounts( const string &fileNameCountOfCounts ); void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton); -PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair ); +const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair ); void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton ); -double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * ); -double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * ); +double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & ); +double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & ); set<string> functionWordList; void loadFunctionWords( const string &fileNameFunctionWords ); -double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * ); +double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & ); void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs , map<size_t, map<size_t, float> > &sourceProb , map<size_t, map<size_t, float> > &targetProb); @@ -90,7 +91,7 @@ int main(int argc, char* argv[]) << "scoring methods for extracted rules\n"; if (argc < 4) { - cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] \n"; + cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n"; exit(1); } string fileNameExtract = argv[1]; @@ -156,6 +157,9 @@ int main(int argc, char* argv[]) } else if (strcmp(argv[i],"--Singleton") == 0) { singletonFeature = true; cerr << "binary singleton feature\n"; + } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) { + crossedNonTerm = true; + cerr << "crossed non-term reordering feature\n"; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); @@ -243,12 +247,12 @@ int main(int argc, char* argv[]) processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton ); phrasePairsWithSameF.clear(); - isSingleton = true; + isSingleton = false; lastPhrasePair = NULL; } else { - isSingleton = false; + isSingleton = true; } // add phrase pairs to list, it's now the last one @@ -336,7 +340,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT } -PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair ) +const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair ) { float bestAlignmentCount = -1; PhraseAlignment* bestAlignment; @@ -357,7 +361,7 @@ PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair ) } } - return bestAlignment; + return *bestAlignment; } @@ -448,11 +452,73 @@ void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t, } +bool calcCrossedNonTerm(int sourcePos, int targetPos, const std::vector< std::set<size_t> > &alignedToS) +{ + for (int currSource = 0; currSource < alignedToS.size(); ++currSource) + { + if (currSource == sourcePos) + { // skip + } + else + { + const std::set<size_t> &targetSet = alignedToS[currSource]; + std::set<size_t>::const_iterator iter; + for (iter = targetSet.begin(); iter != targetSet.end(); ++iter) + { + size_t currTarget = *iter; + + if ((currSource < sourcePos && currTarget > targetPos) + || (currSource > sourcePos && currTarget < targetPos) + ) + { + return true; + } + } + + } + } + + return false; +} + +int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment) +{ + const std::vector< std::set<size_t> > &alignedToS = bestAlignment.alignedToS; + + for (int sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos) + { + const std::set<size_t> &targetSet = alignedToS[sourcePos]; + cerr << "size=" << targetSet.size() << " "; + std::set<size_t>::const_iterator iter; + for (iter = targetSet.begin(); iter != targetSet.end(); ++iter) + { + size_t targetPos = *iter; + cerr << sourcePos << "-" << targetPos << " "; + } + cerr << endl; + + WORD_ID wordId = phraseS[sourcePos]; + const WORD &word = vcbS.getWord(wordId); + bool isNonTerm = isNonTerminal(word); + + if (isNonTerm) + { + assert(targetSet.size() == 1); + int targetPos = *targetSet.begin(); + bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS); + if (ret) + return 1; + } + } + + return 0; +} + void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton ) { if (phrasePair.size() == 0) return; - PhraseAlignment *bestAlignment = findBestAlignment( phrasePair ); + const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair ); // compute count float count = 0; @@ -492,17 +558,17 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo // source phrase (unless inverse) if (! inverseFlag) { - printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile); + printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile); phraseTableFile << " ||| "; } // target phrase - printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile); + printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile); phraseTableFile << " ||| "; // source phrase (if inverse) if (inverseFlag) { - printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile); + printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile); phraseTableFile << " ||| "; } @@ -525,7 +591,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo } if (singletonFeature) { - phraseTableFile << " " << (isSingleton?1:0); + phraseTableFile << " " << (isSingleton ? 1 : 0); + } + + if (crossedNonTerm && !inverseFlag) { + phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment); } // target-side PCFG score @@ -539,26 +609,31 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo if (! inverseFlag) { if (hierarchicalFlag) { // always output alignment if hiero style, but only for non-terms - assert(phraseT.size() == bestAlignment->alignedToT.size() + 1); + assert(phraseT.size() == bestAlignment.alignedToT.size() + 1); for(size_t j = 0; j < phraseT.size() - 1; j++) { if (isNonTerminal(vcbT.getWord( phraseT[j] ))) { - if (bestAlignment->alignedToT[ j ].size() != 1) { + if (bestAlignment.alignedToT[ j ].size() != 1) { cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl; phraseTableFile.flush(); - assert(bestAlignment->alignedToT[ j ].size() == 1); + assert(bestAlignment.alignedToT[ j ].size() == 1); } - int sourcePos = *(bestAlignment->alignedToT[ j ].begin()); + int sourcePos = *(bestAlignment.alignedToT[ j ].begin()); phraseTableFile << sourcePos << "-" << j << " "; } else if (wordAlignmentFlag) { - int sourcePos = *(bestAlignment->alignedToT[ j ].begin()); - phraseTableFile << sourcePos << "-" << j << " "; + const std::set<size_t> &sourceSet = bestAlignment.alignedToT[ j ]; + std::set<size_t>::const_iterator iter; + for (iter = sourceSet.begin(); iter != sourceSet.end(); ++iter) + { + int sourcePos = *iter; + phraseTableFile << sourcePos << "-" << j << " "; + } } } } else if (wordAlignmentFlag) { // alignment info in pb model - for(size_t j=0; j<bestAlignment->alignedToT.size(); j++) { - const set< size_t > &aligned = bestAlignment->alignedToT[j]; + for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) { + const set< size_t > &aligned = bestAlignment.alignedToT[j]; for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) { phraseTableFile << *p << "-" << j << " "; } @@ -592,13 +667,13 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo phraseTableFile << endl; } -double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment ) +double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment ) { // unaligned word counter double unaligned = 1.0; // only checking target words - source words are caught when computing inverse - for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) { - const set< size_t > & srcIndices = alignment->alignedToT[ ti ]; + for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) { + const set< size_t > & srcIndices = alignment.alignedToT[ ti ]; if (srcIndices.empty()) { unaligned *= 2.718; } @@ -606,13 +681,13 @@ double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, Ph return unaligned; } -double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment ) +double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment ) { // unaligned word counter double unaligned = 1.0; // only checking target words - source words are caught when computing inverse - for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) { - const set< size_t > & srcIndices = alignment->alignedToT[ ti ]; + for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) { + const set< size_t > & srcIndices = alignment.alignedToT[ ti ]; if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) { unaligned *= 2.718; } @@ -645,14 +720,14 @@ void loadFunctionWords( const string &fileName ) inFile.close(); } -double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment ) +double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment ) { // lexical translation probability double lexScore = 1.0; int null = vcbS.getWordID("NULL"); // all target words have to be explained - for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) { - const set< size_t > & srcIndices = alignment->alignedToT[ ti ]; + for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) { + const set< size_t > & srcIndices = alignment.alignedToT[ ti ]; if (srcIndices.empty()) { // explain unaligned word by NULL lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] ); |