binary hiero reordering feature. Implementation of 1 described in nist 2012. 1 if non-term is reordered wrt to other words or non-terms. 0 otherwise

author: Hieu Hoang <fishandfrolick@gmail.com> 2012-08-25 03:47:57 +0400
committer: Hieu Hoang <fishandfrolick@gmail.com> 2012-08-25 03:47:57 +0400
commit: 33c03edfbbb2c80fd1c02e24ef56357b92546c08 (patch)
tree: 898aed945ba3d8c5c5a24b96a319afef04c26c9c /phrase-extract
parent: 1931bfe959b2ab0088229996a4e7f61cca61909c (diff)
1 files changed, 106 insertions, 31 deletions
diff --git a/phrase-extract/score.cpp b/phrase-extract/score.cpp
index 225b5f2c9..8ce704f7d 100644
--- a/phrase-extract/score.cpp
+++ b/phrase-extract/score.cpp
@@ -58,6 +58,7 @@ bool unalignedFlag = false;
 bool unalignedFWFlag = false;
 bool outputNTLengths = false;
 bool singletonFeature = false;
+bool crossedNonTerm = false;
 int countOfCounts[COC_MAX+1];
 int totalDistinct = 0;
 float minCountHierarchical = 0;
@@ -71,13 +72,13 @@ vector<string> tokenize( const char [] );
 
 void writeCountOfCounts( const string &fileNameCountOfCounts );
 void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton);
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair );
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair );
 void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton );
-double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
-double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & );
+double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
 set<string> functionWordList;
 void loadFunctionWords( const string &fileNameFunctionWords );
-double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
 void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
                       , map<size_t, map<size_t, float> > &sourceProb
                       , map<size_t, map<size_t, float> > &targetProb);
@@ -90,7 +91,7 @@ int main(int argc, char* argv[])
        << "scoring methods for extracted rules\n";
 
   if (argc < 4) {
-    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] \n";
+    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
     exit(1);
   }
   string fileNameExtract = argv[1];
@@ -156,6 +157,9 @@ int main(int argc, char* argv[])
     } else if (strcmp(argv[i],"--Singleton") == 0) {
       singletonFeature = true;
       cerr << "binary singleton feature\n";
+    } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
+      crossedNonTerm = true;
+      cerr << "crossed non-term reordering feature\n";
     } else {
       cerr << "ERROR: unknown option " << argv[i] << endl;
       exit(1);
@@ -243,12 +247,12 @@ int main(int argc, char* argv[])
       processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
       
       phrasePairsWithSameF.clear();
-      isSingleton = true;
+      isSingleton = false;
       lastPhrasePair = NULL;
     }
     else
     {
-      isSingleton = false;
+      isSingleton = true;
     }
 
     // add phrase pairs to list, it's now the last one
@@ -336,7 +340,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
   
 }
 
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair )
 {
   float bestAlignmentCount = -1;
   PhraseAlignment* bestAlignment;
@@ -357,7 +361,7 @@ PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
     }
   }    
 
-  return bestAlignment;
+  return *bestAlignment;
 }
 
 
@@ -448,11 +452,73 @@ void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t,
 
 }
 
+bool calcCrossedNonTerm(int sourcePos, int targetPos, const std::vector< std::set<size_t> > &alignedToS)
+{
+  for (int currSource = 0; currSource < alignedToS.size(); ++currSource)
+  {
+    if (currSource == sourcePos)
+    { // skip
+    }
+    else 
+    {
+      const std::set<size_t> &targetSet = alignedToS[currSource];
+      std::set<size_t>::const_iterator iter;
+      for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
+      {
+        size_t currTarget = *iter;
+        
+        if ((currSource < sourcePos && currTarget > targetPos)
+            || (currSource > sourcePos && currTarget < targetPos)
+          )
+        {
+          return true;
+        }
+      }
+      
+    }
+  }
+  
+  return false;
+}
+
+int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment)
+{
+  const std::vector< std::set<size_t> > &alignedToS = bestAlignment.alignedToS;
+  
+  for (int sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos)
+  {
+    const std::set<size_t> &targetSet = alignedToS[sourcePos];
+    cerr << "size=" << targetSet.size() << " ";
+    std::set<size_t>::const_iterator iter;
+    for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
+    {
+      size_t targetPos = *iter;
+      cerr << sourcePos << "-" << targetPos << " ";
+    }
+    cerr << endl;
+    
+    WORD_ID wordId = phraseS[sourcePos];
+    const WORD &word = vcbS.getWord(wordId);
+    bool isNonTerm = isNonTerminal(word);
+    
+    if (isNonTerm)
+    {
+      assert(targetSet.size() == 1);
+      int targetPos = *targetSet.begin();
+      bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS);
+      if (ret)
+        return 1;
+    }
+  }
+  
+  return 0;
+}
+
 void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton )
 {
   if (phrasePair.size() == 0) return;
 
-  PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
+  const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair );
     
   // compute count
   float count = 0;
@@ -492,17 +558,17 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
 
   // source phrase (unless inverse)
   if (! inverseFlag) {
-    printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+    printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
     phraseTableFile << " ||| ";
   }
 
   // target phrase
-  printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+  printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
   phraseTableFile << " ||| ";
 
   // source phrase (if inverse)
   if (inverseFlag) {
-    printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+    printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
     phraseTableFile << " ||| ";
   }
 
@@ -525,7 +591,11 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
   }
 
   if (singletonFeature) {
-    phraseTableFile << " " << (isSingleton?1:0);
+    phraseTableFile << " " << (isSingleton ? 1 : 0);
+  }
+  
+  if (crossedNonTerm && !inverseFlag) {
+    phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment);
   }
   
   // target-side PCFG score
@@ -539,26 +609,31 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
   if (! inverseFlag) {
     if (hierarchicalFlag) {
       // always output alignment if hiero style, but only for non-terms
-      assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
+      assert(phraseT.size() == bestAlignment.alignedToT.size() + 1);
       for(size_t j = 0; j < phraseT.size() - 1; j++) {
         if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
-          if (bestAlignment->alignedToT[ j ].size() != 1) {
+          if (bestAlignment.alignedToT[ j ].size() != 1) {
             cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
             phraseTableFile.flush();
-            assert(bestAlignment->alignedToT[ j ].size() == 1);
+            assert(bestAlignment.alignedToT[ j ].size() == 1);
           }
-          int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
+          int sourcePos = *(bestAlignment.alignedToT[ j ].begin());
           phraseTableFile << sourcePos << "-" << j << " ";
         }
         else if (wordAlignmentFlag) {
-          int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
-          phraseTableFile << sourcePos << "-" << j << " ";
+          const std::set<size_t> &sourceSet = bestAlignment.alignedToT[ j ];
+          std::set<size_t>::const_iterator iter;
+          for (iter = sourceSet.begin(); iter != sourceSet.end(); ++iter)
+          {
+            int sourcePos = *iter;
+            phraseTableFile << sourcePos << "-" << j << " ";            
+          }
         }
       }
     } else if (wordAlignmentFlag) {
       // alignment info in pb model
-      for(size_t j=0; j<bestAlignment->alignedToT.size(); j++) {
-        const set< size_t > &aligned = bestAlignment->alignedToT[j];
+      for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) {
+        const set< size_t > &aligned = bestAlignment.alignedToT[j];
         for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
           phraseTableFile << *p << "-" << j << " ";
         }
@@ -592,13 +667,13 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
   phraseTableFile << endl;
 }
 
-double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
 {
   // unaligned word counter
   double unaligned = 1.0;
   // only checking target words - source words are caught when computing inverse
-  for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
     if (srcIndices.empty()) {
       unaligned *= 2.718;
     }
@@ -606,13 +681,13 @@ double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, Ph
   return unaligned;
 }
 
-double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
 {
   // unaligned word counter
   double unaligned = 1.0;
   // only checking target words - source words are caught when computing inverse
-  for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
     if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) {
       unaligned *= 2.718;
     }
@@ -645,14 +720,14 @@ void loadFunctionWords( const string &fileName )
   inFile.close();
 }
 
-double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
 {
   // lexical translation probability
   double lexScore = 1.0;
   int null = vcbS.getWordID("NULL");
   // all target words have to be explained
-  for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
     if (srcIndices.empty()) {
       // explain unaligned word by NULL
       lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );
author	Hieu Hoang <fishandfrolick@gmail.com>	2012-08-25 03:47:57 +0400
committer	Hieu Hoang <fishandfrolick@gmail.com>	2012-08-25 03:47:57 +0400
commit	33c03edfbbb2c80fd1c02e24ef56357b92546c08 (patch)
tree	898aed945ba3d8c5c5a24b96a319afef04c26c9c /phrase-extract
parent	1931bfe959b2ab0088229996a4e7f61cca61909c (diff)