merge issues

author: phikoehn <pkoehn@inf.ed.ac.uk> 2012-09-03 10:27:41 +0400
committer: phikoehn <pkoehn@inf.ed.ac.uk> 2012-09-03 10:27:41 +0400
commit: 5d9859ba0e742bb5207c8ee78f50252241723cdb (patch)
tree: 7c447da9fd6acb41117238d443fcae67eeeb5d35 /phrase-extract
parent: 19ef78514693a5557bae5614c8a2cc31a77a47d3 (diff)
parent: 92b15c103fa542a19789c043d47b629d2563bad8 (diff)
8 files changed, 218 insertions, 73 deletions
diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h
index bb2d97580..431be58b0 100644
--- a/phrase-extract/RuleExtractionOptions.h
+++ b/phrase-extract/RuleExtractionOptions.h
@@ -53,6 +53,7 @@ public:
   bool gzOutput;
   bool unpairedExtractFormat;
   bool conditionOnTargetLhs;
+  bool boundaryRules;
   
   RuleExtractionOptions()
     : maxSpan(10)
@@ -85,6 +86,7 @@ public:
     , gzOutput(false)
     , unpairedExtractFormat(false)
     , conditionOnTargetLhs(false)
+    , boundaryRules(false)
   {}
 };
 
diff --git a/phrase-extract/SentenceAlignment.cpp b/phrase-extract/SentenceAlignment.cpp
index 8e44bddc4..af1cfa953 100644
--- a/phrase-extract/SentenceAlignment.cpp
+++ b/phrase-extract/SentenceAlignment.cpp
@@ -25,33 +25,45 @@
 
 #include "tables-core.h"
 
+using namespace std;
+
 namespace MosesTraining
 {
 
 SentenceAlignment::~SentenceAlignment() {}
 
-bool SentenceAlignment::processTargetSentence(const char * targetString, int)
+void addBoundaryWords(vector<string> &phrase)
+{
+  phrase.insert(phrase.begin(), "<s>");
+  phrase.push_back("</s>");
+}
+
+bool SentenceAlignment::processTargetSentence(const char * targetString, int, bool boundaryRules)
 {
   target = tokenize(targetString);
+  if (boundaryRules)
+    addBoundaryWords(target);
   return true;
 }
 
-bool SentenceAlignment::processSourceSentence(const char * sourceString, int)
+bool SentenceAlignment::processSourceSentence(const char * sourceString, int, bool boundaryRules)
 {
   source = tokenize(sourceString);
+  if (boundaryRules)
+    addBoundaryWords(source);
   return true;
 }
 
-bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID)
+bool SentenceAlignment::create( char targetString[], char sourceString[], char alignmentString[], int sentenceID, bool boundaryRules)
 {
   using namespace std;
   this->sentenceID = sentenceID;
 
   // process sentence strings and store in target and source members.
-  if (!processTargetSentence(targetString, sentenceID)) {
+  if (!processTargetSentence(targetString, sentenceID, boundaryRules)) {
     return false;
   }
-  if (!processSourceSentence(sourceString, sentenceID)) {
+  if (!processSourceSentence(sourceString, sentenceID, boundaryRules)) {
     return false;
   }
 
@@ -81,6 +93,12 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a
       cerr << "T: " << targetString << endl << "S: " << sourceString << endl;
       return false;
     }
+    
+    if (boundaryRules) {
+      ++s;
+      ++t;
+    }
+    
     // cout << "alignmentSequence[i] " << alignmentSequence[i] << " is " << s << ", " << t << endl;
     if ((size_t)t >= target.size() || (size_t)s >= source.size()) {
       cerr << "WARNING: sentence " << sentenceID << " has alignment point (" << s << ", " << t << ") out of bounds (" << source.size() << ", " << target.size() << ")\n";
@@ -90,6 +108,16 @@ bool SentenceAlignment::create( char targetString[], char sourceString[], char a
     alignedToT[t].push_back( s );
     alignedCountS[s]++;
   }
+  
+  if (boundaryRules) {
+    alignedToT[0].push_back(0);
+    alignedCountS[0]++;
+    
+    alignedToT.back().push_back(alignedCountS.size() - 1);
+    alignedCountS.back()++;
+    
+  }
+  
   return true;
 }
 
diff --git a/phrase-extract/SentenceAlignment.h b/phrase-extract/SentenceAlignment.h
index b1fb5933a..7c2988780 100644
--- a/phrase-extract/SentenceAlignment.h
+++ b/phrase-extract/SentenceAlignment.h
@@ -38,12 +38,13 @@ public:
 
   virtual ~SentenceAlignment();
 
-  virtual bool processTargetSentence(const char *, int);
+  virtual bool processTargetSentence(const char *, int, bool boundaryRules);
 
-  virtual bool processSourceSentence(const char *, int);
+  virtual bool processSourceSentence(const char *, int, bool boundaryRules);
 
   bool create(char targetString[], char sourceString[],
-              char alignmentString[], int sentenceID);
+              char alignmentString[], int sentenceID, bool boundaryRules);
+  
 };
 
 }
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.cpp b/phrase-extract/SentenceAlignmentWithSyntax.cpp
index 83a048757..5d866edfb 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.cpp
+++ b/phrase-extract/SentenceAlignmentWithSyntax.cpp
@@ -32,10 +32,10 @@ using namespace std;
 namespace MosesTraining
 {
 
-bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID)
+bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetString, int sentenceID, bool boundaryRules)
 {
   if (!m_options.targetSyntax) {
-    return SentenceAlignment::processTargetSentence(targetString, sentenceID);
+    return SentenceAlignment::processTargetSentence(targetString, sentenceID, boundaryRules);
   }
 
   string targetStringCPP(targetString);
@@ -52,10 +52,10 @@ bool SentenceAlignmentWithSyntax::processTargetSentence(const char * targetStrin
   return true;
 }
 
-bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID)
+bool SentenceAlignmentWithSyntax::processSourceSentence(const char * sourceString, int sentenceID, bool boundaryRules)
 {
   if (!m_options.sourceSyntax) {
-    return SentenceAlignment::processSourceSentence(sourceString, sentenceID);
+    return SentenceAlignment::processSourceSentence(sourceString, sentenceID, boundaryRules);
   }
 
   string sourceStringCPP(sourceString);
diff --git a/phrase-extract/SentenceAlignmentWithSyntax.h b/phrase-extract/SentenceAlignmentWithSyntax.h
index 38fa77907..28eef57b7 100644
--- a/phrase-extract/SentenceAlignmentWithSyntax.h
+++ b/phrase-extract/SentenceAlignmentWithSyntax.h
@@ -59,10 +59,10 @@ public:
   virtual ~SentenceAlignmentWithSyntax() {}
 
   bool
-  processTargetSentence(const char *, int);
+  processTargetSentence(const char *, int, bool boundaryRules);
 
   bool
-  processSourceSentence(const char *, int);
+  processSourceSentence(const char *, int, bool boundaryRules);
 };
 
 }
diff --git a/phrase-extract/extract-rules.cpp b/phrase-extract/extract-rules.cpp
index f031df8e4..52a141917 100644
--- a/phrase-extract/extract-rules.cpp
+++ b/phrase-extract/extract-rules.cpp
@@ -72,20 +72,20 @@ private:
   void writeRulesToFile();
   
   // subs
-  void addRule( int, int, int, int, RuleExist &ruleExist);
+  void addRule( int, int, int, int, int, RuleExist &ruleExist);
   void addHieroRule( int startT, int endT, int startS, int endS
                     , RuleExist &ruleExist, const HoleCollection &holeColl, int numHoles, int initStartF, int wordCountT, int wordCountS);
   void printHieroPhrase( int startT, int endT, int startS, int endS
-                        , HoleCollection &holeColl, LabelIndex &labelIndex);
+                        , HoleCollection &holeColl, LabelIndex &labelIndex, int countS);
   string printTargetHieroPhrase(  int startT, int endT, int startS, int endS
-                                , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore);
+                                , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore, int countS);
   string printSourceHieroPhrase( int startT, int endT, int startS, int endS
                                 , HoleCollection &holeColl, const LabelIndex &labelIndex);
   void preprocessSourceHieroPhrase( int startT, int endT, int startS, int endS
                                    , WordIndex &indexS, HoleCollection &holeColl, const LabelIndex &labelIndex);
   void printHieroAlignment(  int startT, int endT, int startS, int endS
                            , const WordIndex &indexS, const WordIndex &indexT, HoleCollection &holeColl, ExtractedRule &rule);
-  void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl);
+  void printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS);
   
   inline string IntToString( int i )
   {
@@ -138,7 +138,9 @@ int main(int argc, char* argv[])
          << " | --SourceSyntax | --TargetSyntax"
          << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource |  --NoNonTermFirstWord | --NoFractionalCounting"
          << " | --UnpairedExtractFormat"
-         << " | --ConditionOnTargetLHS ]\n";
+         << " | --ConditionOnTargetLHS ]"
+        << " | --BoundaryRules[" << options.boundaryRules << "]";
+    
     exit(1);
   }
   char* &fileNameT = argv[1];
@@ -263,18 +265,18 @@ int main(int argc, char* argv[])
       options.unpairedExtractFormat = true;
     } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
       options.conditionOnTargetLhs = true;
-#ifdef WITH_THREADS
     } else if (strcmp(argv[i],"-threads") == 0 || 
                strcmp(argv[i],"--threads") == 0 ||
                strcmp(argv[i],"--Threads") == 0) {
       thread_count = atoi(argv[++i]);
-#endif
     } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
       if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
         cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
         exit(1);
       }
       sentenceOffset = atoi(argv[++i]);
+    } else if (strcmp(argv[i],"--BoundaryRules") == 0) {
+      options.boundaryRules = true;
     } else {
       cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
       exit(1);
@@ -330,7 +332,7 @@ int main(int argc, char* argv[])
       cout << "LOG: PHRASES_BEGIN:" << endl;
     }
 
-    if (sentence.create(targetString, sourceString, alignmentString, i)) {
+    if (sentence.create(targetString, sourceString, alignmentString, i, options.boundaryRules)) {
       if (options.unknownWordLabelFlag) {
         collectWordLabelCounts(sentence);
       }
@@ -442,7 +444,7 @@ void ExtractTask::extractRules()
 
           // if within length limits, add as fully-lexical phrase pair
           if (endT-startT < m_options.maxSymbolsTarget && endS-startS < m_options.maxSymbolsSource) {
-            addRule(startT,endT,startS,endS, ruleExist);
+            addRule(startT,endT,startS,endS, countS, ruleExist);
           }
 
           // take note that this is a valid phrase alignment
@@ -502,7 +504,8 @@ void ExtractTask::preprocessSourceHieroPhrase( int startT, int endT, int startS,
 }
 
 string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, int endS
-                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore)
+                              , WordIndex &indexT, HoleCollection &holeColl, const LabelIndex &labelIndex, double &logPCFGScore
+                              , int countS)
 {
   HoleList::iterator iterHoleList = holeColl.GetHoles().begin();
   assert(iterHoleList != holeColl.GetHoles().end());
@@ -524,8 +527,15 @@ string ExtractTask::printTargetHieroPhrase( int startT, int endT, int startS, in
       assert(sourceLabel != "");
 
       int labelI = labelIndex[ 2+holeCount ];
-      string targetLabel = m_options.targetSyntax ?
-                           m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[ labelI ]->GetLabel() : "X";
+      string targetLabel;
+      if (m_options.targetSyntax) {
+        targetLabel = m_sentence.targetTree.GetNodes(currPos,hole.GetEnd(1))[labelI]->GetLabel();
+      } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+         targetLabel = "S"; 
+      } else {
+        targetLabel = "X";
+      }
+      
       hole.SetLabel(targetLabel, 1);
 
       if (m_options.unpairedExtractFormat) {
@@ -639,15 +649,22 @@ void ExtractTask::printHieroAlignment( int startT, int endT, int startS, int end
 }
 
 void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
-                       , HoleCollection &holeColl, LabelIndex &labelIndex)
+                       , HoleCollection &holeColl, LabelIndex &labelIndex, int countS)
 {
   WordIndex indexS, indexT; // to keep track of word positions in rule
 
   ExtractedRule rule( startT, endT, startS, endS );
 
   // phrase labels
-  string targetLabel = m_options.targetSyntax ?
-                       m_sentence.targetTree.GetNodes(startT,endT)[ labelIndex[0] ]->GetLabel() : "X";
+  string targetLabel;
+  if (m_options.targetSyntax) {
+    targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0] ]->GetLabel();
+  } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+    targetLabel = "S";
+  } else {
+    targetLabel = "X";
+  }
+
   string sourceLabel = m_options.sourceSyntax ?
                        m_sentence.sourceTree.GetNodes(startS,endS)[ labelIndex[1] ]->GetLabel() : "X";
 
@@ -657,12 +674,12 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
   // target
   if (m_options.pcfgScore) {
     double logPCFGScore = m_sentence.targetTree.GetNodes(startT,endT)[labelIndex[0]]->GetPcfgScore();
-    rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+    rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
                 + " [" + targetLabel + "]";
     rule.pcfgScore = std::exp(logPCFGScore);
   } else {
     double logPCFGScore = 0.0f;
-    rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore)
+    rule.target = printTargetHieroPhrase(startT, endT, startS, endS, indexT, holeColl, labelIndex, logPCFGScore, countS)
                 + " [" + targetLabel + "]";
   }
 
@@ -680,7 +697,7 @@ void ExtractTask::printHieroPhrase( int startT, int endT, int startS, int endS
   addRuleToCollection( rule );
 }
 
-void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl)
+void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int endS, HoleCollection &holeColl, int countS)
 {
   LabelIndex labelIndex,labelCount;
 
@@ -715,7 +732,7 @@ void ExtractTask::printAllHieroPhrases( int startT, int endT, int startS, int en
   // loop through the holes
   bool done = false;
   while(!done) {
-    printHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex );
+    printHieroPhrase( startT, endT, startS, endS, holeColl, labelIndex, countS );
     for(unsigned int i=0; i<labelIndex.size(); i++) {
       labelIndex[i]++;
       if(labelIndex[i] == labelCount[i]) {
@@ -843,7 +860,7 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
 
         // passed all checks...
         if (allowablePhrase)
-          printAllHieroPhrases(startT, endT, startS, endS, copyHoleColl);
+          printAllHieroPhrases(startT, endT, startS, endS, copyHoleColl, wordCountS);
 
         // recursively search for next hole
         int nextInitStartT = m_options.nonTermConsecTarget ? endHoleT + 1 : endHoleT + 2;
@@ -855,10 +872,15 @@ void ExtractTask::addHieroRule( int startT, int endT, int startS, int endS
   }
 }
 
-void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist &ruleExist)
+void ExtractTask::addRule( int startT, int endT, int startS, int endS, int countS, RuleExist &ruleExist)
 {
-  // source
-
+  // contains only <s> or </s>. Don't output
+  if (m_options.boundaryRules 
+      && (   (startS == 0         && endS == 0) 
+          || (startS == countS-1  && endS == countS-1))) {
+    return;
+  }
+  
   if (m_options.onlyOutputSpanInfo) {
     cout << startS << " " << endS << " " << startT << " " << endT << endl;
     return;
@@ -874,8 +896,14 @@ void ExtractTask::addRule( int startT, int endT, int startS, int endS, RuleExist
   else {
     sourceLabel = m_options.sourceSyntax ?
                   m_sentence.sourceTree.GetNodes(startS,endS)[0]->GetLabel() : "X";
-    targetLabel = m_options.targetSyntax ?
-                  m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel() : "X";
+    
+    if (m_options.targetSyntax) {
+      targetLabel = m_sentence.targetTree.GetNodes(startT,endT)[0]->GetLabel();
+    } else if (m_options.boundaryRules && (startS == 0 || endS == countS - 1)) {
+      targetLabel = "S";
+    } else {
+      targetLabel = "X";
+    }
   }
 
   // source
diff --git a/phrase-extract/extract.cpp b/phrase-extract/extract.cpp
index 58eb4b2f3..6a1ee77ab 100644
--- a/phrase-extract/extract.cpp
+++ b/phrase-extract/extract.cpp
@@ -261,7 +261,7 @@ int main(int argc, char* argv[])
       cout << "LOG: ALT: " << alignmentString << endl;
       cout << "LOG: PHRASES_BEGIN:" << endl;
     }
-	if (sentence.create( englishString, foreignString, alignmentString, i)) {
+	if (sentence.create( englishString, foreignString, alignmentString, i, false)) {
    	ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFile , extractFileInv, extractFileOrientation, extractFileSentenceId);
       task->Run();
       delete task;
diff --git a/phrase-extract/score.cpp b/phrase-extract/score.cpp
index 8348a44bc..4de7acc0f 100644
--- a/phrase-extract/score.cpp
+++ b/phrase-extract/score.cpp
@@ -59,6 +59,8 @@ bool lexFlag = true;
 bool unalignedFlag = false;
 bool unalignedFWFlag = false;
 bool outputNTLengths = false;
+bool singletonFeature = false;
+bool crossedNonTerm = false;
 int countOfCounts[COC_MAX+1];
 int totalDistinct = 0;
 float minCountHierarchical = 0;
@@ -77,14 +79,14 @@ Vocabulary vcbS;
 vector<string> tokenize( const char [] );
 
 void writeCountOfCounts( const string &fileNameCountOfCounts );
-void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile);
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair );
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile );
-double computeLexicalTranslation( const PHRASE &, const PHRASE &, PhraseAlignment * );
-double computeUnalignedPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+void processPhrasePairs( vector< PhraseAlignment > & , ostream &phraseTableFile, bool isSingleton);
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair );
+void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float, int, ostream &phraseTableFile, bool isSingleton );
+double computeLexicalTranslation( const PHRASE &, const PHRASE &, const PhraseAlignment & );
+double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
 set<string> functionWordList;
 void loadFunctionWords( const string &fileNameFunctionWords );
-double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, PhraseAlignment * );
+double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
 void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
                       , map<size_t, map<size_t, float> > &sourceProb
                       , map<size_t, map<size_t, float> > &targetProb);
@@ -97,7 +99,7 @@ int main(int argc, char* argv[])
        << "scoring methods for extracted rules\n";
 
   if (argc < 4) {
-    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]\n";
+    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]] [--Singleton] [--CrossedNonTerm] \n";
     exit(1);
   }
   string fileNameExtract = argv[1];
@@ -177,6 +179,12 @@ int main(int argc, char* argv[])
       minCountHierarchical -= 0.00001; // account for rounding
     } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
       outputNTLengths = true;
+    } else if (strcmp(argv[i],"--Singleton") == 0) {
+      singletonFeature = true;
+      cerr << "binary singleton feature\n";
+    } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
+      crossedNonTerm = true;
+      cerr << "crossed non-term reordering feature\n";
     } else {
       cerr << "ERROR: unknown option " << argv[i] << endl;
       exit(1);
@@ -238,6 +246,7 @@ int main(int argc, char* argv[])
   float lastCount = 0.0f;
   float lastPcfgSum = 0.0f;
   vector< PhraseAlignment > phrasePairsWithSameF;
+  bool isSingleton = true;
   int i=0;
   char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
   lastLine[0] = '\0';
@@ -272,16 +281,22 @@ int main(int argc, char* argv[])
     // if new source phrase, process last batch
     if (lastPhrasePair != NULL &&
         lastPhrasePair->GetSource() != phrasePair.GetSource()) {
-      processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
+      processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
+      
       phrasePairsWithSameF.clear();
+      isSingleton = false;
       lastPhrasePair = NULL;
     }
+    else
+    {
+      isSingleton = true;
+    }
 
     // add phrase pairs to list, it's now the last one
     phrasePairsWithSameF.push_back( phrasePair );
     lastPhrasePair = &phrasePairsWithSameF.back();
   }
-  processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
+  processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
 	
 	phraseTableFile->flush();
 	if (phraseTableFile != &cout) {
@@ -315,7 +330,7 @@ void writeCountOfCounts( const string &fileNameCountOfCounts )
 	countOfCountsFile.Close();
 }
 
-void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile )
+void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseTableFile, bool isSingleton )
 {
   if (phrasePair.size() == 0) return;
 
@@ -356,12 +371,12 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
   for(iter = sortedColl.begin(); iter != sortedColl.end(); ++iter) 
   {
     const PhraseAlignmentCollection &group = **iter;
-    outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile );
+    outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile, isSingleton );
   }
   
 }
 
-PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
+const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrasePair )
 {
   float bestAlignmentCount = -1;
   PhraseAlignment* bestAlignment = NULL;
@@ -382,7 +397,7 @@ PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
     }
   }    
 
-  return bestAlignment;
+  return *bestAlignment;
 }
 
 
@@ -473,11 +488,65 @@ void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t,
 
 }
 
-void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile )
+bool calcCrossedNonTerm(int sourcePos, int targetPos, const std::vector< std::set<size_t> > &alignedToS)
+{
+  for (int currSource = 0; currSource < alignedToS.size(); ++currSource)
+  {
+    if (currSource == sourcePos)
+    { // skip
+    }
+    else 
+    {
+      const std::set<size_t> &targetSet = alignedToS[currSource];
+      std::set<size_t>::const_iterator iter;
+      for (iter = targetSet.begin(); iter != targetSet.end(); ++iter)
+      {
+        size_t currTarget = *iter;
+        
+        if ((currSource < sourcePos && currTarget > targetPos)
+            || (currSource > sourcePos && currTarget < targetPos)
+          )
+        {
+          return true;
+        }
+      }
+      
+    }
+  }
+  
+  return false;
+}
+
+int calcCrossedNonTerm(const PHRASE &phraseS, const PhraseAlignment &bestAlignment)
+{
+  const std::vector< std::set<size_t> > &alignedToS = bestAlignment.alignedToS;
+  
+  for (int sourcePos = 0; sourcePos < alignedToS.size(); ++sourcePos)
+  {
+    const std::set<size_t> &targetSet = alignedToS[sourcePos];
+    
+    WORD_ID wordId = phraseS[sourcePos];
+    const WORD &word = vcbS.getWord(wordId);
+    bool isNonTerm = isNonTerminal(word);
+    
+    if (isNonTerm)
+    {
+      assert(targetSet.size() == 1);
+      int targetPos = *targetSet.begin();
+      bool ret = calcCrossedNonTerm(sourcePos, targetPos, alignedToS);
+      if (ret)
+        return 1;
+    }
+  }
+  
+  return 0;
+}
+
+void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCount, int distinctCount, ostream &phraseTableFile, bool isSingleton )
 {
   if (phrasePair.size() == 0) return;
 
-  PhraseAlignment *bestAlignment = findBestAlignment( phrasePair );
+  const PhraseAlignment &bestAlignment = findBestAlignment( phrasePair );
     
   // compute count
   float count = 0;
@@ -529,17 +598,17 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
 
   // source phrase (unless inverse)
   if (! inverseFlag) {
-    printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+    printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
     phraseTableFile << " ||| ";
   }
 
   // target phrase
-  printTargetPhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+  printTargetPhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
   phraseTableFile << " ||| ";
 
   // source phrase (if inverse)
   if (inverseFlag) {
-    printSourcePhrase(phraseS, phraseT, *bestAlignment, phraseTableFile);
+    printSourcePhrase(phraseS, phraseT, bestAlignment, phraseTableFile);
     phraseTableFile << " ||| ";
   }
 
@@ -561,6 +630,14 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
     phraseTableFile << " " << maybeLogProb( penalty );
   }
 
+  if (singletonFeature) {
+    phraseTableFile << " " << (isSingleton ? 1 : 0);
+  }
+  
+  if (crossedNonTerm && !inverseFlag) {
+    phraseTableFile << " " << calcCrossedNonTerm(phraseS, bestAlignment);
+  }
+  
   // target-side PCFG score
   if (pcfgFlag && !inverseFlag) {
     phraseTableFile << " " << maybeLogProb( pcfgScore );
@@ -632,22 +709,31 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
   if (! inverseFlag) {
     if (hierarchicalFlag) {
       // always output alignment if hiero style, but only for non-terms
-      assert(phraseT.size() == bestAlignment->alignedToT.size() + 1);
+      assert(phraseT.size() == bestAlignment.alignedToT.size() + 1);
       for(size_t j = 0; j < phraseT.size() - 1; j++) {
         if (isNonTerminal(vcbT.getWord( phraseT[j] ))) {
-          if (bestAlignment->alignedToT[ j ].size() != 1) {
+          if (bestAlignment.alignedToT[ j ].size() != 1) {
             cerr << "Error: unequal numbers of non-terminals. Make sure the text does not contain words in square brackets (like [xxx])." << endl;
             phraseTableFile.flush();
-            assert(bestAlignment->alignedToT[ j ].size() == 1);
+            assert(bestAlignment.alignedToT[ j ].size() == 1);
           }
-          int sourcePos = *(bestAlignment->alignedToT[ j ].begin());
+          int sourcePos = *(bestAlignment.alignedToT[ j ].begin());
           phraseTableFile << sourcePos << "-" << j << " ";
         }
+        else if (wordAlignmentFlag) {
+          const std::set<size_t> &sourceSet = bestAlignment.alignedToT[ j ];
+          std::set<size_t>::const_iterator iter;
+          for (iter = sourceSet.begin(); iter != sourceSet.end(); ++iter)
+          {
+            int sourcePos = *iter;
+            phraseTableFile << sourcePos << "-" << j << " ";            
+          }
+        }
       }
     } else if (wordAlignmentFlag) {
       // alignment info in pb model
-      for(size_t j=0; j<bestAlignment->alignedToT.size(); j++) {
-        const set< size_t > &aligned = bestAlignment->alignedToT[j];
+      for(size_t j=0; j<bestAlignment.alignedToT.size(); j++) {
+        const set< size_t > &aligned = bestAlignment.alignedToT[j];
         for (set< size_t >::const_iterator p(aligned.begin()); p != aligned.end(); ++p) {
           phraseTableFile << *p << "-" << j << " ";
         }
@@ -681,13 +767,13 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
   phraseTableFile << endl;
 }
 
-double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
 {
   // unaligned word counter
   double unaligned = 1.0;
   // only checking target words - source words are caught when computing inverse
-  for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
     if (srcIndices.empty()) {
       unaligned *= 2.718;
     }
@@ -695,13 +781,13 @@ double computeUnalignedPenalty( const PHRASE &phraseS, const PHRASE &phraseT, Ph
   return unaligned;
 }
 
-double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeUnalignedFWPenalty( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
 {
   // unaligned word counter
   double unaligned = 1.0;
   // only checking target words - source words are caught when computing inverse
-  for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
     if (srcIndices.empty() && functionWordList.find( vcbT.getWord( phraseT[ ti ] ) ) != functionWordList.end()) {
       unaligned *= 2.718;
     }
@@ -734,14 +820,14 @@ void loadFunctionWords( const string &fileName )
   inFile.close();
 }
 
-double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, PhraseAlignment *alignment )
+double computeLexicalTranslation( const PHRASE &phraseS, const PHRASE &phraseT, const PhraseAlignment &alignment )
 {
   // lexical translation probability
   double lexScore = 1.0;
   int null = vcbS.getWordID("NULL");
   // all target words have to be explained
-  for(size_t ti=0; ti<alignment->alignedToT.size(); ti++) {
-    const set< size_t > & srcIndices = alignment->alignedToT[ ti ];
+  for(size_t ti=0; ti<alignment.alignedToT.size(); ti++) {
+    const set< size_t > & srcIndices = alignment.alignedToT[ ti ];
     if (srcIndices.empty()) {
       // explain unaligned word by NULL
       lexScore *= lexTable.permissiveLookup( null, phraseT[ ti ] );
author	phikoehn <pkoehn@inf.ed.ac.uk>	2012-09-03 10:27:41 +0400
committer	phikoehn <pkoehn@inf.ed.ac.uk>	2012-09-03 10:27:41 +0400
commit	5d9859ba0e742bb5207c8ee78f50252241723cdb (patch)
tree	7c447da9fd6acb41117238d443fcae67eeeb5d35 /phrase-extract
parent	19ef78514693a5557bae5614c8a2cc31a77a47d3 (diff)
parent	92b15c103fa542a19789c043d47b629d2563bad8 (diff)