a lot of changes

author: phikoehn <pkoehn@inf.ed.ac.uk> 2012-08-19 02:47:05 +0400
committer: phikoehn <pkoehn@inf.ed.ac.uk> 2012-08-19 02:47:05 +0400
commit: 366ab93f8aa53b7b065fe8366201bd59dafc51ba (patch)
tree: da32d2aaaa424d633ad55574dac4257067fd73ab /phrase-extract
parent: 388f5a2046d4c45f3b311bbe133be02c3fa560d8 (diff)
7 files changed, 218 insertions, 35 deletions
diff --git a/phrase-extract/Jamfile b/phrase-extract/Jamfile
index d834674b8..50f03a739 100644
--- a/phrase-extract/Jamfile
+++ b/phrase-extract/Jamfile
@@ -2,6 +2,7 @@ obj InputFileStream.o : InputFileStream.cpp : <include>. ;
 alias InputFileStream : InputFileStream.o ..//z ;
 
 obj tables-core.o : tables-core.cpp : <include>. ;
+obj domain.o : domain.cpp : <include>. ;
 obj AlignmentPhrase.o : AlignmentPhrase.cpp : <include>. ;
 obj SentenceAlignment.o : SentenceAlignment.cpp : <include>. ;
 obj SyntaxTree.o : SyntaxTree.cpp : <include>. ;
@@ -16,7 +17,7 @@ exe extract-rules : tables-core.o SentenceAlignment.o SyntaxTree.o XmlTree.o Sen
 
 exe extract-lex : extract-lex.cpp InputFileStream ;
 
-exe score : tables-core.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
+exe score : tables-core.o domain.o AlignmentPhrase.o score.cpp PhraseAlignment.cpp OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
 
 exe consolidate : consolidate.cpp tables-core.o OutputFileStream.cpp InputFileStream ..//boost_iostreams ;
 
@@ -24,7 +25,7 @@ exe consolidate-direct : consolidate-direct.cpp OutputFileStream.cpp InputFileSt
 
 exe consolidate-reverse :  consolidate-reverse.cpp tables-core.o InputFileStream ;
 
-exe relax-parse : tables-core.o SyntaxTree.o XmlTree.o relax-parse.cpp ;
+exe relax-parse : tables-core.o SyntaxTree.o XmlTree.o relax-parse.cpp InputFileStream ;
 
 exe statistics : tables-core.o AlignmentPhrase.o statistics.cpp InputFileStream ;
 
diff --git a/phrase-extract/PhraseAlignment.cpp b/phrase-extract/PhraseAlignment.cpp
index e432294b9..bdfead082 100644
--- a/phrase-extract/PhraseAlignment.cpp
+++ b/phrase-extract/PhraseAlignment.cpp
@@ -79,12 +79,11 @@ inline void Tokenize( std::vector<T> &output
 }
 
 // read in a phrase pair and store it
-void PhraseAlignment::create( char line[], int lineID )
+void PhraseAlignment::create( char line[], int lineID, bool includeSentenceIdFlag )
 {
   assert(phraseS.empty());
   assert(phraseT.empty());
 
-  //cerr << "processing " << line;
   vector< string > token = tokenize( line );
   int item = 1;
   for (size_t j=0; j<token.size(); j++) {
@@ -111,12 +110,13 @@ void PhraseAlignment::create( char line[], int lineID )
         alignedToT[t].insert( s );
         alignedToS[s].insert( t );
       }
-    } else if (item == 4) { // count
+    } else if (includeSentenceIdFlag && item == 4) { // optional sentence id
+      sscanf(token[j].c_str(), "%d", &sentenceId);
+    } else if (item + (includeSentenceIdFlag?-1:0) == 4) { // count
       sscanf(token[j].c_str(), "%f", &count);
-    }
-    else if (item == 5) { // non-term lengths
+    } else if (item + (includeSentenceIdFlag?-1:0) == 5) { // non-term lengths
       addNTLength(token[j]);
-    } else if (item == 6) { // target syntax PCFG score
+    } else if (item + (includeSentenceIdFlag?-1:0) == 6) { // target syntax PCFG score
       float pcfgScore = std::atof(token[j].c_str());
       pcfgSum = pcfgScore * count;
     }
@@ -124,7 +124,7 @@ void PhraseAlignment::create( char line[], int lineID )
 
   createAlignVec(phraseS.size(), phraseT.size());
 
-  if (item == 3) {
+  if (item + (includeSentenceIdFlag?-1:0) == 3) {
     count = 1.0;
   }
   if (item < 3 || item > 6) {
diff --git a/phrase-extract/PhraseAlignment.h b/phrase-extract/PhraseAlignment.h
index 9763b7a52..ce8562a5c 100644
--- a/phrase-extract/PhraseAlignment.h
+++ b/phrase-extract/PhraseAlignment.h
@@ -30,10 +30,13 @@ protected:
 public:
   float pcfgSum;
   float count;
+  size_t sentenceId;
+  std::string domain;
+
   std::vector< std::set<size_t> > alignedToT;
   std::vector< std::set<size_t> > alignedToS;
 
-  void create( char*, int );
+  void create( char*, int, bool );
   void clear();
   bool equals( const PhraseAlignment& );
   bool match( const PhraseAlignment& );
diff --git a/phrase-extract/consolidate.cpp b/phrase-extract/consolidate.cpp
index 1b724bd28..82807fd42 100644
--- a/phrase-extract/consolidate.cpp
+++ b/phrase-extract/consolidate.cpp
@@ -47,9 +47,11 @@ inline float maybeLogProb( float a ) { return logProbFlag ? log(a) : a; }
 char line[LINE_MAX_LENGTH];
 void processFiles( char*, char*, char*, char* );
 void loadCountOfCounts( char* );
+void breakdownCoreAndSparse( string combined, string &core, string &sparse );
 bool getLine( istream &fileP, vector< string > &item );
 vector< string > splitLine();
 vector< int > countBin;
+bool sparseCountBinFeatureFlag = false;
 
 int main(int argc, char* argv[])
 {
@@ -94,8 +96,11 @@ int main(int argc, char* argv[])
     } else if (strcmp(argv[i],"--LowCountFeature") == 0) {
       lowCountFlag = true;
       cerr << "including the low count feature\n";
-    } else if (strcmp(argv[i],"--CountBinFeature") == 0) {
-      cerr << "include count bin feature:";
+    } else if (strcmp(argv[i],"--CountBinFeature") == 0 ||
+               strcmp(argv[i],"--SparseCountBinFeature") == 0) {
+      if (strcmp(argv[i],"--SparseCountBinFeature") == 0)
+        sparseCountBinFeatureFlag = true;
+      cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:";
       int prev = 0;
       while(i+1<argc && argv[i+1][0]>='0' && argv[i+1][0]<='9') {
         int binCount = atoi(argv[++i]);
@@ -223,10 +228,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
     }
 
     // output hierarchical phrase pair (with separated labels)
-    fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1];
+    fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1] << " |||";
 
     // SCORES ...
-    fileConsolidated << " |||";
+    string directScores, directSparseScores, indirectScores, indirectSparseScores;
+    breakdownCoreAndSparse( itemDirect[2], directScores, directSparseScores );
+    breakdownCoreAndSparse( itemIndirect[2], indirectScores, indirectSparseScores );
+
     vector<string> directCounts = tokenize(itemDirect[4].c_str());
     vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
     float countF = atof(directCounts[0].c_str());
@@ -264,12 +272,12 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
     // prob indirect
     if (!onlyDirectFlag) {
       fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
-      fileConsolidated << " " << itemIndirect[2];
+      fileConsolidated << " " << directScores;
     }
 
     // prob direct
     fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
-    fileConsolidated << " " << itemDirect[2];
+    fileConsolidated << " " << indirectScores;
 
     // phrase count feature
     if (phraseCountFlag) {
@@ -281,8 +289,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
       fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
     }
 
-    // count bin feature
-    if (countBin.size()>0) {
+    // count bin feature (as a core feature)
+    if (countBin.size()>0 && !sparseCountBinFeatureFlag) {
       bool foundBin = false;
       for(size_t i=0; i < countBin.size(); i++) {
         if (!foundBin && countEF <= countBin[i]) {
@@ -307,6 +315,35 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
       fileConsolidated << " ||| " << itemDirect[5];
     }
     
+    // count bin feature (as a sparse feature)
+    if (sparseCountBinFeatureFlag || 
+        directSparseScores.compare("") != 0 || 
+        indirectSparseScores.compare("") != 0)
+    {
+      fileConsolidated << " |||";
+      if (directSparseScores.compare("") != 0)
+        fileConsolidated << " " << directSparseScores;
+      if (indirectSparseScores.compare("") != 0)
+        fileConsolidated << " " << indirectSparseScores;
+      if (sparseCountBinFeatureFlag) {
+        bool foundBin = false;
+        for(size_t i=0; i < countBin.size(); i++) {
+          if (!foundBin && countEF <= countBin[i]) {
+            fileConsolidated << " cb_";
+            if (i == 0 && countBin[i] > 1)
+              fileConsolidated << "1_";
+            else if (i > 0 && countBin[i-1]+1 < countBin[i])
+              fileConsolidated << (countBin[i-1]+1) << "_";
+            fileConsolidated << countBin[i] << " 1";
+            foundBin = true;
+          }
+        }
+        if (!foundBin) {
+          fileConsolidated << " cb_max 1";
+        }
+      }
+    }
+
     fileConsolidated << endl;
   }
   fileDirect.Close();
@@ -314,6 +351,22 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
   fileConsolidated.Close();
 }
 
+void breakdownCoreAndSparse( string combined, string &core, string &sparse ) 
+{
+  core = "";
+  sparse = "";
+  vector<string> score = tokenize( combined.c_str() );
+  for(size_t i=0; i<score.size(); i++) {
+    if ((score[i][0] >= '0' && score[i][0] <= '9') || i+1 == score.size()) 
+      core += " " + score[i];
+    else {
+      sparse += " " + score[i];
+      sparse += " " + score[++i];
+    }
+  }
+  if (core.size() > 0 ) core = core.substr(1);
+  if (sparse.size() > 0 ) sparse = sparse.substr(1);
+}
 
 bool getLine( istream &fileP, vector< string > &item )
 {
diff --git a/phrase-extract/extract-rules.cpp b/phrase-extract/extract-rules.cpp
index c333040f6..0abf548c3 100644
--- a/phrase-extract/extract-rules.cpp
+++ b/phrase-extract/extract-rules.cpp
@@ -120,6 +120,7 @@ int main(int argc, char* argv[])
        << "rule extraction from an aligned parallel corpus\n";
 
   RuleExtractionOptions options;
+  int sentenceOffset = 0;
 #ifdef WITH_THREADS
   int thread_count = 1;
 #endif
@@ -274,6 +275,12 @@ int main(int argc, char* argv[])
                strcmp(argv[i],"--Threads") == 0) {
       thread_count = atoi(argv[++i]);
 #endif
+    } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
+      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
+        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
+        exit(1);
+      }
+      sentenceOffset = atoi(argv[++i]);
     } else {
       cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
       exit(1);
@@ -314,7 +321,7 @@ int main(int argc, char* argv[])
 #endif
 
   // loop through all sentence pairs
-  size_t i=0;
+  size_t i=sentenceOffset;
   while(true) {
     i++;
     if (i%1000 == 0) cerr << "." << flush;
diff --git a/phrase-extract/extract.cpp b/phrase-extract/extract.cpp
index 6583fd077..89c45a2e6 100644
--- a/phrase-extract/extract.cpp
+++ b/phrase-extract/extract.cpp
@@ -95,6 +95,8 @@ int maxPhraseLength;
 bool orientationFlag = false;
 bool translationFlag = true;
 bool sentenceIdFlag = false; //create extract file with sentence id
+int sentenceOffset = 0;
+bool includeSentenceIdFlag = false; //include sentence id in extract file
 bool onlyOutputSpanInfo = false;
 bool gzOutput = false;
 
@@ -106,7 +108,7 @@ int main(int argc, char* argv[])
         << "phrase extraction from an aligned parallel corpus\n";
 
   if (argc < 6) {
-    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
+    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId | --IncludeSentenceId | --SentenceOffset n ]\n";
     exit(1);
   }
   char* &fileNameE = argv[1];
@@ -124,6 +126,14 @@ int main(int argc, char* argv[])
       translationFlag = false;
     } else if (strcmp(argv[i], "--SentenceId") == 0) {
       sentenceIdFlag = true;  
+    } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
+      includeSentenceIdFlag = true;  
+    } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
+      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
+        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
+        exit(1);
+      }
+      sentenceOffset = atoi(argv[++i]);
     } else if (strcmp(argv[i], "--GZOutput") == 0) {
       gzOutput = true;  
     } else if(strcmp(argv[i],"--model") == 0) {
@@ -135,7 +145,7 @@ int main(int argc, char* argv[])
       char* modelName = strtok(modelParams, "-");
       char* modelType = strtok(NULL, "-");
 
-      REO_MODEL_TYPE intModelType;
+      // REO_MODEL_TYPE intModelType;
 
       if(strcmp(modelName, "wbe") == 0) {
         wordModel = true;
@@ -217,7 +227,7 @@ int main(int argc, char* argv[])
     extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
   }
 
-  int i=0;
+  int i = sentenceOffset;
   while(true) {
     i++;
     if (i%10000 == 0) cerr << "." << flush;
@@ -669,9 +679,11 @@ void addPhrase( SentenceAlignment &sentence, int startE, int endE, int startF, i
   if (orientationFlag)
     extractFileOrientation << orientationInfo;
 
-  if (sentenceIdFlag) {
+  if (sentenceIdFlag)
     extractFileSentenceId << sentence.sentenceID;
-  }
+
+  if (includeSentenceIdFlag)
+    extractFile << " ||| " << sentence.sentenceID;
 
   if (translationFlag) extractFile << "\n";
   if (translationFlag) extractFileInv << "\n";
@@ -708,4 +720,3 @@ void extractBase( SentenceAlignment &sentence )
 }
 
 }
-
diff --git a/phrase-extract/score.cpp b/phrase-extract/score.cpp
index bfc95de09..f02b6b3b0 100644
--- a/phrase-extract/score.cpp
+++ b/phrase-extract/score.cpp
@@ -29,6 +29,7 @@
 
 #include "SafeGetline.h"
 #include "tables-core.h"
+#include "domain.h"
 #include "PhraseAlignment.h"
 #include "score.h"
 #include "InputFileStream.h"
@@ -53,6 +54,7 @@ bool kneserNeyFlag = false;
 #define COC_MAX 10
 bool logProbFlag = false;
 int negLogProb = 1;
+inline float maybeLogProb( float a ) { return logProbFlag ? negLogProb*log(a) : a; }
 bool lexFlag = true;
 bool unalignedFlag = false;
 bool unalignedFWFlag = false;
@@ -60,6 +62,12 @@ bool outputNTLengths = false;
 int countOfCounts[COC_MAX+1];
 int totalDistinct = 0;
 float minCountHierarchical = 0;
+bool domainFlag = false;
+bool domainRatioFlag = false;
+bool domainSubsetFlag = false;
+bool domainSparseFlag = false;
+Domain *domain;
+bool includeSentenceIdFlag = false;
 
 Vocabulary vcbT;
 Vocabulary vcbS;
@@ -89,14 +97,15 @@ int main(int argc, char* argv[])
        << "scoring methods for extracted rules\n";
 
   if (argc < 4) {
-    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS]\n";
+    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]]\n";
     exit(1);
   }
   char* fileNameExtract = argv[1];
   char* fileNameLex = argv[2];
   char* fileNamePhraseTable = argv[3];
   string fileNameCountOfCounts;
-  char* fileNameFunctionWords;
+  char* fileNameFunctionWords = NULL;
+  char* fileNameDomain = NULL;
 
   for(int i=4; i<argc; i++) {
     if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
@@ -139,6 +148,22 @@ int main(int argc, char* argv[])
       }
       fileNameFunctionWords = argv[++i];
       cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl;
+    } else if (strcmp(argv[i],"--SparseDomainIndicator") == 0 ||
+               strcmp(argv[i],"--SparseDomainRatio") == 0 ||
+               strcmp(argv[i],"--SparseDomainSubset") == 0 ||
+               strcmp(argv[i],"--DomainIndicator") == 0 ||
+               strcmp(argv[i],"--DomainRatio") == 0 ||
+               strcmp(argv[i],"--DomainSubset") == 0) {
+      includeSentenceIdFlag = true;
+      domainFlag = true;
+      domainSparseFlag = strstr( argv[i], "Sparse" );
+      domainRatioFlag = strstr( argv[i], "Ratio" );
+      domainSubsetFlag = strstr( argv[i], "Subset" );
+      if (i+1==argc) {
+        cerr << "ERROR: specify domain info file with " << argv[i] << endl;
+        exit(1);
+      }
+      fileNameDomain = argv[++i];
     } else if (strcmp(argv[i],"--LogProb") == 0) {
       logProbFlag = true;
       cerr << "using log-probabilities\n";
@@ -166,6 +191,18 @@ int main(int argc, char* argv[])
   if (unalignedFWFlag)
     loadFunctionWords( fileNameFunctionWords );
 
+  // load domain information
+  if (domainFlag) {
+    if (inverseFlag) {
+      domainFlag = false;
+      includeSentenceIdFlag = false;
+    }
+    else {
+      domain = new Domain;
+      domain->load( fileNameDomain );
+    }
+  }
+
   // compute count of counts for Good Turing discounting
   if (goodTuringFlag || kneserNeyFlag) {
     for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
@@ -221,7 +258,7 @@ int main(int argc, char* argv[])
 
     // create new phrase pair
     PhraseAlignment phrasePair;
-    phrasePair.create( line, i );
+    phrasePair.create( line, i, includeSentenceIdFlag );
     lastCount = phrasePair.count;
     lastPcfgSum = phrasePair.pcfgSum;
 
@@ -320,7 +357,6 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
   {
     const PhraseAlignmentCollection &group = **iter;
     outputPhrasePair( group, totalSource, phrasePairGroup.GetSize(), phraseTableFile );
-
   }
   
 }
@@ -328,7 +364,7 @@ void processPhrasePairs( vector< PhraseAlignment > &phrasePair, ostream &phraseT
 PhraseAlignment* findBestAlignment(const PhraseAlignmentCollection &phrasePair )
 {
   float bestAlignmentCount = -1;
-  PhraseAlignment* bestAlignment;
+  PhraseAlignment* bestAlignment = NULL;
   
   for(size_t i=0; i<phrasePair.size(); i++) {
     if (phrasePair[i]->count > bestAlignmentCount) {
@@ -440,6 +476,18 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
     count += phrasePair[i]->count;
   }
 
+  // compute domain counts
+  map< string, float > domainCount;
+  if (domainFlag) {
+    for(size_t i=0; i<phrasePair.size(); i++) {
+      string d = domain->getDomainOfSentence( phrasePair[i]->sentenceId );
+      if (domainCount.find( d ) == domainCount.end())
+        domainCount[ d ] = phrasePair[i]->count;
+      else
+        domainCount[ d ] += phrasePair[i]->count;
+    }
+  }
+
   // collect count of count statistics
   if (goodTuringFlag || kneserNeyFlag) {
     totalDistinct++;
@@ -449,7 +497,7 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
   }
 
   // compute PCFG score
-  float pcfgScore;
+  float pcfgScore = 0;
   if (pcfgFlag && !inverseFlag) {
     float pcfgSum = 0;
     for(size_t i=0; i<phrasePair.size(); ++i) {
@@ -489,24 +537,84 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
   // lexical translation probability
   if (lexFlag) {
     double lexScore = computeLexicalTranslation( phraseS, phraseT, bestAlignment);
-    phraseTableFile << ( logProbFlag ? negLogProb*log(lexScore) : lexScore );
+    phraseTableFile << maybeLogProb( lexScore );
   }
 
   // unaligned word penalty
   if (unalignedFlag) {
     double penalty = computeUnalignedPenalty( phraseS, phraseT, bestAlignment);
-    phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
+    phraseTableFile << " " << maybeLogProb( penalty );
   }
 
   // unaligned function word penalty
   if (unalignedFWFlag) {
     double penalty = computeUnalignedFWPenalty( phraseS, phraseT, bestAlignment);
-    phraseTableFile << " " << ( logProbFlag ? negLogProb*log(penalty) : penalty );
+    phraseTableFile << " " << maybeLogProb( penalty );
   }
 
   // target-side PCFG score
   if (pcfgFlag && !inverseFlag) {
-    phraseTableFile << " " << pcfgScore;
+    phraseTableFile << " " << maybeLogProb( pcfgScore );
+  }
+
+  // domain count features
+  if (domainFlag) {
+    if (domainSparseFlag) {
+      // sparse, subset
+      if (domainSubsetFlag) {
+        typedef vector< string >::const_iterator I;
+        phraseTableFile << " doms";
+        for (I i = domain->list.begin(); i != domain->list.end(); i++ ) {
+          if (domainCount.find( *i ) != domainCount.end() ) {
+            phraseTableFile << "_" << *i;
+          }
+        }
+        phraseTableFile << " 1";
+      }
+      // sparse, indicator or ratio
+      else {
+        typedef map< string, float >::const_iterator I;
+        for (I i=domainCount.begin(); i != domainCount.end(); i++) {
+          if (domainRatioFlag) {
+            phraseTableFile << " domr_" << i->first << " " << (i->second / count);
+          }
+          else {
+            phraseTableFile << " dom_" << i->first << " 1";
+          }
+        }
+      }
+    }
+    // core, subset
+    else if (domainSubsetFlag) {
+      if (domain->list.size() > 6) {
+        cerr << "ERROR: too many domains for core domain subset features\n";
+        exit(1);
+      }
+      size_t bitmap = 0;
+      for(size_t bit = 0; bit < domain->list.size(); bit++) {
+        if (domainCount.find( domain->list[ bit ] ) != domainCount.end()) {
+          bitmap += 1 << bit;
+        }
+      }
+      for(size_t i = 1; i < (1 << domain->list.size()); i++) {
+        phraseTableFile << " " << maybeLogProb( (bitmap == i) ? 2.718 : 1 );
+      }
+    }
+    // core, indicator or ratio
+    else {
+      typedef vector< string >::const_iterator I;
+      for (I i = domain->list.begin(); i != domain->list.end(); i++ ) {
+        if (domainCount.find( *i ) == domainCount.end() ) {
+          phraseTableFile << " " << maybeLogProb( 1 );
+        }
+        else if (domainRatioFlag) {
+          phraseTableFile << " " << maybeLogProb( exp( domainCount[ *i ] / count ) );
+        }
+        else {
+          phraseTableFile << " " << maybeLogProb( 2.718 );
+        }
+      }
+    }
   }
 
   phraseTableFile << " ||| ";
author	phikoehn <pkoehn@inf.ed.ac.uk>	2012-08-19 02:47:05 +0400
committer	phikoehn <pkoehn@inf.ed.ac.uk>	2012-08-19 02:47:05 +0400
commit	366ab93f8aa53b7b065fe8366201bd59dafc51ba (patch)
tree	da32d2aaaa424d633ad55574dac4257067fd73ab /phrase-extract
parent	388f5a2046d4c45f3b311bbe133be02c3fa560d8 (diff)