conservative update of some old code in phrase-extract/consolidate-main.cpp

author: Matthias Huck <huck@i6.informatik.rwth-aachen.de> 2015-03-09 21:47:28 +0300
committer: Matthias Huck <huck@i6.informatik.rwth-aachen.de> 2015-03-09 21:47:28 +0300
commit: 973fd980524060487d2a6efa5459697801163cec (patch)
tree: 476b39afe15f217ff8d33e35504be1607bdcc71f /phrase-extract/consolidate-main.cpp
parent: 0c79e19ff933d469d2ccf442f1cec343830c4c2d (diff)
1 files changed, 115 insertions, 158 deletions
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index 0648f3017..7f17eb1c8 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -17,141 +17,132 @@
   Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  ***********************************************************************/
 
-#include <cstdio>
-#include <iostream>
-#include <fstream>
 #include <vector>
 #include <string>
-#include <cstdlib>
-#include <cstring>
 
+#include "util/exception.hh"
 #include "moses/Util.h"
 #include "InputFileStream.h"
 #include "OutputFileStream.h"
 #include "PropertiesConsolidator.h"
 
-using namespace std;
 
+bool countsProperty = false;
+bool goodTuringFlag = false;
 bool hierarchicalFlag = false;
+bool kneserNeyFlag = false;
+bool logProbFlag = false;
+bool lowCountFlag = false;
 bool onlyDirectFlag = false;
+bool partsOfSpeechFlag = false;
 bool phraseCountFlag = false;
-bool lowCountFlag = false;
-bool goodTuringFlag = false;
-bool kneserNeyFlag = false;
 bool sourceLabelsFlag = false;
-bool partsOfSpeechFlag = false;
-bool logProbFlag = false;
-bool countsProperty = false;
+bool sparseCountBinFeatureFlag = false;
+
+std::vector< int > countBin;
 float minScore0 = 0;
 float minScore2 = 0;
 
+std::vector< float > countOfCounts;
+std::vector< float > goodTuringDiscount;
+float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
+
+
+void processFiles( const std::string&, const std::string&, const std::string&, const std::string&, const std::string&, const std::string& );
+void loadCountOfCounts( const std::string& );
+void breakdownCoreAndSparse( const std::string &combined, std::string &core, std::string &sparse );
+bool getLine( Moses::InputFileStream &file, std::vector< std::string > &item );
+
+
 inline float maybeLogProb( float a )
 {
   return logProbFlag ? std::log(a) : a;
 }
 
+
 inline bool isNonTerminal( const std::string &word )
 {
   return (word.length()>=3 && word[0] == '[' && word[word.length()-1] == ']');
 }
 
-void processFiles( char*, char*, char*, char*, char*, char* );
-void loadCountOfCounts( char* );
-void breakdownCoreAndSparse( string combined, string &core, string &sparse );
-bool getLine( istream &fileP, vector< string > &item );
-vector< string > splitLine(const char *line);
-vector< int > countBin;
-bool sparseCountBinFeatureFlag = false;
 
 int main(int argc, char* argv[])
 {
-  cerr << "Consolidate v2.0 written by Philipp Koehn\n"
-       << "consolidating direct and indirect rule tables\n";
+  std::cerr << "Consolidate v2.0 written by Philipp Koehn" << std::endl
+            << "consolidating direct and indirect rule tables" << std::endl;
 
   if (argc < 4) {
-    cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file]  [--PartsOfSpeech parts-of-speech-file] [--MinScore id:threshold[,id:threshold]*]\n";
+    std::cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--PhraseCount] [--GoodTuring counts-of-counts-file] [--KneserNey counts-of-counts-file] [--LowCountFeature] [--SourceLabels source-labels-file] [--PartsOfSpeech parts-of-speech-file] [--MinScore id:threshold[,id:threshold]*]" << std::endl;
     exit(1);
   }
-  char* &fileNameDirect = argv[1];
-  char* &fileNameIndirect = argv[2];
-  char* &fileNameConsolidated = argv[3];
-  char* fileNameCountOfCounts = 0;
-  char* fileNameSourceLabelSet = 0;
-  char* fileNamePartsOfSpeechVocabulary = 0;
+  const std::string fileNameDirect = argv[1];
+  const std::string fileNameIndirect = argv[2];
+  const std::string fileNameConsolidated = argv[3];
+  std::string fileNameCountOfCounts;
+  std::string fileNameSourceLabelSet;
+  std::string fileNamePartsOfSpeechVocabulary;
 
   for(int i=4; i<argc; i++) {
     if (strcmp(argv[i],"--Hierarchical") == 0) {
       hierarchicalFlag = true;
-      cerr << "processing hierarchical rules\n";
+      std::cerr << "processing hierarchical rules" << std::endl;
     } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
       onlyDirectFlag = true;
-      cerr << "only including direct translation scores p(e|f)\n";
+      std::cerr << "only including direct translation scores p(e|f)" << std::endl;
     } else if (strcmp(argv[i],"--PhraseCount") == 0) {
       phraseCountFlag = true;
-      cerr << "including the phrase count feature\n";
+      std::cerr << "including the phrase count feature" << std::endl;
     } else if (strcmp(argv[i],"--GoodTuring") == 0) {
       goodTuringFlag = true;
-      if (i+1==argc) {
-        cerr << "ERROR: specify count of count files for Good Turing discounting!\n";
-        exit(1);
-      }
+      UTIL_THROW_IF2(i+1==argc, "specify count of count files for Good Turing discounting!");
       fileNameCountOfCounts = argv[++i];
-      cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
+      std::cerr << "adjusting phrase translation probabilities with Good Turing discounting" << std::endl;
     } else if (strcmp(argv[i],"--KneserNey") == 0) {
       kneserNeyFlag = true;
-      if (i+1==argc) {
-        cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
-        exit(1);
-      }
+      UTIL_THROW_IF2(i+1==argc, "specify count of count files for Kneser Ney discounting!");
       fileNameCountOfCounts = argv[++i];
-      cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
+      std::cerr << "adjusting phrase translation probabilities with Kneser Ney discounting" << std::endl;
     } else if (strcmp(argv[i],"--LowCountFeature") == 0) {
       lowCountFlag = true;
-      cerr << "including the low count feature\n";
+      std::cerr << "including the low count feature" << std::endl;
     } else if (strcmp(argv[i],"--CountBinFeature") == 0 ||
                strcmp(argv[i],"--SparseCountBinFeature") == 0) {
       if (strcmp(argv[i],"--SparseCountBinFeature") == 0)
         sparseCountBinFeatureFlag = true;
-      cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:";
+      std::cerr << "include "<< (sparseCountBinFeatureFlag ? "sparse " : "") << "count bin feature:";
       int prev = 0;
       while(i+1<argc && argv[i+1][0]>='0' && argv[i+1][0]<='9') {
-        int binCount = atoi(argv[++i]);
+        int binCount = Moses::Scan<int>(argv[++i]);
         countBin.push_back( binCount );
         if (prev+1 == binCount) {
-          cerr << " " << binCount;
+          std::cerr << " " << binCount;
         } else {
-          cerr << " " << (prev+1) << "-" << binCount;
+          std::cerr << " " << (prev+1) << "-" << binCount;
         }
         prev = binCount;
       }
-      cerr << " " << (prev+1) << "+\n";
+      std::cerr << " " << (prev+1) << "+" << std::endl;
     } else if (strcmp(argv[i],"--LogProb") == 0) {
       logProbFlag = true;
-      cerr << "using log-probabilities\n";
+      std::cerr << "using log-probabilities" << std::endl;
     } else if (strcmp(argv[i],"--Counts") == 0) {
       countsProperty = true;
-      cerr << "output counts as a property\n";
+      std::cerr << "output counts as a property" << std::endl;;
     } else if (strcmp(argv[i],"--SourceLabels") == 0) {
       sourceLabelsFlag = true;
-      if (i+1==argc) {
-        cerr << "ERROR: specify source label set file!\n";
-        exit(1);
-      }
+      UTIL_THROW_IF2(i+1==argc, "specify source label set file!");
       fileNameSourceLabelSet = argv[++i];
-      cerr << "processing source labels property\n";
+      std::cerr << "processing source labels property" << std::endl;
     } else if (strcmp(argv[i],"--PartsOfSpeech") == 0) {
       partsOfSpeechFlag = true;
-      if (i+1==argc) {
-        cerr << "ERROR: specify parts-of-speech file!\n";
-        exit(1);
-      }
+      UTIL_THROW_IF2(i+1==argc, "specify parts-of-speech file!");
       fileNamePartsOfSpeechVocabulary = argv[++i];
-      cerr << "processing parts-of-speech property\n";
+      std::cerr << "processing parts-of-speech property" << std::endl;
     } else if (strcmp(argv[i],"--MinScore") == 0) {
-      string setting = argv[++i];
+      std::string setting = argv[++i];
       bool done = false;
       while (!done) {
-        string single_setting;
+        std::string single_setting;
         size_t pos;
         if ((pos = setting.find(",")) != std::string::npos) {
           single_setting = setting.substr(0, pos);
@@ -160,52 +151,42 @@ int main(int argc, char* argv[])
           single_setting = setting;
           done = true;
         }
-        if ((pos = single_setting.find(":")) == std::string::npos) {
-          cerr << "ERROR: faulty MinScore setting '" << single_setting << "' in '" << argv[i] << "'" << endl;
-          exit(1);
-        }
-        unsigned int field = atoi( single_setting.substr(0,pos).c_str() );
-        float threshold = atof( single_setting.substr(pos+1).c_str() );
+        pos = single_setting.find(":");
+        UTIL_THROW_IF2(pos == std::string::npos, "faulty MinScore setting '" << single_setting << "' in '" << argv[i] << "'");
+        unsigned int field = Moses::Scan<unsigned int>( single_setting.substr(0,pos) );
+        float threshold = Moses::Scan<float>( single_setting.substr(pos+1) );
         if (field == 0) {
           minScore0 = threshold;
-          cerr << "setting minScore0 to " << threshold << endl;
+          std::cerr << "setting minScore0 to " << threshold << std::endl;
         } else if (field == 2) {
           minScore2 = threshold;
-          cerr << "setting minScore2 to " << threshold << endl;
+          std::cerr << "setting minScore2 to " << threshold << std::endl;
         } else {
-          cerr << "ERROR: MinScore currently only supported for indirect (0) and direct (2) phrase translation probabilities" << endl;
-          exit(1);
+          UTIL_THROW2("MinScore currently only supported for indirect (0) and direct (2) phrase translation probabilities");
         }
       }
     } else {
-      cerr << "ERROR: unknown option " << argv[i] << endl;
-      exit(1);
+      UTIL_THROW2("unknown option " << argv[i]);
     }
   }
 
   processFiles( fileNameDirect, fileNameIndirect, fileNameConsolidated, fileNameCountOfCounts, fileNameSourceLabelSet, fileNamePartsOfSpeechVocabulary );
 }
 
-vector< float > countOfCounts;
-vector< float > goodTuringDiscount;
-float kneserNey_D1, kneserNey_D2, kneserNey_D3, totalCount = -1;
-void loadCountOfCounts( char* fileNameCountOfCounts )
+
+void loadCountOfCounts( const std::string& fileNameCountOfCounts )
 {
   Moses::InputFileStream fileCountOfCounts(fileNameCountOfCounts);
-  if (fileCountOfCounts.fail()) {
-    cerr << "ERROR: could not open count of counts file " << fileNameCountOfCounts << endl;
-    exit(1);
-  }
-  istream &fileP = fileCountOfCounts;
+  UTIL_THROW_IF2(fileCountOfCounts.fail(), "could not open count of counts file " << fileNameCountOfCounts);
 
   countOfCounts.push_back(0.0);
 
-  string line;
-  while (getline(fileP, line)) {
+  std::string line;
+  while (getline(fileCountOfCounts, line)) {
     if (totalCount < 0)
-      totalCount = atof(line.c_str()); // total number of distinct phrase pairs
+      totalCount = Moses::Scan<float>(line); // total number of distinct phrase pairs
     else
-      countOfCounts.push_back( atof(line.c_str()) );
+      countOfCounts.push_back( Moses::Scan<float>(line) );
   }
   fileCountOfCounts.Close();
 
@@ -232,34 +213,27 @@ void loadCountOfCounts( char* fileNameCountOfCounts )
   if (kneserNey_D3 > 2.9) kneserNey_D3 = 2.9;
 }
 
-void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet, char* fileNamePartsOfSpeechVocabulary )
+
+void processFiles( const std::string& fileNameDirect, 
+                   const std::string& fileNameIndirect, 
+                   const std::string& fileNameConsolidated, 
+                   const std::string& fileNameCountOfCounts, 
+                   const std::string& fileNameSourceLabelSet, 
+                   const std::string& fileNamePartsOfSpeechVocabulary )
 {
   if (goodTuringFlag || kneserNeyFlag)
     loadCountOfCounts( fileNameCountOfCounts );
 
   // open input files
   Moses::InputFileStream fileDirect(fileNameDirect);
+  UTIL_THROW_IF2(fileDirect.fail(), "could not open phrase table file " << fileNameDirect);
   Moses::InputFileStream fileIndirect(fileNameIndirect);
-
-  if (fileDirect.fail()) {
-    cerr << "ERROR: could not open phrase table file " << fileNameDirect << endl;
-    exit(1);
-  }
-  istream &fileDirectP = fileDirect;
-
-  if (fileIndirect.fail()) {
-    cerr << "ERROR: could not open phrase table file " << fileNameIndirect << endl;
-    exit(1);
-  }
-  istream &fileIndirectP = fileIndirect;
+  UTIL_THROW_IF2(fileIndirect.fail(), "could not open phrase table file " << fileNameIndirect);
 
   // open output file: consolidated phrase table
   Moses::OutputFileStream fileConsolidated;
   bool success = fileConsolidated.Open(fileNameConsolidated);
-  if (!success) {
-    cerr << "ERROR: could not open output file " << fileNameConsolidated << endl;
-    exit(1);
-  }
+  UTIL_THROW_IF2(!success, "could not open output file " << fileNameConsolidated);
 
   // create properties consolidator
   // (in case any additional phrase property requires further processing)
@@ -275,43 +249,38 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
   int i=0;
   while(true) {
     i++;
-    if (i%100000 == 0) cerr << "." << flush;
+    if (i%100000 == 0) std::cerr << "." << std::flush;
 
-    vector< string > itemDirect, itemIndirect;
-    if (! getLine(fileIndirectP,itemIndirect) ||
-        ! getLine(fileDirectP,  itemDirect  ))
+    std::vector< std::string > itemDirect, itemIndirect;
+    if (! getLine(fileIndirect, itemIndirect) ||
+        ! getLine(fileDirect, itemDirect))
       break;
 
     // direct: target source alignment probabilities
     // indirect: source target probabilities
 
     // consistency checks
-    if (itemDirect[0].compare( itemIndirect[0] ) != 0) {
-      cerr << "ERROR: target phrase does not match in line " << i << ": '"
-           << itemDirect[0] << "' != '" << itemIndirect[0] << "'" << endl;
-      exit(1);
-    }
-
-    if (itemDirect[1].compare( itemIndirect[1] ) != 0) {
-      cerr << "ERROR: source phrase does not match in line " << i << ": '"
-           << itemDirect[1] << "' != '" << itemIndirect[1] << "'" << endl;
-      exit(1);
-    }
+    UTIL_THROW_IF2(itemDirect[0].compare( itemIndirect[0] ) != 0, 
+                   "target phrase does not match in line " << i << ": '" << itemDirect[0] << "' != '" << itemIndirect[0] << "'");
+    UTIL_THROW_IF2(itemDirect[1].compare( itemIndirect[1] ) != 0, 
+                   "source phrase does not match in line " << i << ": '" << itemDirect[1] << "' != '" << itemIndirect[1] << "'");
 
     // SCORES ...
-    string directScores, directSparseScores, indirectScores, indirectSparseScores;
+    std::string directScores, directSparseScores, indirectScores, indirectSparseScores;
     breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores );
     breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores );
 
-    vector<string> directCounts = Moses::Tokenize(itemDirect[4]);
-    vector<string> indirectCounts = Moses::Tokenize(itemIndirect[4]);
-    float countF = atof(directCounts[0].c_str());
-    float countE = atof(indirectCounts[0].c_str());
-    float countEF = atof(indirectCounts[1].c_str());
+    std::vector<std::string> directCounts;
+    Moses::Tokenize( directCounts, itemDirect[4] );
+    std::vector<std::string> indirectCounts;
+    Moses::Tokenize( indirectCounts, itemIndirect[4] );
+    float countF = Moses::Scan<float>(directCounts[0]);
+    float countE = Moses::Scan<float>(indirectCounts[0]);
+    float countEF = Moses::Scan<float>(indirectCounts[1]);
     float n1_F, n1_E;
     if (kneserNeyFlag) {
-      n1_F = atof(directCounts[2].c_str());
-      n1_E = atof(indirectCounts[2].c_str());
+      n1_F = Moses::Scan<float>(directCounts[2]);
+      n1_E = Moses::Scan<float>(indirectCounts[2]);
     }
 
     // Good Turing discounting
@@ -348,7 +317,8 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
 
     if (partsOfSpeechFlag) {
       // write POS factor from property
-      std::vector<std::string> targetTokens = Moses::Tokenize(itemDirect[1]);
+      std::vector<std::string> targetTokens;
+      Moses::Tokenize( targetTokens, itemDirect[1] );
       std::vector<std::string> propertyValuePOS;
       propertiesConsolidator.GetPOSPropertyValueFromPropertiesString(itemDirect[5], propertyValuePOS);
       size_t targetTerminalIndex = 0;
@@ -405,10 +375,13 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
     }
 
     // alignment
-    fileConsolidated << " ||| " << itemDirect[2];
+    fileConsolidated << " |||";
+    if (!itemDirect[2].empty()) {
+      fileConsolidated << " " << itemDirect[2];;
+    }
 
     // counts, for debugging
-    fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
+    fileConsolidated << " ||| " << countE << " " << countF << " " << countEF;
 
     // sparse features
     fileConsolidated << " |||";
@@ -416,6 +389,7 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
       fileConsolidated << " " << directSparseScores;
     if (indirectSparseScores.compare("") != 0)
       fileConsolidated << " " << indirectSparseScores;
+
     // count bin feature (as a sparse feature)
     if (sparseCountBinFeatureFlag) {
       bool foundBin = false;
@@ -445,18 +419,21 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
       fileConsolidated << " {{Counts " << countE << " " << countF << " " << countEF << "}}";
     }
 
-    fileConsolidated << endl;
+    fileConsolidated << std::endl;
   }
+
   fileDirect.Close();
   fileIndirect.Close();
   fileConsolidated.Close();
 }
 
-void breakdownCoreAndSparse( string combined, string &core, string &sparse )
+
+void breakdownCoreAndSparse( const std::string &combined, std::string &core, std::string &sparse )
 {
   core = "";
   sparse = "";
-  vector<string> score = Moses::Tokenize( combined );
+  std::vector<std::string> score;
+  Moses::Tokenize( score, combined );
   for(size_t i=0; i<score.size(); i++) {
     if ((score[i][0] >= '0' && score[i][0] <= '9') || i+1 == score.size())
       core += " " + score[i];
@@ -469,38 +446,18 @@ void breakdownCoreAndSparse( string combined, string &core, string &sparse )
   if (sparse.size() > 0 ) sparse = sparse.substr(1);
 }
 
-bool getLine( istream &fileP, vector< string > &item )
+
+bool getLine( Moses::InputFileStream &file, std::vector< std::string > &item )
 {
-  if (fileP.eof())
+  if (file.eof())
     return false;
 
-  string line;
-  if (!getline(fileP, line))
+  std::string line;
+  if (!getline(file, line))
     return false;
 
-  item = splitLine(line.c_str());
+  Moses::TokenizeMultiCharSeparator(item, line, " ||| ");
 
   return true;
 }
 
-vector< string > splitLine(const char *line)
-{
-  vector< string > item;
-  int start=0;
-  int i=0;
-  for(; line[i] != '\0'; i++) {
-    if (line[i] == ' ' &&
-        line[i+1] == '|' &&
-        line[i+2] == '|' &&
-        line[i+3] == '|' &&
-        line[i+4] == ' ') {
-      if (start > i) start = i; // empty item
-      item.push_back( string( line+start, i-start ) );
-      start = i+5;
-      i += 3;
-    }
-  }
-  item.push_back( string( line+start, i-start ) );
-
-  return item;
-}
author	Matthias Huck <huck@i6.informatik.rwth-aachen.de>	2015-03-09 21:47:28 +0300
committer	Matthias Huck <huck@i6.informatik.rwth-aachen.de>	2015-03-09 21:47:28 +0300
commit	973fd980524060487d2a6efa5459697801163cec (patch)
tree	476b39afe15f217ff8d33e35504be1607bdcc71f /phrase-extract/consolidate-main.cpp
parent	0c79e19ff933d469d2ccf442f1cec343830c4c2d (diff)