Remove --OutputNTLengths from extract-rules, etc.

The option isn't used in master and the output is compatible with the current rule table format. If anyone wants this in master it should probably be fixed in the span-length branch then merged.
author: Phil Williams <philip.williams@mac.com> 2013-09-14 01:16:42 +0400
committer: Phil Williams <philip.williams@mac.com> 2013-09-14 01:16:42 +0400
commit: cdd9df19d26723645454f8ddef467489d1ed341b (patch)
tree: 1fd033efb62e16fc5fbb37077c7723cf7255bde3 /phrase-extract
parent: 3b03d803d98194772e4b0ac6df7041c8c98c0119 (diff)
6 files changed, 2 insertions, 177 deletions
diff --git a/phrase-extract/ExtractedRule.cpp b/phrase-extract/ExtractedRule.cpp
deleted file mode 100644
index 50d9085e6..000000000
--- a/phrase-extract/ExtractedRule.cpp
+++ /dev/null
@@ -1,44 +0,0 @@
-//
-//  ExtractedRule.cpp
-//  extract
-//
-//  Created by Hieu Hoang on 13/09/2011.
-//  Copyright 2011 __MyCompanyName__. All rights reserved.
-//
-
-#include "ExtractedRule.h"
-
-using namespace std;
-
-namespace MosesTraining
-{
-
-void ExtractedRule::OutputNTLengths(std::ostream &out) const
-{
-  ostringstream outString;
-  OutputNTLengths(outString);
-  out << outString;
-}
-
-void ExtractedRule::OutputNTLengths(std::ostringstream &outString) const
-{
-  std::map<size_t, std::pair<size_t, size_t> >::const_iterator iter;
-  for (iter = m_ntLengths.begin(); iter != m_ntLengths.end(); ++iter) {
-    size_t sourcePos = iter->first;
-    const std::pair<size_t, size_t> &spanLengths = iter->second;
-    outString << sourcePos << "=" << spanLengths.first << "," <<spanLengths.second << " ";
-  }
-}
-
-std::ostream& operator<<(std::ostream &out, const ExtractedRule &obj)
-{
-  out << obj.source << " ||| " << obj.target << " ||| "
-      << obj.alignment << " ||| "
-      << obj.alignmentInv << " ||| ";
-
-  obj.OutputNTLengths(out);
-
-  return out;
-}
-
-} // namespace
diff --git a/phrase-extract/ExtractedRule.h b/phrase-extract/ExtractedRule.h
index a6cd5074d..cb2f2261d 100644
--- a/phrase-extract/ExtractedRule.h
+++ b/phrase-extract/ExtractedRule.h
@@ -32,8 +32,6 @@ namespace MosesTraining
 // sentence-level collection of rules
 class ExtractedRule
 {
-  friend std::ostream& operator<<(std::ostream &, const ExtractedRule &);
-
 public:
   std::string source;
   std::string target;
@@ -54,8 +52,6 @@ public:
   float count;
   double pcfgScore;
 
-  std::map<size_t, std::pair<size_t, size_t> > m_ntLengths;
-
   ExtractedRule(int sT, int eT, int sS, int eS)
     : source()
     , target()
@@ -76,13 +72,6 @@ public:
     , count(0)
     , pcfgScore(0.0) {
   }
-
-  void SetSpanLength(size_t sourcePos, size_t sourceLength, size_t targetLength) {
-    m_ntLengths[sourcePos] = std::pair<size_t, size_t>(sourceLength, targetLength);
-  }
-
-  void OutputNTLengths(std::ostream &out) const;
-  void OutputNTLengths(std::ostringstream &out) const;
 };
 
 }
diff --git a/phrase-extract/RuleExtractionOptions.h b/phrase-extract/RuleExtractionOptions.h
index a9b0ce9e6..b38258470 100644
--- a/phrase-extract/RuleExtractionOptions.h
+++ b/phrase-extract/RuleExtractionOptions.h
@@ -49,7 +49,6 @@ public:
   bool duplicateRules;
   bool fractionalCounting;
   bool pcfgScore;
-  bool outputNTLengths;
   bool gzOutput;
   bool unpairedExtractFormat;
   bool conditionOnTargetLhs;
@@ -83,7 +82,6 @@ public:
     , duplicateRules(true)
     , fractionalCounting(true)
     , pcfgScore(false)
-    , outputNTLengths(false)
     , gzOutput(false)
     , unpairedExtractFormat(false)
     , conditionOnTargetLhs(false)
diff --git a/phrase-extract/consolidate-main.cpp b/phrase-extract/consolidate-main.cpp
index 221b7048c..b0e2c8594 100644
--- a/phrase-extract/consolidate-main.cpp
+++ b/phrase-extract/consolidate-main.cpp
@@ -41,7 +41,6 @@ bool lowCountFlag = false;
 bool goodTuringFlag = false;
 bool kneserNeyFlag = false;
 bool logProbFlag = false;
-bool outputNTLengths = false;
 inline float maybeLogProb( float a )
 {
   return logProbFlag ? log(a) : a;
@@ -62,7 +61,7 @@ int main(int argc, char* argv[])
        << "consolidating direct and indirect rule tables\n";
 
   if (argc < 4) {
-    cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] [--OutputNTLengths] \n";
+    cerr << "syntax: consolidate phrase-table.direct phrase-table.indirect phrase-table.consolidated [--Hierarchical] [--OnlyDirect] \n";
     exit(1);
   }
   char* &fileNameDirect = argv[1];
@@ -119,8 +118,6 @@ int main(int argc, char* argv[])
     } else if (strcmp(argv[i],"--LogProb") == 0) {
       logProbFlag = true;
       cerr << "using log-probabilities\n";
-    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
-      outputNTLengths = true;
     } else {
       cerr << "ERROR: unknown option " << argv[i] << endl;
       exit(1);
@@ -315,10 +312,6 @@ void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameC
     // counts, for debugging
     fileConsolidated << "||| " << countE << " " << countF << " " << countEF;
 
-    if (outputNTLengths) {
-      fileConsolidated << " ||| " << itemDirect[5];
-    }
-
     // count bin feature (as a sparse feature)
     if (sparseCountBinFeatureFlag ||
         directSparseScores.compare("") != 0 ||
diff --git a/phrase-extract/extract-rules-main.cpp b/phrase-extract/extract-rules-main.cpp
index 97a593085..c625c3582 100644
--- a/phrase-extract/extract-rules-main.cpp
+++ b/phrase-extract/extract-rules-main.cpp
@@ -129,7 +129,6 @@ int main(int argc, char* argv[])
          << " --GlueGrammar FILE"
          << " | --UnknownWordLabel FILE"
          << " | --OnlyDirect"
-         << " | --OutputNTLengths"
          << " | --MaxSpan[" << options.maxSpan << "]"
          << " | --MinHoleTarget[" << options.minHoleTarget << "]"
          << " | --MinHoleSource[" << options.minHoleSource << "]"
@@ -262,8 +261,6 @@ int main(int argc, char* argv[])
       options.fractionalCounting = false;
     } else if (strcmp(argv[i],"--PCFG") == 0) {
       options.pcfgScore = true;
-    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
-      options.outputNTLengths = true;
     } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
       options.unpairedExtractFormat = true;
     } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
@@ -663,9 +660,6 @@ void ExtractTask::saveHieroAlignment( int startT, int endT, int startS, int endS
     rule.alignment      += sourceSymbolIndex + "-" + targetSymbolIndex + " ";
     if (!m_options.onlyDirectFlag)
       rule.alignmentInv += targetSymbolIndex + "-" + sourceSymbolIndex + " ";
-
-    rule.SetSpanLength(hole.GetPos(0), hole.GetSize(0), hole.GetSize(1) ) ;
-
   }
 
   rule.alignment.erase(rule.alignment.size()-1);
@@ -1077,9 +1071,6 @@ void ExtractTask::writeRulesToFile()
         << rule->target << " ||| "
         << rule->alignment << " ||| "
         << rule->count << " ||| ";
-    if (m_options.outputNTLengths) {
-      rule->OutputNTLengths(out);
-    }
     if (m_options.pcfgScore) {
       out << " ||| " << rule->pcfgScore;
     }
diff --git a/phrase-extract/score-main.cpp b/phrase-extract/score-main.cpp
index 3042cbe3e..b7ccea0fc 100644
--- a/phrase-extract/score-main.cpp
+++ b/phrase-extract/score-main.cpp
@@ -59,7 +59,6 @@ int negLogProb = 1;
 bool lexFlag = true;
 bool unalignedFlag = false;
 bool unalignedFWFlag = false;
-bool outputNTLengths = false;
 bool singletonFeature = false;
 bool crossedNonTerm = false;
 int countOfCounts[COC_MAX+1];
@@ -82,9 +81,6 @@ double computeUnalignedPenalty( const PHRASE &, const PHRASE &, const PhraseAlig
 set<string> functionWordList;
 void loadFunctionWords( const string &fileNameFunctionWords );
 double computeUnalignedFWPenalty( const PHRASE &, const PHRASE &, const PhraseAlignment & );
-void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
-                      , map<size_t, map<size_t, float> > &sourceProb
-                      , map<size_t, map<size_t, float> > &targetProb);
 void printSourcePhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
 void printTargetPhrase(const PHRASE &, const PHRASE &, const PhraseAlignment &, ostream &);
 
@@ -95,7 +91,7 @@ int main(int argc, char* argv[])
 
   ScoreFeatureManager featureManager;
   if (argc < 4) {
-    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
+    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--NoWordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--Singleton] [--CrossedNonTerm] \n";
     cerr << featureManager.usage() << endl;
     exit(1);
   }
@@ -158,8 +154,6 @@ int main(int argc, char* argv[])
       minCountHierarchical = atof(argv[++i]);
       cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n";
       minCountHierarchical -= 0.00001; // account for rounding
-    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
-      outputNTLengths = true;
     } else if (strcmp(argv[i],"--Singleton") == 0) {
       singletonFeature = true;
       cerr << "binary singleton feature\n";
@@ -375,87 +369,6 @@ const PhraseAlignment &findBestAlignment(const PhraseAlignmentCollection &phrase
   return *bestAlignment;
 }
 
-
-void calcNTLengthProb(const map<size_t, map<size_t, size_t> > &lengths
-                      , size_t total
-                      , map<size_t, map<size_t, float> > &probs)
-{
-  map<size_t, map<size_t, size_t> >::const_iterator iterOuter;
-  for (iterOuter = lengths.begin(); iterOuter != lengths.end(); ++iterOuter) {
-    size_t sourcePos = iterOuter->first;
-    const map<size_t, size_t> &inner = iterOuter->second;
-
-    map<size_t, size_t>::const_iterator iterInner;
-    for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner) {
-      size_t length = iterInner->first;
-      size_t count = iterInner->second;
-      float prob = (float) count / (float) total;
-      probs[sourcePos][length] = prob;
-    }
-  }
-}
-
-void calcNTLengthProb(const vector< PhraseAlignment* > &phrasePairs
-                      , map<size_t, map<size_t, float> > &sourceProb
-                      , map<size_t, map<size_t, float> > &targetProb)
-{
-  map<size_t, map<size_t, size_t> > sourceLengths, targetLengths;
-  // 1st = position in source phrase, 2nd = length, 3rd = count
-  map<size_t, size_t> totals;
-  // 1st = position in source phrase, 2nd = total counts
-  // each source pos should have same count?
-
-  vector< PhraseAlignment* >::const_iterator iterOuter;
-  for (iterOuter = phrasePairs.begin(); iterOuter != phrasePairs.end(); ++iterOuter) {
-    const PhraseAlignment &phrasePair = **iterOuter;
-    const std::map<size_t, std::pair<size_t, size_t> > &ntLengths = phrasePair.GetNTLengths();
-
-    std::map<size_t, std::pair<size_t, size_t> >::const_iterator iterInner;
-    for (iterInner = ntLengths.begin(); iterInner != ntLengths.end(); ++iterInner) {
-      size_t sourcePos = iterInner->first;
-      size_t sourceLength = iterInner->second.first;
-      size_t targetLength = iterInner->second.second;
-
-      sourceLengths[sourcePos][sourceLength]++;
-      targetLengths[sourcePos][targetLength]++;
-
-      totals[sourcePos]++;
-    }
-  }
-
-  if (totals.size() == 0) {
-    // no non-term. Don't bother
-    return;
-  }
-
-  size_t total = totals.begin()->second;
-  if (totals.size() > 1) {
-    assert(total == (++totals.begin())->second );
-  }
-
-  calcNTLengthProb(sourceLengths, total, sourceProb);
-  calcNTLengthProb(targetLengths, total, targetProb);
-
-}
-
-void outputNTLengthProbs(ostream &phraseTableFile, const map<size_t, map<size_t, float> > &probs, const string &prefix)
-{
-  map<size_t, map<size_t, float> >::const_iterator iterOuter;
-  for (iterOuter = probs.begin(); iterOuter != probs.end(); ++iterOuter) {
-    size_t sourcePos = iterOuter->first;
-    const map<size_t, float> &inner = iterOuter->second;
-
-    map<size_t, float>::const_iterator iterInner;
-    for (iterInner = inner.begin(); iterInner != inner.end(); ++iterInner) {
-      size_t length = iterInner->first;
-      float prob = iterInner->second;
-
-      phraseTableFile << sourcePos << "|" << prefix << "|" << length << "=" << prob << " ";
-    }
-  }
-
-}
-
 bool calcCrossedNonTerm(size_t sourcePos, size_t targetPos, const std::vector< std::set<size_t> > &alignedToS)
 {
   for (size_t currSource = 0; currSource < alignedToS.size(); ++currSource) {
@@ -664,21 +577,6 @@ void outputPhrasePair(const PhraseAlignmentCollection &phrasePair, float totalCo
   if (kneserNeyFlag)
     phraseTableFile << " " << distinctCount;
 
-  // nt lengths
-  if (outputNTLengths) {
-    phraseTableFile << " ||| ";
-
-    if (!inverseFlag) {
-      map<size_t, map<size_t, float> > sourceProb, targetProb;
-      // 1st sourcePos, 2nd = length, 3rd = prob
-
-      calcNTLengthProb(phrasePair, sourceProb, targetProb);
-
-      outputNTLengthProbs(phraseTableFile, sourceProb, "S");
-      outputNTLengthProbs(phraseTableFile, targetProb, "T");
-    }
-  }
-
   phraseTableFile << endl;
 }
author	Phil Williams <philip.williams@mac.com>	2013-09-14 01:16:42 +0400
committer	Phil Williams <philip.williams@mac.com>	2013-09-14 01:16:42 +0400
commit	cdd9df19d26723645454f8ddef467489d1ed341b (patch)
tree	1fd033efb62e16fc5fbb37077c7723cf7255bde3 /phrase-extract
parent	3b03d803d98194772e4b0ac6df7041c8c98c0119 (diff)