Vocabulary pad

author: Kenneth Heafield <github@kheafield.com> 2014-02-06 06:38:35 +0400
committer: Kenneth Heafield <github@kheafield.com> 2014-02-06 06:38:35 +0400
commit: 70d48aebdf062249d2398597b78bdbd072c27916 (patch)
tree: 86031a277a4585b73855bbf39b3175dcdc83a7a9
parent: 99203ab01a2dcb270c7a258874510ce5d2741cd8 (diff)
5 files changed, 27 insertions, 6 deletions
diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc
index 5002680..1514d46 100644
--- a/lm/builder/interpolate.cc
+++ b/lm/builder/interpolate.cc
@@ -52,8 +52,9 @@ class Callback {
 };
 } // namespace
 
-Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs) 
-  : uniform_prob_(1.0 / static_cast<float>(unigram_count - 1)), backoffs_(backoffs) {}
+Interpolate::Interpolate(uint64_t vocab_size, const ChainPositions &backoffs)
+  : uniform_prob_(1.0 / static_cast<float>(vocab_size)), // Includes <unk> but excludes <s>.
+    backoffs_(backoffs) {}
 
 // perform order-wise interpolation
 void Interpolate::Run(const ChainPositions &positions) {
diff --git a/lm/builder/interpolate.hh b/lm/builder/interpolate.hh
index 9268d40..70199af 100644
--- a/lm/builder/interpolate.hh
+++ b/lm/builder/interpolate.hh
@@ -14,7 +14,9 @@ namespace lm { namespace builder {
  */
 class Interpolate {
   public:
-    explicit Interpolate(uint64_t unigram_count, const ChainPositions &backoffs);
+    // Normally the unigram count-1 (since p(<s>) = 0) but might be larger to
+    // set a consistent vocabulary size.
+    explicit Interpolate(uint64_t vocab_size, const ChainPositions &backoffs);
 
     void Run(const ChainPositions &positions);
 
diff --git a/lm/builder/lmplz_main.cc b/lm/builder/lmplz_main.cc
index 74f59fc..cebbc00 100644
--- a/lm/builder/lmplz_main.cc
+++ b/lm/builder/lmplz_main.cc
@@ -47,9 +47,10 @@ int main(int argc, char *argv[]) {
       ("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
       ("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
       ("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
-      ("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
       ("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
-      ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
+      ("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
+      ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write a file containing the unique vocabulary strings delimited by null bytes")
+      ("vocab_pad", po::value<std::size_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
       ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
       ("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
       ("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout");
@@ -95,6 +96,11 @@ int main(int argc, char *argv[]) {
     }
 #endif
 
+    if (vm.count("vocab_pad") && !pipeline.initial_probs.interpolate_unigrams) {
+      std::cerr << "--vocab_pad requires --interpolate_unigrams" << std::endl;
+      return 1;
+    }
+
     util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
 
     lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
diff --git a/lm/builder/pipeline.cc b/lm/builder/pipeline.cc
index 44a2313..0788335 100644
--- a/lm/builder/pipeline.cc
+++ b/lm/builder/pipeline.cc
@@ -269,7 +269,7 @@ void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &maste
     gamma_chains.push_back(read_backoffs);
     gamma_chains.back() >> gammas[i].Source();
   }
-  master >> Interpolate(counts[0], ChainPositions(gamma_chains));
+  master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), ChainPositions(gamma_chains));
   gamma_chains >> util::stream::kRecycle;
   master.BufferFinal(counts);
 }
diff --git a/lm/builder/pipeline.hh b/lm/builder/pipeline.hh
index 845e548..4f3211e 100644
--- a/lm/builder/pipeline.hh
+++ b/lm/builder/pipeline.hh
@@ -30,6 +30,18 @@ struct PipelineConfig {
   // Number of blocks to use.  This will be overridden to 1 if everything fits.
   std::size_t block_count;
 
+  /* Computing the perplexity of LMs with different vocabularies is hard.  For
+   * example, the lowest perplexity is attained by a unigram model that
+   * predicts p(<unk>) = 1 and has no other vocabulary.  Also, linearly
+   * interpolated models will sum to more than 1 because <unk> is duplicated
+   * (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to
+   * 1 but comes with its own problems).  This option will make the vocabulary
+   * a particular size by replicating <unk> multiple times for purposes of
+   * computing vocabulary size.  It has no effect if the actual vocabulary is
+   * larger.  This parameter serves the same purpose as IRSTLM's "dub".
+   */
+  uint64_t vocab_size_for_unk;
+
   const std::string &TempPrefix() const { return sort.temp_prefix; }
   std::size_t TotalMemory() const { return sort.total_memory; }
 };
author	Kenneth Heafield <github@kheafield.com>	2014-02-06 06:38:35 +0400
committer	Kenneth Heafield <github@kheafield.com>	2014-02-06 06:38:35 +0400
commit	70d48aebdf062249d2398597b78bdbd072c27916 (patch)
tree	86031a277a4585b73855bbf39b3175dcdc83a7a9
parent	99203ab01a2dcb270c7a258874510ce5d2741cd8 (diff)