Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/kpu/kenlm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2014-02-06 06:38:35 +0400
committerKenneth Heafield <github@kheafield.com>2014-02-06 06:38:35 +0400
commit70d48aebdf062249d2398597b78bdbd072c27916 (patch)
tree86031a277a4585b73855bbf39b3175dcdc83a7a9
parent99203ab01a2dcb270c7a258874510ce5d2741cd8 (diff)
Vocabulary pad
-rw-r--r--lm/builder/interpolate.cc5
-rw-r--r--lm/builder/interpolate.hh4
-rw-r--r--lm/builder/lmplz_main.cc10
-rw-r--r--lm/builder/pipeline.cc2
-rw-r--r--lm/builder/pipeline.hh12
5 files changed, 27 insertions, 6 deletions
diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc
index 5002680..1514d46 100644
--- a/lm/builder/interpolate.cc
+++ b/lm/builder/interpolate.cc
@@ -52,8 +52,9 @@ class Callback {
};
} // namespace
-Interpolate::Interpolate(uint64_t unigram_count, const ChainPositions &backoffs)
- : uniform_prob_(1.0 / static_cast<float>(unigram_count - 1)), backoffs_(backoffs) {}
+Interpolate::Interpolate(uint64_t vocab_size, const ChainPositions &backoffs)
+ : uniform_prob_(1.0 / static_cast<float>(vocab_size)), // Includes <unk> but excludes <s>.
+ backoffs_(backoffs) {}
// perform order-wise interpolation
void Interpolate::Run(const ChainPositions &positions) {
diff --git a/lm/builder/interpolate.hh b/lm/builder/interpolate.hh
index 9268d40..70199af 100644
--- a/lm/builder/interpolate.hh
+++ b/lm/builder/interpolate.hh
@@ -14,7 +14,9 @@ namespace lm { namespace builder {
*/
class Interpolate {
public:
- explicit Interpolate(uint64_t unigram_count, const ChainPositions &backoffs);
+ // Normally the unigram count-1 (since p(<s>) = 0) but might be larger to
+ // set a consistent vocabulary size.
+ explicit Interpolate(uint64_t vocab_size, const ChainPositions &backoffs);
void Run(const ChainPositions &positions);
diff --git a/lm/builder/lmplz_main.cc b/lm/builder/lmplz_main.cc
index 74f59fc..cebbc00 100644
--- a/lm/builder/lmplz_main.cc
+++ b/lm/builder/lmplz_main.cc
@@ -47,9 +47,10 @@ int main(int argc, char *argv[]) {
("memory,S", SizeOption(pipeline.sort.total_memory, util::GuessPhysicalMemory() ? "80%" : "1G"), "Sorting memory")
("minimum_block", SizeOption(pipeline.minimum_block, "8K"), "Minimum block size to allow")
("sort_block", SizeOption(pipeline.sort.buffer_size, "64M"), "Size of IO operations for sort (determines arity)")
- ("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
- ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
+ ("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
+ ("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write a file containing the unique vocabulary strings delimited by null bytes")
+ ("vocab_pad", po::value<std::size_t>(&pipeline.vocab_size_for_unk)->default_value(0), "If the vocabulary is smaller than this value, pad with <unk> to reach this size. Requires --interpolate_unigrams")
("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout");
@@ -95,6 +96,11 @@ int main(int argc, char *argv[]) {
}
#endif
+ if (vm.count("vocab_pad") && !pipeline.initial_probs.interpolate_unigrams) {
+ std::cerr << "--vocab_pad requires --interpolate_unigrams" << std::endl;
+ return 1;
+ }
+
util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
diff --git a/lm/builder/pipeline.cc b/lm/builder/pipeline.cc
index 44a2313..0788335 100644
--- a/lm/builder/pipeline.cc
+++ b/lm/builder/pipeline.cc
@@ -269,7 +269,7 @@ void InterpolateProbabilities(const std::vector<uint64_t> &counts, Master &maste
gamma_chains.push_back(read_backoffs);
gamma_chains.back() >> gammas[i].Source();
}
- master >> Interpolate(counts[0], ChainPositions(gamma_chains));
+ master >> Interpolate(std::max(master.Config().vocab_size_for_unk, counts[0] - 1 /* <s> is not included */), ChainPositions(gamma_chains));
gamma_chains >> util::stream::kRecycle;
master.BufferFinal(counts);
}
diff --git a/lm/builder/pipeline.hh b/lm/builder/pipeline.hh
index 845e548..4f3211e 100644
--- a/lm/builder/pipeline.hh
+++ b/lm/builder/pipeline.hh
@@ -30,6 +30,18 @@ struct PipelineConfig {
// Number of blocks to use. This will be overridden to 1 if everything fits.
std::size_t block_count;
+ /* Computing the perplexity of LMs with different vocabularies is hard. For
+ * example, the lowest perplexity is attained by a unigram model that
+ * predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly
+ * interpolated models will sum to more than 1 because <unk> is duplicated
+ * (SRI just pretends p(<unk>) = 0 for these purposes, which makes it sum to
+ * 1 but comes with its own problems). This option will make the vocabulary
+ * a particular size by replicating <unk> multiple times for purposes of
+ * computing vocabulary size. It has no effect if the actual vocabulary is
+ * larger. This parameter serves the same purpose as IRSTLM's "dub".
+ */
+ uint64_t vocab_size_for_unk;
+
const std::string &TempPrefix() const { return sort.temp_prefix; }
std::size_t TotalMemory() const { return sort.total_memory; }
};