diff options
author | Kenneth Heafield <github@kheafield.com> | 2014-04-08 19:54:20 +0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2014-04-08 19:54:20 +0400 |
commit | 0952f0027a58e5f4cf26aeaa2cb5d97e10118231 (patch) | |
tree | 471094088eb2d0d4699d979bebb6d2b198247985 | |
parent | 22ec022ce949f1438da61b895cbe909e290ceca2 (diff) | |
parent | 5512e96185c4f3894efab7c49b834509bb16b529 (diff) |
Merge branch 'pruning2' of github.com:kpu/kenlm into pruning2pruning2
-rw-r--r-- | lm/builder/initial_probabilities.cc | 20 |
1 files changed, 15 insertions, 5 deletions
diff --git a/lm/builder/initial_probabilities.cc b/lm/builder/initial_probabilities.cc index ebdbdb2..b29a971 100644 --- a/lm/builder/initial_probabilities.cc +++ b/lm/builder/initial_probabilities.cc @@ -150,16 +150,23 @@ class AddRight { for(; in; ++out) { memcpy(&previous[0], in->begin(), size); uint64_t denominator = 0; - float discountSum = 0.0; + uint64_t normalizer = 0; uint64_t counts[4]; memset(counts, 0, sizeof(counts)); do { denominator += in->UnmarkedCount(); - discountSum += discount_.Apply(in->UnmarkedCount() - in->CutoffCount()); - //mjd: Verify this! According to Chen&Goodman based on counts not on cutoffs. - ++counts[std::min(in->UnmarkedCount(), static_cast<uint64_t>(3))]; + // Collect unused probability mass from pruning. + // Becomes 0 for unpruned ngrams. + normalizer += in->UnmarkedCount() - in->CutoffCount(); + + // Chen&Goodman do not mention counting based on cutoffs, but + // backoff becomes larger than 1 otherwise, so probably needs + // to count cutoffs. Counts normally without pruning. + if(in->CutoffCount() > 0) + ++counts[std::min(in->CutoffCount(), static_cast<uint64_t>(3))]; + } while (++in && !memcmp(&previous[0], in->begin(), size)); BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get()); @@ -169,7 +176,10 @@ class AddRight { entry.gamma += discount_.Get(i) * static_cast<float>(counts[i]); } - entry.gamma += discountSum; + + // Makes model sum to 1 with pruning (I hope). + entry.gamma += normalizer; + entry.gamma /= entry.denominator; if(pruning_) { |