Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/kpu/kenlm.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2014-04-08 19:54:20 +0400
committerKenneth Heafield <github@kheafield.com>2014-04-08 19:54:20 +0400
commit0952f0027a58e5f4cf26aeaa2cb5d97e10118231 (patch)
tree471094088eb2d0d4699d979bebb6d2b198247985
parent22ec022ce949f1438da61b895cbe909e290ceca2 (diff)
parent5512e96185c4f3894efab7c49b834509bb16b529 (diff)
Merge branch 'pruning2' of github.com:kpu/kenlm into pruning2pruning2
-rw-r--r--lm/builder/initial_probabilities.cc20
1 files changed, 15 insertions, 5 deletions
diff --git a/lm/builder/initial_probabilities.cc b/lm/builder/initial_probabilities.cc
index ebdbdb2..b29a971 100644
--- a/lm/builder/initial_probabilities.cc
+++ b/lm/builder/initial_probabilities.cc
@@ -150,16 +150,23 @@ class AddRight {
for(; in; ++out) {
memcpy(&previous[0], in->begin(), size);
uint64_t denominator = 0;
- float discountSum = 0.0;
+ uint64_t normalizer = 0;
uint64_t counts[4];
memset(counts, 0, sizeof(counts));
do {
denominator += in->UnmarkedCount();
- discountSum += discount_.Apply(in->UnmarkedCount() - in->CutoffCount());
- //mjd: Verify this! According to Chen&Goodman based on counts not on cutoffs.
- ++counts[std::min(in->UnmarkedCount(), static_cast<uint64_t>(3))];
+ // Collect unused probability mass from pruning.
+ // Becomes 0 for unpruned ngrams.
+ normalizer += in->UnmarkedCount() - in->CutoffCount();
+
+ // Chen&Goodman do not mention counting based on cutoffs, but
+ // backoff becomes larger than 1 otherwise, so probably needs
+ // to count cutoffs. Counts normally without pruning.
+ if(in->CutoffCount() > 0)
+ ++counts[std::min(in->CutoffCount(), static_cast<uint64_t>(3))];
+
} while (++in && !memcmp(&previous[0], in->begin(), size));
BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get());
@@ -169,7 +176,10 @@ class AddRight {
entry.gamma += discount_.Get(i) * static_cast<float>(counts[i]);
}
- entry.gamma += discountSum;
+
+ // Makes model sum to 1 with pruning (I hope).
+ entry.gamma += normalizer;
+
entry.gamma /= entry.denominator;
if(pruning_) {