diff options
author | Kenneth Heafield <github@kheafield.com> | 2012-11-05 00:36:42 +0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2012-11-05 00:36:42 +0400 |
commit | 14392acc8f41e3ed9e834573cfab433a54d4b68b (patch) | |
tree | d698c74d68505c021d6c37a7c75bca4b69e7aac5 /lm | |
parent | 96f7b42eb9944cc6fb5173c320562ca2dc5a2229 (diff) |
KenLM 36b746 including -w mmap <unk> offset fix
Diffstat (limited to 'lm')
-rw-r--r-- | lm/model.cc | 2 | ||||
-rw-r--r-- | lm/vocab.cc | 7 | ||||
-rw-r--r-- | lm/vocab.hh | 5 |
3 files changed, 10 insertions, 4 deletions
diff --git a/lm/model.cc b/lm/model.cc index 2fd204815..d045954e8 100644 --- a/lm/model.cc +++ b/lm/model.cc @@ -87,7 +87,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT WriteWordsWrapper wrap(config.enumerate_vocab); vocab_.ConfigureEnumerate(&wrap, counts[0]); search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_); - wrap.Write(backing_.file.get(), backing_.vocab.size() + vocab_.UnkCountChangePadding() + backing_.search.size()); + wrap.Write(backing_.file.get(), backing_.vocab.size() + vocab_.UnkCountChangePadding() + Search::Size(counts, config)); } else { vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]); search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_); diff --git a/lm/vocab.cc b/lm/vocab.cc index 11c27518d..fd7f96dc4 100644 --- a/lm/vocab.cc +++ b/lm/vocab.cc @@ -116,7 +116,9 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) { } *end_ = hashed; if (enumerate_) { - strings_to_enumerate_[end_ - begin_].assign(str.data(), str.size()); + void *copied = string_backing_.Allocate(str.size()); + memcpy(copied, str.data(), str.size()); + strings_to_enumerate_[end_ - begin_] = StringPiece(static_cast<const char*>(copied), str.size()); } ++end_; // This is 1 + the offset where it was inserted to make room for unk. @@ -126,7 +128,7 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) { void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) { if (enumerate_) { if (!strings_to_enumerate_.empty()) { - util::PairedIterator<ProbBackoff*, std::string*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin()); + util::PairedIterator<ProbBackoff*, StringPiece*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin()); util::JointSort(begin_, end_, values); } for (WordIndex i = 0; i < static_cast<WordIndex>(end_ - begin_); ++i) { @@ -134,6 +136,7 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) { enumerate_->Add(i + 1, strings_to_enumerate_[i]); } strings_to_enumerate_.clear(); + string_backing_.FreeAll(); } else { util::JointSort(begin_, end_, reorder_vocab + 1); } diff --git a/lm/vocab.hh b/lm/vocab.hh index de54eb064..3902f1174 100644 --- a/lm/vocab.hh +++ b/lm/vocab.hh @@ -4,6 +4,7 @@ #include "lm/enumerate_vocab.hh" #include "lm/lm_exception.hh" #include "lm/virtual_interface.hh" +#include "util/pool.hh" #include "util/probing_hash_table.hh" #include "util/sorted_uniform.hh" #include "util/string_piece.hh" @@ -96,7 +97,9 @@ class SortedVocabulary : public base::Vocabulary { EnumerateVocab *enumerate_; // Actual strings. Used only when loading from ARPA and enumerate_ != NULL - std::vector<std::string> strings_to_enumerate_; + util::Pool string_backing_; + + std::vector<StringPiece> strings_to_enumerate_; }; #pragma pack(push) |