KenLM 36b746 including -w mmap <unk> offset fix

author: Kenneth Heafield <github@kheafield.com> 2012-11-05 00:36:42 +0400
committer: Kenneth Heafield <github@kheafield.com> 2012-11-05 00:36:42 +0400
commit: 14392acc8f41e3ed9e834573cfab433a54d4b68b (patch)
tree: d698c74d68505c021d6c37a7c75bca4b69e7aac5 /lm
parent: 96f7b42eb9944cc6fb5173c320562ca2dc5a2229 (diff)
3 files changed, 10 insertions, 4 deletions
diff --git a/lm/model.cc b/lm/model.cc
index 2fd204815..d045954e8 100644
--- a/lm/model.cc
+++ b/lm/model.cc
@@ -87,7 +87,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
       WriteWordsWrapper wrap(config.enumerate_vocab);
       vocab_.ConfigureEnumerate(&wrap, counts[0]);
       search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_);
-      wrap.Write(backing_.file.get(), backing_.vocab.size() + vocab_.UnkCountChangePadding() + backing_.search.size());
+      wrap.Write(backing_.file.get(), backing_.vocab.size() + vocab_.UnkCountChangePadding() + Search::Size(counts, config));
     } else {
       vocab_.ConfigureEnumerate(config.enumerate_vocab, counts[0]);
       search_.InitializeFromARPA(file, f, counts, config, vocab_, backing_);
diff --git a/lm/vocab.cc b/lm/vocab.cc
index 11c27518d..fd7f96dc4 100644
--- a/lm/vocab.cc
+++ b/lm/vocab.cc
@@ -116,7 +116,9 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) {
   }
   *end_ = hashed;
   if (enumerate_) {
-    strings_to_enumerate_[end_ - begin_].assign(str.data(), str.size());
+    void *copied = string_backing_.Allocate(str.size());
+    memcpy(copied, str.data(), str.size());
+    strings_to_enumerate_[end_ - begin_] = StringPiece(static_cast<const char*>(copied), str.size());
   }
   ++end_;
   // This is 1 + the offset where it was inserted to make room for unk.  
@@ -126,7 +128,7 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) {
 void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
   if (enumerate_) {
     if (!strings_to_enumerate_.empty()) {
-      util::PairedIterator<ProbBackoff*, std::string*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin());
+      util::PairedIterator<ProbBackoff*, StringPiece*> values(reorder_vocab + 1, &*strings_to_enumerate_.begin());
       util::JointSort(begin_, end_, values);
     }
     for (WordIndex i = 0; i < static_cast<WordIndex>(end_ - begin_); ++i) {
@@ -134,6 +136,7 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
       enumerate_->Add(i + 1, strings_to_enumerate_[i]);
     }
     strings_to_enumerate_.clear();
+    string_backing_.FreeAll();
   } else {
     util::JointSort(begin_, end_, reorder_vocab + 1);
   }
diff --git a/lm/vocab.hh b/lm/vocab.hh
index de54eb064..3902f1174 100644
--- a/lm/vocab.hh
+++ b/lm/vocab.hh
@@ -4,6 +4,7 @@
 #include "lm/enumerate_vocab.hh"
 #include "lm/lm_exception.hh"
 #include "lm/virtual_interface.hh"
+#include "util/pool.hh"
 #include "util/probing_hash_table.hh"
 #include "util/sorted_uniform.hh"
 #include "util/string_piece.hh"
@@ -96,7 +97,9 @@ class SortedVocabulary : public base::Vocabulary {
     EnumerateVocab *enumerate_;
 
     // Actual strings.  Used only when loading from ARPA and enumerate_ != NULL 
-    std::vector<std::string> strings_to_enumerate_;
+    util::Pool string_backing_;
+
+    std::vector<StringPiece> strings_to_enumerate_;
 };
 
 #pragma pack(push)
author	Kenneth Heafield <github@kheafield.com>	2012-11-05 00:36:42 +0400
committer	Kenneth Heafield <github@kheafield.com>	2012-11-05 00:36:42 +0400
commit	14392acc8f41e3ed9e834573cfab433a54d4b68b (patch)
tree	d698c74d68505c021d6c37a7c75bca4b69e7aac5 /lm
parent	96f7b42eb9944cc6fb5173c320562ca2dc5a2229 (diff)