load= option to KenLM exposing more load_method options

author: Kenneth Heafield <github@kheafield.com> 2016-02-20 03:07:48 +0300
committer: Kenneth Heafield <github@kheafield.com> 2016-02-20 03:07:48 +0300
commit: 7a1baeecda90456532ef54a3c4995082213fc6d0 (patch)
tree: 893f3f95a4bb4ad755c9d40fe9fd6af7aedd3193 /moses/LM
parent: 5f06e3310f52923c48326b78ef181eff61ef22f8 (diff)
4 files changed, 40 insertions, 79 deletions
diff --git a/moses/LM/Backward.cpp b/moses/LM/Backward.cpp
index 2fb7451b5..411f559ca 100644
--- a/moses/LM/Backward.cpp
+++ b/moses/LM/Backward.cpp
@@ -40,7 +40,8 @@ namespace Moses
 {
 
 /** Constructs a new backward language model. */
-template <class Model> BackwardLanguageModel<Model>::BackwardLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line,file,factorType,lazy)
+// TODO(lane): load_method instead of lazy bool
+template <class Model> BackwardLanguageModel<Model>::BackwardLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line,file,factorType, lazy ? util::LAZY : util::POPULATE_OR_READ)
 {
   //
   // This space intentionally left blank
diff --git a/moses/LM/Ken.cpp b/moses/LM/Ken.cpp
index c81f3b859..a27940e72 100644
--- a/moses/LM/Ken.cpp
+++ b/moses/LM/Ken.cpp
@@ -69,63 +69,6 @@ struct KenLMState : public FFState {
 
 };
 
-///*
-// * An implementation of single factor LM using Ken's code.
-// */
-//template <class Model> class LanguageModelKen : public LanguageModel
-//{
-//public:
-//  LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy);
-//
-//  const FFState *EmptyHypothesisState(const InputType &/*input*/) const {
-//    KenLMState *ret = new KenLMState();
-//    ret->state = m_ngram->BeginSentenceState();
-//    return ret;
-//  }
-//
-//  void CalcScore(const Phrase &phrase, float &fullScore, float &ngramScore, size_t &oovCount) const;
-//
-//  FFState *Evaluate(const Hypothesis &hypo, const FFState *ps, ScoreComponentCollection *out) const;
-//
-//  FFState *EvaluateWhenApplied(const ChartHypothesis& cur_hypo, int featureID, ScoreComponentCollection *accumulator) const;
-//
-//  void IncrementalCallback(Incremental::Manager &manager) const {
-//    manager.LMCallback(*m_ngram, m_lmIdLookup);
-//  }
-//
-//  bool IsUseable(const FactorMask &mask) const;
-//private:
-//  LanguageModelKen(const LanguageModelKen<Model> &copy_from);
-//
-//  lm::WordIndex TranslateID(const Word &word) const {
-//    std::size_t factor = word.GetFactor(m_factorType)->GetId();
-//    return (factor >= m_lmIdLookup.size() ? 0 : m_lmIdLookup[factor]);
-//  }
-//
-//  // Convert last words of hypothesis into vocab ids, returning an end pointer.
-//  lm::WordIndex *LastIDs(const Hypothesis &hypo, lm::WordIndex *indices) const {
-//    lm::WordIndex *index = indices;
-//    lm::WordIndex *end = indices + m_ngram->Order() - 1;
-//    int position = hypo.GetCurrTargetWordsRange().GetEndPos();
-//    for (; ; ++index, --position) {
-//      if (index == end) return index;
-//      if (position == -1) {
-//        *index = m_ngram->GetVocabulary().BeginSentence();
-//        return index + 1;
-//      }
-//      *index = TranslateID(hypo.GetWord(position));
-//    }
-//  }
-//
-//  boost::shared_ptr<Model> m_ngram;
-//
-//  std::vector<lm::WordIndex> m_lmIdLookup;
-//
-//  FactorType m_factorType;
-//
-//  const Factor *m_beginSentenceFactor;
-//};
-
 class MappingBuilder : public lm::EnumerateVocab
 {
 public:
@@ -148,7 +91,7 @@ private:
 
 } // namespace
 
-template <class Model> void LanguageModelKen<Model>::LoadModel(const std::string &file, bool lazy)
+template <class Model> void LanguageModelKen<Model>::LoadModel(const std::string &file, util::LoadMethod load_method)
 {
   lm::ngram::Config config;
   if(this->m_verbosity >= 1) {
@@ -159,19 +102,19 @@ template <class Model> void LanguageModelKen<Model>::LoadModel(const std::string
   FactorCollection &collection = FactorCollection::Instance();
   MappingBuilder builder(collection, m_lmIdLookup);
   config.enumerate_vocab = &builder;
-  config.load_method = lazy ? util::LAZY : util::POPULATE_OR_READ;
+  config.load_method = load_method;
 
   m_ngram.reset(new Model(file.c_str(), config));
 
   m_beginSentenceFactor = collection.AddFactor(BOS_);
 }
 
-template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy)
+template <class Model> LanguageModelKen<Model>::LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method)
   :LanguageModel(line)
   ,m_factorType(factorType)
 {
   ReadParameters();
-  LoadModel(file, lazy);
+  LoadModel(file, load_method);
 }
 
 template <class Model> LanguageModelKen<Model>::LanguageModelKen(const LanguageModelKen<Model> &copy_from)
@@ -479,7 +422,7 @@ LanguageModel *ConstructKenLM(const std::string &lineOrig)
 {
   FactorType factorType = 0;
   string filePath;
-  bool lazy = false;
+  util::LoadMethod load_method = util::POPULATE_OR_READ;
 
   util::TokenIter<util::SingleCharacter, true> argument(lineOrig, ' ');
   ++argument; // KENLM
@@ -500,38 +443,53 @@ LanguageModel *ConstructKenLM(const std::string &lineOrig)
     } else if (name == "path") {
       filePath.assign(value.data(), value.size());
     } else if (name == "lazyken") {
-      lazy = boost::lexical_cast<bool>(value);
+      // deprecated: use load instead.
+      load_method = boost::lexical_cast<bool>(value) ? util::LAZY : util::POPULATE_OR_READ;
+    } else if (name == "load") {
+      if (value == "lazy") {
+        load_method = util::LAZY;
+      } else if (value == "populate_or_lazy") {
+        load_method = util::POPULATE_OR_LAZY;
+      } else if (value == "populate_or_read" || value == "populate") {
+        load_method = util::POPULATE_OR_READ;
+      } else if (value == "read") {
+        load_method = util::READ;
+      } else if (value == "parallel_read") {
+        load_method = util::PARALLEL_READ;
+      } else {
+        UTIL_THROW2("Unknown KenLM load method " << value);
+      }
     } else {
       // pass to base class to interpret
       line << " " << name << "=" << value;
     }
   }
 
-  return ConstructKenLM(line.str(), filePath, factorType, lazy);
+  return ConstructKenLM(line.str(), filePath, factorType, load_method);
 }
 
-LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy)
+LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method)
 {
   lm::ngram::ModelType model_type;
   if (lm::ngram::RecognizeBinary(file.c_str(), model_type)) {
     switch(model_type) {
     case lm::ngram::PROBING:
-      return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, lazy);
+      return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, load_method);
     case lm::ngram::REST_PROBING:
-      return new LanguageModelKen<lm::ngram::RestProbingModel>(line, file, factorType, lazy);
+      return new LanguageModelKen<lm::ngram::RestProbingModel>(line, file, factorType, load_method);
     case lm::ngram::TRIE:
-      return new LanguageModelKen<lm::ngram::TrieModel>(line, file, factorType, lazy);
+      return new LanguageModelKen<lm::ngram::TrieModel>(line, file, factorType, load_method);
     case lm::ngram::QUANT_TRIE:
-      return new LanguageModelKen<lm::ngram::QuantTrieModel>(line, file, factorType, lazy);
+      return new LanguageModelKen<lm::ngram::QuantTrieModel>(line, file, factorType, load_method);
     case lm::ngram::ARRAY_TRIE:
-      return new LanguageModelKen<lm::ngram::ArrayTrieModel>(line, file, factorType, lazy);
+      return new LanguageModelKen<lm::ngram::ArrayTrieModel>(line, file, factorType, load_method);
     case lm::ngram::QUANT_ARRAY_TRIE:
-      return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(line, file, factorType, lazy);
+      return new LanguageModelKen<lm::ngram::QuantArrayTrieModel>(line, file, factorType, load_method);
     default:
       UTIL_THROW2("Unrecognized kenlm model type " << model_type);
     }
   } else {
-    return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, lazy);
+    return new LanguageModelKen<lm::ngram::ProbingModel>(line, file, factorType, load_method);
   }
 }
 
diff --git a/moses/LM/Ken.h b/moses/LM/Ken.h
index 3a94e4c0b..4934228c2 100644
--- a/moses/LM/Ken.h
+++ b/moses/LM/Ken.h
@@ -26,6 +26,7 @@ Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
 #include <boost/shared_ptr.hpp>
 
 #include "lm/word_index.hh"
+#include "util/mmap.hh"
 
 #include "moses/LM/Base.h"
 #include "moses/Hypothesis.h"
@@ -41,7 +42,7 @@ class FFState;
 LanguageModel *ConstructKenLM(const std::string &line);
 
 //! This will also load. Returns a templated KenLM class
-LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, bool lazy);
+LanguageModel *ConstructKenLM(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method);
 
 /*
  * An implementation of single factor LM using Kenneth's code.
@@ -49,7 +50,7 @@ LanguageModel *ConstructKenLM(const std::string &line, const std::string &file,
 template <class Model> class LanguageModelKen : public LanguageModel
 {
 public:
-  LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, bool lazy);
+  LanguageModelKen(const std::string &line, const std::string &file, FactorType factorType, util::LoadMethod load_method);
 
   virtual const FFState *EmptyHypothesisState(const InputType &/*input*/) const;
 
@@ -73,7 +74,7 @@ protected:
 
   FactorType m_factorType;
 
-  void LoadModel(const std::string &file, bool lazy);
+  void LoadModel(const std::string &file, util::LoadMethod load_method);
 
   lm::WordIndex TranslateID(const Word &word) const {
     std::size_t factor = word.GetFactor(m_factorType)->GetId();
diff --git a/moses/LM/Reloading.h b/moses/LM/Reloading.h
index 3993fe9d7..d5ae83d17 100644
--- a/moses/LM/Reloading.h
+++ b/moses/LM/Reloading.h
@@ -64,8 +64,8 @@ private:
 template <class Model> class ReloadingLanguageModel : public LanguageModelKen<Model>
 {
 public:
-
-  ReloadingLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line, file, factorType, lazy), m_file(file), m_lazy(lazy) {
+  // TODO(Lane) copy less code, update to load_method
+  ReloadingLanguageModel(const std::string &line, const std::string &file, FactorType factorType, bool lazy) : LanguageModelKen<Model>(line, file, factorType, lazy ? util::LAZY : util::POPULATE_OR_READ), m_file(file), m_lazy(lazy) {
 
     std::cerr << "ReloadingLM constructor: " << m_file << std::endl;
     //    std::cerr << std::string(line).replace(0,11,"KENLM") << std::endl;
@@ -74,7 +74,8 @@ public:
 
   virtual void InitializeForInput(ttasksptr const& ttask) {
     std::cerr << "ReloadingLM InitializeForInput" << std::endl;
-    LanguageModelKen<Model>::LoadModel(m_file, m_lazy);
+    // TODO(lane): load_method
+    LanguageModelKen<Model>::LoadModel(m_file, m_lazy ? util::LAZY : util::POPULATE_OR_READ);
     /*
     lm::ngram::Config config;
     if(this->m_verbosity >= 1) {
author	Kenneth Heafield <github@kheafield.com>	2016-02-20 03:07:48 +0300
committer	Kenneth Heafield <github@kheafield.com>	2016-02-20 03:07:48 +0300
commit	7a1baeecda90456532ef54a3c4995082213fc6d0 (patch)
tree	893f3f95a4bb4ad755c9d40fe9fd6af7aedd3193 /moses/LM
parent	5f06e3310f52923c48326b78ef181eff61ef22f8 (diff)