KenLM maximum n-gram order can now be set via a compile-time flag

author: Lane Schwartz <dowobeha@gmail.com> 2012-08-09 00:22:13 +0400
committer: Lane Schwartz <dowobeha@gmail.com> 2012-08-09 00:22:13 +0400
commit: da5429318b4c8b4a6d06109fcbdc100ea230202f (patch)
tree: da3a41787d009d30b871de11f777d3fef625dc10
parent: c55931f3524e9e31cb8f76983f150da76b42680d (diff)
10 files changed, 32 insertions, 41 deletions
diff --git a/Jamroot b/Jamroot
index 900f5dc37..bf983d1a5 100644
--- a/Jamroot
+++ b/Jamroot
@@ -56,6 +56,7 @@
 #
 # --without-libsegfault          does not link with libSegFault
 #
+# --max-kenlm-order              maximum ngram order that kenlm can process (default 6)
 #
 #CONTROLLING THE BUILD
 #-a to build from scratch
@@ -83,6 +84,16 @@ if [ option.get "with-cmph" ] {
   requirements += <define>HAVE_CMPH ;
 }
 
+# If you need higher order, change this option
+# Having this limit means that State can be
+# (KENLM_MAX_ORDER - 1) * sizeof(float) bytes instead of
+# sizeof(float*) + (KENLM_MAX_ORDER - 1) * sizeof(float) + malloc overhead
+max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
+requirements += <define>KENLM_MAX_ORDER=$(max-order) ;
+if ( $(max-order) != 6 ) {
+   echo "Setting KenLM maximum n-gram order to $(max-order)" ;
+}
+
 project : default-build
   <threading>multi
   <warnings>on
diff --git a/lm/left.hh b/lm/left.hh
index c00af88a3..751984c5e 100644
--- a/lm/left.hh
+++ b/lm/left.hh
@@ -38,7 +38,6 @@
 #ifndef LM_LEFT__
 #define LM_LEFT__
 
-#include "lm/max_order.hh"
 #include "lm/state.hh"
 #include "lm/return.hh"
 
@@ -111,7 +110,7 @@ template <class M> class RuleScore {
         return;
       }
 
-      float backoffs[kMaxOrder - 1], backoffs2[kMaxOrder - 1];
+      float backoffs[KENLM_MAX_ORDER - 1], backoffs2[KENLM_MAX_ORDER - 1];
       float *back = backoffs, *back2 = backoffs2;
       unsigned char next_use = out_.right.length;
 
diff --git a/lm/max_order.hh b/lm/max_order.hh
deleted file mode 100644
index 71cd23dd2..000000000
--- a/lm/max_order.hh
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef LM_MAX_ORDER__
-#define LM_MAX_ORDER__
-namespace lm {
-namespace ngram {
-// If you need higher order, change this and recompile.  
-// Having this limit means that State can be
-// (kMaxOrder - 1) * sizeof(float) bytes instead of
-// sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
-const unsigned char kMaxOrder = 6;
-
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_MAX_ORDER__
diff --git a/lm/model.cc b/lm/model.cc
index a2d31ce0f..6547ba03d 100644
--- a/lm/model.cc
+++ b/lm/model.cc
@@ -61,7 +61,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
     // File counts do not include pruned trigrams that extend to quadgrams etc.   These will be fixed by search_.
     ReadARPACounts(f, counts);
 
-    if (counts.size() > kMaxOrder) UTIL_THROW(FormatLoadException, "This model has order " << counts.size() << ".  Edit lm/max_order.hh, set kMaxOrder to at least this value, and recompile.");
+    if (counts.size() > KENLM_MAX_ORDER) UTIL_THROW(FormatLoadException, "This model has order " << counts.size() << ".  Re-compile, passing a number at least this large to bjam's --max-kenlm-order flag.");
     if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model.");
     if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0");
 
diff --git a/lm/model.hh b/lm/model.hh
index be8721788..6dee94196 100644
--- a/lm/model.hh
+++ b/lm/model.hh
@@ -5,7 +5,6 @@
 #include "lm/binary_format.hh"
 #include "lm/config.hh"
 #include "lm/facade.hh"
-#include "lm/max_order.hh"
 #include "lm/quantize.hh"
 #include "lm/search_hashed.hh"
 #include "lm/search_trie.hh"
diff --git a/lm/quantize.hh b/lm/quantize.hh
index cd7e8f2f0..36c427272 100644
--- a/lm/quantize.hh
+++ b/lm/quantize.hh
@@ -3,7 +3,6 @@
 
 #include "lm/blank.hh"
 #include "lm/config.hh"
-#include "lm/max_order.hh"
 #include "lm/model_type.hh"
 #include "util/bit_packing.hh"
 
@@ -217,7 +216,7 @@ class SeparatelyQuantize {
     const Bins &LongestTable() const { return longest_; }
 
   private:
-    Bins tables_[kMaxOrder - 1][2];
+    Bins tables_[KENLM_MAX_ORDER - 1][2];
 
     Bins longest_;
 
diff --git a/lm/search_trie.cc b/lm/search_trie.cc
index 18e80d5a6..9a3e96916 100644
--- a/lm/search_trie.cc
+++ b/lm/search_trie.cc
@@ -5,7 +5,6 @@
 #include "lm/binary_format.hh"
 #include "lm/blank.hh"
 #include "lm/lm_exception.hh"
-#include "lm/max_order.hh"
 #include "lm/quantize.hh"
 #include "lm/trie.hh"
 #include "lm/trie_sort.hh"
@@ -180,7 +179,7 @@ const float kBadProb = std::numeric_limits<float>::infinity();
 class SRISucks {
   public:
     SRISucks() {
-      for (BackoffMessages *i = messages_; i != messages_ + kMaxOrder - 1; ++i)
+      for (BackoffMessages *i = messages_; i != messages_ + KENLM_MAX_ORDER - 1; ++i)
         i->Init(sizeof(ProbPointer) + sizeof(WordIndex) * (i - messages_ + 1));
     }
 
@@ -196,7 +195,7 @@ class SRISucks {
     }
 
     void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) {
-      for (unsigned char i = 0; i < kMaxOrder - 1; ++i) {
+      for (unsigned char i = 0; i < KENLM_MAX_ORDER - 1; ++i) {
         it_[i] = values_[i].empty() ? NULL : &*values_[i].begin();
       }
       messages_[0].Apply(it_, unigram_file);
@@ -221,10 +220,10 @@ class SRISucks {
 
   private:
     // This used to be one array.  Then I needed to separate it by order for quantization to work.  
-    std::vector<float> values_[kMaxOrder - 1];
-    BackoffMessages messages_[kMaxOrder - 1];
+    std::vector<float> values_[KENLM_MAX_ORDER - 1];
+    BackoffMessages messages_[KENLM_MAX_ORDER - 1];
 
-    float *it_[kMaxOrder - 1];
+    float *it_[KENLM_MAX_ORDER - 1];
 };
 
 class FindBlanks {
@@ -337,7 +336,7 @@ struct Gram {
 template <class Doing> class BlankManager {
   public:
     BlankManager(unsigned char total_order, Doing &doing) : total_order_(total_order), been_length_(0), doing_(doing) {
-      for (float *i = basis_; i != basis_ + kMaxOrder - 1; ++i) *i = kBadProb;
+      for (float *i = basis_; i != basis_ + KENLM_MAX_ORDER - 1; ++i) *i = kBadProb;
     }
 
     void Visit(const WordIndex *to, unsigned char length, float prob) {
@@ -373,10 +372,10 @@ template <class Doing> class BlankManager {
   private:
     const unsigned char total_order_;
 
-    WordIndex been_[kMaxOrder];
+    WordIndex been_[KENLM_MAX_ORDER];
     unsigned char been_length_;
 
-    float basis_[kMaxOrder];
+    float basis_[KENLM_MAX_ORDER];
     
     Doing &doing_;
 };
@@ -470,8 +469,8 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c
 } // namespace
 
 template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
-  RecordReader inputs[kMaxOrder - 1];
-  RecordReader contexts[kMaxOrder - 1];
+  RecordReader inputs[KENLM_MAX_ORDER - 1];
+  RecordReader contexts[KENLM_MAX_ORDER - 1];
 
   for (unsigned char i = 2; i <= counts.size(); ++i) {
     inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff)));
diff --git a/lm/state.hh b/lm/state.hh
index c74384143..3dbf617bf 100644
--- a/lm/state.hh
+++ b/lm/state.hh
@@ -1,7 +1,6 @@
 #ifndef LM_STATE__
 #define LM_STATE__
 
-#include "lm/max_order.hh"
 #include "lm/word_index.hh"
 #include "util/murmur_hash.hh"
 
@@ -32,7 +31,7 @@ class State {
 
     // Call this before using raw memcmp.  
     void ZeroRemaining() {
-      for (unsigned char i = length; i < kMaxOrder - 1; ++i) {
+      for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) {
         words[i] = 0;
         backoff[i] = 0.0;
       }
@@ -42,8 +41,8 @@ class State {
 
     // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD.  
     // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit.  
-    WordIndex words[kMaxOrder - 1];
-    float backoff[kMaxOrder - 1];
+    WordIndex words[KENLM_MAX_ORDER - 1];
+    float backoff[KENLM_MAX_ORDER - 1];
     unsigned char length;
 };
 
@@ -72,11 +71,11 @@ struct Left {
   }
 
   void ZeroRemaining() {
-    for (uint64_t * i = pointers + length; i < pointers + kMaxOrder - 1; ++i)
+    for (uint64_t * i = pointers + length; i < pointers + KENLM_MAX_ORDER - 1; ++i)
       *i = 0;
   }
 
-  uint64_t pointers[kMaxOrder - 1];
+  uint64_t pointers[KENLM_MAX_ORDER - 1];
   unsigned char length;
   bool full;
 };
diff --git a/lm/trie_sort.hh b/lm/trie_sort.hh
index 6ef17eb9f..c1be9bfc4 100644
--- a/lm/trie_sort.hh
+++ b/lm/trie_sort.hh
@@ -3,7 +3,6 @@
 #ifndef LM_TRIE_SORT__
 #define LM_TRIE_SORT__
 
-#include "lm/max_order.hh"
 #include "lm/word_index.hh"
 
 #include "util/file.hh"
@@ -107,7 +106,7 @@ class SortedFiles {
     
     util::scoped_fd unigram_;
 
-    util::scoped_FILE full_[kMaxOrder - 1], context_[kMaxOrder - 1];
+    util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1];
 };
 
 } // namespace trie
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 45f689736..3ad263027 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -208,7 +208,7 @@ binarize
 	rerun-on-change: lm
 	default-name: lm/binlm
 	template: $lm-binarizer IN OUT
-  error: set kMaxOrder to at least this value
+  error: set KENLM_MAX_ORDER to at least this value
 
 [INTERPOLATED-LM] single
 tuning-from-sgm
@@ -275,7 +275,7 @@ binarize
 	ignore-unless: script
 	rerun-on-change: lm
 	default-name: lm/interpolated-binlm
-  error: set kMaxOrder to at least this value
+  error: set KENLM_MAX_ORDER to at least this value
 
 [TRAINING] single
 consolidate
author	Lane Schwartz <dowobeha@gmail.com>	2012-08-09 00:22:13 +0400
committer	Lane Schwartz <dowobeha@gmail.com>	2012-08-09 00:22:13 +0400
commit	da5429318b4c8b4a6d06109fcbdc100ea230202f (patch)
tree	da3a41787d009d30b871de11f777d3fef625dc10
parent	c55931f3524e9e31cb8f76983f150da76b42680d (diff)