diff options
author | Lane Schwartz <dowobeha@gmail.com> | 2012-08-09 00:22:13 +0400 |
---|---|---|
committer | Lane Schwartz <dowobeha@gmail.com> | 2012-08-09 00:22:13 +0400 |
commit | da5429318b4c8b4a6d06109fcbdc100ea230202f (patch) | |
tree | da3a41787d009d30b871de11f777d3fef625dc10 | |
parent | c55931f3524e9e31cb8f76983f150da76b42680d (diff) |
KenLM maximum n-gram order can now be set via a compile-time flag
-rw-r--r-- | Jamroot | 11 | ||||
-rw-r--r-- | lm/left.hh | 3 | ||||
-rw-r--r-- | lm/max_order.hh | 14 | ||||
-rw-r--r-- | lm/model.cc | 2 | ||||
-rw-r--r-- | lm/model.hh | 1 | ||||
-rw-r--r-- | lm/quantize.hh | 3 | ||||
-rw-r--r-- | lm/search_trie.cc | 21 | ||||
-rw-r--r-- | lm/state.hh | 11 | ||||
-rw-r--r-- | lm/trie_sort.hh | 3 | ||||
-rw-r--r-- | scripts/ems/experiment.meta | 4 |
10 files changed, 32 insertions, 41 deletions
@@ -56,6 +56,7 @@ # # --without-libsegfault does not link with libSegFault # +# --max-kenlm-order maximum ngram order that kenlm can process (default 6) # #CONTROLLING THE BUILD #-a to build from scratch @@ -83,6 +84,16 @@ if [ option.get "with-cmph" ] { requirements += <define>HAVE_CMPH ; } +# If you need higher order, change this option +# Having this limit means that State can be +# (KENLM_MAX_ORDER - 1) * sizeof(float) bytes instead of +# sizeof(float*) + (KENLM_MAX_ORDER - 1) * sizeof(float) + malloc overhead +max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ; +requirements += <define>KENLM_MAX_ORDER=$(max-order) ; +if ( $(max-order) != 6 ) { + echo "Setting KenLM maximum n-gram order to $(max-order)" ; +} + project : default-build <threading>multi <warnings>on diff --git a/lm/left.hh b/lm/left.hh index c00af88a3..751984c5e 100644 --- a/lm/left.hh +++ b/lm/left.hh @@ -38,7 +38,6 @@ #ifndef LM_LEFT__ #define LM_LEFT__ -#include "lm/max_order.hh" #include "lm/state.hh" #include "lm/return.hh" @@ -111,7 +110,7 @@ template <class M> class RuleScore { return; } - float backoffs[kMaxOrder - 1], backoffs2[kMaxOrder - 1]; + float backoffs[KENLM_MAX_ORDER - 1], backoffs2[KENLM_MAX_ORDER - 1]; float *back = backoffs, *back2 = backoffs2; unsigned char next_use = out_.right.length; diff --git a/lm/max_order.hh b/lm/max_order.hh deleted file mode 100644 index 71cd23dd2..000000000 --- a/lm/max_order.hh +++ /dev/null @@ -1,14 +0,0 @@ -#ifndef LM_MAX_ORDER__ -#define LM_MAX_ORDER__ -namespace lm { -namespace ngram { -// If you need higher order, change this and recompile. -// Having this limit means that State can be -// (kMaxOrder - 1) * sizeof(float) bytes instead of -// sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead -const unsigned char kMaxOrder = 6; - -} // namespace ngram -} // namespace lm - -#endif // LM_MAX_ORDER__ diff --git a/lm/model.cc b/lm/model.cc index a2d31ce0f..6547ba03d 100644 --- a/lm/model.cc +++ b/lm/model.cc @@ -61,7 +61,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT // File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_. ReadARPACounts(f, counts); - if (counts.size() > kMaxOrder) UTIL_THROW(FormatLoadException, "This model has order " << counts.size() << ". Edit lm/max_order.hh, set kMaxOrder to at least this value, and recompile."); + if (counts.size() > KENLM_MAX_ORDER) UTIL_THROW(FormatLoadException, "This model has order " << counts.size() << ". Re-compile, passing a number at least this large to bjam's --max-kenlm-order flag."); if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model."); if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0"); diff --git a/lm/model.hh b/lm/model.hh index be8721788..6dee94196 100644 --- a/lm/model.hh +++ b/lm/model.hh @@ -5,7 +5,6 @@ #include "lm/binary_format.hh" #include "lm/config.hh" #include "lm/facade.hh" -#include "lm/max_order.hh" #include "lm/quantize.hh" #include "lm/search_hashed.hh" #include "lm/search_trie.hh" diff --git a/lm/quantize.hh b/lm/quantize.hh index cd7e8f2f0..36c427272 100644 --- a/lm/quantize.hh +++ b/lm/quantize.hh @@ -3,7 +3,6 @@ #include "lm/blank.hh" #include "lm/config.hh" -#include "lm/max_order.hh" #include "lm/model_type.hh" #include "util/bit_packing.hh" @@ -217,7 +216,7 @@ class SeparatelyQuantize { const Bins &LongestTable() const { return longest_; } private: - Bins tables_[kMaxOrder - 1][2]; + Bins tables_[KENLM_MAX_ORDER - 1][2]; Bins longest_; diff --git a/lm/search_trie.cc b/lm/search_trie.cc index 18e80d5a6..9a3e96916 100644 --- a/lm/search_trie.cc +++ b/lm/search_trie.cc @@ -5,7 +5,6 @@ #include "lm/binary_format.hh" #include "lm/blank.hh" #include "lm/lm_exception.hh" -#include "lm/max_order.hh" #include "lm/quantize.hh" #include "lm/trie.hh" #include "lm/trie_sort.hh" @@ -180,7 +179,7 @@ const float kBadProb = std::numeric_limits<float>::infinity(); class SRISucks { public: SRISucks() { - for (BackoffMessages *i = messages_; i != messages_ + kMaxOrder - 1; ++i) + for (BackoffMessages *i = messages_; i != messages_ + KENLM_MAX_ORDER - 1; ++i) i->Init(sizeof(ProbPointer) + sizeof(WordIndex) * (i - messages_ + 1)); } @@ -196,7 +195,7 @@ class SRISucks { } void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) { - for (unsigned char i = 0; i < kMaxOrder - 1; ++i) { + for (unsigned char i = 0; i < KENLM_MAX_ORDER - 1; ++i) { it_[i] = values_[i].empty() ? NULL : &*values_[i].begin(); } messages_[0].Apply(it_, unigram_file); @@ -221,10 +220,10 @@ class SRISucks { private: // This used to be one array. Then I needed to separate it by order for quantization to work. - std::vector<float> values_[kMaxOrder - 1]; - BackoffMessages messages_[kMaxOrder - 1]; + std::vector<float> values_[KENLM_MAX_ORDER - 1]; + BackoffMessages messages_[KENLM_MAX_ORDER - 1]; - float *it_[kMaxOrder - 1]; + float *it_[KENLM_MAX_ORDER - 1]; }; class FindBlanks { @@ -337,7 +336,7 @@ struct Gram { template <class Doing> class BlankManager { public: BlankManager(unsigned char total_order, Doing &doing) : total_order_(total_order), been_length_(0), doing_(doing) { - for (float *i = basis_; i != basis_ + kMaxOrder - 1; ++i) *i = kBadProb; + for (float *i = basis_; i != basis_ + KENLM_MAX_ORDER - 1; ++i) *i = kBadProb; } void Visit(const WordIndex *to, unsigned char length, float prob) { @@ -373,10 +372,10 @@ template <class Doing> class BlankManager { private: const unsigned char total_order_; - WordIndex been_[kMaxOrder]; + WordIndex been_[KENLM_MAX_ORDER]; unsigned char been_length_; - float basis_[kMaxOrder]; + float basis_[KENLM_MAX_ORDER]; Doing &doing_; }; @@ -470,8 +469,8 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c } // namespace template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) { - RecordReader inputs[kMaxOrder - 1]; - RecordReader contexts[kMaxOrder - 1]; + RecordReader inputs[KENLM_MAX_ORDER - 1]; + RecordReader contexts[KENLM_MAX_ORDER - 1]; for (unsigned char i = 2; i <= counts.size(); ++i) { inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff))); diff --git a/lm/state.hh b/lm/state.hh index c74384143..3dbf617bf 100644 --- a/lm/state.hh +++ b/lm/state.hh @@ -1,7 +1,6 @@ #ifndef LM_STATE__ #define LM_STATE__ -#include "lm/max_order.hh" #include "lm/word_index.hh" #include "util/murmur_hash.hh" @@ -32,7 +31,7 @@ class State { // Call this before using raw memcmp. void ZeroRemaining() { - for (unsigned char i = length; i < kMaxOrder - 1; ++i) { + for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) { words[i] = 0; backoff[i] = 0.0; } @@ -42,8 +41,8 @@ class State { // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD. // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit. - WordIndex words[kMaxOrder - 1]; - float backoff[kMaxOrder - 1]; + WordIndex words[KENLM_MAX_ORDER - 1]; + float backoff[KENLM_MAX_ORDER - 1]; unsigned char length; }; @@ -72,11 +71,11 @@ struct Left { } void ZeroRemaining() { - for (uint64_t * i = pointers + length; i < pointers + kMaxOrder - 1; ++i) + for (uint64_t * i = pointers + length; i < pointers + KENLM_MAX_ORDER - 1; ++i) *i = 0; } - uint64_t pointers[kMaxOrder - 1]; + uint64_t pointers[KENLM_MAX_ORDER - 1]; unsigned char length; bool full; }; diff --git a/lm/trie_sort.hh b/lm/trie_sort.hh index 6ef17eb9f..c1be9bfc4 100644 --- a/lm/trie_sort.hh +++ b/lm/trie_sort.hh @@ -3,7 +3,6 @@ #ifndef LM_TRIE_SORT__ #define LM_TRIE_SORT__ -#include "lm/max_order.hh" #include "lm/word_index.hh" #include "util/file.hh" @@ -107,7 +106,7 @@ class SortedFiles { util::scoped_fd unigram_; - util::scoped_FILE full_[kMaxOrder - 1], context_[kMaxOrder - 1]; + util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1]; }; } // namespace trie diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta index 45f689736..3ad263027 100644 --- a/scripts/ems/experiment.meta +++ b/scripts/ems/experiment.meta @@ -208,7 +208,7 @@ binarize rerun-on-change: lm default-name: lm/binlm template: $lm-binarizer IN OUT - error: set kMaxOrder to at least this value + error: set KENLM_MAX_ORDER to at least this value [INTERPOLATED-LM] single tuning-from-sgm @@ -275,7 +275,7 @@ binarize ignore-unless: script rerun-on-change: lm default-name: lm/interpolated-binlm - error: set kMaxOrder to at least this value + error: set KENLM_MAX_ORDER to at least this value [TRAINING] single consolidate |