Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Jamroot11
-rw-r--r--lm/left.hh3
-rw-r--r--lm/max_order.hh14
-rw-r--r--lm/model.cc2
-rw-r--r--lm/model.hh1
-rw-r--r--lm/quantize.hh3
-rw-r--r--lm/search_trie.cc21
-rw-r--r--lm/state.hh11
-rw-r--r--lm/trie_sort.hh3
-rw-r--r--scripts/ems/experiment.meta4
10 files changed, 32 insertions, 41 deletions
diff --git a/Jamroot b/Jamroot
index 900f5dc37..bf983d1a5 100644
--- a/Jamroot
+++ b/Jamroot
@@ -56,6 +56,7 @@
#
# --without-libsegfault does not link with libSegFault
#
+# --max-kenlm-order maximum ngram order that kenlm can process (default 6)
#
#CONTROLLING THE BUILD
#-a to build from scratch
@@ -83,6 +84,16 @@ if [ option.get "with-cmph" ] {
requirements += <define>HAVE_CMPH ;
}
+# If you need higher order, change this option
+# Having this limit means that State can be
+# (KENLM_MAX_ORDER - 1) * sizeof(float) bytes instead of
+# sizeof(float*) + (KENLM_MAX_ORDER - 1) * sizeof(float) + malloc overhead
+max-order = [ option.get "max-kenlm-order" : 6 : 6 ] ;
+requirements += <define>KENLM_MAX_ORDER=$(max-order) ;
+if ( $(max-order) != 6 ) {
+ echo "Setting KenLM maximum n-gram order to $(max-order)" ;
+}
+
project : default-build
<threading>multi
<warnings>on
diff --git a/lm/left.hh b/lm/left.hh
index c00af88a3..751984c5e 100644
--- a/lm/left.hh
+++ b/lm/left.hh
@@ -38,7 +38,6 @@
#ifndef LM_LEFT__
#define LM_LEFT__
-#include "lm/max_order.hh"
#include "lm/state.hh"
#include "lm/return.hh"
@@ -111,7 +110,7 @@ template <class M> class RuleScore {
return;
}
- float backoffs[kMaxOrder - 1], backoffs2[kMaxOrder - 1];
+ float backoffs[KENLM_MAX_ORDER - 1], backoffs2[KENLM_MAX_ORDER - 1];
float *back = backoffs, *back2 = backoffs2;
unsigned char next_use = out_.right.length;
diff --git a/lm/max_order.hh b/lm/max_order.hh
deleted file mode 100644
index 71cd23dd2..000000000
--- a/lm/max_order.hh
+++ /dev/null
@@ -1,14 +0,0 @@
-#ifndef LM_MAX_ORDER__
-#define LM_MAX_ORDER__
-namespace lm {
-namespace ngram {
-// If you need higher order, change this and recompile.
-// Having this limit means that State can be
-// (kMaxOrder - 1) * sizeof(float) bytes instead of
-// sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
-const unsigned char kMaxOrder = 6;
-
-} // namespace ngram
-} // namespace lm
-
-#endif // LM_MAX_ORDER__
diff --git a/lm/model.cc b/lm/model.cc
index a2d31ce0f..6547ba03d 100644
--- a/lm/model.cc
+++ b/lm/model.cc
@@ -61,7 +61,7 @@ template <class Search, class VocabularyT> void GenericModel<Search, VocabularyT
// File counts do not include pruned trigrams that extend to quadgrams etc. These will be fixed by search_.
ReadARPACounts(f, counts);
- if (counts.size() > kMaxOrder) UTIL_THROW(FormatLoadException, "This model has order " << counts.size() << ". Edit lm/max_order.hh, set kMaxOrder to at least this value, and recompile.");
+ if (counts.size() > KENLM_MAX_ORDER) UTIL_THROW(FormatLoadException, "This model has order " << counts.size() << ". Re-compile, passing a number at least this large to bjam's --max-kenlm-order flag.");
if (counts.size() < 2) UTIL_THROW(FormatLoadException, "This ngram implementation assumes at least a bigram model.");
if (config.probing_multiplier <= 1.0) UTIL_THROW(ConfigException, "probing multiplier must be > 1.0");
diff --git a/lm/model.hh b/lm/model.hh
index be8721788..6dee94196 100644
--- a/lm/model.hh
+++ b/lm/model.hh
@@ -5,7 +5,6 @@
#include "lm/binary_format.hh"
#include "lm/config.hh"
#include "lm/facade.hh"
-#include "lm/max_order.hh"
#include "lm/quantize.hh"
#include "lm/search_hashed.hh"
#include "lm/search_trie.hh"
diff --git a/lm/quantize.hh b/lm/quantize.hh
index cd7e8f2f0..36c427272 100644
--- a/lm/quantize.hh
+++ b/lm/quantize.hh
@@ -3,7 +3,6 @@
#include "lm/blank.hh"
#include "lm/config.hh"
-#include "lm/max_order.hh"
#include "lm/model_type.hh"
#include "util/bit_packing.hh"
@@ -217,7 +216,7 @@ class SeparatelyQuantize {
const Bins &LongestTable() const { return longest_; }
private:
- Bins tables_[kMaxOrder - 1][2];
+ Bins tables_[KENLM_MAX_ORDER - 1][2];
Bins longest_;
diff --git a/lm/search_trie.cc b/lm/search_trie.cc
index 18e80d5a6..9a3e96916 100644
--- a/lm/search_trie.cc
+++ b/lm/search_trie.cc
@@ -5,7 +5,6 @@
#include "lm/binary_format.hh"
#include "lm/blank.hh"
#include "lm/lm_exception.hh"
-#include "lm/max_order.hh"
#include "lm/quantize.hh"
#include "lm/trie.hh"
#include "lm/trie_sort.hh"
@@ -180,7 +179,7 @@ const float kBadProb = std::numeric_limits<float>::infinity();
class SRISucks {
public:
SRISucks() {
- for (BackoffMessages *i = messages_; i != messages_ + kMaxOrder - 1; ++i)
+ for (BackoffMessages *i = messages_; i != messages_ + KENLM_MAX_ORDER - 1; ++i)
i->Init(sizeof(ProbPointer) + sizeof(WordIndex) * (i - messages_ + 1));
}
@@ -196,7 +195,7 @@ class SRISucks {
}
void ObtainBackoffs(unsigned char total_order, FILE *unigram_file, RecordReader *reader) {
- for (unsigned char i = 0; i < kMaxOrder - 1; ++i) {
+ for (unsigned char i = 0; i < KENLM_MAX_ORDER - 1; ++i) {
it_[i] = values_[i].empty() ? NULL : &*values_[i].begin();
}
messages_[0].Apply(it_, unigram_file);
@@ -221,10 +220,10 @@ class SRISucks {
private:
// This used to be one array. Then I needed to separate it by order for quantization to work.
- std::vector<float> values_[kMaxOrder - 1];
- BackoffMessages messages_[kMaxOrder - 1];
+ std::vector<float> values_[KENLM_MAX_ORDER - 1];
+ BackoffMessages messages_[KENLM_MAX_ORDER - 1];
- float *it_[kMaxOrder - 1];
+ float *it_[KENLM_MAX_ORDER - 1];
};
class FindBlanks {
@@ -337,7 +336,7 @@ struct Gram {
template <class Doing> class BlankManager {
public:
BlankManager(unsigned char total_order, Doing &doing) : total_order_(total_order), been_length_(0), doing_(doing) {
- for (float *i = basis_; i != basis_ + kMaxOrder - 1; ++i) *i = kBadProb;
+ for (float *i = basis_; i != basis_ + KENLM_MAX_ORDER - 1; ++i) *i = kBadProb;
}
void Visit(const WordIndex *to, unsigned char length, float prob) {
@@ -373,10 +372,10 @@ template <class Doing> class BlankManager {
private:
const unsigned char total_order_;
- WordIndex been_[kMaxOrder];
+ WordIndex been_[KENLM_MAX_ORDER];
unsigned char been_length_;
- float basis_[kMaxOrder];
+ float basis_[KENLM_MAX_ORDER];
Doing &doing_;
};
@@ -470,8 +469,8 @@ void PopulateUnigramWeights(FILE *file, WordIndex unigram_count, RecordReader &c
} // namespace
template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::vector<uint64_t> &counts, const Config &config, TrieSearch<Quant, Bhiksha> &out, Quant &quant, const SortedVocabulary &vocab, Backing &backing) {
- RecordReader inputs[kMaxOrder - 1];
- RecordReader contexts[kMaxOrder - 1];
+ RecordReader inputs[KENLM_MAX_ORDER - 1];
+ RecordReader contexts[KENLM_MAX_ORDER - 1];
for (unsigned char i = 2; i <= counts.size(); ++i) {
inputs[i-2].Init(files.Full(i), i * sizeof(WordIndex) + (i == counts.size() ? sizeof(Prob) : sizeof(ProbBackoff)));
diff --git a/lm/state.hh b/lm/state.hh
index c74384143..3dbf617bf 100644
--- a/lm/state.hh
+++ b/lm/state.hh
@@ -1,7 +1,6 @@
#ifndef LM_STATE__
#define LM_STATE__
-#include "lm/max_order.hh"
#include "lm/word_index.hh"
#include "util/murmur_hash.hh"
@@ -32,7 +31,7 @@ class State {
// Call this before using raw memcmp.
void ZeroRemaining() {
- for (unsigned char i = length; i < kMaxOrder - 1; ++i) {
+ for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) {
words[i] = 0;
backoff[i] = 0.0;
}
@@ -42,8 +41,8 @@ class State {
// You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD.
// This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit.
- WordIndex words[kMaxOrder - 1];
- float backoff[kMaxOrder - 1];
+ WordIndex words[KENLM_MAX_ORDER - 1];
+ float backoff[KENLM_MAX_ORDER - 1];
unsigned char length;
};
@@ -72,11 +71,11 @@ struct Left {
}
void ZeroRemaining() {
- for (uint64_t * i = pointers + length; i < pointers + kMaxOrder - 1; ++i)
+ for (uint64_t * i = pointers + length; i < pointers + KENLM_MAX_ORDER - 1; ++i)
*i = 0;
}
- uint64_t pointers[kMaxOrder - 1];
+ uint64_t pointers[KENLM_MAX_ORDER - 1];
unsigned char length;
bool full;
};
diff --git a/lm/trie_sort.hh b/lm/trie_sort.hh
index 6ef17eb9f..c1be9bfc4 100644
--- a/lm/trie_sort.hh
+++ b/lm/trie_sort.hh
@@ -3,7 +3,6 @@
#ifndef LM_TRIE_SORT__
#define LM_TRIE_SORT__
-#include "lm/max_order.hh"
#include "lm/word_index.hh"
#include "util/file.hh"
@@ -107,7 +106,7 @@ class SortedFiles {
util::scoped_fd unigram_;
- util::scoped_FILE full_[kMaxOrder - 1], context_[kMaxOrder - 1];
+ util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1];
};
} // namespace trie
diff --git a/scripts/ems/experiment.meta b/scripts/ems/experiment.meta
index 45f689736..3ad263027 100644
--- a/scripts/ems/experiment.meta
+++ b/scripts/ems/experiment.meta
@@ -208,7 +208,7 @@ binarize
rerun-on-change: lm
default-name: lm/binlm
template: $lm-binarizer IN OUT
- error: set kMaxOrder to at least this value
+ error: set KENLM_MAX_ORDER to at least this value
[INTERPOLATED-LM] single
tuning-from-sgm
@@ -275,7 +275,7 @@ binarize
ignore-unless: script
rerun-on-change: lm
default-name: lm/interpolated-binlm
- error: set kMaxOrder to at least this value
+ error: set KENLM_MAX_ORDER to at least this value
[TRAINING] single
consolidate