Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/lm
diff options
context:
space:
mode:
authorJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-04-30 08:05:11 +0300
committerJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-04-30 08:05:11 +0300
commiteca582410006443d0b101a9ae188e302f34f8a03 (patch)
tree35212762fbe666330205e2a9ef09d16a918d077c /lm
parent85acdc62b1548863a6db18bebb538406cfcfa038 (diff)
Remove trailing whitespace in C++ files.
Diffstat (limited to 'lm')
-rw-r--r--lm/bhiksha.cc6
-rw-r--r--lm/bhiksha.hh6
-rw-r--r--lm/binary_format.cc2
-rw-r--r--lm/binary_format.hh12
-rw-r--r--lm/blank.hh6
-rw-r--r--lm/build_binary_main.cc2
-rw-r--r--lm/builder/adjust_counts.cc56
-rw-r--r--lm/builder/adjust_counts.hh8
-rw-r--r--lm/builder/adjust_counts_test.cc2
-rw-r--r--lm/builder/corpus_count.cc28
-rw-r--r--lm/builder/corpus_count.hh2
-rw-r--r--lm/builder/initial_probabilities.cc42
-rw-r--r--lm/builder/initial_probabilities.hh6
-rw-r--r--lm/builder/interpolate.cc6
-rw-r--r--lm/builder/interpolate.hh4
-rw-r--r--lm/builder/joint_order.hh10
-rw-r--r--lm/builder/lmplz_main.cc10
-rw-r--r--lm/builder/ngram.hh10
-rw-r--r--lm/builder/pipeline.cc18
-rw-r--r--lm/builder/pipeline.hh2
-rw-r--r--lm/builder/print.cc2
-rw-r--r--lm/builder/print.hh6
-rw-r--r--lm/builder/sort.hh40
-rw-r--r--lm/enumerate_vocab.hh2
-rw-r--r--lm/facade.hh8
-rw-r--r--lm/filter/count_io.hh4
-rw-r--r--lm/filter/filter_main.cc2
-rw-r--r--lm/filter/format.hh12
-rw-r--r--lm/filter/phrase.cc26
-rw-r--r--lm/filter/phrase.hh14
-rw-r--r--lm/filter/phrase_table_vocab_main.cc4
-rw-r--r--lm/filter/thread.hh14
-rw-r--r--lm/filter/vocab.cc2
-rw-r--r--lm/filter/vocab.hh2
-rw-r--r--lm/filter/wrapper.hh2
-rw-r--r--lm/left.hh44
-rw-r--r--lm/left_test.cc4
-rw-r--r--lm/lm_exception.hh2
-rw-r--r--lm/max_order.hh2
-rw-r--r--lm/model.hh40
-rw-r--r--lm/model_test.cc10
-rw-r--r--lm/model_type.hh2
-rw-r--r--lm/ngram_query.hh4
-rw-r--r--lm/partial.hh10
-rw-r--r--lm/partial_test.cc4
-rw-r--r--lm/quantize.cc4
-rw-r--r--lm/quantize.hh10
-rw-r--r--lm/return.hh10
-rw-r--r--lm/search_trie.cc2
-rw-r--r--lm/sizes.cc2
-rw-r--r--lm/state.hh12
-rw-r--r--lm/trie.cc8
-rw-r--r--lm/trie.hh18
-rw-r--r--lm/trie_sort.cc24
-rw-r--r--lm/trie_sort.hh4
-rw-r--r--lm/value.hh4
-rw-r--r--lm/value_build.cc8
-rw-r--r--lm/value_build.hh2
-rw-r--r--lm/virtual_interface.hh32
-rw-r--r--lm/vocab.cc24
-rw-r--r--lm/vocab.hh14
-rw-r--r--lm/weights.hh4
-rw-r--r--lm/wrappers/nplm.cc4
-rw-r--r--lm/wrappers/nplm.hh2
64 files changed, 344 insertions, 344 deletions
diff --git a/lm/bhiksha.cc b/lm/bhiksha.cc
index c8a18dfda..4262b615e 100644
--- a/lm/bhiksha.cc
+++ b/lm/bhiksha.cc
@@ -11,12 +11,12 @@ namespace lm {
namespace ngram {
namespace trie {
-DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) :
+DontBhiksha::DontBhiksha(const void * /*base*/, uint64_t /*max_offset*/, uint64_t max_next, const Config &/*config*/) :
next_(util::BitsMask::ByMax(max_next)) {}
const uint8_t kArrayBhikshaVersion = 0;
-// TODO: put this in binary file header instead when I change the binary file format again.
+// TODO: put this in binary file header instead when I change the binary file format again.
void ArrayBhiksha::UpdateConfigFromBinary(const BinaryFormat &file, uint64_t offset, Config &config) {
uint8_t buffer[2];
file.ReadForConfig(buffer, 2, offset);
@@ -33,7 +33,7 @@ uint8_t ChopBits(uint64_t max_offset, uint64_t max_next, const Config &config) {
uint8_t required = util::RequiredBits(max_next);
uint8_t best_chop = 0;
int64_t lowest_change = std::numeric_limits<int64_t>::max();
- // There are probably faster ways but I don't care because this is only done once per order at construction time.
+ // There are probably faster ways but I don't care because this is only done once per order at construction time.
for (uint8_t chop = 0; chop <= std::min(required, config.pointer_bhiksha_bits); ++chop) {
int64_t change = (max_next >> (required - chop)) * 64 /* table cost in bits */
- max_offset * static_cast<int64_t>(chop); /* savings in bits*/
diff --git a/lm/bhiksha.hh b/lm/bhiksha.hh
index 8ec8989c7..36438f1d2 100644
--- a/lm/bhiksha.hh
+++ b/lm/bhiksha.hh
@@ -7,7 +7,7 @@
* pages={388--391},
* }
*
- * Currently only used for next pointers.
+ * Currently only used for next pointers.
*/
#ifndef LM_BHIKSHA_H
@@ -86,9 +86,9 @@ class ArrayBhiksha {
// assert(end_it == std::upper_bound(offset_begin_, offset_end_, index + 1));
--end_it;
// assert(end_it >= begin_it);
- out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
+ out.begin = ((begin_it - offset_begin_) << next_inline_.bits) |
util::ReadInt57(base, bit_offset, next_inline_.bits, next_inline_.mask);
- out.end = ((end_it - offset_begin_) << next_inline_.bits) |
+ out.end = ((end_it - offset_begin_) << next_inline_.bits) |
util::ReadInt57(base, bit_offset + total_bits, next_inline_.bits, next_inline_.mask);
// If this fails, consider rebuilding your model using KenLM after 1e333d786b748555e8f368d2bbba29a016c98052
assert(out.end >= out.begin);
diff --git a/lm/binary_format.cc b/lm/binary_format.cc
index 481174047..4ad893d44 100644
--- a/lm/binary_format.cc
+++ b/lm/binary_format.cc
@@ -135,7 +135,7 @@ void MatchCheck(ModelType model_type, unsigned int search_version, const Paramet
const std::size_t kInvalidSize = static_cast<std::size_t>(-1);
-BinaryFormat::BinaryFormat(const Config &config)
+BinaryFormat::BinaryFormat(const Config &config)
: write_method_(config.write_method), write_mmap_(config.write_mmap), load_method_(config.load_method),
header_size_(kInvalidSize), vocab_size_(kInvalidSize), vocab_string_offset_(kInvalidOffset) {}
diff --git a/lm/binary_format.hh b/lm/binary_format.hh
index 136d6b1aa..ff99b9574 100644
--- a/lm/binary_format.hh
+++ b/lm/binary_format.hh
@@ -19,18 +19,18 @@ namespace ngram {
extern const char *kModelNames[6];
-/*Inspect a file to determine if it is a binary lm. If not, return false.
+/*Inspect a file to determine if it is a binary lm. If not, return false.
* If so, return true and set recognized to the type. This is the only API in
- * this header designed for use by decoder authors.
+ * this header designed for use by decoder authors.
*/
bool RecognizeBinary(const char *file, ModelType &recognized);
struct FixedWidthParameters {
unsigned char order;
float probing_multiplier;
- // What type of model is this?
+ // What type of model is this?
ModelType model_type;
- // Does the end of the file have the actual strings in the vocabulary?
+ // Does the end of the file have the actual strings in the vocabulary?
bool has_vocabulary;
unsigned int search_version;
};
@@ -38,7 +38,7 @@ struct FixedWidthParameters {
// This is a macro instead of an inline function so constants can be assigned using it.
#define ALIGN8(a) ((std::ptrdiff_t(((a)-1)/8)+1)*8)
-// Parameters stored in the header of a binary file.
+// Parameters stored in the header of a binary file.
struct Parameters {
FixedWidthParameters fixed;
std::vector<uint64_t> counts;
@@ -79,7 +79,7 @@ class BinaryFormat {
const char *write_mmap_;
util::LoadMethod load_method_;
- // File behind memory, if any.
+ // File behind memory, if any.
util::scoped_fd file_;
// If there is a file involved, a single mapping.
diff --git a/lm/blank.hh b/lm/blank.hh
index 2107e1cb6..e09054c9b 100644
--- a/lm/blank.hh
+++ b/lm/blank.hh
@@ -15,9 +15,9 @@ namespace ngram {
* kNoExtensionBackoff. If the n-gram might be extended, then out_state must
* contain the full n-gram, in which case kExtensionBackoff is set. In any
* case, if an n-gram has non-zero backoff, the full state is returned so
- * backoff can be properly charged.
+ * backoff can be properly charged.
* These differ only in sign bit because the backoff is in fact zero in either
- * case.
+ * case.
*/
const float kNoExtensionBackoff = -0.0;
const float kExtensionBackoff = 0.0;
@@ -28,7 +28,7 @@ inline void SetExtension(float &backoff) {
if (backoff == kNoExtensionBackoff) backoff = kExtensionBackoff;
}
-// This compiles down nicely.
+// This compiles down nicely.
inline bool HasExtension(const float &backoff) {
typedef union { float f; uint32_t i; } UnionValue;
UnionValue compare, interpret;
diff --git a/lm/build_binary_main.cc b/lm/build_binary_main.cc
index 6d88a398d..35206e60b 100644
--- a/lm/build_binary_main.cc
+++ b/lm/build_binary_main.cc
@@ -56,7 +56,7 @@ void Usage(const char *name, const char *default_mem) {
exit(1);
}
-// I could really use boost::lexical_cast right about now.
+// I could really use boost::lexical_cast right about now.
float ParseFloat(const char *from) {
char *end;
float ret = strtod(from, &end);
diff --git a/lm/builder/adjust_counts.cc b/lm/builder/adjust_counts.cc
index 2dd3cef1b..bcaa71998 100644
--- a/lm/builder/adjust_counts.cc
+++ b/lm/builder/adjust_counts.cc
@@ -114,7 +114,7 @@ class CollapseStream {
current_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
prune_threshold_(prune_threshold),
prune_words_(prune_words),
- block_(position) {
+ block_(position) {
StartBlock();
}
@@ -125,27 +125,27 @@ class CollapseStream {
CollapseStream &operator++() {
assert(block_);
-
+
if (current_.begin()[1] == kBOS && current_.Base() < copy_from_) {
memcpy(current_.Base(), copy_from_, current_.TotalSize());
UpdateCopyFrom();
-
+
// Mark highest order n-grams for later pruning
if(current_.Count() <= prune_threshold_) {
- current_.Mark();
+ current_.Mark();
}
-
+
if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) {
- current_.Mark();
+ current_.Mark();
break;
}
}
}
-
+
}
-
+
current_.NextInMemory();
uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
if (current_.Base() == block_base + block_->ValidSize()) {
@@ -153,21 +153,21 @@ class CollapseStream {
++block_;
StartBlock();
}
-
+
// Mark highest order n-grams for later pruning
if(current_.Count() <= prune_threshold_) {
- current_.Mark();
+ current_.Mark();
}
if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) {
- current_.Mark();
+ current_.Mark();
break;
}
}
}
-
+
return *this;
}
@@ -180,21 +180,21 @@ class CollapseStream {
current_.ReBase(block_->Get());
copy_from_ = static_cast<uint8_t*>(block_->Get()) + block_->ValidSize();
UpdateCopyFrom();
-
+
// Mark highest order n-grams for later pruning
if(current_.Count() <= prune_threshold_) {
- current_.Mark();
+ current_.Mark();
}
if(!prune_words_.empty()) {
for(WordIndex* i = current_.begin(); i != current_.end(); i++) {
if(prune_words_[*i]) {
- current_.Mark();
+ current_.Mark();
break;
}
}
}
-
+
}
// Find last without bos.
@@ -222,18 +222,18 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
StatCollector stats(order, counts_, counts_pruned_, discounts_);
if (order == 1) {
- // Only unigrams. Just collect stats.
+ // Only unigrams. Just collect stats.
for (NGramStream full(positions[0]); full; ++full) {
-
+
// Do not prune <s> </s> <unk>
if(*full->begin() > 2) {
if(full->Count() <= prune_thresholds_[0])
full->Mark();
-
+
if(!prune_words_.empty() && prune_words_[*full->begin()])
full->Mark();
}
-
+
stats.AddFull(full->UnmarkedCount(), full->IsMarked());
}
@@ -243,7 +243,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
NGramStreams streams;
streams.Init(positions, positions.size() - 1);
-
+
CollapseStream full(positions[positions.size() - 1], prune_thresholds_.back(), prune_words_);
// Initialization: <unk> has count 0 and so does <s>.
@@ -261,7 +261,7 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
std::vector<uint64_t> actual_counts(positions.size(), 0);
// Something of a hack: don't prune <s>.
actual_counts[0] = std::numeric_limits<uint64_t>::max();
-
+
// Iterate over full (the stream of the highest order ngrams)
for (; full; ++full) {
const WordIndex *different = FindDifference(*full, **lower_valid);
@@ -272,16 +272,16 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
uint64_t order_minus_1 = lower_valid - streams_begin;
if(actual_counts[order_minus_1] <= prune_thresholds_[order_minus_1])
(*lower_valid)->Mark();
-
+
if(!prune_words_.empty()) {
for(WordIndex* i = (*lower_valid)->begin(); i != (*lower_valid)->end(); i++) {
if(prune_words_[*i]) {
- (*lower_valid)->Mark();
+ (*lower_valid)->Mark();
break;
}
}
}
-
+
stats.Add(order_minus_1, (*lower_valid)->UnmarkedCount(), (*lower_valid)->IsMarked());
++*lower_valid;
}
@@ -327,16 +327,16 @@ void AdjustCounts::Run(const util::stream::ChainPositions &positions) {
uint64_t lower_count = actual_counts[(*s)->Order() - 1];
if(lower_count <= prune_thresholds_[(*s)->Order() - 1])
(*s)->Mark();
-
+
if(!prune_words_.empty()) {
for(WordIndex* i = (*s)->begin(); i != (*s)->end(); i++) {
if(prune_words_[*i]) {
- (*s)->Mark();
+ (*s)->Mark();
break;
}
}
}
-
+
stats.Add(s - streams.begin(), lower_count, (*s)->IsMarked());
++*s;
}
diff --git a/lm/builder/adjust_counts.hh b/lm/builder/adjust_counts.hh
index b169950e9..29319ba06 100644
--- a/lm/builder/adjust_counts.hh
+++ b/lm/builder/adjust_counts.hh
@@ -30,9 +30,9 @@ struct DiscountConfig {
WarningAction bad_action;
};
-/* Compute adjusted counts.
+/* Compute adjusted counts.
* Input: unique suffix sorted N-grams (and just the N-grams) with raw counts.
- * Output: [1,N]-grams with adjusted counts.
+ * Output: [1,N]-grams with adjusted counts.
* [1,N)-grams are in suffix order
* N-grams are in undefined order (they're going to be sorted anyway).
*/
@@ -50,13 +50,13 @@ class AdjustCounts {
const DiscountConfig &discount_config,
std::vector<Discount> &discounts)
: prune_thresholds_(prune_thresholds), counts_(counts), counts_pruned_(counts_pruned),
- prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts)
+ prune_words_(prune_words), discount_config_(discount_config), discounts_(discounts)
{}
void Run(const util::stream::ChainPositions &positions);
private:
- const std::vector<uint64_t> &prune_thresholds_;
+ const std::vector<uint64_t> &prune_thresholds_;
std::vector<uint64_t> &counts_;
std::vector<uint64_t> &counts_pruned_;
const std::vector<bool> &prune_words_;
diff --git a/lm/builder/adjust_counts_test.cc b/lm/builder/adjust_counts_test.cc
index 353e3dd35..2a9d78ae0 100644
--- a/lm/builder/adjust_counts_test.cc
+++ b/lm/builder/adjust_counts_test.cc
@@ -82,7 +82,7 @@ BOOST_AUTO_TEST_CASE(Simple) {
}
BOOST_REQUIRE_EQUAL(4UL, counts.size());
BOOST_CHECK_EQUAL(4UL, counts[0]);
- // These are no longer set because the discounts are bad.
+ // These are no longer set because the discounts are bad.
/* BOOST_CHECK_EQUAL(4UL, counts[1]);
BOOST_CHECK_EQUAL(3UL, counts[2]);
BOOST_CHECK_EQUAL(3UL, counts[3]);*/
diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc
index 7f3dafa27..889eeb7a9 100644
--- a/lm/builder/corpus_count.cc
+++ b/lm/builder/corpus_count.cc
@@ -45,7 +45,7 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
std::size_t operator()(const WordIndex *start) const {
return util::MurmurHashNative(start, size_);
}
-
+
private:
const std::size_t size_;
};
@@ -53,11 +53,11 @@ class DedupeHash : public std::unary_function<const WordIndex *, bool> {
class DedupeEquals : public std::binary_function<const WordIndex *, const WordIndex *, bool> {
public:
explicit DedupeEquals(std::size_t order) : size_(order * sizeof(WordIndex)) {}
-
+
bool operator()(const WordIndex *first, const WordIndex *second) const {
return !memcmp(first, second, size_);
- }
-
+ }
+
private:
const std::size_t size_;
};
@@ -82,7 +82,7 @@ typedef util::ProbingHashTable<DedupeEntry, DedupeHash, DedupeEquals> Dedupe;
class Writer {
public:
- Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
+ Writer(std::size_t order, const util::stream::ChainPosition &position, void *dedupe_mem, std::size_t dedupe_mem_size)
: block_(position), gram_(block_->Get(), order),
dedupe_invalid_(order, std::numeric_limits<WordIndex>::max()),
dedupe_(dedupe_mem, dedupe_mem_size, &dedupe_invalid_[0], DedupeHash(order), DedupeEquals(order)),
@@ -91,7 +91,7 @@ class Writer {
dedupe_.Clear();
assert(Dedupe::Size(position.GetChain().BlockSize() / position.GetChain().EntrySize(), kProbingMultiplier) == dedupe_mem_size);
if (order == 1) {
- // Add special words. AdjustCounts is responsible if order != 1.
+ // Add special words. AdjustCounts is responsible if order != 1.
AddUnigramWord(kUNK);
AddUnigramWord(kBOS);
}
@@ -121,16 +121,16 @@ class Writer {
memmove(gram_.begin(), gram_.begin() + 1, sizeof(WordIndex) * (gram_.Order() - 1));
return;
}
- // Complete the write.
+ // Complete the write.
gram_.Count() = 1;
- // Prepare the next n-gram.
+ // Prepare the next n-gram.
if (reinterpret_cast<uint8_t*>(gram_.begin()) + gram_.TotalSize() != static_cast<uint8_t*>(block_->Get()) + block_size_) {
NGram last(gram_);
gram_.NextInMemory();
std::copy(last.begin() + 1, last.end(), gram_.begin());
return;
}
- // Block end. Need to store the context in a temporary buffer.
+ // Block end. Need to store the context in a temporary buffer.
std::copy(gram_.begin() + 1, gram_.end(), buffer_.get());
dedupe_.Clear();
block_->SetValidSize(block_size_);
@@ -158,7 +158,7 @@ class Writer {
// Hash table combiner implementation.
Dedupe dedupe_;
- // Small buffer to hold existing ngrams when shifting across a block boundary.
+ // Small buffer to hold existing ngrams when shifting across a block boundary.
boost::scoped_array<WordIndex> buffer_;
const std::size_t block_size_;
@@ -224,12 +224,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
} catch (const util::EndOfFileException &e) {}
token_count_ = count;
type_count_ = vocab.Size();
-
+
// Create list of unigrams that are supposed to be pruned
if (!prune_vocab_filename_.empty()) {
try {
util::FilePiece prune_vocab_file(prune_vocab_filename_.c_str());
-
+
prune_words_.resize(vocab.Size(), true);
try {
while (true) {
@@ -238,12 +238,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
prune_words_[vocab.Index(*w)] = false;
}
} catch (const util::EndOfFileException &e) {}
-
+
// Never prune <unk>, <s>, </s>
prune_words_[kUNK] = false;
prune_words_[kBOS] = false;
prune_words_[kEOS] = false;
-
+
} catch (const util::Exception &e) {
std::cerr << e.what() << std::endl;
abort();
diff --git a/lm/builder/corpus_count.hh b/lm/builder/corpus_count.hh
index d3121ca45..165505c4a 100644
--- a/lm/builder/corpus_count.hh
+++ b/lm/builder/corpus_count.hh
@@ -40,7 +40,7 @@ class CorpusCount {
uint64_t &token_count_;
WordIndex &type_count_;
std::vector<bool>& prune_words_;
- const std::string& prune_vocab_filename_;
+ const std::string& prune_vocab_filename_;
std::size_t dedupe_mem_size_;
util::scoped_malloc dedupe_mem_;
diff --git a/lm/builder/initial_probabilities.cc b/lm/builder/initial_probabilities.cc
index b1dd96f31..80063eb2e 100644
--- a/lm/builder/initial_probabilities.cc
+++ b/lm/builder/initial_probabilities.cc
@@ -27,9 +27,9 @@ struct HashBufferEntry : public BufferEntry {
uint64_t hash_value;
};
-// Reads all entries in order like NGramStream does.
+// Reads all entries in order like NGramStream does.
// But deletes any entries that have CutoffCount below or equal to pruning
-// threshold.
+// threshold.
class PruneNGramStream {
public:
PruneNGramStream(const util::stream::ChainPosition &position) :
@@ -37,7 +37,7 @@ class PruneNGramStream {
dest_(NULL, NGram::OrderFromSize(position.GetChain().EntrySize())),
currentCount_(0),
block_(position)
- {
+ {
StartBlock();
}
@@ -50,7 +50,7 @@ class PruneNGramStream {
PruneNGramStream &operator++() {
assert(block_);
-
+
if(current_.Order() == 1 && *current_.begin() <= 2)
dest_.NextInMemory();
else if(currentCount_ > 0) {
@@ -59,9 +59,9 @@ class PruneNGramStream {
}
dest_.NextInMemory();
}
-
+
current_.NextInMemory();
-
+
uint8_t *block_base = static_cast<uint8_t*>(block_->Get());
if (current_.Base() == block_base + block_->ValidSize()) {
block_->SetValidSize(dest_.Base() - block_base);
@@ -70,13 +70,13 @@ class PruneNGramStream {
if (block_) {
currentCount_ = current_.CutoffCount();
}
- } else {
+ } else {
currentCount_ = current_.CutoffCount();
}
-
+
return *this;
}
-
+
private:
void StartBlock() {
for (; ; ++block_) {
@@ -85,13 +85,13 @@ class PruneNGramStream {
}
current_.ReBase(block_->Get());
currentCount_ = current_.CutoffCount();
-
+
dest_.ReBase(block_->Get());
}
NGram current_; // input iterator
NGram dest_; // output iterator
-
+
uint64_t currentCount_;
util::stream::Link block_;
@@ -155,24 +155,24 @@ class AddRight {
memcpy(previous_raw, in->begin(), size);
uint64_t denominator = 0;
uint64_t normalizer = 0;
-
+
uint64_t counts[4];
memset(counts, 0, sizeof(counts));
do {
denominator += in->UnmarkedCount();
-
+
// Collect unused probability mass from pruning.
// Becomes 0 for unpruned ngrams.
normalizer += in->UnmarkedCount() - in->CutoffCount();
-
+
// Chen&Goodman do not mention counting based on cutoffs, but
// backoff becomes larger than 1 otherwise, so probably needs
// to count cutoffs. Counts normally without pruning.
if(in->CutoffCount() > 0)
++counts[std::min(in->CutoffCount(), static_cast<uint64_t>(3))];
-
+
} while (++in && !memcmp(previous_raw, in->begin(), size));
-
+
BufferEntry &entry = *reinterpret_cast<BufferEntry*>(out.Get());
entry.denominator = static_cast<float>(denominator);
entry.gamma = 0.0;
@@ -182,9 +182,9 @@ class AddRight {
// Makes model sum to 1 with pruning (I hope).
entry.gamma += normalizer;
-
+
entry.gamma /= entry.denominator;
-
+
if(pruning_) {
// If pruning is enabled the stream actually contains HashBufferEntry, see InitialProbabilities(...),
// so add a hash value that identifies the current ngram.
@@ -244,13 +244,13 @@ class MergeRight {
++summed;
return;
}
-
+
std::vector<WordIndex> previous(grams->Order() - 1);
const std::size_t size = sizeof(WordIndex) * previous.size();
for (; grams; ++summed) {
memcpy(&previous[0], grams->begin(), size);
const BufferEntry &sums = *static_cast<const BufferEntry*>(summed.Get());
-
+
do {
Payload &pay = grams->Value();
pay.uninterp.prob = discount_.Apply(grams->UnmarkedCount()) / sums.denominator;
@@ -288,7 +288,7 @@ void InitialProbabilities(
gamma_out[i] >> AddRight(discounts[i], second, prune_vocab || prune_thresholds[i] > 0);
primary[i] >> MergeRight(config.interpolate_unigrams, gamma_out[i].Add(), discounts[i]);
-
+
// Don't bother with the OnlyGamma thread for something to discard.
if (i) gamma_out[i] >> OnlyGamma(prune_vocab || prune_thresholds[i] > 0);
}
diff --git a/lm/builder/initial_probabilities.hh b/lm/builder/initial_probabilities.hh
index 57e09cd51..a8ecf4dc2 100644
--- a/lm/builder/initial_probabilities.hh
+++ b/lm/builder/initial_probabilities.hh
@@ -15,17 +15,17 @@ struct InitialProbabilitiesConfig {
// These should be small buffers to keep the adder from getting too far ahead
util::stream::ChainConfig adder_in;
util::stream::ChainConfig adder_out;
- // SRILM doesn't normally interpolate unigrams.
+ // SRILM doesn't normally interpolate unigrams.
bool interpolate_unigrams;
};
/* Compute initial (uninterpolated) probabilities
* primary: the normal chain of n-grams. Incoming is context sorted adjusted
* counts. Outgoing has uninterpolated probabilities for use by Interpolate.
- * second_in: a second copy of the primary input. Discard the output.
+ * second_in: a second copy of the primary input. Discard the output.
* gamma_out: Computed gamma values are output on these chains in suffix order.
* The values are bare floats and should be buffered for interpolation to
- * use.
+ * use.
*/
void InitialProbabilities(
const InitialProbabilitiesConfig &config,
diff --git a/lm/builder/interpolate.cc b/lm/builder/interpolate.cc
index 0f9b98162..5b04cb3ff 100644
--- a/lm/builder/interpolate.cc
+++ b/lm/builder/interpolate.cc
@@ -47,7 +47,7 @@ class OutputQ {
private:
// Product of backoffs in the numerator divided by backoffs in the
- // denominator. Does not include
+ // denominator. Does not include
std::vector<float> q_delta_;
};
@@ -81,7 +81,7 @@ template <class Output> class Callback {
if(prune_vocab_ || prune_thresholds_[i + 1] > 0)
while(backoffs_[i])
++backoffs_[i];
-
+
if (backoffs_[i]) {
std::cerr << "Backoffs do not match for order " << (i + 1) << std::endl;
abort();
@@ -99,7 +99,7 @@ template <class Output> class Callback {
if(prune_vocab_ || prune_thresholds_[order_minus_1 + 1] > 0) {
//Compute hash value for current context
uint64_t current_hash = util::MurmurHashNative(gram.begin(), gram.Order() * sizeof(WordIndex));
-
+
const HashGamma *hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
while(current_hash != hashed_backoff->hash_value && ++backoffs_[order_minus_1])
hashed_backoff = static_cast<const HashGamma*>(backoffs_[order_minus_1].Get());
diff --git a/lm/builder/interpolate.hh b/lm/builder/interpolate.hh
index adfd9198f..207a16dfd 100644
--- a/lm/builder/interpolate.hh
+++ b/lm/builder/interpolate.hh
@@ -8,8 +8,8 @@
#include <stdint.h>
namespace lm { namespace builder {
-
-/* Interpolate step.
+
+/* Interpolate step.
* Input: suffix sorted n-grams with (p_uninterpolated, gamma) from
* InitialProbabilities.
* Output: suffix sorted n-grams with complete probability
diff --git a/lm/builder/joint_order.hh b/lm/builder/joint_order.hh
index 1728706dd..b05ef67fd 100644
--- a/lm/builder/joint_order.hh
+++ b/lm/builder/joint_order.hh
@@ -35,7 +35,7 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
// Does the context match the lower one?
if (!memcmp(streams[static_cast<int>(current) - 1]->begin(), streams[current]->begin() + Compare::kMatchOffset, sizeof(WordIndex) * current)) {
callback.Enter(current, *streams[current]);
- // Transition to looking for extensions.
+ // Transition to looking for extensions.
if (++current < order) continue;
}
#ifdef DEBUG
@@ -46,16 +46,16 @@ template <class Callback, class Compare> void JointOrder(const util::stream::Cha
abort();
}
#endif // DEBUG
- // No extension left.
+ // No extension left.
while(true) {
assert(current > 0);
--current;
callback.Exit(current, *streams[current]);
-
+
if (++streams[current]) break;
-
+
UTIL_THROW_IF(order != current + 1, FormatLoadException, "Detected n-gram without matching suffix");
-
+
order = current;
if (!order) return;
}
diff --git a/lm/builder/lmplz_main.cc b/lm/builder/lmplz_main.cc
index 65ec55729..5c9d86deb 100644
--- a/lm/builder/lmplz_main.cc
+++ b/lm/builder/lmplz_main.cc
@@ -53,7 +53,7 @@ std::vector<uint64_t> ParsePruning(const std::vector<std::string> &param, std::s
// throw if each n-gram order has not threshold specified
UTIL_THROW_IF(prune_thresholds.size() > order, util::Exception, "You specified pruning thresholds for orders 1 through " << prune_thresholds.size() << " but the model only has order " << order);
// threshold for unigram can only be 0 (no pruning)
-
+
// check if threshold are not in decreasing order
uint64_t lower_threshold = 0;
for (std::vector<uint64_t>::iterator it = prune_thresholds.begin(); it != prune_thresholds.end(); ++it) {
@@ -124,7 +124,7 @@ int main(int argc, char *argv[]) {
po::store(po::parse_command_line(argc, argv, options), vm);
if (argc == 1 || vm["help"].as<bool>()) {
- std::cerr <<
+ std::cerr <<
"Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
"Please cite:\n"
"@inproceedings{Heafield-estimate,\n"
@@ -147,7 +147,7 @@ int main(int argc, char *argv[]) {
std::cerr << "This machine has " << mem << " bytes of memory.\n\n";
} else {
std::cerr << "Unable to determine the amount of memory on this machine.\n\n";
- }
+ }
std::cerr << options << std::endl;
return 1;
}
@@ -191,11 +191,11 @@ int main(int argc, char *argv[]) {
else {
pipeline.prune_vocab = false;
}
-
+
util::NormalizeTempPrefix(pipeline.sort.temp_prefix);
lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
- // TODO: evaluate options for these.
+ // TODO: evaluate options for these.
initial.adder_in.total_memory = 32768;
initial.adder_in.block_count = 2;
initial.adder_out.total_memory = 32768;
diff --git a/lm/builder/ngram.hh b/lm/builder/ngram.hh
index 4525b3421..d0033206c 100644
--- a/lm/builder/ngram.hh
+++ b/lm/builder/ngram.hh
@@ -68,26 +68,26 @@ class NGram {
assert(size == TotalSize(ret));
return ret;
}
-
+
// manipulate msb to signal that ngram can be pruned
/*mjd**********************************************************************/
bool IsMarked() const {
return Value().count >> (sizeof(Value().count) * 8 - 1);
}
-
+
void Mark() {
Value().count |= (1ul << (sizeof(Value().count) * 8 - 1));
}
-
+
void Unmark() {
Value().count &= ~(1ul << (sizeof(Value().count) * 8 - 1));
}
-
+
uint64_t UnmarkedCount() const {
return Value().count & ~(1ul << (sizeof(Value().count) * 8 - 1));
}
-
+
uint64_t CutoffCount() const {
return IsMarked() ? 0 : UnmarkedCount();
}
diff --git a/lm/builder/pipeline.cc b/lm/builder/pipeline.cc
index fced0e3bd..1ca2e26f5 100644
--- a/lm/builder/pipeline.cc
+++ b/lm/builder/pipeline.cc
@@ -37,7 +37,7 @@ void PrintStatistics(const std::vector<uint64_t> &counts, const std::vector<uint
class Master {
public:
- explicit Master(PipelineConfig &config)
+ explicit Master(PipelineConfig &config)
: config_(config), chains_(config.order), files_(config.order) {
config_.minimum_block = std::max(NGram::TotalSize(config_.order), config_.minimum_block);
}
@@ -64,7 +64,7 @@ class Master {
CreateChains(config_.TotalMemory() - merge_using, count_bounds);
ngrams.Output(chains_.back(), merge_using);
- // Setup unigram file.
+ // Setup unigram file.
files_.push_back(util::MakeTemp(config_.TempPrefix()));
}
@@ -204,7 +204,7 @@ class Master {
PipelineConfig &config_;
util::stream::Chains chains_;
- // Often only unigrams, but sometimes all orders.
+ // Often only unigrams, but sometimes all orders.
util::FixedArray<util::stream::FileBuffer> files_;
};
@@ -214,7 +214,7 @@ void CountText(int text_file /* input */, int vocab_file /* output */, Master &m
const std::size_t vocab_usage = CorpusCount::VocabUsage(config.vocab_estimate);
UTIL_THROW_IF(config.TotalMemory() < vocab_usage, util::Exception, "Vocab hash size estimate " << vocab_usage << " exceeds total memory " << config.TotalMemory());
- std::size_t memory_for_chain =
+ std::size_t memory_for_chain =
// This much memory to work with after vocab hash table.
static_cast<float>(config.TotalMemory() - vocab_usage) /
// Solve for block size including the dedupe multiplier for one block.
@@ -252,7 +252,7 @@ void InitialProbabilities(const std::vector<uint64_t> &counts, const std::vector
util::stream::Chains gamma_chains(config.order);
InitialProbabilities(config.initial_probs, discounts, master.MutableChains(), second, gamma_chains, prune_thresholds, prune_vocab);
- // Don't care about gamma for 0.
+ // Don't care about gamma for 0.
gamma_chains[0] >> util::stream::kRecycle;
gammas.Init(config.order - 1);
for (std::size_t i = 1; i < config.order; ++i) {
@@ -307,16 +307,16 @@ void Pipeline(PipelineConfig &config, int text_file, Output &output) {
// master's destructor will wait for chains. But they might be deadlocked if
// this thread dies because e.g. it ran out of memory.
try {
- util::scoped_fd vocab_file(config.vocab_file.empty() ?
- util::MakeTemp(config.TempPrefix()) :
+ util::scoped_fd vocab_file(config.vocab_file.empty() ?
+ util::MakeTemp(config.TempPrefix()) :
util::CreateOrThrow(config.vocab_file.c_str()));
output.SetVocabFD(vocab_file.get());
uint64_t token_count;
std::string text_file_name;
-
+
std::vector<bool> prune_words;
CountText(text_file, vocab_file.get(), master, token_count, text_file_name, prune_words);
-
+
std::vector<uint64_t> counts;
std::vector<uint64_t> counts_pruned;
std::vector<Discount> discounts;
diff --git a/lm/builder/pipeline.hh b/lm/builder/pipeline.hh
index 8f4d82103..1987daff1 100644
--- a/lm/builder/pipeline.hh
+++ b/lm/builder/pipeline.hh
@@ -44,7 +44,7 @@ struct PipelineConfig {
// Compute collapsed q values instead of probability and backoff
bool output_q;
-
+
/* Computing the perplexity of LMs with different vocabularies is hard. For
* example, the lowest perplexity is attained by a unigram model that
* predicts p(<unk>) = 1 and has no other vocabulary. Also, linearly
diff --git a/lm/builder/print.cc b/lm/builder/print.cc
index 2c8c7276c..56a3134d8 100644
--- a/lm/builder/print.cc
+++ b/lm/builder/print.cc
@@ -55,7 +55,7 @@ void PrintARPA::Run(const util::stream::ChainPositions &positions) {
if (order != positions.size())
out << '\t' << stream->Value().complete.backoff;
out << '\n';
-
+
}
out << '\n';
}
diff --git a/lm/builder/print.hh b/lm/builder/print.hh
index ad282ea85..093a35697 100644
--- a/lm/builder/print.hh
+++ b/lm/builder/print.hh
@@ -14,7 +14,7 @@
// Warning: print routines read all unigrams before all bigrams before all
// trigrams etc. So if other parts of the chain move jointly, you'll have to
-// buffer.
+// buffer.
namespace lm { namespace builder {
@@ -42,7 +42,7 @@ class VocabReconstitute {
std::vector<const char*> map_;
};
-// Not defined, only specialized.
+// Not defined, only specialized.
template <class T> void PrintPayload(util::FakeOFStream &to, const Payload &payload);
template <> inline void PrintPayload<uint64_t>(util::FakeOFStream &to, const Payload &payload) {
// TODO slow
@@ -55,7 +55,7 @@ template <> inline void PrintPayload<ProbBackoff>(util::FakeOFStream &to, const
to << payload.complete.prob << ' ' << payload.complete.backoff;
}
-// template parameter is the type stored.
+// template parameter is the type stored.
template <class V> class Print {
public:
static void DumpSeparateFiles(const VocabReconstitute &vocab, const std::string &file_base, util::stream::Chains &chains) {
diff --git a/lm/builder/sort.hh b/lm/builder/sort.hh
index 712bb8e35..ed20b4b79 100644
--- a/lm/builder/sort.hh
+++ b/lm/builder/sort.hh
@@ -19,7 +19,7 @@ namespace builder {
*/
template <class Child> class Comparator : public std::binary_function<const void *, const void *, bool> {
public:
-
+
/**
* Constructs a comparator capable of comparing two n-grams.
*
@@ -51,8 +51,8 @@ template <class Child> class Comparator : public std::binary_function<const void
/**
* N-gram comparator that compares n-grams according to their reverse (suffix) order.
*
- * This comparator compares n-grams lexicographically, one word at a time,
- * beginning with the last word of each n-gram and ending with the first word of each n-gram.
+ * This comparator compares n-grams lexicographically, one word at a time,
+ * beginning with the last word of each n-gram and ending with the first word of each n-gram.
*
* Some examples of n-gram comparisons as defined by this comparator:
* - a b c == a b c
@@ -64,8 +64,8 @@ template <class Child> class Comparator : public std::binary_function<const void
*/
class SuffixOrder : public Comparator<SuffixOrder> {
public:
-
- /**
+
+ /**
* Constructs a comparator capable of comparing two n-grams.
*
* @param order Number of words in each n-gram
@@ -73,7 +73,7 @@ class SuffixOrder : public Comparator<SuffixOrder> {
explicit SuffixOrder(std::size_t order) : Comparator<SuffixOrder>(order) {}
/**
- * Compares two n-grams lexicographically, one word at a time,
+ * Compares two n-grams lexicographically, one word at a time,
* beginning with the last word of each n-gram and ending with the first word of each n-gram.
*
* @param lhs A pointer to the n-gram on the left-hand side of the comparison
@@ -90,11 +90,11 @@ class SuffixOrder : public Comparator<SuffixOrder> {
static const unsigned kMatchOffset = 1;
};
-
+
/**
* N-gram comparator that compares n-grams according to the reverse (suffix) order of the n-gram context.
*
- * This comparator compares n-grams lexicographically, one word at a time,
+ * This comparator compares n-grams lexicographically, one word at a time,
* beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
* finally, this comparator compares the last word of each n-gram.
*
@@ -108,8 +108,8 @@ class SuffixOrder : public Comparator<SuffixOrder> {
*/
class ContextOrder : public Comparator<ContextOrder> {
public:
-
- /**
+
+ /**
* Constructs a comparator capable of comparing two n-grams.
*
* @param order Number of words in each n-gram
@@ -117,7 +117,7 @@ class ContextOrder : public Comparator<ContextOrder> {
explicit ContextOrder(std::size_t order) : Comparator<ContextOrder>(order) {}
/**
- * Compares two n-grams lexicographically, one word at a time,
+ * Compares two n-grams lexicographically, one word at a time,
* beginning with the penultimate word of each n-gram and ending with the first word of each n-gram;
* finally, this comparator compares the last word of each n-gram.
*
@@ -136,7 +136,7 @@ class ContextOrder : public Comparator<ContextOrder> {
/**
* N-gram comparator that compares n-grams according to their natural (prefix) order.
*
- * This comparator compares n-grams lexicographically, one word at a time,
+ * This comparator compares n-grams lexicographically, one word at a time,
* beginning with the first word of each n-gram and ending with the last word of each n-gram.
*
* Some examples of n-gram comparisons as defined by this comparator:
@@ -149,8 +149,8 @@ class ContextOrder : public Comparator<ContextOrder> {
*/
class PrefixOrder : public Comparator<PrefixOrder> {
public:
-
- /**
+
+ /**
* Constructs a comparator capable of comparing two n-grams.
*
* @param order Number of words in each n-gram
@@ -158,7 +158,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
explicit PrefixOrder(std::size_t order) : Comparator<PrefixOrder>(order) {}
/**
- * Compares two n-grams lexicographically, one word at a time,
+ * Compares two n-grams lexicographically, one word at a time,
* beginning with the first word of each n-gram and ending with the last word of each n-gram.
*
* @param lhs A pointer to the n-gram on the left-hand side of the comparison
@@ -171,7 +171,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
}
return false;
}
-
+
static const unsigned kMatchOffset = 0;
};
@@ -179,7 +179,7 @@ class PrefixOrder : public Comparator<PrefixOrder> {
struct AddCombiner {
bool operator()(void *first_void, const void *second_void, const SuffixOrder &compare) const {
NGram first(first_void, compare.Order());
- // There isn't a const version of NGram.
+ // There isn't a const version of NGram.
NGram second(const_cast<void*>(second_void), compare.Order());
if (memcmp(first.begin(), second.begin(), sizeof(WordIndex) * compare.Order())) return false;
first.Count() += second.Count();
@@ -204,10 +204,10 @@ template <class Compare> class Sorts : public util::FixedArray<util::stream::Sor
typedef util::FixedArray<S> P;
public:
-
+
/**
* Constructs, but does not initialize.
- *
+ *
* @ref util::FixedArray::Init() "Init" must be called before use.
*
* @see util::FixedArray::Init()
@@ -222,7 +222,7 @@ template <class Compare> class Sorts : public util::FixedArray<util::stream::Sor
*/
explicit Sorts(std::size_t number) : util::FixedArray<util::stream::Sort<Compare> >(number) {}
- /**
+ /**
* Constructs a new @ref util::stream::Sort "Sort" object which is stored in this @ref util::FixedArray "array".
*
* The new @ref util::stream::Sort "Sort" object is constructed using the provided @ref util::stream::SortConfig "SortConfig" and @ref Comparator "ngram comparator";
diff --git a/lm/enumerate_vocab.hh b/lm/enumerate_vocab.hh
index f5ce78985..f4c94cd26 100644
--- a/lm/enumerate_vocab.hh
+++ b/lm/enumerate_vocab.hh
@@ -10,7 +10,7 @@ namespace lm {
* and implement Add. Then put a pointer in Config.enumerate_vocab; it does
* not take ownership. Add is called once per vocab word. index starts at 0
* and increases by 1 each time. This is only used by the Model constructor;
- * the pointer is not retained by the class.
+ * the pointer is not retained by the class.
*/
class EnumerateVocab {
public:
diff --git a/lm/facade.hh b/lm/facade.hh
index 8e12b62ee..325ef159a 100644
--- a/lm/facade.hh
+++ b/lm/facade.hh
@@ -9,8 +9,8 @@
namespace lm {
namespace base {
-// Common model interface that depends on knowing the specific classes.
-// Curiously recurring template pattern.
+// Common model interface that depends on knowing the specific classes.
+// Curiously recurring template pattern.
template <class Child, class StateT, class VocabularyT> class ModelFacade : public Model {
public:
typedef StateT State;
@@ -32,7 +32,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
*reinterpret_cast<State*>(out_state));
}
- // Default Score function calls FullScore. Model can override this.
+ // Default Score function calls FullScore. Model can override this.
float Score(const State &in_state, const WordIndex new_word, State &out_state) const {
return static_cast<const Child*>(this)->FullScore(in_state, new_word, out_state).prob;
}
@@ -53,7 +53,7 @@ template <class Child, class StateT, class VocabularyT> class ModelFacade : publ
virtual ~ModelFacade() {}
- // begin_sentence and null_context can disappear after. vocab should stay.
+ // begin_sentence and null_context can disappear after. vocab should stay.
void Init(const State &begin_sentence, const State &null_context, const Vocabulary &vocab, unsigned char order) {
begin_sentence_ = begin_sentence;
null_context_ = null_context;
diff --git a/lm/filter/count_io.hh b/lm/filter/count_io.hh
index de894baf8..02eb78baa 100644
--- a/lm/filter/count_io.hh
+++ b/lm/filter/count_io.hh
@@ -33,7 +33,7 @@ class CountOutput : boost::noncopyable {
class CountBatch {
public:
- explicit CountBatch(std::streamsize initial_read)
+ explicit CountBatch(std::streamsize initial_read)
: initial_read_(initial_read) {
buffer_.reserve(initial_read);
}
@@ -66,7 +66,7 @@ class CountBatch {
private:
std::streamsize initial_read_;
- // This could have been a std::string but that's less happy with raw writes.
+ // This could have been a std::string but that's less happy with raw writes.
std::vector<char> buffer_;
};
diff --git a/lm/filter/filter_main.cc b/lm/filter/filter_main.cc
index 82fdc1ef7..6e89d1fa3 100644
--- a/lm/filter/filter_main.cc
+++ b/lm/filter/filter_main.cc
@@ -58,7 +58,7 @@ typedef enum {MODE_COPY, MODE_SINGLE, MODE_MULTIPLE, MODE_UNION, MODE_UNSET} Fil
typedef enum {FORMAT_ARPA, FORMAT_COUNT} Format;
struct Config {
- Config() :
+ Config() :
#ifndef NTHREAD
batch_size(25000),
threads(boost::thread::hardware_concurrency()),
diff --git a/lm/filter/format.hh b/lm/filter/format.hh
index 5a2e2db3c..d453f05b8 100644
--- a/lm/filter/format.hh
+++ b/lm/filter/format.hh
@@ -134,12 +134,12 @@ struct CountFormat {
/* For multithreading, the buffer classes hold batches of filter inputs and
* outputs in memory. The strings get reused a lot, so keep them around
- * instead of clearing each time.
+ * instead of clearing each time.
*/
class InputBuffer {
public:
InputBuffer() : actual_(0) {}
-
+
void Reserve(size_t size) { lines_.reserve(size); }
template <class Output> void AddNGram(const StringPiece &ngram, const StringPiece &line, Output &output) {
@@ -179,18 +179,18 @@ class BinaryOutputBuffer {
void Reserve(size_t size) {
lines_.reserve(size);
}
-
+
void AddNGram(const StringPiece &line) {
lines_.push_back(line);
}
-
+
template <class Output> void Flush(Output &output) {
for (std::vector<StringPiece>::const_iterator i = lines_.begin(); i != lines_.end(); ++i) {
output.AddNGram(*i);
}
lines_.clear();
}
-
+
private:
std::vector<StringPiece> lines_;
};
@@ -234,7 +234,7 @@ class MultipleOutputBuffer {
private:
struct Annotated {
- // If this is empty, send to all systems.
+ // If this is empty, send to all systems.
// A filter should never send to all systems and send to a single one.
std::vector<size_t> systems;
StringPiece line;
diff --git a/lm/filter/phrase.cc b/lm/filter/phrase.cc
index 345900ffa..d8260d54e 100644
--- a/lm/filter/phrase.cc
+++ b/lm/filter/phrase.cc
@@ -31,14 +31,14 @@ unsigned int ReadMultiple(std::istream &in, Substrings &out) {
word.clear();
}
if (c == ' ') continue;
- // It's more than just a space. Close out the phrase.
+ // It's more than just a space. Close out the phrase.
if (!phrase.empty()) {
sentence_content = true;
out.AddPhrase(sentence_id, phrase.begin(), phrase.end());
phrase.clear();
}
if (c == '\t' || c == '\v') continue;
- // It's more than a space or tab: a newline.
+ // It's more than a space or tab: a newline.
if (sentence_content) {
++sentence_id;
sentence_content = false;
@@ -53,7 +53,7 @@ typedef unsigned int Sentence;
typedef std::vector<Sentence> Sentences;
} // namespace
-namespace detail {
+namespace detail {
const StringPiece kEndSentence("</s>");
@@ -61,7 +61,7 @@ class Arc {
public:
Arc() {}
- // For arcs from one vertex to another.
+ // For arcs from one vertex to another.
void SetPhrase(detail::Vertex &from, detail::Vertex &to, const Sentences &intersect) {
Set(to, intersect);
from_ = &from;
@@ -69,7 +69,7 @@ class Arc {
/* For arcs from before the n-gram begins to somewhere in the n-gram (right
* aligned). These have no from_ vertex; it implictly matches every
- * sentence. This also handles when the n-gram is a substring of a phrase.
+ * sentence. This also handles when the n-gram is a substring of a phrase.
*/
void SetRight(detail::Vertex &to, const Sentences &complete) {
Set(to, complete);
@@ -87,12 +87,12 @@ class Arc {
/* When this function returns:
* If Empty() then there's nothing left from this intersection.
*
- * If Current() == to then to is part of the intersection.
+ * If Current() == to then to is part of the intersection.
*
* Otherwise, Current() > to. In this case, to is not part of the
* intersection and neither is anything < Current(). To determine if
* any value >= Current() is in the intersection, call LowerBound again
- * with the value.
+ * with the value.
*/
void LowerBound(const Sentence to);
@@ -160,15 +160,15 @@ void Arc::Set(Vertex &to, const Sentences &sentences) {
void Vertex::LowerBound(const Sentence to) {
if (Empty()) return;
- // Union lower bound.
+ // Union lower bound.
while (true) {
Arc *top = incoming_.top();
if (top->Current() > to) {
current_ = top->Current();
return;
}
- // If top->Current() == to, we still need to verify that's an actual
- // element and not just a bound.
+ // If top->Current() == to, we still need to verify that's an actual
+ // element and not just a bound.
incoming_.pop();
top->LowerBound(to);
if (!top->Empty()) {
@@ -213,13 +213,13 @@ void BuildGraph(const Substrings &phrase, const std::vector<Hash> &hashes, detai
}
}
- // Phrases starting at the second or later word in the n-gram.
+ // Phrases starting at the second or later word in the n-gram.
Vertex *vertex_from = vertices;
for (const Hash *word_from = first_word + 1; word_from != &*hashes.end(); ++word_from, ++vertex_from) {
hash = 0;
Vertex *vertex_to = vertex_from + 1;
for (const Hash *word_to = word_from; ; ++word_to, ++vertex_to) {
- // Notice that word_to and vertex_to have the same index.
+ // Notice that word_to and vertex_to have the same index.
hash = util::MurmurHashNative(&hash, sizeof(uint64_t), *word_to);
// Now hash covers [word_from, word_to].
if (word_to == last_word) {
@@ -250,7 +250,7 @@ detail::Vertex &ConditionCommon::MakeGraph() {
vertices_.clear();
vertices_.resize(hashes_.size());
arcs_.clear();
- // One for every substring.
+ // One for every substring.
arcs_.resize(((hashes_.size() + 1) * hashes_.size()) / 2);
BuildGraph(substrings_, hashes_, &*vertices_.begin(), &*arcs_.begin());
return vertices_[hashes_.size() - 1];
diff --git a/lm/filter/phrase.hh b/lm/filter/phrase.hh
index e5898c9ae..5227ab246 100644
--- a/lm/filter/phrase.hh
+++ b/lm/filter/phrase.hh
@@ -27,7 +27,7 @@ class Substrings {
private:
/* This is the value in a hash table where the key is a string. It indicates
* four sets of sentences:
- * substring is sentences with a phrase containing the key as a substring.
+ * substring is sentences with a phrase containing the key as a substring.
* left is sentencess with a phrase that begins with the key (left aligned).
* right is sentences with a phrase that ends with the key (right aligned).
* phrase is sentences where the key is a phrase.
@@ -39,8 +39,8 @@ class Substrings {
/* Most of the CPU is hash table lookups, so let's not complicate it with
* vector equality comparisons. If a collision happens, the SentenceRelation
* structure will contain the union of sentence ids over the colliding strings.
- * In that case, the filter will be slightly more permissive.
- * The key here is the same as boost's hash of std::vector<std::string>.
+ * In that case, the filter will be slightly more permissive.
+ * The key here is the same as boost's hash of std::vector<std::string>.
*/
typedef boost::unordered_map<Hash, SentenceRelation> Table;
@@ -58,9 +58,9 @@ class Substrings {
LM_FILTER_PHRASE_METHOD(Phrase, phrase)
#pragma GCC diagnostic ignored "-Wuninitialized" // end != finish so there's always an initialization
- // sentence_id must be non-decreasing. Iterators are over words in the phrase.
+ // sentence_id must be non-decreasing. Iterators are over words in the phrase.
template <class Iterator> void AddPhrase(unsigned int sentence_id, const Iterator &begin, const Iterator &end) {
- // Iterate over all substrings.
+ // Iterate over all substrings.
for (Iterator start = begin; start != end; ++start) {
Hash hash = 0;
SentenceRelation *relation;
@@ -85,7 +85,7 @@ class Substrings {
};
// Read a file with one sentence per line containing tab-delimited phrases of
-// space-separated words.
+// space-separated words.
unsigned int ReadMultiple(std::istream &in, Substrings &out);
namespace detail {
@@ -94,7 +94,7 @@ extern const StringPiece kEndSentence;
template <class Iterator> void MakeHashes(Iterator i, const Iterator &end, std::vector<Hash> &hashes) {
hashes.clear();
if (i == end) return;
- // TODO: check strict phrase boundaries after <s> and before </s>. For now, just skip tags.
+ // TODO: check strict phrase boundaries after <s> and before </s>. For now, just skip tags.
if ((i->data()[0] == '<') && (i->data()[i->size() - 1] == '>')) {
++i;
}
diff --git a/lm/filter/phrase_table_vocab_main.cc b/lm/filter/phrase_table_vocab_main.cc
index e0f47d894..e8a8d0265 100644
--- a/lm/filter/phrase_table_vocab_main.cc
+++ b/lm/filter/phrase_table_vocab_main.cc
@@ -88,7 +88,7 @@ class TargetWords {
class Input {
public:
- explicit Input(std::size_t max_length)
+ explicit Input(std::size_t max_length)
: max_length_(max_length), sentence_id_(0), empty_() {}
void AddSentence(StringPiece sentence, TargetWords &targets) {
@@ -125,7 +125,7 @@ class Input {
Map map_;
std::size_t sentence_id_;
-
+
// Temporaries in AddSentence.
std::string canonical_;
std::vector<std::size_t> starts_;
diff --git a/lm/filter/thread.hh b/lm/filter/thread.hh
index 6a6523f90..88e069cb1 100644
--- a/lm/filter/thread.hh
+++ b/lm/filter/thread.hh
@@ -13,29 +13,29 @@ namespace lm {
template <class OutputBuffer> class ThreadBatch {
public:
ThreadBatch() {}
-
+
void Reserve(size_t size) {
input_.Reserve(size);
output_.Reserve(size);
}
- // File reading thread.
+ // File reading thread.
InputBuffer &Fill(uint64_t sequence) {
sequence_ = sequence;
// Why wait until now to clear instead of after output? free in the same
- // thread as allocated.
+ // thread as allocated.
input_.Clear();
return input_;
}
- // Filter worker thread.
+ // Filter worker thread.
template <class Filter> void CallFilter(Filter &filter) {
input_.CallFilter(filter, output_);
}
uint64_t Sequence() const { return sequence_; }
- // File writing thread.
+ // File writing thread.
template <class RealOutput> void Flush(RealOutput &output) {
output_.Flush(output);
}
@@ -73,7 +73,7 @@ template <class Batch, class Output> class OutputWorker {
void operator()(Request request) {
assert(request->Sequence() >= base_sequence_);
- // Assemble the output in order.
+ // Assemble the output in order.
uint64_t pos = request->Sequence() - base_sequence_;
if (pos >= ordering_.size()) {
ordering_.resize(pos + 1, NULL);
@@ -102,7 +102,7 @@ template <class Filter, class OutputBuffer, class RealOutput> class Controller :
typedef ThreadBatch<OutputBuffer> Batch;
public:
- Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output)
+ Controller(size_t batch_size, size_t queue, size_t workers, const Filter &filter, RealOutput &output)
: batch_size_(batch_size), queue_size_(queue),
batches_(queue),
to_read_(queue),
diff --git a/lm/filter/vocab.cc b/lm/filter/vocab.cc
index 0a5585580..2aca4fc60 100644
--- a/lm/filter/vocab.cc
+++ b/lm/filter/vocab.cc
@@ -30,7 +30,7 @@ bool IsLineEnd(std::istream &in) {
}// namespace
// Read space separated words in enter separated lines. These lines can be
-// very long, so don't read an entire line at a time.
+// very long, so don't read an entire line at a time.
unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, std::vector<unsigned int> > &out) {
in.exceptions(std::istream::badbit);
unsigned int sentence = 0;
diff --git a/lm/filter/vocab.hh b/lm/filter/vocab.hh
index 2ee6e1f8a..397a93237 100644
--- a/lm/filter/vocab.hh
+++ b/lm/filter/vocab.hh
@@ -26,7 +26,7 @@ unsigned int ReadMultiple(std::istream &in, boost::unordered_map<std::string, st
/* Is this a special tag like <s> or <UNK>? This actually includes anything
* surrounded with < and >, which most tokenizers separate for real words, so
- * this should not catch real words as it looks at a single token.
+ * this should not catch real words as it looks at a single token.
*/
inline bool IsTag(const StringPiece &value) {
// The parser should never give an empty string.
diff --git a/lm/filter/wrapper.hh b/lm/filter/wrapper.hh
index 822c5c27d..227ec8e45 100644
--- a/lm/filter/wrapper.hh
+++ b/lm/filter/wrapper.hh
@@ -13,7 +13,7 @@ namespace lm {
// multiple-output filter so clients code against one interface.
template <class Binary> class BinaryFilter {
public:
- // Binary modes are just references (and a set) and it makes the API cleaner to copy them.
+ // Binary modes are just references (and a set) and it makes the API cleaner to copy them.
explicit BinaryFilter(Binary binary) : binary_(binary) {}
template <class Iterator, class Output> void AddNGram(const Iterator &begin, const Iterator &end, const StringPiece &line, Output &output) {
diff --git a/lm/left.hh b/lm/left.hh
index 36d613697..4d496863c 100644
--- a/lm/left.hh
+++ b/lm/left.hh
@@ -1,22 +1,22 @@
/* Efficient left and right language model state for sentence fragments.
* Intended usage:
- * Store ChartState with every chart entry.
+ * Store ChartState with every chart entry.
* To do a rule application:
- * 1. Make a ChartState object for your new entry.
- * 2. Construct RuleScore.
- * 3. Going from left to right, call Terminal or NonTerminal.
- * For terminals, just pass the vocab id.
+ * 1. Make a ChartState object for your new entry.
+ * 2. Construct RuleScore.
+ * 3. Going from left to right, call Terminal or NonTerminal.
+ * For terminals, just pass the vocab id.
* For non-terminals, pass that non-terminal's ChartState.
* If your decoder expects scores inclusive of subtree scores (i.e. you
* label entries with the highest-scoring path), pass the non-terminal's
- * score as prob.
+ * score as prob.
* If your decoder expects relative scores and will walk the chart later,
- * pass prob = 0.0.
+ * pass prob = 0.0.
* In other words, the only effect of prob is that it gets added to the
- * returned log probability.
- * 4. Call Finish. It returns the log probability.
+ * returned log probability.
+ * 4. Call Finish. It returns the log probability.
*
- * There's a couple more details:
+ * There's a couple more details:
* Do not pass <s> to Terminal as it is formally not a word in the sentence,
* only context. Instead, call BeginSentence. If called, it should be the
* first call after RuleScore is constructed (since <s> is always the
@@ -27,12 +27,12 @@
* Hashing and sorting comparison operators are provided. All state objects
* are POD. If you intend to use memcmp on raw state objects, you must call
* ZeroRemaining first, as the value of array entries beyond length is
- * otherwise undefined.
+ * otherwise undefined.
*
* Usage is of course not limited to chart decoding. Anything that generates
* sentence fragments missing left context could benefit. For example, a
* phrase-based decoder could pre-score phrases, storing ChartState with each
- * phrase, even if hypotheses are generated left-to-right.
+ * phrase, even if hypotheses are generated left-to-right.
*/
#ifndef LM_LEFT_H
@@ -77,7 +77,7 @@ template <class M> class RuleScore {
left_done_ = true;
}
- // Faster version of NonTerminal for the case where the rule begins with a non-terminal.
+ // Faster version of NonTerminal for the case where the rule begins with a non-terminal.
void BeginNonTerminal(const ChartState &in, float prob = 0.0) {
prob_ = prob;
*out_ = in;
@@ -86,7 +86,7 @@ template <class M> class RuleScore {
void NonTerminal(const ChartState &in, float prob = 0.0) {
prob_ += prob;
-
+
if (!in.left.length) {
if (in.left.full) {
for (const float *i = out_->right.backoff; i < out_->right.backoff + out_->right.length; ++i) prob_ += *i;
@@ -131,26 +131,26 @@ template <class M> class RuleScore {
return;
}
- // Right state was minimized, so it's already independent of the new words to the left.
+ // Right state was minimized, so it's already independent of the new words to the left.
if (in.right.length < in.left.length) {
out_->right = in.right;
return;
}
- // Shift exisiting words down.
+ // Shift exisiting words down.
for (WordIndex *i = out_->right.words + next_use - 1; i >= out_->right.words; --i) {
*(i + in.right.length) = *i;
}
- // Add words from in.right.
+ // Add words from in.right.
std::copy(in.right.words, in.right.words + in.right.length, out_->right.words);
- // Assemble backoff composed on the existing state's backoff followed by the new state's backoff.
+ // Assemble backoff composed on the existing state's backoff followed by the new state's backoff.
std::copy(in.right.backoff, in.right.backoff + in.right.length, out_->right.backoff);
std::copy(back, back + next_use, out_->right.backoff + in.right.length);
out_->right.length = in.right.length + next_use;
}
float Finish() {
- // A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram.
+ // A N-1-gram might extend left and right but we should still set full to true because it's an N-1-gram.
out_->left.full = left_done_ || (out_->left.length == model_.Order() - 1);
return prob_;
}
@@ -173,17 +173,17 @@ template <class M> class RuleScore {
back_in, // Backoffs to use
in.left.pointers[extend_length - 1], extend_length, // Words to be extended
back_out, // Backoffs for the next score
- next_use)); // Length of n-gram to use in next scoring.
+ next_use)); // Length of n-gram to use in next scoring.
if (next_use != out_->right.length) {
left_done_ = true;
if (!next_use) {
- // Early exit.
+ // Early exit.
out_->right = in.right;
prob_ += model_.UnRest(in.left.pointers + extend_length, in.left.pointers + in.left.length, extend_length + 1);
return true;
}
}
- // Continue scoring.
+ // Continue scoring.
return false;
}
diff --git a/lm/left_test.cc b/lm/left_test.cc
index b45614613..fdb641627 100644
--- a/lm/left_test.cc
+++ b/lm/left_test.cc
@@ -16,7 +16,7 @@ namespace {
#define Term(word) score.Terminal(m.GetVocabulary().Index(word));
#define VCheck(word, value) BOOST_CHECK_EQUAL(m.GetVocabulary().Index(word), value);
-// Apparently some Boost versions use templates and are pretty strict about types matching.
+// Apparently some Boost versions use templates and are pretty strict about types matching.
#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
template <class M> void Short(const M &m) {
@@ -175,7 +175,7 @@ template <class M> void LookupVocab(const M &m, const StringPiece &str, std::vec
SLOPPY_CHECK_CLOSE(expect, RightToLeft(m, words, rest), 0.001); \
SLOPPY_CHECK_CLOSE(expect, TreeMiddle(m, words, rest), 0.001); \
-// Build sentences, or parts thereof, from right to left.
+// Build sentences, or parts thereof, from right to left.
template <class M> void GrowBig(const M &m, bool rest = false) {
std::vector<WordIndex> words;
float expect;
diff --git a/lm/lm_exception.hh b/lm/lm_exception.hh
index 8bb610812..85a5738eb 100644
--- a/lm/lm_exception.hh
+++ b/lm/lm_exception.hh
@@ -1,7 +1,7 @@
#ifndef LM_LM_EXCEPTION_H
#define LM_LM_EXCEPTION_H
-// Named to avoid conflict with util/exception.hh.
+// Named to avoid conflict with util/exception.hh.
#include "util/exception.hh"
#include "util/string_piece.hh"
diff --git a/lm/max_order.hh b/lm/max_order.hh
index 5f181f3fc..0ad1379e0 100644
--- a/lm/max_order.hh
+++ b/lm/max_order.hh
@@ -1,7 +1,7 @@
#ifndef LM_MAX_ORDER_H
#define LM_MAX_ORDER_H
/* IF YOUR BUILD SYSTEM PASSES -DKENLM_MAX_ORDER, THEN CHANGE THE BUILD SYSTEM.
- * If not, this is the default maximum order.
+ * If not, this is the default maximum order.
* Having this limit means that State can be
* (kMaxOrder - 1) * sizeof(float) bytes instead of
* sizeof(float*) + (kMaxOrder - 1) * sizeof(float) + malloc overhead
diff --git a/lm/model.hh b/lm/model.hh
index c67ae2eed..b2bbe3999 100644
--- a/lm/model.hh
+++ b/lm/model.hh
@@ -25,7 +25,7 @@ namespace lm {
namespace ngram {
namespace detail {
-// Should return the same results as SRI.
+// Should return the same results as SRI.
// ModelFacade typedefs Vocabulary so we use VocabularyT to avoid naming conflicts.
template <class Search, class VocabularyT> class GenericModel : public base::ModelFacade<GenericModel<Search, VocabularyT>, State, VocabularyT> {
private:
@@ -38,7 +38,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
/* Get the size of memory that will be mapped given ngram counts. This
* does not include small non-mapped control structures, such as this class
- * itself.
+ * itself.
*/
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config = Config());
@@ -46,47 +46,47 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
* files must have the format expected by this class or you'll get an
* exception. So TrieModel can only load ARPA or binary created by
* TrieModel. To classify binary files, call RecognizeBinary in
- * lm/binary_format.hh.
+ * lm/binary_format.hh.
*/
explicit GenericModel(const char *file, const Config &config = Config());
/* Score p(new_word | in_state) and incorporate new_word into out_state.
* Note that in_state and out_state must be different references:
- * &in_state != &out_state.
+ * &in_state != &out_state.
*/
FullScoreReturn FullScore(const State &in_state, const WordIndex new_word, State &out_state) const;
/* Slower call without in_state. Try to remember state, but sometimes it
- * would cost too much memory or your decoder isn't setup properly.
+ * would cost too much memory or your decoder isn't setup properly.
* To use this function, make an array of WordIndex containing the context
* vocabulary ids in reverse order. Then, pass the bounds of the array:
* [context_rbegin, context_rend). The new_word is not part of the context
- * array unless you intend to repeat words.
+ * array unless you intend to repeat words.
*/
FullScoreReturn FullScoreForgotState(const WordIndex *context_rbegin, const WordIndex *context_rend, const WordIndex new_word, State &out_state) const;
/* Get the state for a context. Don't use this if you can avoid it. Use
* BeginSentenceState or NullContextState and extend from those. If
* you're only going to use this state to call FullScore once, use
- * FullScoreForgotState.
+ * FullScoreForgotState.
* To use this function, make an array of WordIndex containing the context
* vocabulary ids in reverse order. Then, pass the bounds of the array:
- * [context_rbegin, context_rend).
+ * [context_rbegin, context_rend).
*/
void GetState(const WordIndex *context_rbegin, const WordIndex *context_rend, State &out_state) const;
/* More efficient version of FullScore where a partial n-gram has already
- * been scored.
- * NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE.
+ * been scored.
+ * NOTE: THE RETURNED .rest AND .prob ARE RELATIVE TO THE .rest RETURNED BEFORE.
*/
FullScoreReturn ExtendLeft(
- // Additional context in reverse order. This will update add_rend to
+ // Additional context in reverse order. This will update add_rend to
const WordIndex *add_rbegin, const WordIndex *add_rend,
- // Backoff weights to use.
+ // Backoff weights to use.
const float *backoff_in,
// extend_left returned by a previous query.
uint64_t extend_pointer,
- // Length of n-gram that the pointer corresponds to.
+ // Length of n-gram that the pointer corresponds to.
unsigned char extend_length,
// Where to write additional backoffs for [extend_length + 1, min(Order() - 1, return.ngram_length)]
float *backoff_out,
@@ -95,17 +95,17 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
/* Return probabilities minus rest costs for an array of pointers. The
* first length should be the length of the n-gram to which pointers_begin
- * points.
+ * points.
*/
float UnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const {
- // Compiler should optimize this if away.
+ // Compiler should optimize this if away.
return Search::kDifferentRest ? InternalUnRest(pointers_begin, pointers_end, first_length) : 0.0;
}
private:
FullScoreReturn ScoreExceptBackoff(const WordIndex *const context_rbegin, const WordIndex *const context_rend, const WordIndex new_word, State &out_state) const;
- // Score bigrams and above. Do not include backoff.
+ // Score bigrams and above. Do not include backoff.
void ResumeScore(const WordIndex *context_rbegin, const WordIndex *const context_rend, unsigned char starting_order_minus_2, typename Search::Node &node, float *backoff_out, unsigned char &next_use, FullScoreReturn &ret) const;
// Appears after Size in the cc file.
@@ -116,7 +116,7 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
float InternalUnRest(const uint64_t *pointers_begin, const uint64_t *pointers_end, unsigned char first_length) const;
BinaryFormat backing_;
-
+
VocabularyT vocab_;
Search search_;
@@ -124,8 +124,8 @@ template <class Search, class VocabularyT> class GenericModel : public base::Mod
} // namespace detail
-// Instead of typedef, inherit. This allows the Model etc to be forward declared.
-// Oh the joys of C and C++.
+// Instead of typedef, inherit. This allows the Model etc to be forward declared.
+// Oh the joys of C and C++.
#define LM_COMMA() ,
#define LM_NAME_MODEL(name, from)\
class name : public from {\
@@ -140,7 +140,7 @@ LM_NAME_MODEL(ArrayTrieModel, detail::GenericModel<trie::TrieSearch<DontQuantize
LM_NAME_MODEL(QuantTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::DontBhiksha> LM_COMMA() SortedVocabulary>);
LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<SeparatelyQuantize LM_COMMA() trie::ArrayBhiksha> LM_COMMA() SortedVocabulary>);
-// Default implementation. No real reason for it to be the default.
+// Default implementation. No real reason for it to be the default.
typedef ::lm::ngram::ProbingVocabulary Vocabulary;
typedef ProbingModel Model;
diff --git a/lm/model_test.cc b/lm/model_test.cc
index 2e4b14fb4..d408d6fe4 100644
--- a/lm/model_test.cc
+++ b/lm/model_test.cc
@@ -7,7 +7,7 @@
#include <boost/test/unit_test.hpp>
#include <boost/test/floating_point_comparison.hpp>
-// Apparently some Boost versions use templates and are pretty strict about types matching.
+// Apparently some Boost versions use templates and are pretty strict about types matching.
#define SLOPPY_CHECK_CLOSE(ref, value, tol) BOOST_CHECK_CLOSE(static_cast<double>(ref), static_cast<double>(value), static_cast<double>(tol));
namespace lm {
@@ -118,7 +118,7 @@ template <class M> void Blanks(const M &model) {
AppendTest("not_found", 1, -1.995635 - 7.0 - 0.30103, true);
state = model.NullContextState();
- // higher looking is a blank.
+ // higher looking is a blank.
AppendTest("higher", 1, -1.509559, false);
AppendTest("looking", 2, -1.285941 - 0.30103, false);
@@ -150,7 +150,7 @@ template <class M> void Unknowns(const M &model) {
State preserve = state;
AppendTest("not_found2", 2, -15.0, true);
AppendTest("not_found3", 2, -15.0 - 2.0, true);
-
+
state = preserve;
AppendTest("however", 2, -4, true);
AppendTest("not_found3", 3, -6, true);
@@ -167,7 +167,7 @@ template <class M> void MinimalState(const M &model) {
AppendTest("foo", 1, -3.141592, true);
BOOST_CHECK_EQUAL(1, state.length);
AppendTest("bar", 2, -6.0, true);
- // Has to include the backoff weight.
+ // Has to include the backoff weight.
BOOST_CHECK_EQUAL(1, state.length);
AppendTest("bar", 1, -2.718281 + 3.0, true);
BOOST_CHECK_EQUAL(1, state.length);
@@ -263,7 +263,7 @@ template <class M> void Stateless(const M &model) {
// the
AppendTest("the", 1, -4.04005, true);
StatelessTest(5, 5, 1, -4.04005);
- // No context of the.
+ // No context of the.
StatelessTest(5, 0, 1, -1.687872);
// biarritz
StatelessTest(6, 1, 1, -1.9889);
diff --git a/lm/model_type.hh b/lm/model_type.hh
index fbe1117a5..dcdc6ac7c 100644
--- a/lm/model_type.hh
+++ b/lm/model_type.hh
@@ -8,7 +8,7 @@ namespace ngram {
* and I want to preserve existing binary files. */
typedef enum {PROBING=0, REST_PROBING=1, TRIE=2, QUANT_TRIE=3, ARRAY_TRIE=4, QUANT_ARRAY_TRIE=5} ModelType;
-// Historical names.
+// Historical names.
const ModelType HASH_PROBING = PROBING;
const ModelType TRIE_SORTED = TRIE;
const ModelType QUANT_TRIE_SORTED = QUANT_TRIE;
diff --git a/lm/ngram_query.hh b/lm/ngram_query.hh
index 560853749..937fe2421 100644
--- a/lm/ngram_query.hh
+++ b/lm/ngram_query.hh
@@ -22,7 +22,7 @@ struct BasicPrint {
std::cout << "Total: " << total << " OOV: " << oov << '\n';
}
void Summary(double, double, uint64_t, uint64_t) {}
-
+
};
struct FullPrint : public BasicPrint {
@@ -31,7 +31,7 @@ struct FullPrint : public BasicPrint {
}
void Summary(double ppl_including_oov, double ppl_excluding_oov, uint64_t corpus_oov, uint64_t corpus_tokens) {
- std::cout <<
+ std::cout <<
"Perplexity including OOVs:\t" << ppl_including_oov << "\n"
"Perplexity excluding OOVs:\t" << ppl_excluding_oov << "\n"
"OOVs:\t" << corpus_oov << "\n"
diff --git a/lm/partial.hh b/lm/partial.hh
index 3e67d91c5..9e4e3522e 100644
--- a/lm/partial.hh
+++ b/lm/partial.hh
@@ -35,9 +35,9 @@ template <class Model> ExtendReturn ExtendLoop(
unsigned char i = 0;
unsigned char length = pointers_end - pointers;
- // pointers_write is NULL means that the existing left state is full, so we should use completed probabilities.
+ // pointers_write is NULL means that the existing left state is full, so we should use completed probabilities.
if (pointers_write) {
- // Using full context, writing to new left state.
+ // Using full context, writing to new left state.
for (; i < length; ++i) {
FullScoreReturn ret(model.ExtendLeft(
add_rbegin, add_rbegin + value.next_use,
@@ -61,7 +61,7 @@ template <class Model> ExtendReturn ExtendLoop(
}
}
}
- // Using some of the new context.
+ // Using some of the new context.
for (; i < length && value.next_use; ++i) {
FullScoreReturn ret(model.ExtendLeft(
add_rbegin, add_rbegin + value.next_use,
@@ -73,7 +73,7 @@ template <class Model> ExtendReturn ExtendLoop(
value.adjust += ret.prob;
}
float unrest = model.UnRest(pointers + i, pointers_end, i + seen + 1);
- // Using none of the new context.
+ // Using none of the new context.
value.adjust += unrest;
std::copy(backoff_in, backoff_in + value.next_use, backoff_write);
@@ -100,7 +100,7 @@ template <class Model> float RevealBefore(const Model &model, const Right &revea
if (left.full) {
for (unsigned char i = 0; i < value.next_use; ++i) value.adjust += backoff_buffer[i];
} else {
- // If left wasn't full when it came in, put words into right state.
+ // If left wasn't full when it came in, put words into right state.
std::copy(reveal.words + seen, reveal.words + seen + value.next_use, right.words + right.length);
right.length += value.next_use;
left.full = value.make_full || (right.length == model.Order() - 1);
diff --git a/lm/partial_test.cc b/lm/partial_test.cc
index 8d309c85a..adb644fa6 100644
--- a/lm/partial_test.cc
+++ b/lm/partial_test.cc
@@ -123,7 +123,7 @@ BOOST_AUTO_TEST_CASE(EndSentence) {
before.words[1] = loin;
before.backoff[0] = -0.845098;
before.backoff[1] = 0.0;
-
+
before.length = 1;
BOOST_CHECK_CLOSE(-0.0410707, RevealBefore(m, before, 0, true, between.left, between.right), 0.001);
BOOST_CHECK_EQUAL(0, between.left.length);
@@ -159,7 +159,7 @@ void CheckAdjustment(const RestProbingModel &model, float expect, const Right &b
if (before_full) {
got += RevealBefore(model, before, before.length, true, between.left, between.right);
}
- // Sometimes they're zero and BOOST_CHECK_CLOSE fails for this.
+ // Sometimes they're zero and BOOST_CHECK_CLOSE fails for this.
BOOST_CHECK(fabs(expect - got) < 0.001);
}
diff --git a/lm/quantize.cc b/lm/quantize.cc
index 273ea3989..02b5dbc0e 100644
--- a/lm/quantize.cc
+++ b/lm/quantize.cc
@@ -50,12 +50,12 @@ void SeparatelyQuantize::UpdateConfigFromBinary(const BinaryFormat &file, uint64
void SeparatelyQuantize::SetupMemory(void *base, unsigned char order, const Config &config) {
prob_bits_ = config.prob_bits;
backoff_bits_ = config.backoff_bits;
- // We need the reserved values.
+ // We need the reserved values.
if (config.prob_bits == 0) UTIL_THROW(ConfigException, "You can't quantize probability to zero");
if (config.backoff_bits == 0) UTIL_THROW(ConfigException, "You can't quantize backoff to zero");
if (config.prob_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing probability supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.prob_bits) << " bits.");
if (config.backoff_bits > 25) UTIL_THROW(ConfigException, "For efficiency reasons, quantizing backoff supports at most 25 bits. Currently you have requested " << static_cast<unsigned>(config.backoff_bits) << " bits.");
- // Reserve 8 byte header for bit counts.
+ // Reserve 8 byte header for bit counts.
actual_base_ = static_cast<uint8_t*>(base);
float *start = reinterpret_cast<float*>(actual_base_ + 8);
for (unsigned char i = 0; i < order - 2; ++i) {
diff --git a/lm/quantize.hh b/lm/quantize.hh
index 84a30872e..8500aceec 100644
--- a/lm/quantize.hh
+++ b/lm/quantize.hh
@@ -85,7 +85,7 @@ class DontQuantize {
void SetupMemory(void * /*start*/, unsigned char /*order*/, const Config & /*config*/) {}
static const bool kTrain = false;
- // These should never be called because kTrain is false.
+ // These should never be called because kTrain is false.
void Train(uint8_t /*order*/, std::vector<float> &/*prob*/, std::vector<float> &/*backoff*/) {}
void TrainProb(uint8_t, std::vector<float> &/*prob*/) {}
@@ -142,7 +142,7 @@ class SeparatelyQuantize {
static uint64_t Size(uint8_t order, const Config &config) {
uint64_t longest_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.prob_bits)) * sizeof(float);
uint64_t middle_table = (static_cast<uint64_t>(1) << static_cast<uint64_t>(config.backoff_bits)) * sizeof(float) + longest_table;
- // unigrams are currently not quantized so no need for a table.
+ // unigrams are currently not quantized so no need for a table.
return (order - 2) * middle_table + longest_table + /* for the bit counts and alignment padding) */ 8;
}
@@ -168,7 +168,7 @@ class SeparatelyQuantize {
float Rest() const { return Prob(); }
void Write(float prob, float backoff) const {
- util::WriteInt57(address_.base, address_.offset, ProbBins().Bits() + BackoffBins().Bits(),
+ util::WriteInt57(address_.base, address_.offset, ProbBins().Bits() + BackoffBins().Bits(),
(ProbBins().EncodeProb(prob) << BackoffBins().Bits()) | BackoffBins().EncodeBackoff(backoff));
}
@@ -183,7 +183,7 @@ class SeparatelyQuantize {
class LongestPointer {
public:
LongestPointer(const SeparatelyQuantize &quant, const util::BitAddress &address) : table_(&quant.LongestTable()), address_(address) {}
-
+
LongestPointer() : address_(NULL, 0) {}
bool Found() const { return address_.base != NULL; }
@@ -206,7 +206,7 @@ class SeparatelyQuantize {
void SetupMemory(void *start, unsigned char order, const Config &config);
static const bool kTrain = true;
- // Assumes 0.0 is removed from backoff.
+ // Assumes 0.0 is removed from backoff.
void Train(uint8_t order, std::vector<float> &prob, std::vector<float> &backoff);
// Train just probabilities (for longest order).
void TrainProb(uint8_t order, std::vector<float> &prob);
diff --git a/lm/return.hh b/lm/return.hh
index 982ffd66a..ee1f25e94 100644
--- a/lm/return.hh
+++ b/lm/return.hh
@@ -9,7 +9,7 @@ struct FullScoreReturn {
// log10 probability
float prob;
- /* The length of n-gram matched. Do not use this for recombination.
+ /* The length of n-gram matched. Do not use this for recombination.
* Consider a model containing only the following n-grams:
* -1 foo
* -3.14 bar
@@ -18,9 +18,9 @@ struct FullScoreReturn {
*
* If you score ``bar'' then ngram_length is 1 and recombination state is the
* empty string because bar has zero backoff and does not extend to the
- * right.
- * If you score ``foo'' then ngram_length is 1 and recombination state is
- * ``foo''.
+ * right.
+ * If you score ``foo'' then ngram_length is 1 and recombination state is
+ * ``foo''.
*
* Ideally, keep output states around and compare them. Failing that,
* get out_state.ValidLength() and use that length for recombination.
@@ -29,7 +29,7 @@ struct FullScoreReturn {
/* Left extension information. If independent_left is set, then prob is
* independent of words to the left (up to additional backoff). Otherwise,
- * extend_left indicates how to efficiently extend further to the left.
+ * extend_left indicates how to efficiently extend further to the left.
*/
bool independent_left;
uint64_t extend_left; // Defined only if independent_left
diff --git a/lm/search_trie.cc b/lm/search_trie.cc
index 5b0f55fc8..a63985af6 100644
--- a/lm/search_trie.cc
+++ b/lm/search_trie.cc
@@ -517,7 +517,7 @@ template <class Quant, class Bhiksha> void BuildTrie(SortedFiles &files, std::ve
{
WriteEntries<Quant, Bhiksha> writer(contexts, quant, unigrams, out.middle_begin_, out.longest_, counts.size(), sri);
RecursiveInsert(counts.size(), counts[0], inputs, config.ProgressMessages(), "Writing trie", writer);
- // Write the last unigram entry, which is the end pointer for the bigrams.
+ // Write the last unigram entry, which is the end pointer for the bigrams.
writer.Unigram(counts[0]);
}
diff --git a/lm/sizes.cc b/lm/sizes.cc
index 55ad586c4..dd831c505 100644
--- a/lm/sizes.cc
+++ b/lm/sizes.cc
@@ -36,7 +36,7 @@ void ShowSizes(const std::vector<uint64_t> &counts, const lm::ngram::Config &con
long int length = std::max<long int>(2, static_cast<long int>(ceil(log10((double) max_length / divide))));
std::cerr << "Memory estimate for binary LM:\ntype ";
- // right align bytes.
+ // right align bytes.
for (long int i = 0; i < length - 2; ++i) std::cerr << ' ';
std::cerr << prefix << "B\n"
diff --git a/lm/state.hh b/lm/state.hh
index d9ba596ad..2195dee73 100644
--- a/lm/state.hh
+++ b/lm/state.hh
@@ -11,7 +11,7 @@ namespace lm {
namespace ngram {
// This is a POD but if you want memcmp to return the same as operator==, call
-// ZeroRemaining first.
+// ZeroRemaining first.
class State {
public:
bool operator==(const State &other) const {
@@ -19,7 +19,7 @@ class State {
return !memcmp(words, other.words, length * sizeof(WordIndex));
}
- // Three way comparison function.
+ // Three way comparison function.
int Compare(const State &other) const {
if (length != other.length) return length < other.length ? -1 : 1;
return memcmp(words, other.words, length * sizeof(WordIndex));
@@ -30,7 +30,7 @@ class State {
return memcmp(words, other.words, length * sizeof(WordIndex)) < 0;
}
- // Call this before using raw memcmp.
+ // Call this before using raw memcmp.
void ZeroRemaining() {
for (unsigned char i = length; i < KENLM_MAX_ORDER - 1; ++i) {
words[i] = 0;
@@ -40,8 +40,8 @@ class State {
unsigned char Length() const { return length; }
- // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD.
- // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit.
+ // You shouldn't need to touch anything below this line, but the members are public so FullState will qualify as a POD.
+ // This order minimizes total size of the struct if WordIndex is 64 bit, float is 32 bit, and alignment of 64 bit integers is 64 bit.
WordIndex words[KENLM_MAX_ORDER - 1];
float backoff[KENLM_MAX_ORDER - 1];
unsigned char length;
@@ -55,7 +55,7 @@ inline uint64_t hash_value(const State &state, uint64_t seed = 0) {
struct Left {
bool operator==(const Left &other) const {
- return
+ return
length == other.length &&
(!length || (pointers[length - 1] == other.pointers[length - 1] && full == other.full));
}
diff --git a/lm/trie.cc b/lm/trie.cc
index 93320a332..72ad54484 100644
--- a/lm/trie.cc
+++ b/lm/trie.cc
@@ -14,7 +14,7 @@ namespace {
class KeyAccessor {
public:
- KeyAccessor(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_t total_bits)
+ KeyAccessor(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_t total_bits)
: base_(reinterpret_cast<const uint8_t*>(base)), key_mask_(key_mask), key_bits_(key_bits), total_bits_(total_bits) {}
typedef uint64_t Key;
@@ -38,9 +38,9 @@ bool FindBitPacked(const void *base, uint64_t key_mask, uint8_t key_bits, uint8_
uint64_t BitPacked::BaseSize(uint64_t entries, uint64_t max_vocab, uint8_t remaining_bits) {
uint8_t total_bits = util::RequiredBits(max_vocab) + remaining_bits;
- // Extra entry for next pointer at the end.
+ // Extra entry for next pointer at the end.
// +7 then / 8 to round up bits and convert to bytes
- // +sizeof(uint64_t) so that ReadInt57 etc don't go segfault.
+ // +sizeof(uint64_t) so that ReadInt57 etc don't go segfault.
// Note that this waste is O(order), not O(number of ngrams).
return ((1 + entries) * total_bits + 7) / 8 + sizeof(uint64_t);
}
@@ -100,7 +100,7 @@ template <class Bhiksha> util::BitAddress BitPackedMiddle<Bhiksha>::Find(WordInd
template <class Bhiksha> void BitPackedMiddle<Bhiksha>::FinishedLoading(uint64_t next_end, const Config &config) {
// Write at insert_index. . .
- uint64_t last_next_write = insert_index_ * total_bits_ +
+ uint64_t last_next_write = insert_index_ * total_bits_ +
// at the offset where the next pointers are stored.
(total_bits_ - bhiksha_.InlineBits());
bhiksha_.WriteNext(base_, last_next_write, insert_index_, next_end);
diff --git a/lm/trie.hh b/lm/trie.hh
index cd39298b5..b7f0458bf 100644
--- a/lm/trie.hh
+++ b/lm/trie.hh
@@ -18,7 +18,7 @@ struct NodeRange {
uint64_t begin, end;
};
-// TODO: if the number of unigrams is a concern, also bit pack these records.
+// TODO: if the number of unigrams is a concern, also bit pack these records.
struct UnigramValue {
ProbBackoff weights;
uint64_t next;
@@ -44,24 +44,24 @@ class UnigramPointer {
class Unigram {
public:
Unigram() {}
-
+
void Init(void *start) {
unigram_ = static_cast<UnigramValue*>(start);
}
-
+
static uint64_t Size(uint64_t count) {
- // +1 in case unknown doesn't appear. +1 for the final next.
+ // +1 in case unknown doesn't appear. +1 for the final next.
return (count + 2) * sizeof(UnigramValue);
}
-
+
const ProbBackoff &Lookup(WordIndex index) const { return unigram_[index].weights; }
-
+
ProbBackoff &Unknown() { return unigram_[0].weights; }
UnigramValue *Raw() {
return unigram_;
}
-
+
UnigramPointer Find(WordIndex word, NodeRange &next) const {
UnigramValue *val = unigram_ + word;
next.begin = val->next;
@@ -71,7 +71,7 @@ class Unigram {
private:
UnigramValue *unigram_;
-};
+};
class BitPacked {
public:
@@ -99,7 +99,7 @@ template <class Bhiksha> class BitPackedMiddle : public BitPacked {
public:
static uint64_t Size(uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const Config &config);
- // next_source need not be initialized.
+ // next_source need not be initialized.
BitPackedMiddle(void *base, uint8_t quant_bits, uint64_t entries, uint64_t max_vocab, uint64_t max_next, const BitPacked &next_source, const Config &config);
util::BitAddress Insert(WordIndex word);
diff --git a/lm/trie_sort.cc b/lm/trie_sort.cc
index c3f468746..33a2f96b5 100644
--- a/lm/trie_sort.cc
+++ b/lm/trie_sort.cc
@@ -27,7 +27,7 @@ namespace {
typedef util::SizedIterator NGramIter;
-// Proxy for an entry except there is some extra cruft between the entries. This is used to sort (n-1)-grams using the same memory as the sorted n-grams.
+// Proxy for an entry except there is some extra cruft between the entries. This is used to sort (n-1)-grams using the same memory as the sorted n-grams.
class PartialViewProxy {
public:
PartialViewProxy() : attention_size_(0), inner_() {}
@@ -64,7 +64,7 @@ class PartialViewProxy {
typedef util::SizedInnerIterator InnerIterator;
InnerIterator &Inner() { return inner_; }
- const InnerIterator &Inner() const { return inner_; }
+ const InnerIterator &Inner() const { return inner_; }
InnerIterator inner_;
};
@@ -78,7 +78,7 @@ FILE *DiskFlush(const void *mem_begin, const void *mem_end, const std::string &t
FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_prefix, std::size_t entry_size, unsigned char order) {
const size_t context_size = sizeof(WordIndex) * (order - 1);
- // Sort just the contexts using the same memory.
+ // Sort just the contexts using the same memory.
PartialIter context_begin(PartialViewProxy(begin + sizeof(WordIndex), entry_size, context_size));
PartialIter context_end(PartialViewProxy(end + sizeof(WordIndex), entry_size, context_size));
@@ -91,7 +91,7 @@ FILE *WriteContextFile(uint8_t *begin, uint8_t *end, const std::string &temp_pre
util::scoped_FILE out(util::FMakeTemp(temp_prefix));
- // Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator.
+ // Write out to file and uniqueify at the same time. Could have used unique_copy if there was an appropriate OutputIterator.
if (context_begin == context_end) return out.release();
PartialIter i(context_begin);
util::WriteOrThrow(out.get(), i->Data(), context_size);
@@ -118,7 +118,7 @@ struct ThrowCombine {
}
};
-// Useful for context files that just contain records with no value.
+// Useful for context files that just contain records with no value.
struct FirstCombine {
void operator()(std::size_t entry_size, unsigned char /*order*/, const void *first, const void * /*second*/, FILE *out) const {
util::WriteOrThrow(out, first, entry_size);
@@ -172,7 +172,7 @@ void RecordReader::Overwrite(const void *start, std::size_t amount) {
util::WriteOrThrow(file_, start, amount);
long forward = entry_size_ - internal - amount;
#if !defined(_WIN32) && !defined(_WIN64)
- if (forward)
+ if (forward)
#endif
UTIL_THROW_IF(fseek(file_, forward, SEEK_CUR), util::ErrnoException, "Couldn't seek forwards past revision");
}
@@ -191,7 +191,7 @@ SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<u
PositiveProbWarn warn(config.positive_log_probability);
unigram_.reset(util::MakeTemp(file_prefix));
{
- // In case <unk> appears.
+ // In case <unk> appears.
size_t size_out = (counts[0] + 1) * sizeof(ProbBackoff);
util::scoped_mmap unigram_mmap(util::MapZeroedWrite(unigram_.get(), size_out), size_out);
Read1Grams(f, counts[0], vocab, reinterpret_cast<ProbBackoff*>(unigram_mmap.get()), warn);
@@ -199,7 +199,7 @@ SortedFiles::SortedFiles(const Config &config, util::FilePiece &f, std::vector<u
if (!vocab.SawUnk()) ++counts[0];
}
- // Only use as much buffer as we need.
+ // Only use as much buffer as we need.
size_t buffer_use = 0;
for (unsigned int order = 2; order < counts.size(); ++order) {
buffer_use = std::max<size_t>(buffer_use, static_cast<size_t>((sizeof(WordIndex) * order + 2 * sizeof(float)) * counts[order - 1]));
@@ -240,7 +240,7 @@ class Closer {
void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &file_prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size) {
ReadNGramHeader(f, order);
const size_t count = counts[order - 1];
- // Size of weights. Does it include backoff?
+ // Size of weights. Does it include backoff?
const size_t words_size = sizeof(WordIndex) * order;
const size_t weights_size = sizeof(float) + ((order == counts.size()) ? 0 : sizeof(float));
const size_t entry_size = words_size + weights_size;
@@ -264,9 +264,9 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo
ReadNGram(f, order, vocab, it, *reinterpret_cast<ProbBackoff*>(out + words_size), warn);
}
}
- // Sort full records by full n-gram.
+ // Sort full records by full n-gram.
util::SizedProxy proxy_begin(begin, entry_size), proxy_end(out_end, entry_size);
- // parallel_sort uses too much RAM. TODO: figure out why windows sort doesn't like my proxies.
+ // parallel_sort uses too much RAM. TODO: figure out why windows sort doesn't like my proxies.
#if defined(_WIN32) || defined(_WIN64)
std::stable_sort
#else
@@ -279,7 +279,7 @@ void SortedFiles::ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vo
done += (out_end - begin) / entry_size;
}
- // All individual files created. Merge them.
+ // All individual files created. Merge them.
while (files.size() > 1) {
files.push_back(MergeSortedFiles(files[0], files[1], file_prefix, weights_size, order, ThrowCombine()));
diff --git a/lm/trie_sort.hh b/lm/trie_sort.hh
index e5406d9b6..594efee51 100644
--- a/lm/trie_sort.hh
+++ b/lm/trie_sort.hh
@@ -1,4 +1,4 @@
-// Step of trie builder: create sorted files.
+// Step of trie builder: create sorted files.
#ifndef LM_TRIE_SORT_H
#define LM_TRIE_SORT_H
@@ -101,7 +101,7 @@ class SortedFiles {
private:
void ConvertToSorted(util::FilePiece &f, const SortedVocabulary &vocab, const std::vector<uint64_t> &counts, const std::string &prefix, unsigned char order, PositiveProbWarn &warn, void *mem, std::size_t mem_size);
-
+
util::scoped_fd unigram_;
util::scoped_FILE full_[KENLM_MAX_ORDER - 1], context_[KENLM_MAX_ORDER - 1];
diff --git a/lm/value.hh b/lm/value.hh
index 36e870848..d017d59fc 100644
--- a/lm/value.hh
+++ b/lm/value.hh
@@ -39,7 +39,7 @@ template <class Weights> class GenericProbingProxy {
const Weights *to_;
};
-// Basic proxy for trie unigrams.
+// Basic proxy for trie unigrams.
template <class Weights> class GenericTrieUnigramProxy {
public:
explicit GenericTrieUnigramProxy(const Weights &to) : to_(&to) {}
@@ -113,7 +113,7 @@ struct RestValue {
float Rest() const { return to_->rest; }
};
-// gcc 4.1 doesn't properly back dependent types :-(.
+// gcc 4.1 doesn't properly back dependent types :-(.
#pragma pack(push)
#pragma pack(4)
struct ProbingEntry {
diff --git a/lm/value_build.cc b/lm/value_build.cc
index 3ec3dce2a..ac623a6d9 100644
--- a/lm/value_build.cc
+++ b/lm/value_build.cc
@@ -3,7 +3,7 @@
#include "lm/model.hh"
#include "lm/read_arpa.hh"
-namespace lm {
+namespace lm {
namespace ngram {
template <class Model> LowerRestBuild<Model>::LowerRestBuild(const Config &config, unsigned int order, const typename Model::Vocabulary &vocab) {
@@ -12,8 +12,8 @@ template <class Model> LowerRestBuild<Model>::LowerRestBuild(const Config &confi
for_lower.write_mmap = NULL;
for_lower.rest_lower_files.clear();
- // Unigram models aren't supported, so this is a custom loader.
- // TODO: optimize the unigram loading?
+ // Unigram models aren't supported, so this is a custom loader.
+ // TODO: optimize the unigram loading?
{
util::FilePiece uni(config.rest_lower_files[0].c_str());
std::vector<uint64_t> number;
@@ -44,7 +44,7 @@ template <class Model> LowerRestBuild<Model>::LowerRestBuild(const Config &confi
throw;
}
- // TODO: force/check same vocab.
+ // TODO: force/check same vocab.
}
template <class Model> LowerRestBuild<Model>::~LowerRestBuild() {
diff --git a/lm/value_build.hh b/lm/value_build.hh
index 6fd26ef8f..49989ab42 100644
--- a/lm/value_build.hh
+++ b/lm/value_build.hh
@@ -57,7 +57,7 @@ class MaxRestBuild {
return true;
}
- // Probing does need to go back to unigram.
+ // Probing does need to go back to unigram.
const static bool kMarkEvenLower = true;
};
diff --git a/lm/virtual_interface.hh b/lm/virtual_interface.hh
index e138ac14e..ea491fbf7 100644
--- a/lm/virtual_interface.hh
+++ b/lm/virtual_interface.hh
@@ -15,16 +15,16 @@ template <class T, class U, class V> class ModelFacade;
/* Vocabulary interface. Call Index(string) and get a word index for use in
* calling Model. It provides faster convenience functions for <s>, </s>, and
- * <unk> although you can also find these using Index.
+ * <unk> although you can also find these using Index.
*
* Some models do not load the mapping from index to string. If you need this,
* check if the model Vocabulary class implements such a function and access it
- * directly.
+ * directly.
*
* The Vocabulary object is always owned by the Model and can be retrieved from
* the Model using BaseVocabulary() for this abstract interface or
* GetVocabulary() for the actual implementation (in which case you'll need the
- * actual implementation of the Model too).
+ * actual implementation of the Model too).
*/
class Vocabulary {
public:
@@ -36,7 +36,7 @@ class Vocabulary {
/* Most implementations allow StringPiece lookups and need only override
* Index(StringPiece). SRI requires null termination and overrides all
- * three methods.
+ * three methods.
*/
virtual WordIndex Index(const StringPiece &str) const = 0;
virtual WordIndex Index(const std::string &str) const {
@@ -47,7 +47,7 @@ class Vocabulary {
}
protected:
- // Call SetSpecial afterward.
+ // Call SetSpecial afterward.
Vocabulary() {}
Vocabulary(WordIndex begin_sentence, WordIndex end_sentence, WordIndex not_found) {
@@ -59,13 +59,13 @@ class Vocabulary {
WordIndex begin_sentence_, end_sentence_, not_found_;
private:
- // Disable copy constructors. They're private and undefined.
+ // Disable copy constructors. They're private and undefined.
// Ersatz boost::noncopyable.
Vocabulary(const Vocabulary &);
Vocabulary &operator=(const Vocabulary &);
};
-/* There are two ways to access a Model.
+/* There are two ways to access a Model.
*
*
* OPTION 1: Access the Model directly (e.g. lm::ngram::Model in model.hh).
@@ -90,29 +90,29 @@ class Vocabulary {
* unsigned int Order() const;
*
* NB: In case you're wondering why the model implementation looks like it's
- * missing these methods, see facade.hh.
+ * missing these methods, see facade.hh.
*
* This is the fastest way to use a model and presents a normal State class to
- * be included in a hypothesis state structure.
+ * be included in a hypothesis state structure.
*
*
- * OPTION 2: Use the virtual interface below.
+ * OPTION 2: Use the virtual interface below.
*
- * The virtual interface allow you to decide which Model to use at runtime
+ * The virtual interface allow you to decide which Model to use at runtime
* without templatizing everything on the Model type. However, each Model has
* its own State class, so a single State cannot be efficiently provided (it
* would require using the maximum memory of any Model's State or memory
* allocation with each lookup). This means you become responsible for
- * allocating memory with size StateSize() and passing it to the Score or
- * FullScore functions provided here.
+ * allocating memory with size StateSize() and passing it to the Score or
+ * FullScore functions provided here.
*
* For example, cdec has a std::string containing the entire state of a
* hypothesis. It can reserve StateSize bytes in this string for the model
- * state.
+ * state.
*
* All the State objects are POD, so it's ok to use raw memory for storing
* State.
- * in_state and out_state must not have the same address.
+ * in_state and out_state must not have the same address.
*/
class Model {
public:
@@ -148,7 +148,7 @@ class Model {
unsigned char order_;
- // Disable copy constructors. They're private and undefined.
+ // Disable copy constructors. They're private and undefined.
// Ersatz boost::noncopyable.
Model(const Model &);
Model &operator=(const Model &);
diff --git a/lm/vocab.cc b/lm/vocab.cc
index 4fad78964..f6d834323 100644
--- a/lm/vocab.cc
+++ b/lm/vocab.cc
@@ -20,15 +20,15 @@ namespace ngram {
namespace detail {
uint64_t HashForVocab(const char *str, std::size_t len) {
// This proved faster than Boost's hash in speed trials: total load time Murmur 67090000, Boost 72210000
- // Chose to use 64A instead of native so binary format will be portable across 64 and 32 bit.
+ // Chose to use 64A instead of native so binary format will be portable across 64 and 32 bit.
return util::MurmurHash64A(str, len, 0);
}
} // namespace detail
namespace {
-// Normally static initialization is a bad idea but MurmurHash is pure arithmetic, so this is ok.
+// Normally static initialization is a bad idea but MurmurHash is pure arithmetic, so this is ok.
const uint64_t kUnknownHash = detail::HashForVocab("<unk>", 5);
-// Sadly some LMs have <UNK>.
+// Sadly some LMs have <UNK>.
const uint64_t kUnknownCapHash = detail::HashForVocab("<UNK>", 5);
void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count, uint64_t offset) {
@@ -38,7 +38,7 @@ void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count, uint
util::ReadOrThrow(fd, check_unk, 6);
UTIL_THROW_IF(
memcmp(check_unk, "<unk>", 6),
- FormatLoadException,
+ FormatLoadException,
"Vocabulary words are in the wrong place. This could be because the binary file was built with stale gcc and old kenlm. Stale gcc, including the gcc distributed with RedHat and OS X, has a bug that ignores pragma pack for template-dependent types. New kenlm works around this, so you'll save memory but have to rebuild any binary files using the probing data structure.");
if (!enumerate) return;
enumerate->Add(0, "<unk>");
@@ -58,7 +58,7 @@ void ReadWords(int fd, EnumerateVocab *enumerate, WordIndex expected_count, uint
util::ReadOrThrow(fd, &next_char, 1);
buf.push_back(next_char);
}
- // Ok now we have null terminated strings.
+ // Ok now we have null terminated strings.
for (const char *i = buf.data(); i != buf.data() + buf.size();) {
std::size_t length = strlen(i);
enumerate->Add(index++, StringPiece(i, length));
@@ -83,13 +83,13 @@ void WriteWordsWrapper::Add(WordIndex index, const StringPiece &str) {
SortedVocabulary::SortedVocabulary() : begin_(NULL), end_(NULL), enumerate_(NULL) {}
uint64_t SortedVocabulary::Size(uint64_t entries, const Config &/*config*/) {
- // Lead with the number of entries.
+ // Lead with the number of entries.
return sizeof(uint64_t) + sizeof(uint64_t) * entries;
}
void SortedVocabulary::SetupMemory(void *start, std::size_t allocated, std::size_t entries, const Config &config) {
assert(allocated >= Size(entries, config));
- // Leave space for number of entries.
+ // Leave space for number of entries.
begin_ = reinterpret_cast<uint64_t*>(start) + 1;
end_ = begin_;
saw_unk_ = false;
@@ -122,7 +122,7 @@ WordIndex SortedVocabulary::Insert(const StringPiece &str) {
strings_to_enumerate_[end_ - begin_] = StringPiece(static_cast<const char*>(copied), str.size());
}
++end_;
- // This is 1 + the offset where it was inserted to make room for unk.
+ // This is 1 + the offset where it was inserted to make room for unk.
return end_ - begin_;
}
@@ -133,7 +133,7 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
util::JointSort(begin_, end_, values);
}
for (WordIndex i = 0; i < static_cast<WordIndex>(end_ - begin_); ++i) {
- // <unk> strikes again: +1 here.
+ // <unk> strikes again: +1 here.
enumerate_->Add(i + 1, strings_to_enumerate_[i]);
}
strings_to_enumerate_.clear();
@@ -142,7 +142,7 @@ void SortedVocabulary::FinishedLoading(ProbBackoff *reorder_vocab) {
util::JointSort(begin_, end_, reorder_vocab + 1);
}
SetSpecial(Index("<s>"), Index("</s>"), 0);
- // Save size. Excludes UNK.
+ // Save size. Excludes UNK.
*(reinterpret_cast<uint64_t*>(begin_) - 1) = end_ - begin_;
// Includes UNK.
bound_ = end_ - begin_ + 1;
@@ -161,7 +161,7 @@ const unsigned int kProbingVocabularyVersion = 0;
namespace detail {
struct ProbingVocabularyHeader {
- // Lowest unused vocab id. This is also the number of words, including <unk>.
+ // Lowest unused vocab id. This is also the number of words, including <unk>.
unsigned int version;
WordIndex bound;
};
@@ -198,7 +198,7 @@ void ProbingVocabulary::ConfigureEnumerate(EnumerateVocab *to, std::size_t /*max
WordIndex ProbingVocabulary::Insert(const StringPiece &str) {
uint64_t hashed = detail::HashForVocab(str);
- // Prevent unknown from going into the table.
+ // Prevent unknown from going into the table.
if (hashed == kUnknownHash || hashed == kUnknownCapHash) {
saw_unk_ = true;
return 0;
diff --git a/lm/vocab.hh b/lm/vocab.hh
index d6ae07b83..2659b9ba8 100644
--- a/lm/vocab.hh
+++ b/lm/vocab.hh
@@ -35,7 +35,7 @@ class WriteWordsWrapper : public EnumerateVocab {
WriteWordsWrapper(EnumerateVocab *inner);
~WriteWordsWrapper();
-
+
void Add(WordIndex index, const StringPiece &str);
const std::string &Buffer() const { return buffer_; }
@@ -46,7 +46,7 @@ class WriteWordsWrapper : public EnumerateVocab {
std::string buffer_;
};
-// Vocabulary based on sorted uniform find storing only uint64_t values and using their offsets as indices.
+// Vocabulary based on sorted uniform find storing only uint64_t values and using their offsets as indices.
class SortedVocabulary : public base::Vocabulary {
public:
SortedVocabulary();
@@ -67,7 +67,7 @@ class SortedVocabulary : public base::Vocabulary {
// Size for purposes of file writing
static uint64_t Size(uint64_t entries, const Config &config);
- // Vocab words are [0, Bound()) Only valid after FinishedLoading/LoadedBinary.
+ // Vocab words are [0, Bound()) Only valid after FinishedLoading/LoadedBinary.
WordIndex Bound() const { return bound_; }
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
@@ -79,7 +79,7 @@ class SortedVocabulary : public base::Vocabulary {
WordIndex Insert(const StringPiece &str);
- // Reorders reorder_vocab so that the IDs are sorted.
+ // Reorders reorder_vocab so that the IDs are sorted.
void FinishedLoading(ProbBackoff *reorder_vocab);
// Trie stores the correct counts including <unk> in the header. If this was previously sized based on a count exluding <unk>, padding with 8 bytes will make it the correct size based on a count including <unk>.
@@ -98,7 +98,7 @@ class SortedVocabulary : public base::Vocabulary {
EnumerateVocab *enumerate_;
- // Actual strings. Used only when loading from ARPA and enumerate_ != NULL
+ // Actual strings. Used only when loading from ARPA and enumerate_ != NULL
util::Pool string_backing_;
std::vector<StringPiece> strings_to_enumerate_;
@@ -123,7 +123,7 @@ struct ProbingVocabularyEntry {
};
#pragma pack(pop)
-// Vocabulary storing a map from uint64_t to WordIndex.
+// Vocabulary storing a map from uint64_t to WordIndex.
class ProbingVocabulary : public base::Vocabulary {
public:
ProbingVocabulary();
@@ -137,7 +137,7 @@ class ProbingVocabulary : public base::Vocabulary {
// This just unwraps Config to get the probing_multiplier.
static uint64_t Size(uint64_t entries, const Config &config);
- // Vocab words are [0, Bound()).
+ // Vocab words are [0, Bound()).
WordIndex Bound() const { return bound_; }
// Everything else is for populating. I'm too lazy to hide and friend these, but you'll only get a const reference anyway.
diff --git a/lm/weights.hh b/lm/weights.hh
index da1963d83..f14312753 100644
--- a/lm/weights.hh
+++ b/lm/weights.hh
@@ -1,13 +1,13 @@
#ifndef LM_WEIGHTS_H
#define LM_WEIGHTS_H
-// Weights for n-grams. Probability and possibly a backoff.
+// Weights for n-grams. Probability and possibly a backoff.
namespace lm {
struct Prob {
float prob;
};
-// No inheritance so this will be a POD.
+// No inheritance so this will be a POD.
struct ProbBackoff {
float prob;
float backoff;
diff --git a/lm/wrappers/nplm.cc b/lm/wrappers/nplm.cc
index edc7b5b72..9bd7c1ed8 100644
--- a/lm/wrappers/nplm.cc
+++ b/lm/wrappers/nplm.cc
@@ -10,7 +10,7 @@
namespace lm {
namespace np {
-Vocabulary::Vocabulary(const nplm::vocabulary &vocab)
+Vocabulary::Vocabulary(const nplm::vocabulary &vocab)
: base::Vocabulary(vocab.lookup_word("<s>"), vocab.lookup_word("</s>"), vocab.lookup_word("<unk>")),
vocab_(vocab), null_word_(vocab.lookup_word("<null>")) {}
@@ -60,7 +60,7 @@ nplm::neuralLM *LoadNPLM(const std::string &file) {
}
} // namespace
-Model::Model(const std::string &file, std::size_t cache)
+Model::Model(const std::string &file, std::size_t cache)
: base_instance_(LoadNPLM(file)), vocab_(base_instance_->get_vocabulary()), cache_size_(cache) {
UTIL_THROW_IF(base_instance_->get_order() > NPLM_MAX_ORDER, util::Exception, "This NPLM has order " << (unsigned int)base_instance_->get_order() << " but the KenLM wrapper was compiled with " << NPLM_MAX_ORDER << ". Change the defintion of NPLM_MAX_ORDER and recompile.");
// log10 compatible with backoff models.
diff --git a/lm/wrappers/nplm.hh b/lm/wrappers/nplm.hh
index 416281de2..82b38fdd7 100644
--- a/lm/wrappers/nplm.hh
+++ b/lm/wrappers/nplm.hh
@@ -9,7 +9,7 @@
#include <boost/scoped_ptr.hpp>
/* Wrapper to NPLM "by Ashish Vaswani, with contributions from David Chiang
- * and Victoria Fossum."
+ * and Victoria Fossum."
* http://nlg.isi.edu/software/nplm/
*/