Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--lm/builder/lmplz_main.cc31
-rw-r--r--lm/builder/ngram.hh2
-rw-r--r--lm/model.cc21
-rw-r--r--lm/model.hh5
-rw-r--r--lm/search_hashed.cc29
-rw-r--r--lm/search_hashed.hh19
-rw-r--r--lm/virtual_interface.hh3
-rw-r--r--util/double-conversion/bignum-dtoa.h3
-rw-r--r--util/double-conversion/bignum.h18
-rw-r--r--util/double-conversion/cached-powers.h18
-rw-r--r--util/double-conversion/diy-fp.h26
-rw-r--r--util/double-conversion/double-conversion.h49
-rw-r--r--util/double-conversion/fast-dtoa.h3
-rw-r--r--util/double-conversion/fixed-dtoa.h3
-rw-r--r--util/double-conversion/ieee.h59
-rw-r--r--util/double-conversion/strtod.h3
-rw-r--r--util/double-conversion/utils.h76
-rw-r--r--util/file.cc14
-rw-r--r--util/probing_hash_table.hh23
-rw-r--r--util/proxy_iterator.hh25
-rw-r--r--util/sized_iterator.hh27
-rw-r--r--util/stream/chain.hh2
-rw-r--r--util/string_piece_hash.hh2
23 files changed, 235 insertions, 226 deletions
diff --git a/lm/builder/lmplz_main.cc b/lm/builder/lmplz_main.cc
index 1e086dcce..2e3002d12 100644
--- a/lm/builder/lmplz_main.cc
+++ b/lm/builder/lmplz_main.cc
@@ -33,6 +33,8 @@ int main(int argc, char *argv[]) {
po::options_description options("Language model building options");
lm::builder::PipelineConfig pipeline;
+ std::string text, arpa;
+
options.add_options()
("order,o", po::value<std::size_t>(&pipeline.order)
#if BOOST_VERSION >= 104200
@@ -47,18 +49,21 @@ int main(int argc, char *argv[]) {
("vocab_estimate", po::value<lm::WordIndex>(&pipeline.vocab_estimate)->default_value(1000000), "Assume this vocabulary size for purposes of calculating memory in step 1 (corpus count) and pre-sizing the hash table")
("block_count", po::value<std::size_t>(&pipeline.block_count)->default_value(2), "Block count (per order)")
("vocab_file", po::value<std::string>(&pipeline.vocab_file)->default_value(""), "Location to write vocabulary file")
- ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.");
+ ("verbose_header", po::bool_switch(&pipeline.verbose_header), "Add a verbose header to the ARPA file that includes information such as token count, smoothing type, etc.")
+ ("text", po::value<std::string>(&text), "Read text from a file instead of stdin")
+ ("arpa", po::value<std::string>(&arpa), "Write ARPA to a file instead of stdout");
if (argc == 1) {
std::cerr <<
"Builds unpruned language models with modified Kneser-Ney smoothing.\n\n"
"Please cite:\n"
- "@inproceedings{kenlm,\n"
- "author = {Kenneth Heafield},\n"
- "title = {{KenLM}: Faster and Smaller Language Model Queries},\n"
- "booktitle = {Proceedings of the Sixth Workshop on Statistical Machine Translation},\n"
- "month = {July}, year={2011},\n"
- "address = {Edinburgh, UK},\n"
- "publisher = {Association for Computational Linguistics},\n"
+ "@inproceedings{Heafield-estimate,\n"
+ " author = {Kenneth Heafield and Ivan Pouzyrevsky and Jonathan H. Clark and Philipp Koehn},\n"
+ " title = {Scalable Modified {Kneser-Ney} Language Model Estimation},\n"
+ " year = {2013},\n"
+ " month = {8},\n"
+ " booktitle = {Proceedings of the 51st Annual Meeting of the Association for Computational Linguistics},\n"
+ " address = {Sofia, Bulgaria},\n"
+ " url = {http://kheafield.com/professional/edinburgh/estimate\\_paper.pdf},\n"
"}\n\n"
"Provide the corpus on stdin. The ARPA file will be written to stdout. Order of\n"
"the model (-o) is the only mandatory option. As this is an on-disk program,\n"
@@ -91,9 +96,17 @@ int main(int argc, char *argv[]) {
initial.adder_out.block_count = 2;
pipeline.read_backoffs = initial.adder_out;
+ util::scoped_fd in(0), out(1);
+ if (vm.count("text")) {
+ in.reset(util::OpenReadOrThrow(text.c_str()));
+ }
+ if (vm.count("arpa")) {
+ out.reset(util::CreateOrThrow(arpa.c_str()));
+ }
+
// Read from stdin
try {
- lm::builder::Pipeline(pipeline, 0, 1);
+ lm::builder::Pipeline(pipeline, in.release(), out.release());
} catch (const util::MallocException &e) {
std::cerr << e.what() << std::endl;
std::cerr << "Try rerunning with a more conservative -S setting than " << vm["memory"].as<std::string>() << std::endl;
diff --git a/lm/builder/ngram.hh b/lm/builder/ngram.hh
index 2984ed0b6..f5681516a 100644
--- a/lm/builder/ngram.hh
+++ b/lm/builder/ngram.hh
@@ -53,7 +53,7 @@ class NGram {
Payload &Value() { return *reinterpret_cast<Payload *>(end_); }
uint64_t &Count() { return Value().count; }
- const uint64_t Count() const { return Value().count; }
+ uint64_t Count() const { return Value().count; }
std::size_t Order() const { return end_ - begin_; }
diff --git a/lm/model.cc b/lm/model.cc
index a40fd2fb0..a26654a6f 100644
--- a/lm/model.cc
+++ b/lm/model.cc
@@ -304,5 +304,26 @@ template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::DontBhiks
template class GenericModel<trie::TrieSearch<SeparatelyQuantize, trie::ArrayBhiksha>, SortedVocabulary>;
} // namespace detail
+
+base::Model *LoadVirtual(const char *file_name, const Config &config, ModelType model_type) {
+ RecognizeBinary(file_name, model_type);
+ switch (model_type) {
+ case PROBING:
+ return new ProbingModel(file_name, config);
+ case REST_PROBING:
+ return new RestProbingModel(file_name, config);
+ case TRIE:
+ return new TrieModel(file_name, config);
+ case QUANT_TRIE:
+ return new QuantTrieModel(file_name, config);
+ case ARRAY_TRIE:
+ return new ArrayTrieModel(file_name, config);
+ case QUANT_ARRAY_TRIE:
+ return new QuantArrayTrieModel(file_name, config);
+ default:
+ UTIL_THROW(FormatLoadException, "Confused by model type " << model_type);
+ }
+}
+
} // namespace ngram
} // namespace lm
diff --git a/lm/model.hh b/lm/model.hh
index 13ff864e1..60f55110b 100644
--- a/lm/model.hh
+++ b/lm/model.hh
@@ -153,6 +153,11 @@ LM_NAME_MODEL(QuantArrayTrieModel, detail::GenericModel<trie::TrieSearch<Separat
typedef ::lm::ngram::ProbingVocabulary Vocabulary;
typedef ProbingModel Model;
+/* Autorecognize the file type, load, and return the virtual base class. Don't
+ * use the virtual base class if you can avoid it. Instead, use the above
+ * classes as template arguments to your own virtual feature function.*/
+base::Model *LoadVirtual(const char *file_name, const Config &config = Config(), ModelType if_arpa = PROBING);
+
} // namespace ngram
} // namespace lm
diff --git a/lm/search_hashed.cc b/lm/search_hashed.cc
index 2d6f15b23..62275d277 100644
--- a/lm/search_hashed.cc
+++ b/lm/search_hashed.cc
@@ -54,7 +54,7 @@ template <class Weights> class ActivateUnigram {
Weights *modify_;
};
-// Find the lower order entry, inserting blanks along the way as necessary.
+// Find the lower order entry, inserting blanks along the way as necessary.
template <class Value> void FindLower(
const std::vector<uint64_t> &keys,
typename Value::Weights &unigram,
@@ -64,7 +64,7 @@ template <class Value> void FindLower(
typename Value::ProbingEntry entry;
// Backoff will always be 0.0. We'll get the probability and rest in another pass.
entry.value.backoff = kNoExtensionBackoff;
- // Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb.
+ // Go back and find the longest right-aligned entry, informing it that it extends left. Normally this will match immediately, but sometimes SRI is dumb.
for (int lower = keys.size() - 2; ; --lower) {
if (lower == -1) {
between.push_back(&unigram);
@@ -77,11 +77,11 @@ template <class Value> void FindLower(
}
}
-// Between usually has single entry, the value to adjust. But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here.
+// Between usually has single entry, the value to adjust. But sometimes SRI stupidly pruned entries so it has unitialized blank values to be set here.
template <class Added, class Build> void AdjustLower(
const Added &added,
const Build &build,
- std::vector<typename Build::Value::Weights *> &between,
+ std::vector<typename Build::Value::Weights *> &between,
const unsigned int n,
const std::vector<WordIndex> &vocab_ids,
typename Build::Value::Weights *unigrams,
@@ -93,14 +93,14 @@ template <class Added, class Build> void AdjustLower(
}
typedef util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> Middle;
float prob = -fabs(between.back()->prob);
- // Order of the n-gram on which probabilities are based.
+ // Order of the n-gram on which probabilities are based.
unsigned char basis = n - between.size();
assert(basis != 0);
typename Build::Value::Weights **change = &between.back();
// Skip the basis.
--change;
if (basis == 1) {
- // Hallucinate a bigram based on a unigram's backoff and a unigram probability.
+ // Hallucinate a bigram based on a unigram's backoff and a unigram probability.
float &backoff = unigrams[vocab_ids[1]].backoff;
SetExtension(backoff);
prob += backoff;
@@ -128,14 +128,14 @@ template <class Added, class Build> void AdjustLower(
typename std::vector<typename Value::Weights *>::const_iterator i(between.begin());
build.MarkExtends(**i, added);
const typename Value::Weights *longer = *i;
- // Everything has probability but is not marked as extending.
+ // Everything has probability but is not marked as extending.
for (++i; i != between.end(); ++i) {
build.MarkExtends(**i, *longer);
longer = *i;
}
}
-// Continue marking lower entries even they know that they extend left. This is used for upper/lower bounds.
+// Continue marking lower entries even they know that they extend left. This is used for upper/lower bounds.
template <class Build> void MarkLower(
const std::vector<uint64_t> &keys,
const Build &build,
@@ -144,15 +144,15 @@ template <class Build> void MarkLower(
int start_order,
const typename Build::Value::Weights &longer) {
if (start_order == 0) return;
- typename util::ProbingHashTable<typename Build::Value::ProbingEntry, util::IdentityHash>::MutableIterator iter;
- // Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code.
+ // Hopefully the compiler will realize that if MarkExtends always returns false, it can simplify this code.
for (int even_lower = start_order - 2 /* index in middle */; ; --even_lower) {
if (even_lower == -1) {
build.MarkExtends(unigram, longer);
return;
}
- middle[even_lower].UnsafeMutableFind(keys[even_lower], iter);
- if (!build.MarkExtends(iter->value, longer)) return;
+ if (!build.MarkExtends(
+ middle[even_lower].UnsafeMutableMustFind(keys[even_lower])->value,
+ longer)) return;
}
}
@@ -168,7 +168,6 @@ template <class Build, class Activate, class Store> void ReadNGrams(
Store &store,
PositiveProbWarn &warn) {
typedef typename Build::Value Value;
- typedef util::ProbingHashTable<typename Value::ProbingEntry, util::IdentityHash> Middle;
assert(n >= 2);
ReadNGramHeader(f, n);
@@ -186,7 +185,7 @@ template <class Build, class Activate, class Store> void ReadNGrams(
for (unsigned int h = 1; h < n - 1; ++h) {
keys[h] = detail::CombineWordHash(keys[h-1], vocab_ids[h+1]);
}
- // Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0.
+ // Initially the sign bit is on, indicating it does not extend left. Most already have this but there might +0.0.
util::SetSign(entry.value.prob);
entry.key = keys[n-2];
@@ -203,7 +202,7 @@ template <class Build, class Activate, class Store> void ReadNGrams(
} // namespace
namespace detail {
-
+
template <class Value> uint8_t *HashedSearch<Value>::SetupMemory(uint8_t *start, const std::vector<uint64_t> &counts, const Config &config) {
std::size_t allocated = Unigram::Size(counts[0]);
unigram_ = Unigram(start, counts[0], allocated);
diff --git a/lm/search_hashed.hh b/lm/search_hashed.hh
index 005957967..9d067bc2e 100644
--- a/lm/search_hashed.hh
+++ b/lm/search_hashed.hh
@@ -71,7 +71,7 @@ template <class Value> class HashedSearch {
static const bool kDifferentRest = Value::kDifferentRest;
static const unsigned int kVersion = 0;
- // TODO: move probing_multiplier here with next binary file format update.
+ // TODO: move probing_multiplier here with next binary file format update.
static void UpdateConfigFromBinary(int, const std::vector<uint64_t> &, Config &) {}
static uint64_t Size(const std::vector<uint64_t> &counts, const Config &config) {
@@ -102,14 +102,9 @@ template <class Value> class HashedSearch {
return ret;
}
-#pragma GCC diagnostic ignored "-Wuninitialized"
MiddlePointer Unpack(uint64_t extend_pointer, unsigned char extend_length, Node &node) const {
node = extend_pointer;
- typename Middle::ConstIterator found;
- bool got = middle_[extend_length - 2].Find(extend_pointer, found);
- assert(got);
- (void)got;
- return MiddlePointer(found->value);
+ return MiddlePointer(middle_[extend_length - 2].MustFind(extend_pointer)->value);
}
MiddlePointer LookupMiddle(unsigned char order_minus_2, WordIndex word, Node &node, bool &independent_left, uint64_t &extend_pointer) const {
@@ -126,14 +121,14 @@ template <class Value> class HashedSearch {
}
LongestPointer LookupLongest(WordIndex word, const Node &node) const {
- // Sign bit is always on because longest n-grams do not extend left.
+ // Sign bit is always on because longest n-grams do not extend left.
typename Longest::ConstIterator found;
if (!longest_.Find(CombineWordHash(node, word), found)) return LongestPointer();
return LongestPointer(found->value.prob);
}
- // Generate a node without necessarily checking that it actually exists.
- // Optionally return false if it's know to not exist.
+ // Generate a node without necessarily checking that it actually exists.
+ // Optionally return false if it's know to not exist.
bool FastMakeNode(const WordIndex *begin, const WordIndex *end, Node &node) const {
assert(begin != end);
node = static_cast<Node>(*begin);
@@ -144,7 +139,7 @@ template <class Value> class HashedSearch {
}
private:
- // Interpret config's rest cost build policy and pass the right template argument to ApplyBuild.
+ // Interpret config's rest cost build policy and pass the right template argument to ApplyBuild.
void DispatchBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const Config &config, const ProbingVocabulary &vocab, PositiveProbWarn &warn);
template <class Build> void ApplyBuild(util::FilePiece &f, const std::vector<uint64_t> &counts, const ProbingVocabulary &vocab, PositiveProbWarn &warn, const Build &build);
@@ -153,7 +148,7 @@ template <class Value> class HashedSearch {
public:
Unigram() {}
- Unigram(void *start, uint64_t count, std::size_t /*allocated*/) :
+ Unigram(void *start, uint64_t count, std::size_t /*allocated*/) :
unigram_(static_cast<typename Value::Weights*>(start))
#ifdef DEBUG
, count_(count)
diff --git a/lm/virtual_interface.hh b/lm/virtual_interface.hh
index 6a5a0196f..17f064b2c 100644
--- a/lm/virtual_interface.hh
+++ b/lm/virtual_interface.hh
@@ -6,6 +6,7 @@
#include "util/string_piece.hh"
#include <string>
+#include <string.h>
namespace lm {
namespace base {
@@ -119,7 +120,9 @@ class Model {
size_t StateSize() const { return state_size_; }
const void *BeginSentenceMemory() const { return begin_sentence_memory_; }
+ void BeginSentenceWrite(void *to) const { memcpy(to, begin_sentence_memory_, StateSize()); }
const void *NullContextMemory() const { return null_context_memory_; }
+ void NullContextWrite(void *to) const { memcpy(to, null_context_memory_, StateSize()); }
// Requires in_state != out_state
virtual float Score(const void *in_state, const WordIndex new_word, void *out_state) const = 0;
diff --git a/util/double-conversion/bignum-dtoa.h b/util/double-conversion/bignum-dtoa.h
index 652a4db9a..34b961992 100644
--- a/util/double-conversion/bignum-dtoa.h
+++ b/util/double-conversion/bignum-dtoa.h
@@ -30,8 +30,7 @@
#include "utils.h"
-namespace double_conversion
-{
+namespace double_conversion {
enum BignumDtoaMode {
// Return the shortest correct representation.
diff --git a/util/double-conversion/bignum.h b/util/double-conversion/bignum.h
index 5deadbfbe..5ec3544f5 100644
--- a/util/double-conversion/bignum.h
+++ b/util/double-conversion/bignum.h
@@ -30,12 +30,10 @@
#include "utils.h"
-namespace double_conversion
-{
+namespace double_conversion {
-class Bignum
-{
-public:
+class Bignum {
+ public:
// 3584 = 128 * 28. We can represent 2^3584 > 10^1000 accurately.
// This bignum can encode much bigger numbers, since it contains an
// exponent.
@@ -62,9 +60,7 @@ public:
void MultiplyByUInt32(uint32_t factor);
void MultiplyByUInt64(uint64_t factor);
void MultiplyByPowerOfTen(int exponent);
- void Times10() {
- return MultiplyByUInt32(10);
- }
+ void Times10() { return MultiplyByUInt32(10); }
// Pseudocode:
// int result = this / other;
// this = this % other;
@@ -101,7 +97,7 @@ public:
static bool PlusLess(const Bignum& a, const Bignum& b, const Bignum& c) {
return PlusCompare(a, b, c) < 0;
}
-private:
+ private:
typedef uint32_t Chunk;
typedef uint64_t DoubleChunk;
@@ -129,9 +125,7 @@ private:
// shift_amount must be < kBigitSize.
void BigitsShiftLeft(int shift_amount);
// BigitLength includes the "hidden" digits encoded in the exponent.
- int BigitLength() const {
- return used_digits_ + exponent_;
- }
+ int BigitLength() const { return used_digits_ + exponent_; }
Chunk BigitAt(int index) const;
void SubtractTimes(const Bignum& other, int factor);
diff --git a/util/double-conversion/cached-powers.h b/util/double-conversion/cached-powers.h
index 3daf52d51..61a50614c 100644
--- a/util/double-conversion/cached-powers.h
+++ b/util/double-conversion/cached-powers.h
@@ -30,12 +30,10 @@
#include "diy-fp.h"
-namespace double_conversion
-{
+namespace double_conversion {
-class PowersOfTenCache
-{
-public:
+class PowersOfTenCache {
+ public:
// Not all powers of ten are cached. The decimal exponent of two neighboring
// cached numbers will differ by kDecimalExponentDistance.
@@ -47,9 +45,9 @@ public:
// Returns a cached power-of-ten with a binary exponent in the range
// [min_exponent; max_exponent] (boundaries included).
static void GetCachedPowerForBinaryExponentRange(int min_exponent,
- int max_exponent,
- DiyFp* power,
- int* decimal_exponent);
+ int max_exponent,
+ DiyFp* power,
+ int* decimal_exponent);
// Returns a cached power of ten x ~= 10^k such that
// k <= decimal_exponent < k + kCachedPowersDecimalDistance.
@@ -57,8 +55,8 @@ public:
// kMinDecimalExponent <= requested_exponent, and
// requested_exponent < kMaxDecimalExponent + kDecimalExponentDistance.
static void GetCachedPowerForDecimalExponent(int requested_exponent,
- DiyFp* power,
- int* found_exponent);
+ DiyFp* power,
+ int* found_exponent);
};
} // namespace double_conversion
diff --git a/util/double-conversion/diy-fp.h b/util/double-conversion/diy-fp.h
index 39a6bd7dd..9dcf8fbdb 100644
--- a/util/double-conversion/diy-fp.h
+++ b/util/double-conversion/diy-fp.h
@@ -30,17 +30,15 @@
#include "utils.h"
-namespace double_conversion
-{
+namespace double_conversion {
// This "Do It Yourself Floating Point" class implements a floating-point number
// with a uint64 significand and an int exponent. Normalized DiyFp numbers will
// have the most significant bit of the significand set.
// Multiplication and Subtraction do not normalize their results.
// DiyFp are not designed to contain special doubles (NaN and Infinity).
-class DiyFp
-{
-public:
+class DiyFp {
+ public:
static const int kSignificandSize = 64;
DiyFp() : f_(0), e_(0) {}
@@ -102,21 +100,13 @@ public:
return result;
}
- uint64_t f() const {
- return f_;
- }
- int e() const {
- return e_;
- }
+ uint64_t f() const { return f_; }
+ int e() const { return e_; }
- void set_f(uint64_t new_value) {
- f_ = new_value;
- }
- void set_e(int new_value) {
- e_ = new_value;
- }
+ void set_f(uint64_t new_value) { f_ = new_value; }
+ void set_e(int new_value) { e_ = new_value; }
-private:
+ private:
static const uint64_t kUint64MSB = UINT64_2PART_C(0x80000000, 00000000);
uint64_t f_;
diff --git a/util/double-conversion/double-conversion.h b/util/double-conversion/double-conversion.h
index b3e51bae8..1c3387d4f 100644
--- a/util/double-conversion/double-conversion.h
+++ b/util/double-conversion/double-conversion.h
@@ -30,12 +30,10 @@
#include "utils.h"
-namespace double_conversion
-{
+namespace double_conversion {
-class DoubleToStringConverter
-{
-public:
+class DoubleToStringConverter {
+ public:
// When calling ToFixed with a double > 10^kMaxFixedDigitsBeforePoint
// or a requested_digits parameter > kMaxFixedDigitsAfterPoint then the
// function returns false.
@@ -114,20 +112,20 @@ public:
int decimal_in_shortest_high,
int max_leading_padding_zeroes_in_precision_mode,
int max_trailing_padding_zeroes_in_precision_mode)
- : flags_(flags),
- infinity_symbol_(infinity_symbol),
- nan_symbol_(nan_symbol),
- exponent_character_(exponent_character),
- decimal_in_shortest_low_(decimal_in_shortest_low),
- decimal_in_shortest_high_(decimal_in_shortest_high),
- max_leading_padding_zeroes_in_precision_mode_(
- max_leading_padding_zeroes_in_precision_mode),
- max_trailing_padding_zeroes_in_precision_mode_(
- max_trailing_padding_zeroes_in_precision_mode) {
+ : flags_(flags),
+ infinity_symbol_(infinity_symbol),
+ nan_symbol_(nan_symbol),
+ exponent_character_(exponent_character),
+ decimal_in_shortest_low_(decimal_in_shortest_low),
+ decimal_in_shortest_high_(decimal_in_shortest_high),
+ max_leading_padding_zeroes_in_precision_mode_(
+ max_leading_padding_zeroes_in_precision_mode),
+ max_trailing_padding_zeroes_in_precision_mode_(
+ max_trailing_padding_zeroes_in_precision_mode) {
// When 'trailing zero after the point' is set, then 'trailing point'
// must be set too.
ASSERT(((flags & EMIT_TRAILING_DECIMAL_POINT) != 0) ||
- !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0));
+ !((flags & EMIT_TRAILING_ZERO_AFTER_POINT) != 0));
}
// Returns a converter following the EcmaScript specification.
@@ -343,7 +341,7 @@ public:
int* length,
int* point);
-private:
+ private:
// Implementation for ToShortest and ToShortestSingle.
bool ToShortestIeeeNumber(double value,
StringBuilder* result_builder,
@@ -380,9 +378,8 @@ private:
};
-class StringToDoubleConverter
-{
-public:
+class StringToDoubleConverter {
+ public:
// Enumeration for allowing octals and ignoring junk when converting
// strings to numbers.
enum Flags {
@@ -491,11 +488,11 @@ public:
double junk_string_value,
const char* infinity_symbol,
const char* nan_symbol)
- : flags_(flags),
- empty_string_value_(empty_string_value),
- junk_string_value_(junk_string_value),
- infinity_symbol_(infinity_symbol),
- nan_symbol_(nan_symbol) {
+ : flags_(flags),
+ empty_string_value_(empty_string_value),
+ junk_string_value_(junk_string_value),
+ infinity_symbol_(infinity_symbol),
+ nan_symbol_(nan_symbol) {
}
// Performs the conversion.
@@ -519,7 +516,7 @@ public:
processed_characters_count, false));
}
-private:
+ private:
const int flags_;
const double empty_string_value_;
const double junk_string_value_;
diff --git a/util/double-conversion/fast-dtoa.h b/util/double-conversion/fast-dtoa.h
index 184f9cade..5f1e8eee5 100644
--- a/util/double-conversion/fast-dtoa.h
+++ b/util/double-conversion/fast-dtoa.h
@@ -30,8 +30,7 @@
#include "utils.h"
-namespace double_conversion
-{
+namespace double_conversion {
enum FastDtoaMode {
// Computes the shortest representation of the given input. The returned
diff --git a/util/double-conversion/fixed-dtoa.h b/util/double-conversion/fixed-dtoa.h
index 9383cb936..3bdd08e21 100644
--- a/util/double-conversion/fixed-dtoa.h
+++ b/util/double-conversion/fixed-dtoa.h
@@ -30,8 +30,7 @@
#include "utils.h"
-namespace double_conversion
-{
+namespace double_conversion {
// Produces digits necessary to print a given number with
// 'fractional_count' digits after the decimal point.
diff --git a/util/double-conversion/ieee.h b/util/double-conversion/ieee.h
index 0922129d5..839dc47d4 100644
--- a/util/double-conversion/ieee.h
+++ b/util/double-conversion/ieee.h
@@ -30,31 +30,17 @@
#include "diy-fp.h"
-namespace double_conversion
-{
+namespace double_conversion {
// We assume that doubles and uint64_t have the same endianness.
-static uint64_t double_to_uint64(double d)
-{
- return BitCast<uint64_t>(d);
-}
-static double uint64_to_double(uint64_t d64)
-{
- return BitCast<double>(d64);
-}
-static uint32_t float_to_uint32(float f)
-{
- return BitCast<uint32_t>(f);
-}
-static float uint32_to_float(uint32_t d32)
-{
- return BitCast<float>(d32);
-}
+static uint64_t double_to_uint64(double d) { return BitCast<uint64_t>(d); }
+static double uint64_to_double(uint64_t d64) { return BitCast<double>(d64); }
+static uint32_t float_to_uint32(float f) { return BitCast<uint32_t>(f); }
+static float uint32_to_float(uint32_t d32) { return BitCast<float>(d32); }
// Helper functions for doubles.
-class Double
-{
-public:
+class Double {
+ public:
static const uint64_t kSignMask = UINT64_2PART_C(0x80000000, 00000000);
static const uint64_t kExponentMask = UINT64_2PART_C(0x7FF00000, 00000000);
static const uint64_t kSignificandMask = UINT64_2PART_C(0x000FFFFF, FFFFFFFF);
@@ -127,7 +113,7 @@ public:
uint64_t d64 = AsUint64();
int biased_e =
- static_cast<int>((d64 & kExponentMask) >> kPhysicalSignificandSize);
+ static_cast<int>((d64 & kExponentMask) >> kPhysicalSignificandSize);
return biased_e - kExponentBias;
}
@@ -157,13 +143,13 @@ public:
bool IsNan() const {
uint64_t d64 = AsUint64();
return ((d64 & kExponentMask) == kExponentMask) &&
- ((d64 & kSignificandMask) != 0);
+ ((d64 & kSignificandMask) != 0);
}
bool IsInfinite() const {
uint64_t d64 = AsUint64();
return ((d64 & kExponentMask) == kExponentMask) &&
- ((d64 & kSignificandMask) == 0);
+ ((d64 & kSignificandMask) == 0);
}
int Sign() const {
@@ -211,9 +197,7 @@ public:
return physical_significand_is_zero && (Exponent() != kDenormalExponent);
}
- double value() const {
- return uint64_to_double(d64_);
- }
+ double value() const { return uint64_to_double(d64_); }
// Returns the significand size for a given order of magnitude.
// If v = f*2^e with 2^p-1 <= f <= 2^p then p+e is v's order of magnitude.
@@ -237,7 +221,7 @@ public:
return Double(kNaN).value();
}
-private:
+ private:
static const int kExponentBias = 0x3FF + kPhysicalSignificandSize;
static const int kDenormalExponent = -kExponentBias + 1;
static const int kMaxExponent = 0x7FF - kExponentBias;
@@ -270,13 +254,12 @@ private:
biased_exponent = static_cast<uint64_t>(exponent + kExponentBias);
}
return (significand & kSignificandMask) |
- (biased_exponent << kPhysicalSignificandSize);
+ (biased_exponent << kPhysicalSignificandSize);
}
};
-class Single
-{
-public:
+class Single {
+ public:
static const uint32_t kSignMask = 0x80000000;
static const uint32_t kExponentMask = 0x7F800000;
static const uint32_t kSignificandMask = 0x007FFFFF;
@@ -306,7 +289,7 @@ public:
uint32_t d32 = AsUint32();
int biased_e =
- static_cast<int>((d32 & kExponentMask) >> kPhysicalSignificandSize);
+ static_cast<int>((d32 & kExponentMask) >> kPhysicalSignificandSize);
return biased_e - kExponentBias;
}
@@ -336,13 +319,13 @@ public:
bool IsNan() const {
uint32_t d32 = AsUint32();
return ((d32 & kExponentMask) == kExponentMask) &&
- ((d32 & kSignificandMask) != 0);
+ ((d32 & kSignificandMask) != 0);
}
bool IsInfinite() const {
uint32_t d32 = AsUint32();
return ((d32 & kExponentMask) == kExponentMask) &&
- ((d32 & kSignificandMask) == 0);
+ ((d32 & kSignificandMask) == 0);
}
int Sign() const {
@@ -390,9 +373,7 @@ public:
return physical_significand_is_zero && (Exponent() != kDenormalExponent);
}
- float value() const {
- return uint32_to_float(d32_);
- }
+ float value() const { return uint32_to_float(d32_); }
static float Infinity() {
return Single(kInfinity).value();
@@ -402,7 +383,7 @@ public:
return Single(kNaN).value();
}
-private:
+ private:
static const int kExponentBias = 0x7F + kPhysicalSignificandSize;
static const int kDenormalExponent = -kExponentBias + 1;
static const int kMaxExponent = 0xFF - kExponentBias;
diff --git a/util/double-conversion/strtod.h b/util/double-conversion/strtod.h
index 1d81078d2..ed0293b8f 100644
--- a/util/double-conversion/strtod.h
+++ b/util/double-conversion/strtod.h
@@ -30,8 +30,7 @@
#include "utils.h"
-namespace double_conversion
-{
+namespace double_conversion {
// The buffer must only contain digits in the range [0-9]. It must not
// contain a dot or a sign. It must not start with '0', and must not be empty.
diff --git a/util/double-conversion/utils.h b/util/double-conversion/utils.h
index 91f1e6c48..9ccb3b653 100644
--- a/util/double-conversion/utils.h
+++ b/util/double-conversion/utils.h
@@ -126,29 +126,25 @@ typedef unsigned __int64 uint64_t;
DISALLOW_COPY_AND_ASSIGN(TypeName)
#endif
-namespace double_conversion
-{
+namespace double_conversion {
static const int kCharSize = sizeof(char);
// Returns the maximum of the two parameters.
template <typename T>
-static T Max(T a, T b)
-{
+static T Max(T a, T b) {
return a < b ? b : a;
}
// Returns the minimum of the two parameters.
template <typename T>
-static T Min(T a, T b)
-{
+static T Min(T a, T b) {
return a < b ? a : b;
}
-inline int StrLength(const char* string)
-{
+inline int StrLength(const char* string) {
size_t length = strlen(string);
ASSERT(length == static_cast<size_t>(static_cast<int>(length)));
return static_cast<int>(length);
@@ -156,9 +152,8 @@ inline int StrLength(const char* string)
// This is a simplified version of V8's Vector class.
template <typename T>
-class Vector
-{
-public:
+class Vector {
+ public:
Vector() : start_(NULL), length_(0) {}
Vector(T* data, int length) : start_(data), length_(length) {
ASSERT(length == 0 || (length > 0 && data != NULL));
@@ -174,19 +169,13 @@ public:
}
// Returns the length of the vector.
- int length() const {
- return length_;
- }
+ int length() const { return length_; }
// Returns whether or not the vector is empty.
- bool is_empty() const {
- return length_ == 0;
- }
+ bool is_empty() const { return length_ == 0; }
// Returns the pointer to the start of the data in the vector.
- T* start() const {
- return start_;
- }
+ T* start() const { return start_; }
// Access individual vector elements - checks bounds in debug mode.
T& operator[](int index) const {
@@ -194,15 +183,11 @@ public:
return start_[index];
}
- T& first() {
- return start_[0];
- }
+ T& first() { return start_[0]; }
- T& last() {
- return start_[length_ - 1];
- }
+ T& last() { return start_[length_ - 1]; }
-private:
+ private:
T* start_;
int length_;
};
@@ -211,19 +196,14 @@ private:
// Helper class for building result strings in a character buffer. The
// purpose of the class is to use safe operations that checks the
// buffer bounds on all operations in debug mode.
-class StringBuilder
-{
-public:
+class StringBuilder {
+ public:
StringBuilder(char* buffer, int size)
- : buffer_(buffer, size), position_(0) { }
+ : buffer_(buffer, size), position_(0) { }
- ~StringBuilder() {
- if (!is_finalized()) Finalize();
- }
+ ~StringBuilder() { if (!is_finalized()) Finalize(); }
- int size() const {
- return buffer_.length();
- }
+ int size() const { return buffer_.length(); }
// Get the current position in the builder.
int position() const {
@@ -232,9 +212,7 @@ public:
}
// Reset the position.
- void Reset() {
- position_ = 0;
- }
+ void Reset() { position_ = 0; }
// Add a single character to the builder. It is not allowed to add
// 0-characters; use the Finalize() method to terminate the string
@@ -284,13 +262,11 @@ public:
return buffer_.start();
}
-private:
+ private:
Vector<char> buffer_;
int position_;
- bool is_finalized() const {
- return position_ < 0;
- }
+ bool is_finalized() const { return position_ < 0; }
DISALLOW_IMPLICIT_CONSTRUCTORS(StringBuilder);
};
@@ -320,11 +296,14 @@ private:
// enough that it can no longer see that you have cast one pointer type to
// another thus avoiding the warning.
template <class Dest, class Source>
-inline Dest BitCast(const Source& source)
-{
+inline Dest BitCast(const Source& source) {
// Compile time assertion: sizeof(Dest) == sizeof(Source)
// A compile error here means your Dest and Source have different sizes.
- typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1];
+ typedef char VerifySizesAreEqual[sizeof(Dest) == sizeof(Source) ? 1 : -1]
+#if __GNUC__ > 4 || __GNUC__ == 4 && __GNUC_MINOR__ >= 8
+ __attribute__((unused))
+#endif
+ ;
Dest dest;
memmove(&dest, &source, sizeof(dest));
@@ -332,8 +311,7 @@ inline Dest BitCast(const Source& source)
}
template <class Dest, class Source>
-inline Dest BitCast(Source* source)
-{
+inline Dest BitCast(Source* source) {
return BitCast<Dest>(reinterpret_cast<uintptr_t>(source));
}
diff --git a/util/file.cc b/util/file.cc
index c7d8e23b2..bef04cb1c 100644
--- a/util/file.cc
+++ b/util/file.cc
@@ -116,7 +116,7 @@ std::size_t GuardLarge(std::size_t size) {
// The following operating systems have broken read/write/pread/pwrite that
// only supports up to 2^31.
#if defined(_WIN32) || defined(_WIN64) || defined(__APPLE__) || defined(OS_ANDROID)
- return std::min(static_cast<std::size_t>(INT_MAX), size);
+ return std::min(static_cast<std::size_t>(static_cast<unsigned>(-1)), size);
#else
return size;
#endif
@@ -209,7 +209,7 @@ void WriteOrThrow(int fd, const void *data_void, std::size_t size) {
#endif
errno = 0;
do {
- ret =
+ ret =
#if defined(_WIN32) || defined(_WIN64)
_write
#else
@@ -229,7 +229,7 @@ void WriteOrThrow(FILE *to, const void *data, std::size_t size) {
}
void FSyncOrThrow(int fd) {
-// Apparently windows doesn't have fsync?
+// Apparently windows doesn't have fsync?
#if !defined(_WIN32) && !defined(_WIN64)
UTIL_THROW_IF_ARG(-1 == fsync(fd), FDException, (fd), "while syncing");
#endif
@@ -248,7 +248,7 @@ template <> struct CheckOffT<8> {
typedef CheckOffT<sizeof(off_t)>::True IgnoredType;
#endif
-// Can't we all just get along?
+// Can't we all just get along?
void InternalSeek(int fd, int64_t off, int whence) {
if (
#if defined(_WIN32) || defined(_WIN64)
@@ -457,9 +457,9 @@ bool TryName(int fd, std::string &out) {
std::ostringstream convert;
convert << fd;
name += convert.str();
-
+
struct stat sb;
- if (-1 == lstat(name.c_str(), &sb))
+ if (-1 == lstat(name.c_str(), &sb))
return false;
out.resize(sb.st_size + 1);
ssize_t ret = readlink(name.c_str(), &out[0], sb.st_size + 1);
@@ -471,7 +471,7 @@ bool TryName(int fd, std::string &out) {
}
out.resize(ret);
// Don't use the non-file names.
- if (!out.empty() && out[0] != '/')
+ if (!out.empty() && out[0] != '/')
return false;
return true;
#endif
diff --git a/util/probing_hash_table.hh b/util/probing_hash_table.hh
index 57866ff93..51a2944d9 100644
--- a/util/probing_hash_table.hh
+++ b/util/probing_hash_table.hh
@@ -109,9 +109,20 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
if (equal_(got, key)) { out = i; return true; }
if (equal_(got, invalid_)) return false;
if (++i == end_) i = begin_;
- }
+ }
+ }
+
+ // Like UnsafeMutableFind, but the key must be there.
+ template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) {
+ for (MutableIterator i(begin_ + (hash_(key) % buckets_));;) {
+ Key got(i->GetKey());
+ if (equal_(got, key)) { return i; }
+ assert(!equal_(got, invalid_));
+ if (++i == end_) i = begin_;
+ }
}
+
template <class Key> bool Find(const Key key, ConstIterator &out) const {
#ifdef DEBUG
assert(initialized_);
@@ -124,6 +135,16 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
}
}
+ // Like Find but we're sure it must be there.
+ template <class Key> ConstIterator MustFind(const Key key) const {
+ for (ConstIterator i(begin_ + (hash_(key) % buckets_));;) {
+ Key got(i->GetKey());
+ if (equal_(got, key)) { return i; }
+ assert(!equal_(got, invalid_));
+ if (++i == end_) i = begin_;
+ }
+ }
+
void Clear() {
Entry invalid;
invalid.SetKey(invalid_);
diff --git a/util/proxy_iterator.hh b/util/proxy_iterator.hh
index 121a45fa3..0ee1716f4 100644
--- a/util/proxy_iterator.hh
+++ b/util/proxy_iterator.hh
@@ -6,11 +6,11 @@
/* This is a RandomAccessIterator that uses a proxy to access the underlying
* data. Useful for packing data at bit offsets but still using STL
- * algorithms.
+ * algorithms.
*
* Normally I would use boost::iterator_facade but some people are too lazy to
* install boost and still want to use my language model. It's amazing how
- * many operators an iterator has.
+ * many operators an iterator has.
*
* The Proxy needs to provide:
* class InnerIterator;
@@ -22,15 +22,15 @@
* operator<(InnerIterator)
* operator+=(std::ptrdiff_t)
* operator-(InnerIterator)
- * and of course whatever Proxy needs to dereference it.
+ * and of course whatever Proxy needs to dereference it.
*
- * It's also a good idea to specialize std::swap for Proxy.
+ * It's also a good idea to specialize std::swap for Proxy.
*/
namespace util {
template <class Proxy> class ProxyIterator {
private:
- // Self.
+ // Self.
typedef ProxyIterator<Proxy> S;
typedef typename Proxy::InnerIterator InnerIterator;
@@ -38,16 +38,21 @@ template <class Proxy> class ProxyIterator {
typedef std::random_access_iterator_tag iterator_category;
typedef typename Proxy::value_type value_type;
typedef std::ptrdiff_t difference_type;
- typedef Proxy reference;
+ typedef Proxy & reference;
typedef Proxy * pointer;
ProxyIterator() {}
- // For cast from non const to const.
+ // For cast from non const to const.
template <class AlternateProxy> ProxyIterator(const ProxyIterator<AlternateProxy> &in) : p_(*in) {}
explicit ProxyIterator(const Proxy &p) : p_(p) {}
- // p_'s operator= does value copying, but here we want iterator copying.
+ // p_'s swap does value swapping, but here we want iterator swapping
+ friend inline void swap(ProxyIterator<Proxy> &first, ProxyIterator<Proxy> &second) {
+ swap(first.I(), second.I());
+ }
+
+ // p_'s operator= does value copying, but here we want iterator copying.
S &operator=(const S &other) {
I() = other.I();
return *this;
@@ -72,8 +77,8 @@ template <class Proxy> class ProxyIterator {
std::ptrdiff_t operator-(const S &other) const { return I() - other.I(); }
- Proxy operator*() { return p_; }
- const Proxy operator*() const { return p_; }
+ Proxy &operator*() { return p_; }
+ const Proxy &operator*() const { return p_; }
Proxy *operator->() { return &p_; }
const Proxy *operator->() const { return &p_; }
Proxy operator[](std::ptrdiff_t amount) const { return *(*this + amount); }
diff --git a/util/sized_iterator.hh b/util/sized_iterator.hh
index eb2016b90..dce8f229a 100644
--- a/util/sized_iterator.hh
+++ b/util/sized_iterator.hh
@@ -36,6 +36,11 @@ class SizedInnerIterator {
void *Data() { return ptr_; }
std::size_t EntrySize() const { return size_; }
+ friend inline void swap(SizedInnerIterator &first, SizedInnerIterator &second) {
+ std::swap(first.ptr_, second.ptr_);
+ std::swap(first.size_, second.size_);
+ }
+
private:
uint8_t *ptr_;
std::size_t size_;
@@ -63,12 +68,22 @@ class SizedProxy {
const void *Data() const { return inner_.Data(); }
void *Data() { return inner_.Data(); }
-
+
+ /**
+ // TODO: this (deep) swap was recently added. why? if any std heap sort etc
+ // algs are using swap, that's going to be worse performance than using
+ // =. i'm not sure why we *want* a deep swap. if C++11 compilers are
+ // choosing between move constructor and swap, then we'd better implement a
+ // (deep) move constructor. it may also be that this is moot since i made
+ // ProxyIterator a reference and added a shallow ProxyIterator swap? (I
+ // need Ken or someone competent to judge whether that's correct also. -
+ // let me know at graehl@gmail.com
+ */
friend void swap(SizedProxy &first, SizedProxy &second) {
std::swap_ranges(
- static_cast<char*>(first.inner_.Data()),
- static_cast<char*>(first.inner_.Data()) + first.inner_.EntrySize(),
- static_cast<char*>(second.inner_.Data()));
+ static_cast<char*>(first.inner_.Data()),
+ static_cast<char*>(first.inner_.Data()) + first.inner_.EntrySize(),
+ static_cast<char*>(second.inner_.Data()));
}
private:
@@ -87,7 +102,7 @@ typedef ProxyIterator<SizedProxy> SizedIterator;
inline SizedIterator SizedIt(void *ptr, std::size_t size) { return SizedIterator(SizedProxy(ptr, size)); }
-// Useful wrapper for a comparison function i.e. sort.
+// Useful wrapper for a comparison function i.e. sort.
template <class Delegate, class Proxy = SizedProxy> class SizedCompare : public std::binary_function<const Proxy &, const Proxy &, bool> {
public:
explicit SizedCompare(const Delegate &delegate = Delegate()) : delegate_(delegate) {}
@@ -106,7 +121,7 @@ template <class Delegate, class Proxy = SizedProxy> class SizedCompare : public
}
const Delegate &GetDelegate() const { return delegate_; }
-
+
private:
const Delegate delegate_;
};
diff --git a/util/stream/chain.hh b/util/stream/chain.hh
index 154b9b334..0cc83a852 100644
--- a/util/stream/chain.hh
+++ b/util/stream/chain.hh
@@ -122,7 +122,7 @@ class Chain {
threads_.push_back(new Thread(Complete(), kRecycle));
}
- Chain &operator>>(const Recycler &recycle) {
+ Chain &operator>>(const Recycler &) {
CompleteLoop();
return *this;
}
diff --git a/util/string_piece_hash.hh b/util/string_piece_hash.hh
index aa3e3dd45..f206b1d87 100644
--- a/util/string_piece_hash.hh
+++ b/util/string_piece_hash.hh
@@ -3,8 +3,6 @@
#include "util/string_piece.hh"
-#include <set>
-
#include <boost/functional/hash.hpp>
#include <boost/version.hpp>