Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/util
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2015-09-10 18:04:09 +0300
committerKenneth Heafield <github@kheafield.com>2015-09-10 18:04:09 +0300
commit5732129bd45bb70be70eb38ad36949216c07d3bc (patch)
treed4c8703343b42682ad9ca7124371e6308cfe7682 /util
parent37f7326057d7e7af95548684ef997b9fee1a55ca (diff)
Update kenlm
Diffstat (limited to 'util')
-rw-r--r--util/CMakeLists.txt2
-rw-r--r--util/fixed_array.hh6
-rw-r--r--util/probing_hash_table.hh125
-rw-r--r--util/probing_hash_table_benchmark_main.cc141
-rw-r--r--util/stream/CMakeLists.txt2
-rw-r--r--util/stream/rewindable_stream.cc2
6 files changed, 225 insertions, 53 deletions
diff --git a/util/CMakeLists.txt b/util/CMakeLists.txt
index c52cdbc06..6f7f5e99b 100644
--- a/util/CMakeLists.txt
+++ b/util/CMakeLists.txt
@@ -80,7 +80,7 @@ if(BUILD_TESTING)
set_target_properties(${test} PROPERTIES COMPILE_FLAGS -DBOOST_TEST_DYN_LINK)
# Link the executable against boost
- target_link_libraries(${test} ${Boost_LIBRARIES})
+ target_link_libraries(${test} ${Boost_LIBRARIES} pthread)
# file_piece_test requires an extra command line parameter
if ("${test}" STREQUAL "file_piece_test")
diff --git a/util/fixed_array.hh b/util/fixed_array.hh
index 9083d39ee..c67e8ed95 100644
--- a/util/fixed_array.hh
+++ b/util/fixed_array.hh
@@ -11,10 +11,10 @@
namespace util {
/**
- * Defines a fixed-size collection.
+ * Defines an array with fixed maximum size.
*
- * Ever want an array of things by they don't have a default constructor or are
- * non-copyable? FixedArray allows constructing one at a time.
+ * Ever want an array of things but they don't have a default constructor or
+ * are non-copyable? FixedArray allows constructing one at a time.
*/
template <class T> class FixedArray {
public:
diff --git a/util/probing_hash_table.hh b/util/probing_hash_table.hh
index f32b64ea3..53fbbe996 100644
--- a/util/probing_hash_table.hh
+++ b/util/probing_hash_table.hh
@@ -26,6 +26,65 @@ struct IdentityHash {
template <class T> T operator()(T arg) const { return arg; }
};
+class DivMod {
+ public:
+ explicit DivMod(std::size_t buckets) : buckets_(buckets) {}
+
+ static std::size_t RoundBuckets(std::size_t from) {
+ return from;
+ }
+
+ template <class It> It Ideal(It begin, uint64_t hash) const {
+ return begin + (hash % buckets_);
+ }
+
+ template <class BaseIt, class OutIt> void Next(BaseIt begin, BaseIt end, OutIt &it) const {
+ if (++it == end) it = begin;
+ }
+
+ void Double() {
+ buckets_ *= 2;
+ }
+
+ private:
+ std::size_t buckets_;
+};
+
+class Power2Mod {
+ public:
+ explicit Power2Mod(std::size_t buckets) {
+ UTIL_THROW_IF(!buckets || (((buckets - 1) & buckets)), ProbingSizeException, "Size " << buckets << " is not a power of 2.");
+ mask_ = buckets - 1;
+ }
+
+ // Round up to next power of 2.
+ static std::size_t RoundBuckets(std::size_t from) {
+ --from;
+ from |= from >> 1;
+ from |= from >> 2;
+ from |= from >> 4;
+ from |= from >> 8;
+ from |= from >> 16;
+ from |= from >> 32;
+ return from + 1;
+ }
+
+ template <class It> It Ideal(It begin, uint64_t hash) const {
+ return begin + (hash & mask_);
+ }
+
+ template <class BaseIt, class OutIt> void Next(BaseIt begin, BaseIt /*end*/, OutIt &it) const {
+ it = begin + ((it - begin + 1) & mask_);
+ }
+
+ void Double() {
+ mask_ = (mask_ << 1) | 1;
+ }
+
+ private:
+ std::size_t mask_;
+};
+
template <class EntryT, class HashT, class EqualT> class AutoProbing;
/* Non-standard hash table
@@ -36,7 +95,7 @@ template <class EntryT, class HashT, class EqualT> class AutoProbing;
* Uses linear probing to find value.
* Only insert and lookup operations.
*/
-template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class ProbingHashTable {
+template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key>, class ModT = DivMod> class ProbingHashTable {
public:
typedef EntryT Entry;
typedef typename Entry::Key Key;
@@ -44,14 +103,15 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
typedef Entry *MutableIterator;
typedef HashT Hash;
typedef EqualT Equal;
+ typedef ModT Mod;
static uint64_t Size(uint64_t entries, float multiplier) {
- uint64_t buckets = std::max(entries + 1, static_cast<uint64_t>(multiplier * static_cast<float>(entries)));
+ uint64_t buckets = Mod::RoundBuckets(std::max(entries + 1, static_cast<uint64_t>(multiplier * static_cast<float>(entries))));
return buckets * sizeof(Entry);
}
// Must be assigned to later.
- ProbingHashTable() : entries_(0)
+ ProbingHashTable() : mod_(1), entries_(0)
#ifdef DEBUG
, initialized_(false)
#endif
@@ -59,11 +119,12 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
ProbingHashTable(void *start, std::size_t allocated, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal())
: begin_(reinterpret_cast<MutableIterator>(start)),
- buckets_(allocated / sizeof(Entry)),
- end_(begin_ + buckets_),
+ end_(begin_ + allocated / sizeof(Entry)),
+ buckets_(end_ - begin_),
invalid_(invalid),
hash_(hash_func),
equal_(equal_func),
+ mod_(end_ - begin_),
entries_(0)
#ifdef DEBUG
, initialized_(true)
@@ -75,6 +136,13 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
end_ = begin_ + buckets_;
}
+ MutableIterator Ideal(const Key key) {
+ return mod_.Ideal(begin_, hash_(key));
+ }
+ ConstIterator Ideal(const Key key) const {
+ return mod_.Ideal(begin_, hash_(key));
+ }
+
template <class T> MutableIterator Insert(const T &t) {
#ifdef DEBUG
assert(initialized_);
@@ -83,12 +151,12 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
return UncheckedInsert(t);
}
- // Return true if the value was found (and not inserted). This is consistent with Find but the opposite if hash_map!
+ // Return true if the value was found (and not inserted). This is consistent with Find but the opposite of hash_map!
template <class T> bool FindOrInsert(const T &t, MutableIterator &out) {
#ifdef DEBUG
assert(initialized_);
#endif
- for (MutableIterator i = Ideal(t.GetKey());;) {
+ for (MutableIterator i = Ideal(t.GetKey());;mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, t.GetKey())) { out = i; return true; }
if (equal_(got, invalid_)) {
@@ -97,7 +165,6 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
out = i;
return false;
}
- if (++i == end_) i = begin_;
}
}
@@ -108,44 +175,45 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
#ifdef DEBUG
assert(initialized_);
#endif
- for (MutableIterator i(Ideal(key));;) {
+ for (MutableIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, key)) { out = i; return true; }
if (equal_(got, invalid_)) return false;
- if (++i == end_) i = begin_;
}
}
// Like UnsafeMutableFind, but the key must be there.
template <class Key> MutableIterator UnsafeMutableMustFind(const Key key) {
- for (MutableIterator i(Ideal(key));;) {
+ for (MutableIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, key)) { return i; }
assert(!equal_(got, invalid_));
- if (++i == end_) i = begin_;
}
}
-
- template <class Key> bool Find(const Key key, ConstIterator &out) const {
+ // Iterator is both input and output.
+ template <class Key> bool FindFromIdeal(const Key key, ConstIterator &i) const {
#ifdef DEBUG
assert(initialized_);
#endif
- for (ConstIterator i(Ideal(key));;) {
+ for (;; mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
- if (equal_(got, key)) { out = i; return true; }
+ if (equal_(got, key)) return true;
if (equal_(got, invalid_)) return false;
- if (++i == end_) i = begin_;
}
}
+ template <class Key> bool Find(const Key key, ConstIterator &out) const {
+ out = Ideal(key);
+ return FindFromIdeal(key, out);
+ }
+
// Like Find but we're sure it must be there.
template <class Key> ConstIterator MustFind(const Key key) const {
- for (ConstIterator i(Ideal(key));;) {
+ for (ConstIterator i(Ideal(key));; mod_.Next(begin_, end_, i)) {
Key got(i->GetKey());
if (equal_(got, key)) { return i; }
assert(!equal_(got, invalid_));
- if (++i == end_) i = begin_;
}
}
@@ -174,6 +242,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
MutableIterator old_end = begin_ + buckets_;
buckets_ *= 2;
end_ = begin_ + buckets_;
+ mod_.Double();
if (clear_new) {
Entry invalid;
invalid.SetKey(invalid_);
@@ -230,26 +299,20 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
private:
friend class AutoProbing<Entry, Hash, Equal>;
- MutableIterator Ideal(const Key key) {
- return begin_ + (hash_(key) % buckets_);
- }
- ConstIterator Ideal(const Key key) const {
- return begin_ + (hash_(key) % buckets_);
- }
-
template <class T> MutableIterator UncheckedInsert(const T &t) {
- for (MutableIterator i(Ideal(t.GetKey()));;) {
+ for (MutableIterator i(Ideal(t.GetKey()));; mod_.Next(begin_, end_, i)) {
if (equal_(i->GetKey(), invalid_)) { *i = t; return i; }
- if (++i == end_) { i = begin_; }
}
}
MutableIterator begin_;
- std::size_t buckets_;
MutableIterator end_;
+ std::size_t buckets_;
Key invalid_;
Hash hash_;
Equal equal_;
+ Mod mod_;
+
std::size_t entries_;
#ifdef DEBUG
bool initialized_;
@@ -259,7 +322,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
// Resizable linear probing hash table. This owns the memory.
template <class EntryT, class HashT, class EqualT = std::equal_to<typename EntryT::Key> > class AutoProbing {
private:
- typedef ProbingHashTable<EntryT, HashT, EqualT> Backend;
+ typedef ProbingHashTable<EntryT, HashT, EqualT, Power2Mod> Backend;
public:
static std::size_t MemUsage(std::size_t size, float multiplier = 1.5) {
return Backend::Size(size, multiplier);
@@ -272,7 +335,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
typedef HashT Hash;
typedef EqualT Equal;
- AutoProbing(std::size_t initial_size = 10, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) :
+ AutoProbing(std::size_t initial_size = 5, const Key &invalid = Key(), const Hash &hash_func = Hash(), const Equal &equal_func = Equal()) :
allocated_(Backend::Size(initial_size, 1.5)), mem_(util::MallocOrThrow(allocated_)), backend_(mem_.get(), allocated_, invalid, hash_func, equal_func) {
threshold_ = initial_size * 1.2;
Clear();
diff --git a/util/probing_hash_table_benchmark_main.cc b/util/probing_hash_table_benchmark_main.cc
index 3e12290cf..c5129480f 100644
--- a/util/probing_hash_table_benchmark_main.cc
+++ b/util/probing_hash_table_benchmark_main.cc
@@ -1,10 +1,8 @@
+#include "util/file.hh"
#include "util/probing_hash_table.hh"
#include "util/scoped.hh"
#include "util/usage.hh"
-#include <boost/random/mersenne_twister.hpp>
-#include <boost/random/uniform_int_distribution.hpp>
-
#include <iostream>
namespace util {
@@ -16,34 +14,143 @@ struct Entry {
Key GetKey() const { return key; }
};
-typedef util::ProbingHashTable<Entry, util::IdentityHash> Table;
+// I don't care if this doesn't run on Windows. Empirically /dev/urandom was faster than boost::random's Mersenne Twister.
+class URandom {
+ public:
+ URandom() :
+ it_(buf_ + 1024), end_(buf_ + 1024),
+ file_(util::OpenReadOrThrow("/dev/urandom")) {}
+
+ uint64_t Get() {
+ if (it_ == end_) {
+ it_ = buf_;
+ util::ReadOrThrow(file_.get(), buf_, sizeof(buf_));
+ it_ = buf_;
+ }
+ return *it_++;
+ }
+
+ void Batch(uint64_t *begin, uint64_t *end) {
+ util::ReadOrThrow(file_.get(), begin, (end - begin) * sizeof(uint64_t));
+ }
+
+ private:
+ uint64_t buf_[1024];
+ uint64_t *it_, *end_;
+
+ util::scoped_fd file_;
+};
+
+struct PrefetchEntry {
+ uint64_t key;
+ const Entry *pointer;
+};
+
+const std::size_t kPrefetchSize = 4;
+template <class Table> class PrefetchQueue {
+ public:
+ explicit PrefetchQueue(Table &table) : table_(table), cur_(0), twiddle_(false) {
+ for (PrefetchEntry *i = entries_; i != entries_ + kPrefetchSize; ++i)
+ i->pointer = NULL;
+ }
+
+ void Add(uint64_t key) {
+ if (Cur().pointer) {
+ twiddle_ ^= table_.FindFromIdeal(Cur().key, Cur().pointer);
+ }
+ Cur().key = key;
+ Cur().pointer = table_.Ideal(key);
+ __builtin_prefetch(Cur().pointer, 0, 0);
+ Next();
+ }
+
+ bool Drain() {
+ if (Cur().pointer) {
+ for (PrefetchEntry *i = &Cur(); i < entries_ + kPrefetchSize; ++i) {
+ twiddle_ ^= table_.FindFromIdeal(i->key, i->pointer);
+ }
+ }
+ for (PrefetchEntry *i = entries_; i < &Cur(); ++i) {
+ twiddle_ ^= table_.FindFromIdeal(i->key, i->pointer);
+ }
+ return twiddle_;
+ }
+
+ private:
+ PrefetchEntry &Cur() { return entries_[cur_]; }
+ void Next() {
+ ++cur_;
+ cur_ = cur_ % kPrefetchSize;
+ }
+
+ Table &table_;
+ PrefetchEntry entries_[kPrefetchSize];
+ std::size_t cur_;
-void Test(uint64_t entries, uint64_t lookups, float multiplier = 1.5) {
- std::size_t size = Table::Size(entries, multiplier);
+ bool twiddle_;
+
+ PrefetchQueue(const PrefetchQueue&);
+ void operator=(const PrefetchQueue&);
+};
+
+/*template <class Table> class Immediate {
+ public:
+
+ private:
+ Table &table_;
+};*/
+
+std::size_t Size(uint64_t entries, float multiplier = 1.5) {
+ typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Power2Mod> Table;
+ // Always round up to power of 2 for fair comparison.
+ return Power2Mod::RoundBuckets(Table::Size(entries, multiplier) / sizeof(Entry)) * sizeof(Entry);
+}
+
+template <class Mod> bool Test(URandom &rn, uint64_t entries, const uint64_t *const queries_begin, const uint64_t *const queries_end, float multiplier = 1.5) {
+ typedef util::ProbingHashTable<Entry, util::IdentityHash, std::equal_to<Entry::Key>, Mod> Table;
+ std::size_t size = Size(entries, multiplier);
scoped_malloc backing(util::CallocOrThrow(size));
Table table(backing.get(), size);
- boost::random::mt19937 gen;
- boost::random::uniform_int_distribution<> dist(std::numeric_limits<uint64_t>::min(), std::numeric_limits<uint64_t>::max());
+
double start = UserTime();
for (uint64_t i = 0; i < entries; ++i) {
Entry entry;
- entry.key = dist(gen);
+ entry.key = rn.Get();
table.Insert(entry);
}
- double inserted = UserTime();
+ double inserted = UserTime() - start;
+ double before_lookup = UserTime();
+ PrefetchQueue<Table> queue(table);
+ for (const uint64_t *i = queries_begin; i != queries_end; ++i) {
+ queue.Add(*i);
+/* typename Table::ConstIterator it;
+ meaningless ^= table.Find(*i, it);*/
+ }
+ bool meaningless = queue.Drain();
+ std::cout << entries << ' ' << size << ' ' << (inserted / static_cast<double>(entries)) << ' ' << (UserTime() - before_lookup) / static_cast<double>(queries_end - queries_begin) << '\n';
+ return meaningless;
+}
+
+template <class Mod> bool TestRun(uint64_t lookups = 20000000, float multiplier = 1.5) {
+ URandom rn;
+ util::scoped_malloc queries(util::CallocOrThrow(lookups * sizeof(uint64_t)));
+ rn.Batch(static_cast<uint64_t*>(queries.get()), static_cast<uint64_t*>(queries.get()) + lookups);
+ uint64_t physical_mem_limit = util::GuessPhysicalMemory() / 2;
bool meaningless = true;
- for (uint64_t i = 0; i < lookups; ++i) {
- Table::ConstIterator it;
- meaningless ^= table.Find(dist(gen), it);
+ for (uint64_t i = 4; Size(i / multiplier) < physical_mem_limit; i *= 4) {
+ meaningless ^= util::Test<Mod>(rn, i / multiplier, static_cast<const uint64_t*>(queries.get()), static_cast<const uint64_t*>(queries.get()) + lookups, multiplier);
}
- std::cout << meaningless << ' ' << entries << ' ' << multiplier << ' ' << (inserted - start) << ' ' << (UserTime() - inserted) / static_cast<double>(lookups) << std::endl;
+ return meaningless;
}
} // namespace
} // namespace util
int main() {
- for (uint64_t i = 1; i <= 10000000ULL; i *= 10) {
- util::Test(i, 10000000);
- }
+ bool meaningless = false;
+ std::cout << "#Integer division\n";
+ meaningless ^= util::TestRun<util::DivMod>();
+ std::cout << "#Masking\n";
+ meaningless ^= util::TestRun<util::Power2Mod>();
+ std::cerr << "Meaningless: " << meaningless << '\n';
}
diff --git a/util/stream/CMakeLists.txt b/util/stream/CMakeLists.txt
index d3eddbe5c..3e47f73e6 100644
--- a/util/stream/CMakeLists.txt
+++ b/util/stream/CMakeLists.txt
@@ -55,7 +55,7 @@ if(BUILD_TESTING)
set_target_properties(${test} PROPERTIES COMPILE_FLAGS -DBOOST_TEST_DYN_LINK)
# Link the executable against boost
- target_link_libraries(${test} ${Boost_LIBRARIES})
+ target_link_libraries(${test} ${Boost_LIBRARIES} pthread)
# Specify command arguments for how to run each unit test
#
diff --git a/util/stream/rewindable_stream.cc b/util/stream/rewindable_stream.cc
index 2867bf8ab..726e2a72d 100644
--- a/util/stream/rewindable_stream.cc
+++ b/util/stream/rewindable_stream.cc
@@ -1,6 +1,8 @@
#include "util/stream/rewindable_stream.hh"
#include "util/pcqueue.hh"
+#include <iostream>
+
namespace util {
namespace stream {