diff options
author | Kenneth Heafield <github@kheafield.com> | 2013-01-24 16:07:46 +0400 |
---|---|---|
committer | Kenneth Heafield <github@kheafield.com> | 2013-01-24 16:07:46 +0400 |
commit | 03b077364a39b367a125092418278e9f4240c35f (patch) | |
tree | 4b410417dfec95f5bb1bf3f8ad1c3aad870bb979 /util | |
parent | 22bf1c77e9866f5010708b288a33957a24627481 (diff) |
KenLM 31a6644 resizable probing hash table, build fixes
Diffstat (limited to 'util')
-rw-r--r-- | util/file_piece.cc | 2 | ||||
-rw-r--r-- | util/probing_hash_table.hh | 92 | ||||
-rw-r--r-- | util/probing_hash_table_test.cc | 52 | ||||
-rw-r--r-- | util/scoped.cc | 6 | ||||
-rw-r--r-- | util/scoped.hh | 1 |
5 files changed, 147 insertions, 6 deletions
diff --git a/util/file_piece.cc b/util/file_piece.cc index 4d1438571..bed5f85af 100644 --- a/util/file_piece.cc +++ b/util/file_piece.cc @@ -120,7 +120,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s } Shift(); // gzip detect. - if ((position_end_ - position_) >= ReadCompressed::kMagicSize && ReadCompressed::DetectCompressedMagic(position_)) { + if ((position_end_ >= position_ + ReadCompressed::kMagicSize) && ReadCompressed::DetectCompressedMagic(position_)) { if (!fallback_to_read_) { at_end_ = false; TransitionToRead(); diff --git a/util/probing_hash_table.hh b/util/probing_hash_table.hh index 6780489df..57866ff93 100644 --- a/util/probing_hash_table.hh +++ b/util/probing_hash_table.hh @@ -6,6 +6,7 @@ #include <algorithm> #include <cstddef> #include <functional> +#include <vector> #include <assert.h> #include <stdint.h> @@ -73,10 +74,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry assert(initialized_); #endif UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full."); - for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) { - if (equal_(i->GetKey(), invalid_)) { *i = t; return i; } - if (++i == end_) { i = begin_; } - } + return UncheckedInsert(t); } // Return true if the value was found (and not inserted). This is consistent with Find but the opposite if hash_map! @@ -126,12 +124,96 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry } } - void Clear(Entry invalid) { + void Clear() { + Entry invalid; + invalid.SetKey(invalid_); std::fill(begin_, end_, invalid); entries_ = 0; } + // Return number of entries assuming no serialization went on. + std::size_t SizeNoSerialization() const { + return entries_; + } + + // Return memory size expected by Double. + std::size_t DoubleTo() const { + return buckets_ * 2 * sizeof(Entry); + } + + // Inform the table that it has double the amount of memory. + // Pass clear_new = false if you are sure the new memory is initialized + // properly (to invalid_) i.e. by mremap. + void Double(void *new_base, bool clear_new = true) { + begin_ = static_cast<MutableIterator>(new_base); + MutableIterator old_end = begin_ + buckets_; + buckets_ *= 2; + end_ = begin_ + buckets_; + if (clear_new) { + Entry invalid; + invalid.SetKey(invalid_); + std::fill(old_end, end_, invalid); + } + std::vector<Entry> rolled_over; + // Move roll-over entries to a buffer because they might not roll over anymore. This should be small. + for (MutableIterator i = begin_; i != old_end && !equal_(i->GetKey(), invalid_); ++i) { + rolled_over.push_back(*i); + i->SetKey(invalid_); + } + /* Re-insert everything. Entries might go backwards to take over a + * recently opened gap, stay, move to new territory, or wrap around. If + * an entry wraps around, it might go to a pointer greater than i (which + * can happen at the beginning) and it will be revisited to possibly fill + * in a gap created later. + */ + Entry temp; + for (MutableIterator i = begin_; i != old_end; ++i) { + if (!equal_(i->GetKey(), invalid_)) { + temp = *i; + i->SetKey(invalid_); + UncheckedInsert(temp); + } + } + // Put the roll-over entries back in. + for (typename std::vector<Entry>::const_iterator i(rolled_over.begin()); i != rolled_over.end(); ++i) { + UncheckedInsert(*i); + } + } + + // Mostly for tests, check consistency of every entry. + void CheckConsistency() { + MutableIterator last; + for (last = end_ - 1; last >= begin_ && !equal_(last->GetKey(), invalid_); --last) {} + UTIL_THROW_IF(last == begin_, ProbingSizeException, "Completely full"); + MutableIterator i; + // Beginning can be wrap-arounds. + for (i = begin_; !equal_(i->GetKey(), invalid_); ++i) { + MutableIterator ideal = Ideal(*i); + UTIL_THROW_IF(ideal > i && ideal <= last, Exception, "Inconsistency at position " << (i - begin_) << " should be at " << (ideal - begin_)); + } + MutableIterator pre_gap = i; + for (; i != end_; ++i) { + if (equal_(i->GetKey(), invalid_)) { + pre_gap = i; + continue; + } + MutableIterator ideal = Ideal(*i); + UTIL_THROW_IF(ideal > i || ideal <= pre_gap, Exception, "Inconsistency at position " << (i - begin_) << " with ideal " << (ideal - begin_)); + } + } + private: + template <class T> MutableIterator Ideal(const T &t) { + return begin_ + (hash_(t.GetKey()) % buckets_); + } + + template <class T> MutableIterator UncheckedInsert(const T &t) { + for (MutableIterator i(Ideal(t));;) { + if (equal_(i->GetKey(), invalid_)) { *i = t; return i; } + if (++i == end_) { i = begin_; } + } + } + MutableIterator begin_; std::size_t buckets_; MutableIterator end_; diff --git a/util/probing_hash_table_test.cc b/util/probing_hash_table_test.cc index be0fa8597..9f7948ce6 100644 --- a/util/probing_hash_table_test.cc +++ b/util/probing_hash_table_test.cc @@ -1,10 +1,14 @@ #include "util/probing_hash_table.hh" +#include "util/murmur_hash.hh" +#include "util/scoped.hh" + #define BOOST_TEST_MODULE ProbingHashTableTest #include <boost/test/unit_test.hpp> #include <boost/scoped_array.hpp> #include <boost/functional/hash.hpp> #include <stdio.h> +#include <stdlib.h> #include <string.h> #include <stdint.h> @@ -19,6 +23,10 @@ struct Entry { return key; } + void SetKey(unsigned char to) { + key = to; + } + uint64_t GetValue() const { return value; } @@ -46,5 +54,49 @@ BOOST_AUTO_TEST_CASE(simple) { BOOST_CHECK(!table.Find(2, i)); } +struct Entry64 { + uint64_t key; + typedef uint64_t Key; + + Entry64() {} + + explicit Entry64(uint64_t key_in) { + key = key_in; + } + + Key GetKey() const { return key; } + void SetKey(uint64_t to) { key = to; } +}; + +struct MurmurHashEntry64 { + std::size_t operator()(uint64_t value) const { + return util::MurmurHash64A(&value, 8); + } +}; + +typedef ProbingHashTable<Entry64, MurmurHashEntry64> Table64; + +BOOST_AUTO_TEST_CASE(Double) { + for (std::size_t initial = 19; initial < 30; ++initial) { + size_t size = Table64::Size(initial, 1.2); + scoped_malloc mem(MallocOrThrow(size)); + Table64 table(mem.get(), size, std::numeric_limits<uint64_t>::max()); + table.Clear(); + for (uint64_t i = 0; i < 19; ++i) { + table.Insert(Entry64(i)); + } + table.CheckConsistency(); + mem.call_realloc(table.DoubleTo()); + table.Double(mem.get()); + table.CheckConsistency(); + for (uint64_t i = 20; i < 40 ; ++i) { + table.Insert(Entry64(i)); + } + mem.call_realloc(table.DoubleTo()); + table.Double(mem.get()); + table.CheckConsistency(); + } +} + } // namespace } // namespace util diff --git a/util/scoped.cc b/util/scoped.cc index e7066ee47..b6972f14b 100644 --- a/util/scoped.cc +++ b/util/scoped.cc @@ -16,6 +16,12 @@ void *MallocOrThrow(std::size_t requested) { return ret; } +void *CallocOrThrow(std::size_t requested) { + void *ret; + UTIL_THROW_IF_ARG(!(ret = std::calloc(1, requested)), MallocException, (requested), "in calloc"); + return ret; +} + scoped_malloc::~scoped_malloc() { std::free(p_); } diff --git a/util/scoped.hh b/util/scoped.hh index d0a5aabd0..b642d0649 100644 --- a/util/scoped.hh +++ b/util/scoped.hh @@ -14,6 +14,7 @@ class MallocException : public ErrnoException { }; void *MallocOrThrow(std::size_t requested); +void *CallocOrThrow(std::size_t requested); class scoped_malloc { public: |