Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/util
diff options
context:
space:
mode:
authorKenneth Heafield <github@kheafield.com>2013-01-24 16:07:46 +0400
committerKenneth Heafield <github@kheafield.com>2013-01-24 16:07:46 +0400
commit03b077364a39b367a125092418278e9f4240c35f (patch)
tree4b410417dfec95f5bb1bf3f8ad1c3aad870bb979 /util
parent22bf1c77e9866f5010708b288a33957a24627481 (diff)
KenLM 31a6644 resizable probing hash table, build fixes
Diffstat (limited to 'util')
-rw-r--r--util/file_piece.cc2
-rw-r--r--util/probing_hash_table.hh92
-rw-r--r--util/probing_hash_table_test.cc52
-rw-r--r--util/scoped.cc6
-rw-r--r--util/scoped.hh1
5 files changed, 147 insertions, 6 deletions
diff --git a/util/file_piece.cc b/util/file_piece.cc
index 4d1438571..bed5f85af 100644
--- a/util/file_piece.cc
+++ b/util/file_piece.cc
@@ -120,7 +120,7 @@ void FilePiece::Initialize(const char *name, std::ostream *show_progress, std::s
}
Shift();
// gzip detect.
- if ((position_end_ - position_) >= ReadCompressed::kMagicSize && ReadCompressed::DetectCompressedMagic(position_)) {
+ if ((position_end_ >= position_ + ReadCompressed::kMagicSize) && ReadCompressed::DetectCompressedMagic(position_)) {
if (!fallback_to_read_) {
at_end_ = false;
TransitionToRead();
diff --git a/util/probing_hash_table.hh b/util/probing_hash_table.hh
index 6780489df..57866ff93 100644
--- a/util/probing_hash_table.hh
+++ b/util/probing_hash_table.hh
@@ -6,6 +6,7 @@
#include <algorithm>
#include <cstddef>
#include <functional>
+#include <vector>
#include <assert.h>
#include <stdint.h>
@@ -73,10 +74,7 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
assert(initialized_);
#endif
UTIL_THROW_IF(++entries_ >= buckets_, ProbingSizeException, "Hash table with " << buckets_ << " buckets is full.");
- for (MutableIterator i(begin_ + (hash_(t.GetKey()) % buckets_));;) {
- if (equal_(i->GetKey(), invalid_)) { *i = t; return i; }
- if (++i == end_) { i = begin_; }
- }
+ return UncheckedInsert(t);
}
// Return true if the value was found (and not inserted). This is consistent with Find but the opposite if hash_map!
@@ -126,12 +124,96 @@ template <class EntryT, class HashT, class EqualT = std::equal_to<typename Entry
}
}
- void Clear(Entry invalid) {
+ void Clear() {
+ Entry invalid;
+ invalid.SetKey(invalid_);
std::fill(begin_, end_, invalid);
entries_ = 0;
}
+ // Return number of entries assuming no serialization went on.
+ std::size_t SizeNoSerialization() const {
+ return entries_;
+ }
+
+ // Return memory size expected by Double.
+ std::size_t DoubleTo() const {
+ return buckets_ * 2 * sizeof(Entry);
+ }
+
+ // Inform the table that it has double the amount of memory.
+ // Pass clear_new = false if you are sure the new memory is initialized
+ // properly (to invalid_) i.e. by mremap.
+ void Double(void *new_base, bool clear_new = true) {
+ begin_ = static_cast<MutableIterator>(new_base);
+ MutableIterator old_end = begin_ + buckets_;
+ buckets_ *= 2;
+ end_ = begin_ + buckets_;
+ if (clear_new) {
+ Entry invalid;
+ invalid.SetKey(invalid_);
+ std::fill(old_end, end_, invalid);
+ }
+ std::vector<Entry> rolled_over;
+ // Move roll-over entries to a buffer because they might not roll over anymore. This should be small.
+ for (MutableIterator i = begin_; i != old_end && !equal_(i->GetKey(), invalid_); ++i) {
+ rolled_over.push_back(*i);
+ i->SetKey(invalid_);
+ }
+ /* Re-insert everything. Entries might go backwards to take over a
+ * recently opened gap, stay, move to new territory, or wrap around. If
+ * an entry wraps around, it might go to a pointer greater than i (which
+ * can happen at the beginning) and it will be revisited to possibly fill
+ * in a gap created later.
+ */
+ Entry temp;
+ for (MutableIterator i = begin_; i != old_end; ++i) {
+ if (!equal_(i->GetKey(), invalid_)) {
+ temp = *i;
+ i->SetKey(invalid_);
+ UncheckedInsert(temp);
+ }
+ }
+ // Put the roll-over entries back in.
+ for (typename std::vector<Entry>::const_iterator i(rolled_over.begin()); i != rolled_over.end(); ++i) {
+ UncheckedInsert(*i);
+ }
+ }
+
+ // Mostly for tests, check consistency of every entry.
+ void CheckConsistency() {
+ MutableIterator last;
+ for (last = end_ - 1; last >= begin_ && !equal_(last->GetKey(), invalid_); --last) {}
+ UTIL_THROW_IF(last == begin_, ProbingSizeException, "Completely full");
+ MutableIterator i;
+ // Beginning can be wrap-arounds.
+ for (i = begin_; !equal_(i->GetKey(), invalid_); ++i) {
+ MutableIterator ideal = Ideal(*i);
+ UTIL_THROW_IF(ideal > i && ideal <= last, Exception, "Inconsistency at position " << (i - begin_) << " should be at " << (ideal - begin_));
+ }
+ MutableIterator pre_gap = i;
+ for (; i != end_; ++i) {
+ if (equal_(i->GetKey(), invalid_)) {
+ pre_gap = i;
+ continue;
+ }
+ MutableIterator ideal = Ideal(*i);
+ UTIL_THROW_IF(ideal > i || ideal <= pre_gap, Exception, "Inconsistency at position " << (i - begin_) << " with ideal " << (ideal - begin_));
+ }
+ }
+
private:
+ template <class T> MutableIterator Ideal(const T &t) {
+ return begin_ + (hash_(t.GetKey()) % buckets_);
+ }
+
+ template <class T> MutableIterator UncheckedInsert(const T &t) {
+ for (MutableIterator i(Ideal(t));;) {
+ if (equal_(i->GetKey(), invalid_)) { *i = t; return i; }
+ if (++i == end_) { i = begin_; }
+ }
+ }
+
MutableIterator begin_;
std::size_t buckets_;
MutableIterator end_;
diff --git a/util/probing_hash_table_test.cc b/util/probing_hash_table_test.cc
index be0fa8597..9f7948ce6 100644
--- a/util/probing_hash_table_test.cc
+++ b/util/probing_hash_table_test.cc
@@ -1,10 +1,14 @@
#include "util/probing_hash_table.hh"
+#include "util/murmur_hash.hh"
+#include "util/scoped.hh"
+
#define BOOST_TEST_MODULE ProbingHashTableTest
#include <boost/test/unit_test.hpp>
#include <boost/scoped_array.hpp>
#include <boost/functional/hash.hpp>
#include <stdio.h>
+#include <stdlib.h>
#include <string.h>
#include <stdint.h>
@@ -19,6 +23,10 @@ struct Entry {
return key;
}
+ void SetKey(unsigned char to) {
+ key = to;
+ }
+
uint64_t GetValue() const {
return value;
}
@@ -46,5 +54,49 @@ BOOST_AUTO_TEST_CASE(simple) {
BOOST_CHECK(!table.Find(2, i));
}
+struct Entry64 {
+ uint64_t key;
+ typedef uint64_t Key;
+
+ Entry64() {}
+
+ explicit Entry64(uint64_t key_in) {
+ key = key_in;
+ }
+
+ Key GetKey() const { return key; }
+ void SetKey(uint64_t to) { key = to; }
+};
+
+struct MurmurHashEntry64 {
+ std::size_t operator()(uint64_t value) const {
+ return util::MurmurHash64A(&value, 8);
+ }
+};
+
+typedef ProbingHashTable<Entry64, MurmurHashEntry64> Table64;
+
+BOOST_AUTO_TEST_CASE(Double) {
+ for (std::size_t initial = 19; initial < 30; ++initial) {
+ size_t size = Table64::Size(initial, 1.2);
+ scoped_malloc mem(MallocOrThrow(size));
+ Table64 table(mem.get(), size, std::numeric_limits<uint64_t>::max());
+ table.Clear();
+ for (uint64_t i = 0; i < 19; ++i) {
+ table.Insert(Entry64(i));
+ }
+ table.CheckConsistency();
+ mem.call_realloc(table.DoubleTo());
+ table.Double(mem.get());
+ table.CheckConsistency();
+ for (uint64_t i = 20; i < 40 ; ++i) {
+ table.Insert(Entry64(i));
+ }
+ mem.call_realloc(table.DoubleTo());
+ table.Double(mem.get());
+ table.CheckConsistency();
+ }
+}
+
} // namespace
} // namespace util
diff --git a/util/scoped.cc b/util/scoped.cc
index e7066ee47..b6972f14b 100644
--- a/util/scoped.cc
+++ b/util/scoped.cc
@@ -16,6 +16,12 @@ void *MallocOrThrow(std::size_t requested) {
return ret;
}
+void *CallocOrThrow(std::size_t requested) {
+ void *ret;
+ UTIL_THROW_IF_ARG(!(ret = std::calloc(1, requested)), MallocException, (requested), "in calloc");
+ return ret;
+}
+
scoped_malloc::~scoped_malloc() {
std::free(p_);
}
diff --git a/util/scoped.hh b/util/scoped.hh
index d0a5aabd0..b642d0649 100644
--- a/util/scoped.hh
+++ b/util/scoped.hh
@@ -14,6 +14,7 @@ class MallocException : public ErrnoException {
};
void *MallocOrThrow(std::size_t requested);
+void *CallocOrThrow(std::size_t requested);
class scoped_malloc {
public: