diff options
author | Maxim Pimenov <m@maps.me> | 2015-09-11 16:38:18 +0300 |
---|---|---|
committer | Sergey Yershov <yershov@corp.mail.ru> | 2016-03-23 16:02:16 +0300 |
commit | d6800004d763a905c056bb3ca86e639f476377bc (patch) | |
tree | 593c88959f02ecdbaeda78a7add3eec77ea0704a /coding/compressed_bit_vector.hpp | |
parent | e8acd6f4596ebf011822e0c86fbca556faf3b724 (diff) |
Review fixes.
Diffstat (limited to 'coding/compressed_bit_vector.hpp')
-rw-r--r-- | coding/compressed_bit_vector.hpp | 138 |
1 files changed, 77 insertions, 61 deletions
diff --git a/coding/compressed_bit_vector.hpp b/coding/compressed_bit_vector.hpp index b856f9fc72..26a2182f77 100644 --- a/coding/compressed_bit_vector.hpp +++ b/coding/compressed_bit_vector.hpp @@ -1,7 +1,6 @@ #include "std/vector.hpp" #include "base/assert.hpp" -#include "base/bits.hpp" #include "coding/reader.hpp" #include "coding/writer.hpp" @@ -24,14 +23,20 @@ public: virtual ~CompressedBitVector() = default; - // Executes f for each bit that is set to one using - // the bit's 0-based position as argument. - template <typename F> - void ForEach(F && f) const; - // Intersects two bit vectors. - static unique_ptr<CompressedBitVector> Intersect(CompressedBitVector const &, - CompressedBitVector const &); + // todo(@pimenov) We expect the common use case to be as follows. + // A CBV is created in memory and several CBVs are read and intersected + // with it one by one. The in-memory CBV may initially contain a bit + // for every feature in an mwm and the intersected CBVs are read from + // the leaves of a search trie. + // Therefore an optimization of Intersect comes to mind: make a wrapper + // around TReader that will read a representation of a CBV from disk + // and intersect it bit by bit with the global in-memory CBV bypassing such + // routines as allocating memory and choosing strategy. They all can be called only + // once, namely in the end, when it is needed to pack the in-memory CBV into + // a suitable representation and pass it to the caller. + static unique_ptr<CompressedBitVector> Intersect(CompressedBitVector const & lhs, + CompressedBitVector const & rhs); // Returns the number of set bits (population count). virtual uint32_t PopCount() const = 0; @@ -60,80 +65,63 @@ string DebugPrint(CompressedBitVector::StorageStrategy strat); class DenseCBV : public CompressedBitVector { public: + DenseCBV() = default; + // Builds a dense CBV from a list of positions of set bits. DenseCBV(vector<uint64_t> const & setBits); - // Builds a dense CBV from a packed bitmap of set bits. - // todo(@pimenov) This behaviour of & and && constructors is extremely error-prone. - DenseCBV(vector<uint64_t> && bitMasks) : m_bits(move(bitMasks)) - { - m_popCount = 0; - for (size_t i = 0; i < m_bits.size(); ++i) - m_popCount += bits::PopCount(m_bits[i]); - } + // Not to be confused with the constructor: the semantics + // of the array of integers is completely different. + static unique_ptr<DenseCBV> BuildFromBitGroups(vector<uint64_t> && bitGroups); - ~DenseCBV() = default; + size_t NumBitGroups() const { return m_bitGroups.size(); } - size_t NumBitGroups() const { return m_bits.size(); } + static uint32_t const kBlockSize = 64; template <typename F> - void ForEach(F && f) const; - - uint64_t GetBitGroup(size_t i) const + void ForEach(F && f) const { - if (i < m_bits.size()) - return m_bits[i]; - return 0; + for (size_t i = 0; i < m_bitGroups.size(); ++i) + for (size_t j = 0; j < kBlockSize; ++j) + if (((m_bitGroups[i] >> j) & 1) > 0) + f(kBlockSize * i + j); } - // CompressedBitVector overrides: + // Returns 0 if the group number is too large to be contained in m_bits. + uint64_t GetBitGroup(size_t i) const; + // CompressedBitVector overrides: uint32_t PopCount() const override; - bool GetBit(uint32_t pos) const override; - StorageStrategy GetStorageStrategy() const override; - void Serialize(Writer & writer) const override; private: - vector<uint64_t> m_bits; - uint32_t m_popCount; + vector<uint64_t> m_bitGroups; + uint32_t m_popCount = 0; }; class SparseCBV : public CompressedBitVector { public: - SparseCBV(vector<uint64_t> const & setBits) : m_positions(setBits) - { - ASSERT(is_sorted(m_positions.begin(), m_positions.end()), ()); - } - - SparseCBV(vector<uint64_t> && setBits) : m_positions(move(setBits)) - { - ASSERT(is_sorted(m_positions.begin(), m_positions.end()), ()); - } + SparseCBV(vector<uint64_t> const & setBits); - ~SparseCBV() = default; + SparseCBV(vector<uint64_t> && setBits); // Returns the position of the i'th set bit. - uint64_t Select(size_t i) const - { - ASSERT_LESS(i, m_positions.size(), ()); - return m_positions[i]; - } + uint64_t Select(size_t i) const; template <typename F> - void ForEach(F && f) const; + void ForEach(F && f) const + { + for (auto const & position : m_positions) + f(position); + } // CompressedBitVector overrides: - uint32_t PopCount() const override; - bool GetBit(uint32_t pos) const override; - StorageStrategy GetStorageStrategy() const override; - void Serialize(Writer & writer) const override; private: @@ -146,7 +134,11 @@ class CompressedBitVectorBuilder public: // Chooses a strategy to store the bit vector with bits from setBits set to one // and returns a pointer to a class that fits best. - static unique_ptr<CompressedBitVector> Build(vector<uint64_t> const & setBits); + static unique_ptr<CompressedBitVector> FromBitPositions(vector<uint64_t> const & setBits); + + // Chooses a strategy to store the bit vector with bits from a bitmap obtained + // by concatenating the elements of bitGroups. + static unique_ptr<CompressedBitVector> FromBitGroups(vector<uint64_t> && bitGroups); // Reads a bit vector from reader which must contain a valid // bit vector representation (see CompressedBitVector::Serialize for the format). @@ -154,29 +146,53 @@ public: static unique_ptr<CompressedBitVector> Deserialize(TReader & reader) { ReaderSource<TReader> src(reader); - uint8_t header = ReadPrimitiveFromSource<uint8_t>(reader); + uint8_t header = ReadPrimitiveFromSource<uint8_t>(src); CompressedBitVector::StorageStrategy strat = static_cast<CompressedBitVector::StorageStrategy>(header); switch (strat) { case CompressedBitVector::StorageStrategy::Dense: { - uint32_t numBitGroups = ReadPrimitiveFromSource<uint32_t>(reader); - vector<uint64_t> bitGroups(numBitGroups); - for (size_t i = 0; i < numBitGroups; ++i) - bitGroups[i] = ReadPrimitiveFromSource<uint64_t>(reader); - return make_unique<DenseCBV>(move(bitGroups)); + vector<uint64_t> bitGroups; + ReadPrimitiveVectorFromSource(src, bitGroups); + return DenseCBV::BuildFromBitGroups(move(bitGroups)); } case CompressedBitVector::StorageStrategy::Sparse: { - uint32_t numBits = ReadPrimitiveFromSource<uint32_t>(reader); - vector<uint64_t> setBits(numBits); - for (size_t i = 0; i < numBits; ++i) - setBits[i] = ReadPrimitiveFromSource<uint64_t>(reader); + vector<uint64_t> setBits; + ReadPrimitiveVectorFromSource(src, setBits); return make_unique<SparseCBV>(setBits); } } return nullptr; } }; + +// ForEach is generic and therefore cannot be virtual: a helper class is needed. +class CompressedBitVectorEnumerator +{ +public: + // Executes f for each bit that is set to one using + // the bit's 0-based position as argument. + template <typename F> + static void ForEach(CompressedBitVector const & cbv, F && f) + { + CompressedBitVector::StorageStrategy strat = cbv.GetStorageStrategy(); + switch (strat) + { + case CompressedBitVector::StorageStrategy::Dense: + { + DenseCBV const & denseCBV = static_cast<DenseCBV const &>(cbv); + denseCBV.ForEach(f); + return; + } + case CompressedBitVector::StorageStrategy::Sparse: + { + SparseCBV const & sparseCBV = static_cast<SparseCBV const &>(cbv); + sparseCBV.ForEach(f); + return; + } + } + } +}; } // namespace coding |