diff options
author | Maxim Pimenov <m@maps.me> | 2015-09-11 16:38:18 +0300 |
---|---|---|
committer | Sergey Yershov <yershov@corp.mail.ru> | 2016-03-23 16:02:16 +0300 |
commit | d6800004d763a905c056bb3ca86e639f476377bc (patch) | |
tree | 593c88959f02ecdbaeda78a7add3eec77ea0704a /coding/compressed_bit_vector.cpp | |
parent | e8acd6f4596ebf011822e0c86fbca556faf3b724 (diff) |
Review fixes.
Diffstat (limited to 'coding/compressed_bit_vector.cpp')
-rw-r--r-- | coding/compressed_bit_vector.cpp | 134 |
1 files changed, 93 insertions, 41 deletions
diff --git a/coding/compressed_bit_vector.cpp b/coding/compressed_bit_vector.cpp index 5f4336b324..0b444b42ac 100644 --- a/coding/compressed_bit_vector.cpp +++ b/coding/compressed_bit_vector.cpp @@ -2,24 +2,23 @@ #include "coding/writer.hpp" #include "coding/write_to_sink.hpp" +#include "base/bits.hpp" + #include "std/algorithm.hpp" namespace { +uint64_t const kBlockSize = coding::DenseCBV::kBlockSize; + unique_ptr<coding::CompressedBitVector> IntersectImpl(coding::DenseCBV const & a, coding::DenseCBV const & b) { size_t sizeA = a.NumBitGroups(); size_t sizeB = b.NumBitGroups(); - vector<uint64_t> resBits; - for (size_t i = 0; i < min(sizeA, sizeB); ++i) - { - uint64_t bitGroup = a.GetBitGroup(i) & b.GetBitGroup(i); - for (size_t j = 0; j < 64; j++) - if (((bitGroup >> j) & 1) > 0) - resBits.push_back(64 * i + j); - } - return coding::CompressedBitVectorBuilder::Build(resBits); + vector<uint64_t> resGroups(min(sizeA, sizeB)); + for (size_t i = 0; i < resGroups.size(); ++i) + resGroups[i] = a.GetBitGroup(i) & b.GetBitGroup(i); + return coding::CompressedBitVectorBuilder::FromBitGroups(move(resGroups)); } // The intersection of dense and sparse is always sparse. @@ -71,6 +70,17 @@ unique_ptr<coding::CompressedBitVector> IntersectImpl(coding::SparseCBV const & } return make_unique<coding::SparseCBV>(move(resPos)); } + +// Returns true if a bit vector with popCount bits set out of totalBits +// is fit to be represented as a DenseCBV. Note that we do not +// account for possible irregularities in the distribution of bits. +// In particular, we do not break the bit vector into blocks that are +// stored separately although this might turn out to be a good idea. +bool DenseEnough(uint64_t popCount, uint64_t totalBits) +{ + // Settle at 30% for now. + return popCount * 10 >= totalBits * 3; +} } // namespace namespace coding @@ -79,18 +89,39 @@ DenseCBV::DenseCBV(vector<uint64_t> const & setBits) { if (setBits.empty()) { - m_bits.resize(0); + m_bitGroups.resize(0); m_popCount = 0; return; } uint64_t maxBit = setBits[0]; for (size_t i = 1; i < setBits.size(); ++i) maxBit = max(maxBit, setBits[i]); - size_t sz = (maxBit + 64 - 1) / 64; - m_bits.resize(sz); + size_t sz = (maxBit + kBlockSize - 1) / kBlockSize; + m_bitGroups.resize(sz); m_popCount = static_cast<uint32_t>(setBits.size()); for (uint64_t pos : setBits) - m_bits[pos / 64] |= static_cast<uint64_t>(1) << (pos % 64); + m_bitGroups[pos / kBlockSize] |= static_cast<uint64_t>(1) << (pos % kBlockSize); +} + +// static +unique_ptr<DenseCBV> DenseCBV::BuildFromBitGroups(vector<uint64_t> && bitGroups) +{ + unique_ptr<DenseCBV> cbv(new DenseCBV()); + cbv->m_popCount = 0; + for (size_t i = 0; i < bitGroups.size(); ++i) + cbv->m_popCount += bits::PopCount(bitGroups[i]); + cbv->m_bitGroups = move(bitGroups); + return cbv; +} + +SparseCBV::SparseCBV(vector<uint64_t> const & setBits) : m_positions(setBits) +{ + ASSERT(is_sorted(m_positions.begin(), m_positions.end()), ()); +} + +SparseCBV::SparseCBV(vector<uint64_t> && setBits) : m_positions(move(setBits)) +{ + ASSERT(is_sorted(m_positions.begin(), m_positions.end()), ()); } uint32_t DenseCBV::PopCount() const { return m_popCount; } @@ -99,40 +130,35 @@ uint32_t SparseCBV::PopCount() const { return m_positions.size(); } bool DenseCBV::GetBit(uint32_t pos) const { - uint64_t bitGroup = GetBitGroup(pos / 64); - return ((bitGroup >> (pos % 64)) & 1) > 0; + uint64_t bitGroup = GetBitGroup(pos / kBlockSize); + return ((bitGroup >> (pos % kBlockSize)) & 1) > 0; } bool SparseCBV::GetBit(uint32_t pos) const { - auto it = lower_bound(m_positions.begin(), m_positions.end(), pos); + auto const it = lower_bound(m_positions.begin(), m_positions.end(), pos); return it != m_positions.end() && *it == pos; } -CompressedBitVector::StorageStrategy DenseCBV::GetStorageStrategy() const +uint64_t DenseCBV::GetBitGroup(size_t i) const { - return CompressedBitVector::StorageStrategy::Dense; + return i < m_bitGroups.size() ? m_bitGroups[i] : 0; } -CompressedBitVector::StorageStrategy SparseCBV::GetStorageStrategy() const +uint64_t SparseCBV::Select(size_t i) const { - return CompressedBitVector::StorageStrategy::Sparse; + ASSERT_LESS(i, m_positions.size(), ()); + return m_positions[i]; } -template <typename F> -void DenseCBV::ForEach(F && f) const +CompressedBitVector::StorageStrategy DenseCBV::GetStorageStrategy() const { - for (size_t i = 0; i < m_bits.size(); ++i) - for (size_t j = 0; j < 64; ++j) - if (((m_bits[i] >> j) & 1) > 0) - f(64 * i + j); + return CompressedBitVector::StorageStrategy::Dense; } -template <typename F> -void SparseCBV::ForEach(F && f) const +CompressedBitVector::StorageStrategy SparseCBV::GetStorageStrategy() const { - for (size_t i = 0; i < m_positions.size(); ++i) - f(m_positions[i]); + return CompressedBitVector::StorageStrategy::Sparse; } string DebugPrint(CompressedBitVector::StorageStrategy strat) @@ -167,16 +193,43 @@ void SparseCBV::Serialize(Writer & writer) const } // static -unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::Build(vector<uint64_t> const & setBits) +unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::FromBitPositions( + vector<uint64_t> const & setBits) { if (setBits.empty()) return make_unique<SparseCBV>(setBits); uint64_t maxBit = setBits[0]; for (size_t i = 1; i < setBits.size(); ++i) maxBit = max(maxBit, setBits[i]); - // 30% occupied is dense enough - if (10 * setBits.size() >= 3 * maxBit) + + if (DenseEnough(setBits.size(), maxBit)) return make_unique<DenseCBV>(setBits); + + return make_unique<SparseCBV>(setBits); +} + +// static +unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::FromBitGroups( + vector<uint64_t> && bitGroups) +{ + while (!bitGroups.empty() && bitGroups.back() == 0) + bitGroups.pop_back(); + if (bitGroups.empty()) + return make_unique<SparseCBV>(bitGroups); + + uint64_t maxBit = kBlockSize * bitGroups.size() - 1; + uint64_t popCount = 0; + for (size_t i = 0; i < bitGroups.size(); ++i) + popCount += bits::PopCount(bitGroups[i]); + + if (DenseEnough(popCount, maxBit)) + return DenseCBV::BuildFromBitGroups(move(bitGroups)); + + vector<uint64_t> setBits; + for (size_t i = 0; i < bitGroups.size(); ++i) + for (size_t j = 0; j < kBlockSize; ++j) + if (((bitGroups[i] >> j) & 1) > 0) + setBits.push_back(kBlockSize * i + j); return make_unique<SparseCBV>(setBits); } @@ -184,29 +237,28 @@ unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::Build(vector<uint64_ unique_ptr<CompressedBitVector> CompressedBitVector::Intersect(CompressedBitVector const & lhs, CompressedBitVector const & rhs) { - auto stratA = lhs.GetStorageStrategy(); - auto stratB = rhs.GetStorageStrategy(); - auto stratDense = CompressedBitVector::StorageStrategy::Dense; - auto stratSparse = CompressedBitVector::StorageStrategy::Sparse; - if (stratA == stratDense && stratB == stratDense) + using strat = CompressedBitVector::StorageStrategy; + auto const stratA = lhs.GetStorageStrategy(); + auto const stratB = rhs.GetStorageStrategy(); + if (stratA == strat::Dense && stratB == strat::Dense) { DenseCBV const & a = static_cast<DenseCBV const &>(lhs); DenseCBV const & b = static_cast<DenseCBV const &>(rhs); return IntersectImpl(a, b); } - if (stratA == stratDense && stratB == stratSparse) + if (stratA == strat::Dense && stratB == strat::Sparse) { DenseCBV const & a = static_cast<DenseCBV const &>(lhs); SparseCBV const & b = static_cast<SparseCBV const &>(rhs); return IntersectImpl(a, b); } - if (stratA == stratSparse && stratB == stratDense) + if (stratA == strat::Sparse && stratB == strat::Dense) { SparseCBV const & a = static_cast<SparseCBV const &>(lhs); DenseCBV const & b = static_cast<DenseCBV const &>(rhs); return IntersectImpl(a, b); } - if (stratA == stratSparse && stratB == stratSparse) + if (stratA == strat::Sparse && stratB == strat::Sparse) { SparseCBV const & a = static_cast<SparseCBV const &>(lhs); SparseCBV const & b = static_cast<SparseCBV const &>(rhs); |