diff options
author | Yuri Gorshenin <y@maps.me> | 2015-10-21 12:07:53 +0300 |
---|---|---|
committer | Sergey Yershov <yershov@corp.mail.ru> | 2016-03-23 16:02:26 +0300 |
commit | 46435d0987a839b31442f76cd6bf0275b2beb8e3 (patch) | |
tree | 221b382d507ad982b5dbe15940e3913e3c3c7299 /coding/compressed_bit_vector.cpp | |
parent | 60cf672e88bbcaf0830ed69eea178b360cd45d0b (diff) |
[coding, search] Added Subtract() method to BitVectors.
Diffstat (limited to 'coding/compressed_bit_vector.cpp')
-rw-r--r-- | coding/compressed_bit_vector.cpp | 285 |
1 files changed, 174 insertions, 111 deletions
diff --git a/coding/compressed_bit_vector.cpp b/coding/compressed_bit_vector.cpp index 876b0135f4..a5fc3ebee7 100644 --- a/coding/compressed_bit_vector.cpp +++ b/coding/compressed_bit_vector.cpp @@ -1,80 +1,156 @@ #include "coding/compressed_bit_vector.hpp" -#include "coding/writer.hpp" + #include "coding/write_to_sink.hpp" +#include "base/assert.hpp" #include "base/bits.hpp" #include "std/algorithm.hpp" namespace coding { -// static -uint32_t const DenseCBV::kBlockSize; -} // namespace coding - namespace { -uint64_t const kBlockSize = coding::DenseCBV::kBlockSize; - -unique_ptr<coding::CompressedBitVector> IntersectImpl(coding::DenseCBV const & a, - coding::DenseCBV const & b) +struct IntersectOp { - size_t sizeA = a.NumBitGroups(); - size_t sizeB = b.NumBitGroups(); - vector<uint64_t> resGroups(min(sizeA, sizeB)); - for (size_t i = 0; i < resGroups.size(); ++i) - resGroups[i] = a.GetBitGroup(i) & b.GetBitGroup(i); - return coding::CompressedBitVectorBuilder::FromBitGroups(move(resGroups)); -} + IntersectOp() {} -// The intersection of dense and sparse is always sparse. -unique_ptr<coding::CompressedBitVector> IntersectImpl(coding::DenseCBV const & a, - coding::SparseCBV const & b) -{ - vector<uint64_t> resPos; - for (size_t i = 0; i < b.PopCount(); ++i) + unique_ptr<coding::CompressedBitVector> operator()(coding::DenseCBV const & a, + coding::DenseCBV const & b) const { - auto pos = b.Select(i); - if (a.GetBit(pos)) - resPos.push_back(pos); + size_t sizeA = a.NumBitGroups(); + size_t sizeB = b.NumBitGroups(); + vector<uint64_t> resGroups(min(sizeA, sizeB)); + for (size_t i = 0; i < resGroups.size(); ++i) + resGroups[i] = a.GetBitGroup(i) & b.GetBitGroup(i); + return coding::CompressedBitVectorBuilder::FromBitGroups(move(resGroups)); } - return make_unique<coding::SparseCBV>(move(resPos)); -} -unique_ptr<coding::CompressedBitVector> IntersectImpl(coding::SparseCBV const & a, - coding::DenseCBV const & b) -{ - return IntersectImpl(b, a); -} - -unique_ptr<coding::CompressedBitVector> IntersectImpl(coding::SparseCBV const & a, - coding::SparseCBV const & b) -{ - size_t sizeA = a.PopCount(); - size_t sizeB = b.PopCount(); - vector<uint64_t> resPos; - size_t i = 0; - size_t j = 0; - while (i < sizeA && j < sizeB) + // The intersection of dense and sparse is always sparse. + unique_ptr<coding::CompressedBitVector> operator()(coding::DenseCBV const & a, + coding::SparseCBV const & b) const { - auto posA = a.Select(i); - auto posB = b.Select(j); - if (posA == posB) + vector<uint64_t> resPos; + for (size_t i = 0; i < b.PopCount(); ++i) { - resPos.push_back(posA); - ++i; - ++j; + auto pos = b.Select(i); + if (a.GetBit(pos)) + resPos.push_back(pos); } - else if (posA < posB) - { - ++i; - } - else + return make_unique<coding::SparseCBV>(move(resPos)); + } + + unique_ptr<coding::CompressedBitVector> operator()(coding::SparseCBV const & a, + coding::DenseCBV const & b) const + { + return operator()(b, a); + } + + unique_ptr<coding::CompressedBitVector> operator()(coding::SparseCBV const & a, + coding::SparseCBV const & b) const + { + vector<uint64_t> resPos; + set_intersection(a.Begin(), a.End(), b.Begin(), b.End(), back_inserter(resPos)); + return make_unique<coding::SparseCBV>(move(resPos)); + } +}; + +struct SubtractOp +{ + SubtractOp() {} + + unique_ptr<coding::CompressedBitVector> operator()(coding::DenseCBV const & a, + coding::DenseCBV const & b) const + { + size_t sizeA = a.NumBitGroups(); + size_t sizeB = b.NumBitGroups(); + vector<uint64_t> resGroups(min(sizeA, sizeB)); + for (size_t i = 0; i < resGroups.size(); ++i) + resGroups[i] = a.GetBitGroup(i) & ~b.GetBitGroup(i); + return CompressedBitVectorBuilder::FromBitGroups(move(resGroups)); + } + + unique_ptr<coding::CompressedBitVector> operator()(coding::DenseCBV const & a, + coding::SparseCBV const & b) const + { + vector<uint64_t> resGroups(a.NumBitGroups()); + + size_t i = 0; + auto j = b.Begin(); + for (; i < resGroups.size() && j < b.End(); ++i) { - ++j; + uint64_t const kBitsBegin = i * DenseCBV::kBlockSize; + uint64_t const kBitsEnd = (i + 1) * DenseCBV::kBlockSize; + + uint64_t mask = 0; + for (; j < b.End() && *j < kBitsEnd; ++j) + { + ASSERT_GREATER_OR_EQUAL(*j, kBitsBegin, ()); + mask |= static_cast<uint64_t>(1) << (*j - kBitsBegin); + } + + resGroups[i] = a.GetBitGroup(i) & ~mask; } + + for (; i < resGroups.size(); ++i) + resGroups[i] = a.GetBitGroup(i); + + return CompressedBitVectorBuilder::FromBitGroups(move(resGroups)); + } + + unique_ptr<coding::CompressedBitVector> operator()(coding::SparseCBV const & a, + coding::DenseCBV const & b) const + { + vector<uint64_t> resPos; + copy_if(a.Begin(), a.End(), back_inserter(resPos), [&](uint64_t bit) + { + return !b.GetBit(bit); + }); + return CompressedBitVectorBuilder::FromBitPositions(move(resPos)); + } + + unique_ptr<coding::CompressedBitVector> operator()(coding::SparseCBV const & a, + coding::SparseCBV const & b) const + { + vector<uint64_t> resPos; + set_difference(a.Begin(), a.End(), b.Begin(), b.End(), back_inserter(resPos)); + return CompressedBitVectorBuilder::FromBitPositions(move(resPos)); + } +}; + +template <typename TBinaryOp> +unique_ptr<coding::CompressedBitVector> Apply(TBinaryOp const & op, CompressedBitVector const & lhs, + CompressedBitVector const & rhs) +{ + using strat = CompressedBitVector::StorageStrategy; + auto const stratA = lhs.GetStorageStrategy(); + auto const stratB = rhs.GetStorageStrategy(); + if (stratA == strat::Dense && stratB == strat::Dense) + { + DenseCBV const & a = static_cast<DenseCBV const &>(lhs); + DenseCBV const & b = static_cast<DenseCBV const &>(rhs); + return op(a, b); + } + if (stratA == strat::Dense && stratB == strat::Sparse) + { + DenseCBV const & a = static_cast<DenseCBV const &>(lhs); + SparseCBV const & b = static_cast<SparseCBV const &>(rhs); + return op(a, b); } - return make_unique<coding::SparseCBV>(move(resPos)); + if (stratA == strat::Sparse && stratB == strat::Dense) + { + SparseCBV const & a = static_cast<SparseCBV const &>(lhs); + DenseCBV const & b = static_cast<DenseCBV const &>(rhs); + return op(a, b); + } + if (stratA == strat::Sparse && stratB == strat::Sparse) + { + SparseCBV const & a = static_cast<SparseCBV const &>(lhs); + SparseCBV const & b = static_cast<SparseCBV const &>(rhs); + return op(a, b); + } + + return nullptr; } // Returns true if a bit vector with popCount bits set out of totalBits @@ -87,22 +163,34 @@ bool DenseEnough(uint64_t popCount, uint64_t totalBits) // Settle at 30% for now. return popCount * 10 >= totalBits * 3; } -} // namespace -namespace coding +template <typename TBitPositions> +unique_ptr<CompressedBitVector> BuildFromBitPositions(TBitPositions && setBits) { + if (setBits.empty()) + return make_unique<SparseCBV>(forward<TBitPositions>(setBits)); + uint64_t const maxBit = *max_element(setBits.begin(), setBits.end()); + + if (DenseEnough(setBits.size(), maxBit)) + return make_unique<DenseCBV>(forward<TBitPositions>(setBits)); + + return make_unique<SparseCBV>(forward<TBitPositions>(setBits)); +} +} // namespace + +// static +uint64_t const DenseCBV::kBlockSize; + DenseCBV::DenseCBV(vector<uint64_t> const & setBits) { if (setBits.empty()) { return; } - uint64_t maxBit = setBits[0]; - for (size_t i = 1; i < setBits.size(); ++i) - maxBit = max(maxBit, setBits[i]); + uint64_t const maxBit = *max_element(setBits.begin(), setBits.end()); size_t const sz = 1 + maxBit / kBlockSize; m_bitGroups.resize(sz); - m_popCount = static_cast<uint32_t>(setBits.size()); + m_popCount = static_cast<uint64_t>(setBits.size()); for (uint64_t pos : setBits) m_bitGroups[pos / kBlockSize] |= static_cast<uint64_t>(1) << (pos % kBlockSize); } @@ -123,9 +211,9 @@ uint64_t DenseCBV::GetBitGroup(size_t i) const return i < m_bitGroups.size() ? m_bitGroups[i] : 0; } -uint32_t DenseCBV::PopCount() const { return m_popCount; } +uint64_t DenseCBV::PopCount() const { return m_popCount; } -bool DenseCBV::GetBit(uint32_t pos) const +bool DenseCBV::GetBit(uint64_t pos) const { uint64_t bitGroup = GetBitGroup(pos / kBlockSize); return ((bitGroup >> (pos % kBlockSize)) & 1) > 0; @@ -140,9 +228,7 @@ void DenseCBV::Serialize(Writer & writer) const { uint8_t header = static_cast<uint8_t>(GetStorageStrategy()); WriteToSink(writer, header); - WriteToSink(writer, static_cast<uint32_t>(NumBitGroups())); - for (size_t i = 0; i < NumBitGroups(); ++i) - WriteToSink(writer, GetBitGroup(i)); + rw::WriteVectorOfPOD(writer, m_bitGroups); } SparseCBV::SparseCBV(vector<uint64_t> const & setBits) : m_positions(setBits) @@ -161,9 +247,9 @@ uint64_t SparseCBV::Select(size_t i) const return m_positions[i]; } -uint32_t SparseCBV::PopCount() const { return m_positions.size(); } +uint64_t SparseCBV::PopCount() const { return m_positions.size(); } -bool SparseCBV::GetBit(uint32_t pos) const +bool SparseCBV::GetBit(uint64_t pos) const { auto const it = lower_bound(m_positions.begin(), m_positions.end(), pos); return it != m_positions.end() && *it == pos; @@ -178,39 +264,35 @@ void SparseCBV::Serialize(Writer & writer) const { uint8_t header = static_cast<uint8_t>(GetStorageStrategy()); WriteToSink(writer, header); - WriteToSink(writer, PopCount()); - ForEach([&](uint64_t bitPos) - { - WriteToSink(writer, bitPos); - }); + rw::WriteVectorOfPOD(writer, m_positions); } // static unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::FromBitPositions( vector<uint64_t> const & setBits) { - if (setBits.empty()) - return make_unique<SparseCBV>(setBits); - uint64_t maxBit = setBits[0]; - for (size_t i = 1; i < setBits.size(); ++i) - maxBit = max(maxBit, setBits[i]); - - if (DenseEnough(setBits.size(), maxBit)) - return make_unique<DenseCBV>(setBits); + return BuildFromBitPositions(setBits); +} - return make_unique<SparseCBV>(setBits); +// static +unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::FromBitPositions( + vector<uint64_t> && setBits) +{ + return BuildFromBitPositions(move(setBits)); } // static unique_ptr<CompressedBitVector> CompressedBitVectorBuilder::FromBitGroups( vector<uint64_t> && bitGroups) { + static uint64_t const kBlockSize = DenseCBV::kBlockSize; + while (!bitGroups.empty() && bitGroups.back() == 0) bitGroups.pop_back(); if (bitGroups.empty()) return make_unique<SparseCBV>(bitGroups); - uint64_t const maxBit = kBlockSize * bitGroups.size() - 1; + uint64_t const maxBit = kBlockSize * (bitGroups.size() - 1) + bits::CeilLog(bitGroups.back()); uint64_t popCount = 0; for (size_t i = 0; i < bitGroups.size(); ++i) popCount += bits::PopCount(bitGroups[i]); @@ -245,34 +327,15 @@ string DebugPrint(CompressedBitVector::StorageStrategy strat) unique_ptr<CompressedBitVector> CompressedBitVector::Intersect(CompressedBitVector const & lhs, CompressedBitVector const & rhs) { - using strat = CompressedBitVector::StorageStrategy; - auto const stratA = lhs.GetStorageStrategy(); - auto const stratB = rhs.GetStorageStrategy(); - if (stratA == strat::Dense && stratB == strat::Dense) - { - DenseCBV const & a = static_cast<DenseCBV const &>(lhs); - DenseCBV const & b = static_cast<DenseCBV const &>(rhs); - return IntersectImpl(a, b); - } - if (stratA == strat::Dense && stratB == strat::Sparse) - { - DenseCBV const & a = static_cast<DenseCBV const &>(lhs); - SparseCBV const & b = static_cast<SparseCBV const &>(rhs); - return IntersectImpl(a, b); - } - if (stratA == strat::Sparse && stratB == strat::Dense) - { - SparseCBV const & a = static_cast<SparseCBV const &>(lhs); - DenseCBV const & b = static_cast<DenseCBV const &>(rhs); - return IntersectImpl(a, b); - } - if (stratA == strat::Sparse && stratB == strat::Sparse) - { - SparseCBV const & a = static_cast<SparseCBV const &>(lhs); - SparseCBV const & b = static_cast<SparseCBV const &>(rhs); - return IntersectImpl(a, b); - } + static IntersectOp const intersectOp; + return Apply(intersectOp, lhs, rhs); +} - return nullptr; +// static +unique_ptr<CompressedBitVector> CompressedBitVector::Subtract(CompressedBitVector const & lhs, + CompressedBitVector const & rhs) +{ + static SubtractOp const subtractOp; + return Apply(subtractOp, lhs, rhs); } } // namespace coding |