diff options
author | Yuri Gorshenin <y@maps.me> | 2015-12-23 17:30:47 +0300 |
---|---|---|
committer | Sergey Yershov <yershov@corp.mail.ru> | 2016-03-23 16:03:15 +0300 |
commit | df0bcdcf07d60c48cc6e84d57e7b42c7821e286f (patch) | |
tree | fb97e5f3d7ca8a7236df0ac9b5cf158abc95364f /coding | |
parent | 79278e235eeaa7698c05ba38aeddde0639c02e7b (diff) |
[search] Implemented search in locality.
Diffstat (limited to 'coding')
-rw-r--r-- | coding/coding_tests/compressed_bit_vector_test.cpp | 119 | ||||
-rw-r--r-- | coding/compressed_bit_vector.cpp | 118 | ||||
-rw-r--r-- | coding/compressed_bit_vector.hpp | 8 |
3 files changed, 234 insertions, 11 deletions
diff --git a/coding/coding_tests/compressed_bit_vector_test.cpp b/coding/coding_tests/compressed_bit_vector_test.cpp index 42515d5720..b0b800ea5e 100644 --- a/coding/coding_tests/compressed_bit_vector_test.cpp +++ b/coding/coding_tests/compressed_bit_vector_test.cpp @@ -25,6 +25,14 @@ void Subtract(vector<uint64_t> & setBits1, vector<uint64_t> & setBits2, vector<u back_inserter(result)); } +void Union(vector<uint64_t> & setBits1, vector<uint64_t> & setBits2, vector<uint64_t> & result) +{ + sort(setBits1.begin(), setBits1.end()); + sort(setBits2.begin(), setBits2.end()); + set_union(setBits1.begin(), setBits1.end(), setBits2.begin(), setBits2.end(), + back_inserter(result)); +} + template <typename TBinaryOp> void CheckBinaryOp(TBinaryOp op, vector<uint64_t> & setBits1, vector<uint64_t> & setBits2, coding::CompressedBitVector const & cbv) @@ -33,14 +41,8 @@ void CheckBinaryOp(TBinaryOp op, vector<uint64_t> & setBits1, vector<uint64_t> & op(setBits1, setBits2, expected); TEST_EQUAL(expected.size(), cbv.PopCount(), ()); - vector<bool> expectedBitmap; - if (!expected.empty()) - expectedBitmap.resize(expected.back() + 1); - for (size_t i = 0; i < expected.size(); ++i) - expectedBitmap[expected[i]] = true; - for (size_t i = 0; i < expectedBitmap.size(); ++i) - TEST_EQUAL(cbv.GetBit(i), expectedBitmap[i], ()); + TEST(cbv.GetBit(expected[i]), ()); } void CheckIntersection(vector<uint64_t> & setBits1, vector<uint64_t> & setBits2, @@ -54,6 +56,12 @@ void CheckSubtraction(vector<uint64_t> & setBits1, vector<uint64_t> & setBits2, { CheckBinaryOp(&Subtract, setBits1, setBits2, cbv); } + +void CheckUnion(vector<uint64_t> & setBits1, vector<uint64_t> & setBits2, + coding::CompressedBitVector const & cbv) +{ + CheckBinaryOp(&Union, setBits1, setBits2, cbv); +} } // namespace UNIT_TEST(CompressedBitVector_Intersect1) @@ -214,6 +222,103 @@ UNIT_TEST(CompressedBitVector_Subtract4) CheckSubtraction(setBits1, setBits2, *cbv3); } +UNIT_TEST(CompressedBitVector_Union_Smoke) +{ + vector<uint64_t> setBits1 = {}; + vector<uint64_t> setBits2 = {}; + + auto cbv1 = coding::CompressedBitVectorBuilder::FromBitPositions(setBits1); + auto cbv2 = coding::CompressedBitVectorBuilder::FromBitPositions(setBits2); + TEST(cbv1.get(), ()); + TEST(cbv2.get(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv1->GetStorageStrategy(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv2->GetStorageStrategy(), ()); + + auto cbv3 = coding::CompressedBitVector::Union(*cbv1, *cbv2); + TEST(cbv3.get(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv3->GetStorageStrategy(), ()); + CheckUnion(setBits1, setBits2, *cbv3); +} + +UNIT_TEST(CompressedBitVector_Union1) +{ + vector<uint64_t> setBits1 = {}; + vector<uint64_t> setBits2 = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}; + + auto cbv1 = coding::CompressedBitVectorBuilder::FromBitPositions(setBits1); + auto cbv2 = coding::CompressedBitVectorBuilder::FromBitPositions(setBits2); + TEST(cbv1.get(), ()); + TEST(cbv2.get(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv1->GetStorageStrategy(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Dense, cbv2->GetStorageStrategy(), ()); + + auto cbv3 = coding::CompressedBitVector::Union(*cbv1, *cbv2); + TEST(cbv3.get(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Dense, cbv3->GetStorageStrategy(), ()); + CheckUnion(setBits1, setBits2, *cbv3); +} + +UNIT_TEST(CompressedBitVector_Union2) +{ + vector<uint64_t> setBits1 = {256, 1024}; + vector<uint64_t> setBits2 = {0, 32, 64}; + + auto cbv1 = coding::CompressedBitVectorBuilder::FromBitPositions(setBits1); + auto cbv2 = coding::CompressedBitVectorBuilder::FromBitPositions(setBits2); + TEST(cbv1.get(), ()); + TEST(cbv2.get(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv1->GetStorageStrategy(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv2->GetStorageStrategy(), ()); + + auto cbv3 = coding::CompressedBitVector::Union(*cbv1, *cbv2); + TEST(cbv3.get(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv3->GetStorageStrategy(), ()); + CheckUnion(setBits1, setBits2, *cbv3); +} + +UNIT_TEST(CompressedBitVector_Union3) +{ + vector<uint64_t> setBits1 = {0, 1, 2, 3, 4, 5, 6}; + + vector<uint64_t> setBits2; + for (int i = 0; i < 256; ++i) + setBits2.push_back(i); + + auto cbv1 = coding::CompressedBitVectorBuilder::FromBitPositions(setBits1); + auto cbv2 = coding::CompressedBitVectorBuilder::FromBitPositions(setBits2); + TEST(cbv1.get(), ()); + TEST(cbv2.get(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Dense, cbv1->GetStorageStrategy(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Dense, cbv2->GetStorageStrategy(), ()); + + auto cbv3 = coding::CompressedBitVector::Union(*cbv1, *cbv2); + TEST(cbv3.get(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Dense, cbv3->GetStorageStrategy(), ()); + CheckUnion(setBits1, setBits2, *cbv3); +} + +UNIT_TEST(CompressedBitVector_Union4) +{ + vector<uint64_t> setBits1; + for (int i = 0; i < coding::DenseCBV::kBlockSize; ++i) + setBits1.push_back(i); + + vector<uint64_t> setBits2 = {1000000000}; + + auto cbv1 = coding::CompressedBitVectorBuilder::FromBitPositions(setBits1); + auto cbv2 = coding::CompressedBitVectorBuilder::FromBitPositions(setBits2); + TEST(cbv1.get(), ()); + TEST(cbv2.get(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Dense, cbv1->GetStorageStrategy(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv2->GetStorageStrategy(), ()); + + auto cbv3 = coding::CompressedBitVector::Union(*cbv1, *cbv2); + TEST(cbv3.get(), ()); + TEST_EQUAL(coding::CompressedBitVector::StorageStrategy::Sparse, cbv3->GetStorageStrategy(), ()); + + CheckUnion(setBits1, setBits2, *cbv3); +} + UNIT_TEST(CompressedBitVector_SerializationDense) { int const kNumBits = 100; diff --git a/coding/compressed_bit_vector.cpp b/coding/compressed_bit_vector.cpp index 9d9513f11f..78a8c7cfde 100644 --- a/coding/compressed_bit_vector.cpp +++ b/coding/compressed_bit_vector.cpp @@ -118,6 +118,96 @@ struct SubtractOp } }; +struct UnionOp +{ + UnionOp() {} + + unique_ptr<coding::CompressedBitVector> operator()(coding::DenseCBV const & a, + coding::DenseCBV const & b) const + { + size_t sizeA = a.NumBitGroups(); + size_t sizeB = b.NumBitGroups(); + + size_t commonSize = min(sizeA, sizeB); + size_t resultSize = max(sizeA, sizeB); + vector<uint64_t> resGroups(resultSize); + for (size_t i = 0; i < commonSize; ++i) + resGroups[i] = a.GetBitGroup(i) | b.GetBitGroup(i); + if (a.NumBitGroups() == resultSize) + { + for (size_t i = commonSize; i < resultSize; ++i) + resGroups[i] = a.GetBitGroup(i); + } + else + { + for (size_t i = commonSize; i < resultSize; ++i) + resGroups[i] = b.GetBitGroup(i); + } + return CompressedBitVectorBuilder::FromBitGroups(move(resGroups)); + } + + unique_ptr<coding::CompressedBitVector> operator()(coding::DenseCBV const & a, + coding::SparseCBV const & b) const + { + size_t sizeA = a.NumBitGroups(); + size_t sizeB = b.PopCount() == 0 ? 0 : (b.Select(b.PopCount() - 1) + DenseCBV::kBlockSize - 1) / + DenseCBV::kBlockSize; + if (sizeB > sizeA) + { + vector<uint64_t> resPos; + auto j = b.Begin(); + auto merge = [&](uint64_t va) + { + while (j < b.End() && *j < va) + { + resPos.push_back(*j); + ++j; + } + resPos.push_back(va); + }; + a.ForEach(merge); + for (; j < b.End(); ++j) + resPos.push_back(*j); + return CompressedBitVectorBuilder::FromBitPositions(move(resPos)); + } + + vector<uint64_t> resGroups(sizeA); + + size_t i = 0; + auto j = b.Begin(); + for (; i < sizeA || j < b.End(); ++i) + { + uint64_t const kBitsBegin = i * DenseCBV::kBlockSize; + uint64_t const kBitsEnd = (i + 1) * DenseCBV::kBlockSize; + + uint64_t mask = i < sizeA ? a.GetBitGroup(i) : 0; + for (; j < b.End() && *j < kBitsEnd; ++j) + { + ASSERT_GREATER_OR_EQUAL(*j, kBitsBegin, ()); + mask |= static_cast<uint64_t>(1) << (*j - kBitsBegin); + } + + resGroups[i] = mask; + } + + return CompressedBitVectorBuilder::FromBitGroups(move(resGroups)); + } + + unique_ptr<coding::CompressedBitVector> operator()(coding::SparseCBV const & a, + coding::DenseCBV const & b) const + { + return operator()(b, a); + } + + unique_ptr<coding::CompressedBitVector> operator()(coding::SparseCBV const & a, + coding::SparseCBV const & b) const + { + vector<uint64_t> resPos; + set_union(a.Begin(), a.End(), b.Begin(), b.End(), back_inserter(resPos)); + return CompressedBitVectorBuilder::FromBitPositions(move(resPos)); + } +}; + template <typename TBinaryOp> unique_ptr<coding::CompressedBitVector> Apply(TBinaryOp const & op, CompressedBitVector const & lhs, CompressedBitVector const & rhs) @@ -341,15 +431,35 @@ string DebugPrint(CompressedBitVector::StorageStrategy strat) unique_ptr<CompressedBitVector> CompressedBitVector::Intersect(CompressedBitVector const & lhs, CompressedBitVector const & rhs) { - static IntersectOp const intersectOp; - return Apply(intersectOp, lhs, rhs); + static IntersectOp const op; + return Apply(op, lhs, rhs); } // static unique_ptr<CompressedBitVector> CompressedBitVector::Subtract(CompressedBitVector const & lhs, CompressedBitVector const & rhs) { - static SubtractOp const subtractOp; - return Apply(subtractOp, lhs, rhs); + static SubtractOp const op; + return Apply(op, lhs, rhs); +} + +// static +unique_ptr<CompressedBitVector> CompressedBitVector::Union(CompressedBitVector const & lhs, + CompressedBitVector const & rhs) +{ + static UnionOp const op; + return Apply(op, lhs, rhs); +} + +// static +bool CompressedBitVector::IsEmpty(unique_ptr<CompressedBitVector> const & cbv) +{ + return !cbv || cbv->PopCount() == 0; +} + +// static +bool CompressedBitVector::IsEmpty(CompressedBitVector const * cbv) +{ + return !cbv || cbv->PopCount() == 0; } } // namespace coding diff --git a/coding/compressed_bit_vector.hpp b/coding/compressed_bit_vector.hpp index 837e35015f..23d7b1c04c 100644 --- a/coding/compressed_bit_vector.hpp +++ b/coding/compressed_bit_vector.hpp @@ -43,6 +43,14 @@ public: static unique_ptr<CompressedBitVector> Subtract(CompressedBitVector const & lhs, CompressedBitVector const & rhs); + // Unites two bit vectors. + static unique_ptr<CompressedBitVector> Union(CompressedBitVector const & lhs, + CompressedBitVector const & rhs); + + static bool IsEmpty(unique_ptr<CompressedBitVector> const & cbv); + + static bool IsEmpty(CompressedBitVector const * cbv); + // Returns the number of set bits (population count). virtual uint64_t PopCount() const = 0; |