Welcome to mirror list, hosted at ThFree Co, Russian Federation.

compressed_bit_vector.hpp « coding - github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
blob: 925759d83db5e6d628c60ccd6bf41d26f7a70b6f (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
#pragma once

#include "coding/read_write_utils.hpp"
#include "coding/reader.hpp"
#include "coding/writer.hpp"

#include "base/assert.hpp"
#include "base/ref_counted.hpp"

#include "std/algorithm.hpp"
#include "std/unique_ptr.hpp"
#include "std/utility.hpp"
#include "std/vector.hpp"

namespace coding
{
class CompressedBitVector : public my::RefCounted
{
public:
  enum class StorageStrategy
  {
    Dense,
    Sparse
  };

  virtual ~CompressedBitVector() = default;

  // Intersects two bit vectors.
  // todo(@pimenov) We expect the common use case to be as follows.
  // A CBV is created in memory and several CBVs are read and intersected
  // with it one by one. The in-memory CBV may initially contain a bit
  // for every feature in an mwm and the intersected CBVs are read from
  // the leaves of a search trie.
  // Therefore an optimization of Intersect comes to mind: make a wrapper
  // around TReader that will read a representation of a CBV from disk
  // and intersect it bit by bit with the global in-memory CBV bypassing such
  // routines as allocating memory and choosing strategy. They all can be called only
  // once, namely in the end, when it is needed to pack the in-memory CBV into
  // a suitable representation and pass it to the caller.
  static unique_ptr<CompressedBitVector> Intersect(CompressedBitVector const & lhs,
                                                   CompressedBitVector const & rhs);

  // Subtracts two bit vectors.
  static unique_ptr<CompressedBitVector> Subtract(CompressedBitVector const & lhs,
                                                  CompressedBitVector const & rhs);

  // Unites two bit vectors.
  static unique_ptr<CompressedBitVector> Union(CompressedBitVector const & lhs,
                                               CompressedBitVector const & rhs);

  static bool IsEmpty(unique_ptr<CompressedBitVector> const & cbv);

  static bool IsEmpty(CompressedBitVector const * cbv);

  // Returns the number of set bits (population count).
  virtual uint64_t PopCount() const = 0;

  // todo(@pimenov) How long will 32 bits be enough here?
  // Would operator[] look better?
  virtual bool GetBit(uint64_t pos) const = 0;

  // Returns a subset of the current bit vector with first
  // min(PopCount(), |n|) set bits.
  virtual unique_ptr<CompressedBitVector> LeaveFirstSetNBits(uint64_t n) const = 0;

  // Returns the strategy used when storing this bit vector.
  virtual StorageStrategy GetStorageStrategy() const = 0;

  // Writes the contents of a bit vector to writer.
  // The first byte is always the header that defines the format.
  // Currently the header is 0 or 1 for Dense and Sparse strategies respectively.
  // It is easier to dispatch via virtual method calls and not bother
  // with template TWriters here as we do in similar places in our code.
  // This should not pose too much a problem because commonly
  // used writers are inhereted from Writer anyway.
  // todo(@pimenov). Think about rewriting Serialize and Deserialize to use the
  // code in old_compressed_bit_vector.{c,h}pp.
  virtual void Serialize(Writer & writer) const = 0;

  // Copies a bit vector and returns a pointer to the copy.
  virtual unique_ptr<CompressedBitVector> Clone() const = 0;
};

string DebugPrint(CompressedBitVector::StorageStrategy strat);

class DenseCBV : public CompressedBitVector
{
public:
  friend class CompressedBitVectorBuilder;
  static uint64_t const kBlockSize = 64;

  DenseCBV() = default;

  // Builds a dense CBV from a list of positions of set bits.
  explicit DenseCBV(vector<uint64_t> const & setBits);

  // Not to be confused with the constructor: the semantics
  // of the array of integers is completely different.
  static unique_ptr<DenseCBV> BuildFromBitGroups(vector<uint64_t> && bitGroups);

  size_t NumBitGroups() const { return m_bitGroups.size(); }

  template <typename TFn>
  void ForEach(TFn && f) const
  {
    for (size_t i = 0; i < m_bitGroups.size(); ++i)
    {
      for (size_t j = 0; j < kBlockSize; ++j)
      {
        if (((m_bitGroups[i] >> j) & 1) > 0)
          f(kBlockSize * i + j);
      }
    }
  }

  // Returns 0 if the group number is too large to be contained in m_bits.
  uint64_t GetBitGroup(size_t i) const;

  // CompressedBitVector overrides:
  uint64_t PopCount() const override;
  bool GetBit(uint64_t pos) const override;
  unique_ptr<CompressedBitVector> LeaveFirstSetNBits(uint64_t n) const override;
  StorageStrategy GetStorageStrategy() const override;
  void Serialize(Writer & writer) const override;
  unique_ptr<CompressedBitVector> Clone() const override;

private:
  vector<uint64_t> m_bitGroups;
  uint64_t m_popCount = 0;
};

class SparseCBV : public CompressedBitVector
{
public:
  friend class CompressedBitVectorBuilder;
  using TIterator = vector<uint64_t>::const_iterator;

  SparseCBV() = default;

  explicit SparseCBV(vector<uint64_t> const & setBits);

  explicit SparseCBV(vector<uint64_t> && setBits);

  // Returns the position of the i'th set bit.
  uint64_t Select(size_t i) const;

  template <typename TFn>
  void ForEach(TFn && f) const
  {
    for (auto const & position : m_positions)
      f(position);
  }

  // CompressedBitVector overrides:
  uint64_t PopCount() const override;
  bool GetBit(uint64_t pos) const override;
  unique_ptr<CompressedBitVector> LeaveFirstSetNBits(uint64_t n) const override;
  StorageStrategy GetStorageStrategy() const override;
  void Serialize(Writer & writer) const override;
  unique_ptr<CompressedBitVector> Clone() const override;

  inline TIterator Begin() const { return m_positions.cbegin(); }
  inline TIterator End() const { return m_positions.cend(); }

private:
  // 0-based positions of the set bits.
  vector<uint64_t> m_positions;
};

class CompressedBitVectorBuilder
{
public:
  // Chooses a strategy to store the bit vector with bits from setBits set to one
  // and returns a pointer to a class that fits best.
  static unique_ptr<CompressedBitVector> FromBitPositions(vector<uint64_t> const & setBits);
  static unique_ptr<CompressedBitVector> FromBitPositions(vector<uint64_t> && setBits);

  // Chooses a strategy to store the bit vector with bits from a bitmap obtained
  // by concatenating the elements of bitGroups.
  static unique_ptr<CompressedBitVector> FromBitGroups(vector<uint64_t> & bitGroups);
  static unique_ptr<CompressedBitVector> FromBitGroups(vector<uint64_t> && bitGroups);

  // Reads a bit vector from reader which must contain a valid
  // bit vector representation (see CompressedBitVector::Serialize for the format).
  template <typename TReader>
  static unique_ptr<CompressedBitVector> DeserializeFromReader(TReader & reader)
  {
    ReaderSource<TReader> src(reader);
    return DeserializeFromSource(src);
  }

  // Reads a bit vector from source which must contain a valid
  // bit vector representation (see CompressedBitVector::Serialize for the format).
  template <typename TSource>
  static unique_ptr<CompressedBitVector> DeserializeFromSource(TSource & src)
  {
    uint8_t header = ReadPrimitiveFromSource<uint8_t>(src);
    CompressedBitVector::StorageStrategy strat =
        static_cast<CompressedBitVector::StorageStrategy>(header);
    switch (strat)
    {
    case CompressedBitVector::StorageStrategy::Dense:
    {
      vector<uint64_t> bitGroups;
      rw::ReadVectorOfPOD(src, bitGroups);
      return DenseCBV::BuildFromBitGroups(move(bitGroups));
    }
    case CompressedBitVector::StorageStrategy::Sparse:
    {
      vector<uint64_t> setBits;
      rw::ReadVectorOfPOD(src, setBits);
      return make_unique<SparseCBV>(move(setBits));
    }
    }
    return unique_ptr<CompressedBitVector>();
  }
};

// ForEach is generic and therefore cannot be virtual: a helper class is needed.
class CompressedBitVectorEnumerator
{
public:
  // Executes f for each bit that is set to one using
  // the bit's 0-based position as argument.
  template <typename TFn>
  static void ForEach(CompressedBitVector const & cbv, TFn && f)
  {
    CompressedBitVector::StorageStrategy strat = cbv.GetStorageStrategy();
    switch (strat)
    {
    case CompressedBitVector::StorageStrategy::Dense:
    {
      DenseCBV const & denseCBV = static_cast<DenseCBV const &>(cbv);
      denseCBV.ForEach(f);
      return;
    }
    case CompressedBitVector::StorageStrategy::Sparse:
    {
      SparseCBV const & sparseCBV = static_cast<SparseCBV const &>(cbv);
      sparseCBV.ForEach(f);
      return;
    }
    }
  }
};

class CompressedBitVectorHasher
{
public:
  static uint64_t Hash(CompressedBitVector const & cbv)
  {
    uint64_t const kBase = 127;
    uint64_t hash = 0;
    CompressedBitVectorEnumerator::ForEach(cbv, [&hash](uint64_t i)
                                           {
                                             hash = hash * kBase + i + 1;
                                           });
    return hash;
  }
};
}  // namespace coding