[search] Added FixedBitsDDVector container.

author: vng <viktor.govako@gmail.com> 2015-12-04 20:33:03 +0300
committer: Sergey Yershov <yershov@corp.mail.ru> 2016-03-23 16:02:59 +0300
commit: 83690aa414831f2ea6e1035c67b9734bb0a7a060 (patch)
tree: a2295b000b90e0bbd0e3bc261f099db0f0d6c71f /coding
parent: bae24455806ddc56fbf0187a3d628409c5c58995 (diff)
6 files changed, 279 insertions, 5 deletions
diff --git a/coding/bit_streams.hpp b/coding/bit_streams.hpp
index 5bc00a9419..702e31145d 100644
--- a/coding/bit_streams.hpp
+++ b/coding/bit_streams.hpp
@@ -1,10 +1,12 @@
 #pragma once
 
+#include "base/assert.hpp"
+#include "base/logging.hpp"
+
+#include "std/algorithm.hpp"
 #include "std/cstdint.hpp"
 #include "std/limits.hpp"
 
-#include "base/assert.hpp"
-#include "base/logging.hpp"
 
 namespace
 {
@@ -39,7 +41,7 @@ public:
   // Writes n bits starting with the least significant bit.
   // They are written one byte at a time so endianness is of no concern.
   // All the other bits except for the first n must be set to zero.
-  void Write(uint8_t bits, uint32_t n)
+  void Write(uint8_t bits, uint8_t n)
   {
     if (n == 0)
       return;
@@ -68,6 +70,17 @@ public:
     }
   }
 
+  // Same as Write but accept up to 32 bits to write.
+  void WriteAtMost32Bits(uint32_t bits, uint8_t n)
+  {
+    ASSERT_LESS_OR_EQUAL(n, 32, ());
+
+    uint8_t constexpr kMinBits = CHAR_BIT;
+    Write(static_cast<uint8_t>(bits), min(n, kMinBits));
+    if (n > kMinBits)
+      WriteAtMost32Bits(bits >> kMinBits, n - kMinBits);
+  }
+
 private:
   // Writes up to CHAR_BIT-1 last bits if they have not been written yet
   // and pads them with zeros.
@@ -96,7 +109,7 @@ public:
   // The underlying m_src is supposed to be byte-aligned (which is the
   // case when it reads from the place that was written to using BitWriter).
   // Read may use one lookahead byte.
-  uint8_t Read(uint32_t n)
+  uint8_t Read(uint8_t n)
   {
     if (n == 0)
       return 0;
diff --git a/coding/coding.pro b/coding/coding.pro
index adf2dd5515..44c6581d70 100644
--- a/coding/coding.pro
+++ b/coding/coding.pro
@@ -67,10 +67,12 @@ HEADERS += \
     file_sort.hpp \
     file_writer.hpp \
     file_writer_stream.hpp \
+    fixed_bits_ddvector.hpp \
     hex.hpp \
     huffman.hpp \
     internal/file64_api.hpp \
     internal/file_data.hpp \
+    internal/xmlparser.hpp \
     matrix_traversal.hpp \
     mmap_reader.hpp \
     multilang_utf8_string.hpp \
@@ -101,4 +103,3 @@ HEADERS += \
     writer.hpp \
     zip_creator.hpp \
     zip_reader.hpp \
-    internal/xmlparser.hpp \
diff --git a/coding/coding_tests/bit_streams_test.cpp b/coding/coding_tests/bit_streams_test.cpp
index b2426e8c73..ac60178ecb 100644
--- a/coding/coding_tests/bit_streams_test.cpp
+++ b/coding/coding_tests/bit_streams_test.cpp
@@ -44,3 +44,21 @@ UNIT_TEST(BitStreams_Smoke)
     TEST_EQUAL(num, nums[i].first, (i));
   }
 }
+
+UNIT_TEST(BitStreams_T1)
+{
+  using TBuffer = vector<uint8_t>;
+  using TWriter = MemWriter<TBuffer>;
+
+  TBuffer buf;
+  {
+    TWriter w(buf);
+    BitWriter<TWriter> bits(w);
+
+    bits.Write(0, 3);
+    bits.Write(3, 3);
+    bits.Write(6, 3);
+  }
+
+  TEST_EQUAL(buf.size(), 2, ());
+}
diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro
index adb3676b54..9e4f3996e2 100644
--- a/coding/coding_tests/coding_tests.pro
+++ b/coding/coding_tests/coding_tests.pro
@@ -27,6 +27,7 @@ SOURCES += ../../testing/testingmain.cpp \
     file_data_test.cpp \
     file_sort_test.cpp \
     file_utils_test.cpp \
+    fixed_bits_ddvector_test.cpp \
     hex_test.cpp \
     huffman_test.cpp \
     mem_file_reader_test.cpp \
diff --git a/coding/coding_tests/fixed_bits_ddvector_test.cpp b/coding/coding_tests/fixed_bits_ddvector_test.cpp
new file mode 100644
index 0000000000..4a8a998952
--- /dev/null
+++ b/coding/coding_tests/fixed_bits_ddvector_test.cpp
@@ -0,0 +1,74 @@
+#include "testing/testing.hpp"
+
+#include "coding/fixed_bits_ddvector.hpp"
+#include "coding/writer.hpp"
+
+#include "std/initializer_list.hpp"
+#include "std/random.hpp"
+
+
+namespace
+{
+
+template <size_t Bits> void TestWithData(vector<uint32_t> const & lst)
+{
+  using TVector = FixedBitsDDVector<Bits, MemReader>;
+  using TBuffer = vector<uint8_t>;
+  using TWriter = MemWriter<TBuffer>;
+
+  TBuffer buf;
+  {
+    TWriter writer(buf);
+    typename TVector::template Builder<TWriter> builder(writer);
+
+    uint32_t optCount = 0;
+    uint32_t const optBound = (1 << Bits) - 1;
+
+    for (uint32_t v : lst)
+    {
+      if (v < optBound)
+        ++optCount;
+
+      builder.PushBack(v);
+    }
+
+    pair<uint32_t, uint32_t> exp(optCount, lst.size());
+    TEST_EQUAL(builder.GetCount(), exp, ());
+  }
+
+  MemReader reader(buf.data(), buf.size());
+  auto const vec = TVector::Create(reader);
+
+  size_t i = 0;
+  for (uint32_t v : lst)
+    TEST_EQUAL(vec->Get(i++), v, ());
+}
+
+} // namespace
+
+UNIT_TEST(FixedBitsDDVector_Smoke)
+{
+  TestWithData<3>({0, 3, 6});
+  TestWithData<3>({7, 20, 50});
+  TestWithData<3>({1, 0, 4, 30, 5, 3, 6, 7, 2, 8, 0});
+}
+
+UNIT_TEST(FixedBitsDDVector_Rand)
+{
+  vector<uint32_t> v;
+
+  default_random_engine gen;
+  uniform_int_distribution<uint32_t> distribution(0, 1000);
+
+  size_t constexpr kMaxCount = 1000;
+  for (size_t i = 0; i < kMaxCount; ++i)
+    v.push_back(distribution(gen));
+
+  TestWithData<3>(v);
+  TestWithData<4>(v);
+  TestWithData<5>(v);
+  TestWithData<6>(v);
+  TestWithData<7>(v);
+  TestWithData<8>(v);
+  TestWithData<9>(v);
+}
diff --git a/coding/fixed_bits_ddvector.hpp b/coding/fixed_bits_ddvector.hpp
new file mode 100644
index 0000000000..b535be9b73
--- /dev/null
+++ b/coding/fixed_bits_ddvector.hpp
@@ -0,0 +1,167 @@
+#pragma once
+
+#include "bit_streams.hpp"
+#include "byte_stream.hpp"
+#include "dd_vector.hpp"
+#include "reader.hpp"
+#include "write_to_sink.hpp"
+
+#include "std/algorithm.hpp"
+#include "std/unique_ptr.hpp"
+#include "std/vector.hpp"
+
+
+/// Disk driven vector for optimal storing small values with rare big values.
+/// Format:
+/// 4 bytes to store vector's size
+/// Buffer of ceil(Size * Bits / 8) bytes, e.g. vector of Bits-sized elements.
+/// - values in range [0, (1 << Bits) - 2] stored as is
+/// - value (1 << Bits) - 1 tells that actual value is stored in the exceptions table below.
+/// Buffer with exceptions table, e.g. vector of (index, value) pairs till the end of the reader,
+/// sorted by index parameter.
+/// Component works in little endian without any host conversions.
+
+template
+<
+  size_t Bits,                /// number of fixed bits
+  class TReader,              /// reader with random offset read functions
+  typename TSize = uint32_t,  /// vector index type (platform independent)
+  typename TValue = uint32_t  /// vector value type (platform independent)
+>
+class FixedBitsDDVector
+{
+  static_assert(is_unsigned<TSize>::value, "");
+  static_assert(is_unsigned<TValue>::value, "");
+  // 16 - is the maximum bits count to get all needed bits in random access within uint32_t.
+  static_assert(Bits <= 16, "");
+
+  using TSelf = FixedBitsDDVector<Bits, TReader, TSize, TValue>;
+
+  struct IndexValue
+  {
+    TSize m_index;
+    TValue m_value;
+    bool operator<(IndexValue const & rhs) const { return m_index < rhs.m_index; }
+  };
+
+  TReader m_bits;
+  DDVector<IndexValue, TReader, TSize> m_vector;
+
+#ifdef DEBUG
+  TSize const m_size;
+#endif
+
+  using TBlock = uint32_t;
+
+  static uint64_t AlignBytesCount(uint64_t count)
+  {
+    return max(count, static_cast<uint64_t>(sizeof(TBlock)));
+  }
+
+  static TBlock constexpr kMask = (1 << Bits) - 1;
+
+  TValue FindInVector(TSize index) const
+  {
+    auto const it = lower_bound(m_vector.begin(), m_vector.end(), IndexValue{index, 0});
+    ASSERT(it != m_vector.end() && it->m_index == index, ());
+    return it->m_value;
+  }
+
+  FixedBitsDDVector(TReader const & bitsReader, TReader const & vecReader, TSize size)
+    : m_bits(bitsReader)
+    , m_vector(vecReader)
+  #ifdef DEBUG
+    , m_size(size)
+  #endif
+  {
+  }
+
+public:
+  static unique_ptr<TSelf> Create(TReader const & reader)
+  {
+    TSize const size = ReadPrimitiveFromPos<TSize>(reader, 0);
+
+    uint64_t const off1 = sizeof(TSize);
+    uint64_t const off2 = AlignBytesCount((size * Bits + CHAR_BIT - 1) / CHAR_BIT) + off1;
+    return unique_ptr<TSelf>(new TSelf(reader.SubReader(off1, off2 - off1),
+                                       reader.SubReader(off2, reader.Size() - off2),
+                                       size));
+  }
+
+  TValue Get(TSize index) const
+  {
+    ASSERT_LESS(index, m_size, ());
+    uint64_t const bitsOffset = index * Bits;
+
+    uint64_t bytesOffset = bitsOffset / CHAR_BIT;
+    size_t constexpr kBlockSize = sizeof(TBlock);
+    if (bytesOffset + kBlockSize > m_bits.Size())
+      bytesOffset = m_bits.Size() - kBlockSize;
+
+    TBlock v = ReadPrimitiveFromPos<TBlock>(m_bits, bytesOffset);
+    v >>= (bitsOffset - bytesOffset * CHAR_BIT);
+    v &= kMask;
+    return (v == kMask ? FindInVector(index) : v);
+  }
+
+  template <class TWriter> class Builder
+  {
+    using TData = vector<uint8_t>;
+    using TempWriter = PushBackByteSink<TData>;
+    using TBits = BitWriter<TempWriter>;
+
+    TData m_data;
+    TempWriter m_writer;
+    unique_ptr<TBits> m_bits;
+
+    vector<IndexValue> m_excepts;
+    TSize m_count = 0;
+    TSize m_optCount = 0;
+
+    TWriter & m_finalWriter;
+
+  public:
+    explicit Builder(TWriter & writer)
+      : m_writer(m_data), m_bits(new TBits(m_writer)), m_finalWriter(writer)
+    {
+    }
+
+    ~Builder()
+    {
+      // Final serialization is in dtor only.
+      // You can't do any intermediate flushes during building vector.
+
+      // Reset the bit stream first.
+      m_bits.reset();
+
+      // Write size of vector.
+      WriteToSink(m_finalWriter, m_count);
+
+      // Write bits vector, alignes at least to 4 bytes.
+      m_data.resize(AlignBytesCount(m_data.size()));
+      m_finalWriter.Write(m_data.data(), m_data.size());
+
+      // Write exceptions table.
+      m_finalWriter.Write(m_excepts.data(), m_excepts.size() * sizeof(IndexValue));
+    }
+
+    void PushBack(TValue v)
+    {
+      if (v >= kMask)
+      {
+        m_bits->WriteAtMost32Bits(kMask, Bits);
+        m_excepts.push_back({m_count, v});
+      }
+      else
+      {
+        ++m_optCount;
+        m_bits->WriteAtMost32Bits(v, Bits);
+      }
+
+      ++m_count;
+    }
+
+    /// @return (number of stored as-is elements, number of all elements)
+    pair<TSize, TSize> GetCount() const { return make_pair(m_optCount, m_count); }
+  };
+};
author	vng <viktor.govako@gmail.com>	2015-12-04 20:33:03 +0300
committer	Sergey Yershov <yershov@corp.mail.ru>	2016-03-23 16:02:59 +0300
commit	83690aa414831f2ea6e1035c67b9734bb0a7a060 (patch)
tree	a2295b000b90e0bbd0e3bc261f099db0f0d6c71f /coding
parent	bae24455806ddc56fbf0187a3d628409c5c58995 (diff)