[coding] Compressed text storage.

author: Yuri Gorshenin <y@maps.me> 2017-07-06 13:17:39 +0300
committer: Yuri Gorshenin <y@maps.me> 2017-07-06 20:46:32 +0300
commit: de30819018ce5dfb962638a88e39133857398491 (patch)
tree: d342a84b66cbb627257891e63fd878d4a003ced4 /coding
parent: 50d77ee9b1c232759fbeca1eb3a3e8ce333a2d63 (diff)
8 files changed, 443 insertions, 1 deletions
diff --git a/coding/CMakeLists.txt b/coding/CMakeLists.txt
index 4516ff6375..246bb52086 100644
--- a/coding/CMakeLists.txt
+++ b/coding/CMakeLists.txt
@@ -72,6 +72,7 @@ set(
   streams_common.hpp
   streams_sink.hpp
   succinct_mapper.hpp
+  text_storage.hpp
   traffic.cpp
   traffic.hpp
   transliteration.cpp
diff --git a/coding/bwt_coder.hpp b/coding/bwt_coder.hpp
index 0a9a5e6398..cf0e16c3f4 100644
--- a/coding/bwt_coder.hpp
+++ b/coding/bwt_coder.hpp
@@ -85,7 +85,8 @@ public:
       mtf.Transform(b);
     }
 
-    CHECK_LESS(start, n, ());
+    if (n != 0)
+      CHECK_LESS(start, n, ());
 
     revBuffer.resize(n);
     base::RevBWT(n, static_cast<size_t>(start), bwtBuffer.data(), revBuffer.data());
diff --git a/coding/coding.pro b/coding/coding.pro
index 3b4d1b8733..6acb942504 100644
--- a/coding/coding.pro
+++ b/coding/coding.pro
@@ -82,6 +82,7 @@ HEADERS += \
     streams_common.hpp \
     streams_sink.hpp \
     succinct_mapper.hpp \
+    text_storage.hpp \
     traffic.hpp \
     transliteration.hpp \
     uri.hpp \
diff --git a/coding/coding_tests/CMakeLists.txt b/coding/coding_tests/CMakeLists.txt
index 3b709659b9..5171fd00cf 100644
--- a/coding/coding_tests/CMakeLists.txt
+++ b/coding/coding_tests/CMakeLists.txt
@@ -32,6 +32,7 @@ set(
   reader_writer_ops_test.cpp
   simple_dense_coding_test.cpp
   succinct_mapper_test.cpp
+  text_storage_tests.cpp
   traffic_test.cpp
   uri_test.cpp
   url_encode_test.cpp
diff --git a/coding/coding_tests/bwt_coder_tests.cpp b/coding/coding_tests/bwt_coder_tests.cpp
index 28965a54e9..5c095ae07e 100644
--- a/coding/coding_tests/bwt_coder_tests.cpp
+++ b/coding/coding_tests/bwt_coder_tests.cpp
@@ -34,6 +34,26 @@ string EncodeDecode(BWTCoder::Params const & params, string const & s)
   return result;
 }
 
+string EncodeDecodeBlock(string const & s)
+{
+  vector<uint8_t> data;
+
+  {
+    MemWriter<decltype(data)> sink(data);
+    BWTCoder::EncodeAndWriteBlock(sink, s.size(), reinterpret_cast<uint8_t const *>(s.data()));
+  }
+
+  string result;
+  {
+    MemReader reader(data.data(), data.size());
+    ReaderSource<MemReader> source(reader);
+
+    BWTCoder::ReadAndDecodeBlock(source, back_inserter(result));
+  }
+
+  return result;
+}
+
 UNIT_TEST(BWTEncoder_Smoke)
 {
   for (size_t blockSize = 1; blockSize < 100; ++blockSize)
@@ -42,12 +62,16 @@ UNIT_TEST(BWTEncoder_Smoke)
 
     params.m_blockSize = blockSize;
     string const s = "abracadabra";
+    TEST_EQUAL(s, EncodeDecodeBlock(s), ());
     TEST_EQUAL(s, EncodeDecode(params, s), (blockSize));
   }
 
   string const strings[] = {"", "mississippi", "again and again and again"};
   for (auto const & s : strings)
+  {
+    TEST_EQUAL(s, EncodeDecodeBlock(s), ());
     TEST_EQUAL(s, EncodeDecode(BWTCoder::Params{}, s), ());
+  }
 }
 
 UNIT_TEST(BWT_Large)
diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro
index cc74f90994..5e4ad89056 100644
--- a/coding/coding_tests/coding_tests.pro
+++ b/coding/coding_tests/coding_tests.pro
@@ -39,6 +39,7 @@ SOURCES += ../../testing/testingmain.cpp \
     reader_writer_ops_test.cpp \
     simple_dense_coding_test.cpp \
     succinct_mapper_test.cpp \
+    text_storage_tests.cpp \
     traffic_test.cpp \
     uri_test.cpp \
     url_encode_test.cpp \
diff --git a/coding/coding_tests/text_storage_tests.cpp b/coding/coding_tests/text_storage_tests.cpp
new file mode 100644
index 0000000000..8f9f86f343
--- /dev/null
+++ b/coding/coding_tests/text_storage_tests.cpp
@@ -0,0 +1,131 @@
+#include "testing/testing.hpp"
+
+#include "coding/reader.hpp"
+#include "coding/text_storage.hpp"
+#include "coding/writer.hpp"
+
+#include <cstdint>
+#include <random>
+#include <string>
+#include <vector>
+
+using namespace coding;
+using namespace std;
+
+namespace
+{
+template <typename Engine>
+string GenerateRandomString(Engine & engine)
+{
+  int const kMinLength = 0;
+  int const kMaxLength = 400;
+
+  int const kMinByte = 0;
+  int const kMaxByte = 255;
+
+  uniform_int_distribution<int> length(kMinLength, kMaxLength);
+  uniform_int_distribution<int> byte(kMinByte, kMaxByte);
+  string s(length(engine), '\0');
+  for (auto & b : s)
+    b = byte(engine);
+  return s;
+}
+
+void DumpStrings(vector<string> const & strings, uint64_t blockSize, vector<uint8_t> & buffer)
+{
+  MemWriter<vector<uint8_t>> writer(buffer);
+  BlockedTextStorageWriter<decltype(writer)> ts(writer, blockSize);
+  for (auto const & s : strings)
+    ts.Append(s);
+}
+
+UNIT_TEST(TextStorage_Smoke)
+{
+  vector<uint8_t> buffer;
+  DumpStrings({} /* strings */, 10 /* blockSize */, buffer);
+
+  {
+    MemReader reader(buffer.data(), buffer.size());
+    BlockedTextStorageIndex index;
+    index.Read(reader);
+    TEST_EQUAL(index.GetNumStrings(), 0, ());
+    TEST_EQUAL(index.GetNumBlockInfos(), 0, ());
+  }
+
+  {
+    MemReader reader(buffer.data(), buffer.size());
+    BlockedTextStorageReader<decltype(reader)> ts(reader);
+    TEST_EQUAL(ts.GetNumStrings(), 0, ());
+  }
+}
+
+UNIT_TEST(TextStorage_Simple)
+{
+  vector<string> const strings = {{"", "Hello", "Hello, World!", "Hola mundo", "Smoke test"}};
+
+  vector<uint8_t> buffer;
+  DumpStrings(strings, 10 /* blockSize */, buffer);
+
+  {
+    MemReader reader(buffer.data(), buffer.size());
+    BlockedTextStorageIndex index;
+    index.Read(reader);
+    TEST_EQUAL(index.GetNumStrings(), strings.size(), ());
+    TEST_EQUAL(index.GetNumBlockInfos(), 3, ());
+  }
+
+  {
+    MemReader reader(buffer.data(), buffer.size());
+    BlockedTextStorageReader<MemReader> ts(reader);
+    TEST_EQUAL(ts.GetNumStrings(), strings.size(), ());
+    for (size_t i = 0; i < ts.GetNumStrings(); ++i)
+      TEST_EQUAL(ts.ExtractString(i), strings[i], ());
+  }
+}
+
+UNIT_TEST(TextStorage_Empty)
+{
+  vector<string> strings;
+  for (int i = 0; i < 1000; ++i) {
+    strings.emplace_back(string(1 /* size */, i % 256));
+    for (int j = 0; j < 1000; ++j)
+      strings.emplace_back();
+  }
+
+  vector<uint8_t> buffer;
+  DumpStrings(strings, 5 /* blockSize */, buffer);
+
+  {
+    MemReader reader(buffer.data(), buffer.size());
+    BlockedTextStorageReader<MemReader> ts(reader);
+    TEST_EQUAL(ts.GetNumStrings(), strings.size(), ());
+    for (size_t i = 0; i < ts.GetNumStrings(); ++i)
+      TEST_EQUAL(ts.ExtractString(i), strings[i], ());
+  }
+}
+
+UNIT_TEST(TextStorage_Random)
+{
+  int const kSeed = 42;
+  int const kNumStrings = 1000;
+  int const kBlockSize = 100;
+  mt19937 engine(kSeed);
+
+  vector<string> strings;
+  for (int i = 0; i < kNumStrings; ++i)
+    strings.push_back(GenerateRandomString(engine));
+
+  vector<uint8_t> buffer;
+  DumpStrings(strings, kBlockSize, buffer);
+
+  MemReader reader(buffer.data(), buffer.size());
+  BlockedTextStorageReader<MemReader> ts(reader);
+
+  TEST_EQUAL(ts.GetNumStrings(), strings.size(), ());
+  for (size_t i = 0; i < ts.GetNumStrings(); ++i)
+    TEST_EQUAL(ts.ExtractString(i), strings[i], ());
+  ts.ClearCache();
+  for (size_t i = ts.GetNumStrings() - 1; i < ts.GetNumStrings(); --i)
+    TEST_EQUAL(ts.ExtractString(i), strings[i], ());
+}
+}  // namespace
diff --git a/coding/text_storage.hpp b/coding/text_storage.hpp
new file mode 100644
index 0000000000..0f0c50b9d2
--- /dev/null
+++ b/coding/text_storage.hpp
@@ -0,0 +1,282 @@
+#pragma once
+
+#include "coding/bwt_coder.hpp"
+#include "coding/reader.hpp"
+#include "coding/varint.hpp"
+#include "coding/write_to_sink.hpp"
+
+#include "base/assert.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <sstream>
+#include <string>
+
+namespace coding
+{
+// Writes set of strings in a format that allows to access blocks of
+// strings. The size of each block roughly equals to the |blockSize|,
+// because the whole number of strings is packed into a single block.
+//
+// Format description:
+// * first 8 bytes - little endian-encoded offset of the index section
+// * data section - represents a catenated sequence of BWT-compressed blocks with
+//   the sequence of individual string lengths in the block
+// * index section - represents a delta-encoded sequence of
+//   BWT-compressed blocks offsets intermixed with the number of
+//   strings inside each block.
+//
+// All numbers except the first offset are varints.
+template <typename Writer>
+class BlockedTextStorageWriter
+{
+public:
+  BlockedTextStorageWriter(Writer & writer, uint64_t blockSize)
+    : m_writer(writer), m_blockSize(blockSize), m_startOffset(writer.Pos()), m_blocks(1)
+  {
+    CHECK(m_blockSize != 0, ());
+    WriteToSink(m_writer, static_cast<uint64_t>(0));
+    m_dataOffset = m_writer.Pos();
+  }
+
+  ~BlockedTextStorageWriter()
+  {
+    if (!m_lengths.empty())
+      FlushPool(m_lengths, m_pool);
+
+    if (m_blocks.back().IsEmpty())
+      m_blocks.pop_back();
+
+    {
+      auto const currentOffset = m_writer.Pos();
+      ASSERT_GREATER_OR_EQUAL(currentOffset, m_startOffset, ());
+      m_writer.Seek(m_startOffset);
+      WriteToSink(m_writer, static_cast<uint64_t>(currentOffset - m_startOffset));
+      m_writer.Seek(currentOffset);
+    }
+
+    WriteVarUint(m_writer, m_blocks.size());
+
+    uint64_t prevOffset = 0;
+    for (auto const & block : m_blocks)
+    {
+      ASSERT_GREATER_OR_EQUAL(block.m_offset, prevOffset, ());
+      WriteVarUint(m_writer, block.m_offset - prevOffset);
+
+      ASSERT(!block.IsEmpty(), ());
+      WriteVarUint(m_writer, block.m_subs);
+
+      prevOffset = block.m_offset;
+    }
+  }
+
+  void Append(std::string const & s)
+  {
+    ASSERT(!m_blocks.empty(), ());
+
+    ASSERT_LESS(m_pool.size(), m_blockSize, ());
+
+    ++m_blocks.back().m_subs;
+    m_pool.append(s);
+    m_lengths.push_back(s.size());
+
+    if (m_pool.size() >= m_blockSize)
+    {
+      FlushPool(m_lengths, m_pool);
+      m_pool.clear();
+      m_lengths.clear();
+      m_blocks.emplace_back(m_writer.Pos() - m_dataOffset /* offset */, 0 /* subs */);
+    }
+  }
+
+private:
+  struct Block
+  {
+    Block() = default;
+    Block(uint64_t offset, uint64_t subs) : m_offset(offset), m_subs(subs) {}
+
+    bool IsEmpty() const { return m_subs == 0; }
+
+    uint64_t m_offset = 0;  // offset of the block inside the sequence of compressed blocks
+    uint64_t m_subs = 0;    // number of strings inside the block
+  };
+
+  void FlushPool(vector<uint64_t> const & lengths, string const & pool)
+  {
+    for (auto const & length : lengths)
+      WriteVarUint(m_writer, length);
+    BWTCoder::EncodeAndWriteBlock(m_writer, pool.size(),
+                                  reinterpret_cast<uint8_t const *>(pool.c_str()));
+  }
+
+  Writer & m_writer;
+  uint64_t const m_blockSize;
+  uint64_t m_startOffset = 0;
+  uint64_t m_dataOffset = 0;
+
+  vector<Block> m_blocks;
+
+  std::string m_pool;          // concatenated strings
+  vector<uint64_t> m_lengths;  // lengths of strings inside the |m_pool|
+};
+
+class BlockedTextStorageIndex
+{
+public:
+  struct BlockInfo
+  {
+    // Returns the index of the first string belonging to the block.
+    uint64_t From() const { return m_from; }
+
+    // Returns the index of the first string from the next block.
+    uint64_t To() const { return m_from + m_subs; }
+
+    uint64_t m_offset = 0;  // offset of the block from the beginning of the section
+    uint64_t m_from = 0;    // index of the first string in the block
+    uint64_t m_subs = 0;    // number of strings in the block
+  };
+
+  size_t GetNumBlockInfos() const { return m_blocks.size(); }
+  size_t GetNumStrings() const { return m_blocks.empty() ? 0 : m_blocks.back().To(); }
+
+  BlockInfo const & GetBlockInfo(size_t blockIx) const
+  {
+    ASSERT_LESS(blockIx, GetNumBlockInfos(), ());
+    return m_blocks[blockIx];
+  }
+
+  // Returns the index of the block the |stringIx| belongs to.
+  // Returns the number of blocks if there're no such block.
+  size_t GetBlockIx(size_t stringIx) const
+  {
+    if (m_blocks.empty() || stringIx >= m_blocks.back().To())
+      return GetNumBlockInfos();
+    if (stringIx >= m_blocks.back().From())
+      return GetNumBlockInfos() - 1;
+
+    size_t lo = 0, hi = GetNumBlockInfos() - 1;
+    while (lo + 1 != hi)
+    {
+      ASSERT_GREATER_OR_EQUAL(stringIx, m_blocks[lo].From(), ());
+      ASSERT_LESS(stringIx, m_blocks[hi].From(), ());
+
+      auto const mi = lo + (hi - lo) / 2;
+      if (stringIx >= m_blocks[mi].From())
+        lo = mi;
+      else
+        hi = mi;
+    }
+
+    ASSERT_GREATER_OR_EQUAL(stringIx, m_blocks[lo].From(), ());
+    ASSERT_LESS(stringIx, m_blocks[hi].From(), ());
+    return lo;
+  }
+
+  template <typename Reader>
+  void Read(Reader & reader)
+  {
+    auto const indexOffset = ReadPrimitiveFromPos<uint64_t, Reader>(reader, 0);
+
+    NonOwningReaderSource source(reader);
+    source.Skip(indexOffset);
+
+    auto const numBlocks = ReadVarUint<uint64_t, NonOwningReaderSource>(source);
+    m_blocks.assign(numBlocks, {});
+
+    uint64_t prevOffset = 8;  // 8 bytes for the offset
+    for (uint64_t i = 0; i < numBlocks; ++i)
+    {
+      auto const delta = ReadVarUint<uint64_t, NonOwningReaderSource>(source);
+      CHECK_GREATER_OR_EQUAL(prevOffset + delta, prevOffset, ());
+      prevOffset += delta;
+
+      auto & block = m_blocks[i];
+      block.m_offset = prevOffset;
+      block.m_from = i == 0 ? 0 : m_blocks[i - 1].To();
+      block.m_subs = ReadVarUint<uint64_t, NonOwningReaderSource>(source);
+      CHECK_GREATER_OR_EQUAL(block.m_from + block.m_subs, block.m_from, ());
+    }
+  }
+
+private:
+  std::vector<BlockInfo> m_blocks;
+};
+
+template <typename Reader>
+class BlockedTextStorageReader
+{
+public:
+  explicit BlockedTextStorageReader(Reader & reader) : m_reader(reader) { m_index.Read(reader); }
+
+  size_t GetNumStrings() const { return m_index.GetNumStrings(); }
+
+  std::string ExtractString(size_t stringIx)
+  {
+    auto const blockIx = m_index.GetBlockIx(stringIx);
+    CHECK_LESS(blockIx, m_index.GetNumBlockInfos(), ());
+
+    if (blockIx >= m_cache.size())
+      m_cache.resize(blockIx + 1);
+    ASSERT_LESS(blockIx, m_cache.size(), ());
+
+    auto const & bi = m_index.GetBlockInfo(blockIx);
+
+    auto & entry = m_cache[blockIx];
+    if (!entry.m_valid)
+    {
+      NonOwningReaderSource source(m_reader);
+      source.Skip(bi.m_offset);
+
+      entry.m_value.clear();
+      entry.m_subs.resize(bi.m_subs);
+
+      uint64_t offset = 0;
+      for (size_t i = 0; i < entry.m_subs.size(); ++i)
+      {
+        auto & sub = entry.m_subs[i];
+        sub.m_offset = offset;
+        sub.m_length = ReadVarUint<uint64_t>(source);
+        CHECK_GREATER_OR_EQUAL(sub.m_offset + sub.m_length, sub.m_offset, ());
+        offset += sub.m_length;
+      }
+      BWTCoder::ReadAndDecodeBlock(source, std::back_inserter(entry.m_value));
+      entry.m_valid = true;
+    }
+    ASSERT(entry.m_valid, ());
+
+    ASSERT_GREATER_OR_EQUAL(stringIx, bi.From(), ());
+    ASSERT_LESS(stringIx, bi.To(), ());
+
+    stringIx -= bi.From();
+    ASSERT_LESS(stringIx, entry.m_subs.size(), ());
+
+    auto const & si = entry.m_subs[stringIx];
+    auto const & value = entry.m_value;
+    ASSERT_LESS_OR_EQUAL(si.m_offset + si.m_length, value.size(), ());
+    return value.substr(si.m_offset, si.m_length);
+  }
+
+  void ClearCache() { m_cache.clear(); }
+
+private:
+  struct StringInfo
+  {
+    StringInfo() = default;
+    StringInfo(uint64_t offset, uint64_t length): m_offset(offset), m_length(length) {}
+
+    uint64_t m_offset = 0;  // offset of the string inside the decompressed block
+    uint64_t m_length = 0;  // length of the string
+  };
+
+  struct CacheEntry
+  {
+    std::string m_value;             // concatenation of the strings
+    std::vector<StringInfo> m_subs;  // indices of individual strings
+    bool m_valid = false;
+  };
+
+  Reader & m_reader;
+  BlockedTextStorageIndex m_index;
+  std::vector<CacheEntry> m_cache;
+};
+}  // namespace coding
author	Yuri Gorshenin <y@maps.me>	2017-07-06 13:17:39 +0300
committer	Yuri Gorshenin <y@maps.me>	2017-07-06 20:46:32 +0300
commit	de30819018ce5dfb962638a88e39133857398491 (patch)
tree	d342a84b66cbb627257891e63fd878d4a003ced4 /coding
parent	50d77ee9b1c232759fbeca1eb3a3e8ce333a2d63 (diff)