From de30819018ce5dfb962638a88e39133857398491 Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Thu, 6 Jul 2017 13:17:39 +0300 Subject: [coding] Compressed text storage. --- coding/CMakeLists.txt | 1 + coding/bwt_coder.hpp | 3 +- coding/coding.pro | 1 + coding/coding_tests/CMakeLists.txt | 1 + coding/coding_tests/bwt_coder_tests.cpp | 24 +++ coding/coding_tests/coding_tests.pro | 1 + coding/coding_tests/text_storage_tests.cpp | 131 ++++++++++++++ coding/text_storage.hpp | 282 +++++++++++++++++++++++++++++ 8 files changed, 443 insertions(+), 1 deletion(-) create mode 100644 coding/coding_tests/text_storage_tests.cpp create mode 100644 coding/text_storage.hpp (limited to 'coding') diff --git a/coding/CMakeLists.txt b/coding/CMakeLists.txt index 4516ff6375..246bb52086 100644 --- a/coding/CMakeLists.txt +++ b/coding/CMakeLists.txt @@ -72,6 +72,7 @@ set( streams_common.hpp streams_sink.hpp succinct_mapper.hpp + text_storage.hpp traffic.cpp traffic.hpp transliteration.cpp diff --git a/coding/bwt_coder.hpp b/coding/bwt_coder.hpp index 0a9a5e6398..cf0e16c3f4 100644 --- a/coding/bwt_coder.hpp +++ b/coding/bwt_coder.hpp @@ -85,7 +85,8 @@ public: mtf.Transform(b); } - CHECK_LESS(start, n, ()); + if (n != 0) + CHECK_LESS(start, n, ()); revBuffer.resize(n); base::RevBWT(n, static_cast(start), bwtBuffer.data(), revBuffer.data()); diff --git a/coding/coding.pro b/coding/coding.pro index 3b4d1b8733..6acb942504 100644 --- a/coding/coding.pro +++ b/coding/coding.pro @@ -82,6 +82,7 @@ HEADERS += \ streams_common.hpp \ streams_sink.hpp \ succinct_mapper.hpp \ + text_storage.hpp \ traffic.hpp \ transliteration.hpp \ uri.hpp \ diff --git a/coding/coding_tests/CMakeLists.txt b/coding/coding_tests/CMakeLists.txt index 3b709659b9..5171fd00cf 100644 --- a/coding/coding_tests/CMakeLists.txt +++ b/coding/coding_tests/CMakeLists.txt @@ -32,6 +32,7 @@ set( reader_writer_ops_test.cpp simple_dense_coding_test.cpp succinct_mapper_test.cpp + text_storage_tests.cpp traffic_test.cpp uri_test.cpp url_encode_test.cpp diff --git a/coding/coding_tests/bwt_coder_tests.cpp b/coding/coding_tests/bwt_coder_tests.cpp index 28965a54e9..5c095ae07e 100644 --- a/coding/coding_tests/bwt_coder_tests.cpp +++ b/coding/coding_tests/bwt_coder_tests.cpp @@ -34,6 +34,26 @@ string EncodeDecode(BWTCoder::Params const & params, string const & s) return result; } +string EncodeDecodeBlock(string const & s) +{ + vector data; + + { + MemWriter sink(data); + BWTCoder::EncodeAndWriteBlock(sink, s.size(), reinterpret_cast(s.data())); + } + + string result; + { + MemReader reader(data.data(), data.size()); + ReaderSource source(reader); + + BWTCoder::ReadAndDecodeBlock(source, back_inserter(result)); + } + + return result; +} + UNIT_TEST(BWTEncoder_Smoke) { for (size_t blockSize = 1; blockSize < 100; ++blockSize) @@ -42,12 +62,16 @@ UNIT_TEST(BWTEncoder_Smoke) params.m_blockSize = blockSize; string const s = "abracadabra"; + TEST_EQUAL(s, EncodeDecodeBlock(s), ()); TEST_EQUAL(s, EncodeDecode(params, s), (blockSize)); } string const strings[] = {"", "mississippi", "again and again and again"}; for (auto const & s : strings) + { + TEST_EQUAL(s, EncodeDecodeBlock(s), ()); TEST_EQUAL(s, EncodeDecode(BWTCoder::Params{}, s), ()); + } } UNIT_TEST(BWT_Large) diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro index cc74f90994..5e4ad89056 100644 --- a/coding/coding_tests/coding_tests.pro +++ b/coding/coding_tests/coding_tests.pro @@ -39,6 +39,7 @@ SOURCES += ../../testing/testingmain.cpp \ reader_writer_ops_test.cpp \ simple_dense_coding_test.cpp \ succinct_mapper_test.cpp \ + text_storage_tests.cpp \ traffic_test.cpp \ uri_test.cpp \ url_encode_test.cpp \ diff --git a/coding/coding_tests/text_storage_tests.cpp b/coding/coding_tests/text_storage_tests.cpp new file mode 100644 index 0000000000..8f9f86f343 --- /dev/null +++ b/coding/coding_tests/text_storage_tests.cpp @@ -0,0 +1,131 @@ +#include "testing/testing.hpp" + +#include "coding/reader.hpp" +#include "coding/text_storage.hpp" +#include "coding/writer.hpp" + +#include +#include +#include +#include + +using namespace coding; +using namespace std; + +namespace +{ +template +string GenerateRandomString(Engine & engine) +{ + int const kMinLength = 0; + int const kMaxLength = 400; + + int const kMinByte = 0; + int const kMaxByte = 255; + + uniform_int_distribution length(kMinLength, kMaxLength); + uniform_int_distribution byte(kMinByte, kMaxByte); + string s(length(engine), '\0'); + for (auto & b : s) + b = byte(engine); + return s; +} + +void DumpStrings(vector const & strings, uint64_t blockSize, vector & buffer) +{ + MemWriter> writer(buffer); + BlockedTextStorageWriter ts(writer, blockSize); + for (auto const & s : strings) + ts.Append(s); +} + +UNIT_TEST(TextStorage_Smoke) +{ + vector buffer; + DumpStrings({} /* strings */, 10 /* blockSize */, buffer); + + { + MemReader reader(buffer.data(), buffer.size()); + BlockedTextStorageIndex index; + index.Read(reader); + TEST_EQUAL(index.GetNumStrings(), 0, ()); + TEST_EQUAL(index.GetNumBlockInfos(), 0, ()); + } + + { + MemReader reader(buffer.data(), buffer.size()); + BlockedTextStorageReader ts(reader); + TEST_EQUAL(ts.GetNumStrings(), 0, ()); + } +} + +UNIT_TEST(TextStorage_Simple) +{ + vector const strings = {{"", "Hello", "Hello, World!", "Hola mundo", "Smoke test"}}; + + vector buffer; + DumpStrings(strings, 10 /* blockSize */, buffer); + + { + MemReader reader(buffer.data(), buffer.size()); + BlockedTextStorageIndex index; + index.Read(reader); + TEST_EQUAL(index.GetNumStrings(), strings.size(), ()); + TEST_EQUAL(index.GetNumBlockInfos(), 3, ()); + } + + { + MemReader reader(buffer.data(), buffer.size()); + BlockedTextStorageReader ts(reader); + TEST_EQUAL(ts.GetNumStrings(), strings.size(), ()); + for (size_t i = 0; i < ts.GetNumStrings(); ++i) + TEST_EQUAL(ts.ExtractString(i), strings[i], ()); + } +} + +UNIT_TEST(TextStorage_Empty) +{ + vector strings; + for (int i = 0; i < 1000; ++i) { + strings.emplace_back(string(1 /* size */, i % 256)); + for (int j = 0; j < 1000; ++j) + strings.emplace_back(); + } + + vector buffer; + DumpStrings(strings, 5 /* blockSize */, buffer); + + { + MemReader reader(buffer.data(), buffer.size()); + BlockedTextStorageReader ts(reader); + TEST_EQUAL(ts.GetNumStrings(), strings.size(), ()); + for (size_t i = 0; i < ts.GetNumStrings(); ++i) + TEST_EQUAL(ts.ExtractString(i), strings[i], ()); + } +} + +UNIT_TEST(TextStorage_Random) +{ + int const kSeed = 42; + int const kNumStrings = 1000; + int const kBlockSize = 100; + mt19937 engine(kSeed); + + vector strings; + for (int i = 0; i < kNumStrings; ++i) + strings.push_back(GenerateRandomString(engine)); + + vector buffer; + DumpStrings(strings, kBlockSize, buffer); + + MemReader reader(buffer.data(), buffer.size()); + BlockedTextStorageReader ts(reader); + + TEST_EQUAL(ts.GetNumStrings(), strings.size(), ()); + for (size_t i = 0; i < ts.GetNumStrings(); ++i) + TEST_EQUAL(ts.ExtractString(i), strings[i], ()); + ts.ClearCache(); + for (size_t i = ts.GetNumStrings() - 1; i < ts.GetNumStrings(); --i) + TEST_EQUAL(ts.ExtractString(i), strings[i], ()); +} +} // namespace diff --git a/coding/text_storage.hpp b/coding/text_storage.hpp new file mode 100644 index 0000000000..0f0c50b9d2 --- /dev/null +++ b/coding/text_storage.hpp @@ -0,0 +1,282 @@ +#pragma once + +#include "coding/bwt_coder.hpp" +#include "coding/reader.hpp" +#include "coding/varint.hpp" +#include "coding/write_to_sink.hpp" + +#include "base/assert.hpp" + +#include +#include +#include +#include + +namespace coding +{ +// Writes set of strings in a format that allows to access blocks of +// strings. The size of each block roughly equals to the |blockSize|, +// because the whole number of strings is packed into a single block. +// +// Format description: +// * first 8 bytes - little endian-encoded offset of the index section +// * data section - represents a catenated sequence of BWT-compressed blocks with +// the sequence of individual string lengths in the block +// * index section - represents a delta-encoded sequence of +// BWT-compressed blocks offsets intermixed with the number of +// strings inside each block. +// +// All numbers except the first offset are varints. +template +class BlockedTextStorageWriter +{ +public: + BlockedTextStorageWriter(Writer & writer, uint64_t blockSize) + : m_writer(writer), m_blockSize(blockSize), m_startOffset(writer.Pos()), m_blocks(1) + { + CHECK(m_blockSize != 0, ()); + WriteToSink(m_writer, static_cast(0)); + m_dataOffset = m_writer.Pos(); + } + + ~BlockedTextStorageWriter() + { + if (!m_lengths.empty()) + FlushPool(m_lengths, m_pool); + + if (m_blocks.back().IsEmpty()) + m_blocks.pop_back(); + + { + auto const currentOffset = m_writer.Pos(); + ASSERT_GREATER_OR_EQUAL(currentOffset, m_startOffset, ()); + m_writer.Seek(m_startOffset); + WriteToSink(m_writer, static_cast(currentOffset - m_startOffset)); + m_writer.Seek(currentOffset); + } + + WriteVarUint(m_writer, m_blocks.size()); + + uint64_t prevOffset = 0; + for (auto const & block : m_blocks) + { + ASSERT_GREATER_OR_EQUAL(block.m_offset, prevOffset, ()); + WriteVarUint(m_writer, block.m_offset - prevOffset); + + ASSERT(!block.IsEmpty(), ()); + WriteVarUint(m_writer, block.m_subs); + + prevOffset = block.m_offset; + } + } + + void Append(std::string const & s) + { + ASSERT(!m_blocks.empty(), ()); + + ASSERT_LESS(m_pool.size(), m_blockSize, ()); + + ++m_blocks.back().m_subs; + m_pool.append(s); + m_lengths.push_back(s.size()); + + if (m_pool.size() >= m_blockSize) + { + FlushPool(m_lengths, m_pool); + m_pool.clear(); + m_lengths.clear(); + m_blocks.emplace_back(m_writer.Pos() - m_dataOffset /* offset */, 0 /* subs */); + } + } + +private: + struct Block + { + Block() = default; + Block(uint64_t offset, uint64_t subs) : m_offset(offset), m_subs(subs) {} + + bool IsEmpty() const { return m_subs == 0; } + + uint64_t m_offset = 0; // offset of the block inside the sequence of compressed blocks + uint64_t m_subs = 0; // number of strings inside the block + }; + + void FlushPool(vector const & lengths, string const & pool) + { + for (auto const & length : lengths) + WriteVarUint(m_writer, length); + BWTCoder::EncodeAndWriteBlock(m_writer, pool.size(), + reinterpret_cast(pool.c_str())); + } + + Writer & m_writer; + uint64_t const m_blockSize; + uint64_t m_startOffset = 0; + uint64_t m_dataOffset = 0; + + vector m_blocks; + + std::string m_pool; // concatenated strings + vector m_lengths; // lengths of strings inside the |m_pool| +}; + +class BlockedTextStorageIndex +{ +public: + struct BlockInfo + { + // Returns the index of the first string belonging to the block. + uint64_t From() const { return m_from; } + + // Returns the index of the first string from the next block. + uint64_t To() const { return m_from + m_subs; } + + uint64_t m_offset = 0; // offset of the block from the beginning of the section + uint64_t m_from = 0; // index of the first string in the block + uint64_t m_subs = 0; // number of strings in the block + }; + + size_t GetNumBlockInfos() const { return m_blocks.size(); } + size_t GetNumStrings() const { return m_blocks.empty() ? 0 : m_blocks.back().To(); } + + BlockInfo const & GetBlockInfo(size_t blockIx) const + { + ASSERT_LESS(blockIx, GetNumBlockInfos(), ()); + return m_blocks[blockIx]; + } + + // Returns the index of the block the |stringIx| belongs to. + // Returns the number of blocks if there're no such block. + size_t GetBlockIx(size_t stringIx) const + { + if (m_blocks.empty() || stringIx >= m_blocks.back().To()) + return GetNumBlockInfos(); + if (stringIx >= m_blocks.back().From()) + return GetNumBlockInfos() - 1; + + size_t lo = 0, hi = GetNumBlockInfos() - 1; + while (lo + 1 != hi) + { + ASSERT_GREATER_OR_EQUAL(stringIx, m_blocks[lo].From(), ()); + ASSERT_LESS(stringIx, m_blocks[hi].From(), ()); + + auto const mi = lo + (hi - lo) / 2; + if (stringIx >= m_blocks[mi].From()) + lo = mi; + else + hi = mi; + } + + ASSERT_GREATER_OR_EQUAL(stringIx, m_blocks[lo].From(), ()); + ASSERT_LESS(stringIx, m_blocks[hi].From(), ()); + return lo; + } + + template + void Read(Reader & reader) + { + auto const indexOffset = ReadPrimitiveFromPos(reader, 0); + + NonOwningReaderSource source(reader); + source.Skip(indexOffset); + + auto const numBlocks = ReadVarUint(source); + m_blocks.assign(numBlocks, {}); + + uint64_t prevOffset = 8; // 8 bytes for the offset + for (uint64_t i = 0; i < numBlocks; ++i) + { + auto const delta = ReadVarUint(source); + CHECK_GREATER_OR_EQUAL(prevOffset + delta, prevOffset, ()); + prevOffset += delta; + + auto & block = m_blocks[i]; + block.m_offset = prevOffset; + block.m_from = i == 0 ? 0 : m_blocks[i - 1].To(); + block.m_subs = ReadVarUint(source); + CHECK_GREATER_OR_EQUAL(block.m_from + block.m_subs, block.m_from, ()); + } + } + +private: + std::vector m_blocks; +}; + +template +class BlockedTextStorageReader +{ +public: + explicit BlockedTextStorageReader(Reader & reader) : m_reader(reader) { m_index.Read(reader); } + + size_t GetNumStrings() const { return m_index.GetNumStrings(); } + + std::string ExtractString(size_t stringIx) + { + auto const blockIx = m_index.GetBlockIx(stringIx); + CHECK_LESS(blockIx, m_index.GetNumBlockInfos(), ()); + + if (blockIx >= m_cache.size()) + m_cache.resize(blockIx + 1); + ASSERT_LESS(blockIx, m_cache.size(), ()); + + auto const & bi = m_index.GetBlockInfo(blockIx); + + auto & entry = m_cache[blockIx]; + if (!entry.m_valid) + { + NonOwningReaderSource source(m_reader); + source.Skip(bi.m_offset); + + entry.m_value.clear(); + entry.m_subs.resize(bi.m_subs); + + uint64_t offset = 0; + for (size_t i = 0; i < entry.m_subs.size(); ++i) + { + auto & sub = entry.m_subs[i]; + sub.m_offset = offset; + sub.m_length = ReadVarUint(source); + CHECK_GREATER_OR_EQUAL(sub.m_offset + sub.m_length, sub.m_offset, ()); + offset += sub.m_length; + } + BWTCoder::ReadAndDecodeBlock(source, std::back_inserter(entry.m_value)); + entry.m_valid = true; + } + ASSERT(entry.m_valid, ()); + + ASSERT_GREATER_OR_EQUAL(stringIx, bi.From(), ()); + ASSERT_LESS(stringIx, bi.To(), ()); + + stringIx -= bi.From(); + ASSERT_LESS(stringIx, entry.m_subs.size(), ()); + + auto const & si = entry.m_subs[stringIx]; + auto const & value = entry.m_value; + ASSERT_LESS_OR_EQUAL(si.m_offset + si.m_length, value.size(), ()); + return value.substr(si.m_offset, si.m_length); + } + + void ClearCache() { m_cache.clear(); } + +private: + struct StringInfo + { + StringInfo() = default; + StringInfo(uint64_t offset, uint64_t length): m_offset(offset), m_length(length) {} + + uint64_t m_offset = 0; // offset of the string inside the decompressed block + uint64_t m_length = 0; // length of the string + }; + + struct CacheEntry + { + std::string m_value; // concatenation of the strings + std::vector m_subs; // indices of individual strings + bool m_valid = false; + }; + + Reader & m_reader; + BlockedTextStorageIndex m_index; + std::vector m_cache; +}; +} // namespace coding -- cgit v1.2.3