Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/coding
diff options
context:
space:
mode:
authorYuri Gorshenin <y@maps.me>2017-06-30 13:00:28 +0300
committerArsentiy Milchakov <milcars@mapswithme.com>2017-07-04 14:09:45 +0300
commitf8aeb3711100c875e718b952d8a42442030d02af (patch)
tree7b36bea7a742022b18ef43c7ee4ac3de5f0ab9a0 /coding
parent043cc3623ae4a22557b7c9738d3c7e6d5cabbb0a (diff)
[base][coding] Burrows-Wheeler coder.
Diffstat (limited to 'coding')
-rw-r--r--coding/CMakeLists.txt1
-rw-r--r--coding/bwt_coder.hpp117
-rw-r--r--coding/coding.pro1
-rw-r--r--coding/coding_tests/CMakeLists.txt1
-rw-r--r--coding/coding_tests/bwt_coder_tests.cpp82
-rw-r--r--coding/coding_tests/coding_tests.pro1
-rw-r--r--coding/huffman.cpp56
-rw-r--r--coding/huffman.hpp151
8 files changed, 331 insertions, 79 deletions
diff --git a/coding/CMakeLists.txt b/coding/CMakeLists.txt
index f8d9cd9f6f..4516ff6375 100644
--- a/coding/CMakeLists.txt
+++ b/coding/CMakeLists.txt
@@ -17,6 +17,7 @@ set(
base64.hpp
bit_streams.hpp
buffer_reader.hpp
+ bwt_coder.hpp
byte_stream.hpp
coder.hpp
coder_util.hpp
diff --git a/coding/bwt_coder.hpp b/coding/bwt_coder.hpp
new file mode 100644
index 0000000000..59d66c2d94
--- /dev/null
+++ b/coding/bwt_coder.hpp
@@ -0,0 +1,117 @@
+#pragma once
+
+#include "coding/huffman.hpp"
+#include "coding/varint.hpp"
+
+#include "base/bwt.hpp"
+#include "base/move_to_front.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <iterator>
+#include <vector>
+
+namespace coding
+{
+class BWTCoder
+{
+public:
+ struct Params
+ {
+ size_t m_blockSize = 32000;
+ };
+
+ template <typename Sink>
+ static void EncodeAndWriteBlock(Sink & sink, size_t n, uint8_t const * s,
+ std::vector<uint8_t> & bwtBuffer)
+ {
+ bwtBuffer.resize(n);
+ auto const start = base::BWT(n, s, bwtBuffer.data());
+
+ base::MoveToFront mtf;
+ for (auto & b : bwtBuffer)
+ b = mtf.Transform(b);
+
+ WriteVarUint(sink, start);
+
+ HuffmanCoder huffman;
+ huffman.Init(bwtBuffer.begin(), bwtBuffer.end());
+ huffman.WriteEncoding(sink);
+ huffman.EncodeAndWrite(sink, bwtBuffer.begin(), bwtBuffer.end());
+ }
+
+ template <typename Sink>
+ static void EncodeAndWriteBlock(Sink & sink, size_t n, uint8_t const * s)
+ {
+ vector<uint8_t> bwtBuffer;
+ EncodeAndWriteBlock(sink, n, s, bwtBuffer);
+ }
+
+ template <typename Sink>
+ static void EncodeAndWrite(Params const & params, Sink & sink, size_t n, uint8_t const * s)
+ {
+ CHECK(params.m_blockSize != 0, ());
+ CHECK_GREATER(n + params.m_blockSize, n, ());
+
+ std::vector<uint8_t> bwtBuffer;
+
+ size_t const numBlocks = (n + params.m_blockSize - 1) / params.m_blockSize;
+ WriteVarUint(sink, numBlocks);
+ for (size_t i = 0; i < n; i += params.m_blockSize)
+ {
+ auto const m = min(n - i, params.m_blockSize);
+ EncodeAndWriteBlock(sink, m, s + i, bwtBuffer);
+ }
+ }
+
+ template <typename Source, typename OutIt>
+ static OutIt ReadAndDecodeBlock(Source & source, vector<uint8_t> & bwtBuffer,
+ vector<uint8_t> & revBuffer, OutIt it)
+ {
+ auto const start = ReadVarUint<uint64_t, Source>(source);
+
+ HuffmanCoder huffman;
+ huffman.ReadEncoding(source);
+
+ bwtBuffer.clear();
+ huffman.ReadAndDecode(source, std::back_inserter(bwtBuffer));
+
+ size_t const n = bwtBuffer.size();
+ base::MoveToFront mtf;
+ for (size_t i = 0; i < n; ++i)
+ {
+ auto const b = mtf[bwtBuffer[i]];
+ bwtBuffer[i] = b;
+ mtf.Transform(b);
+ }
+
+ CHECK_LESS(start, n, ());
+
+ revBuffer.resize(n);
+ base::RevBWT(n, static_cast<size_t>(start), bwtBuffer.data(), revBuffer.data());
+ return std::copy(revBuffer.begin(), revBuffer.end(), it);
+ }
+
+ template <typename Source, typename OutIt>
+ static OutIt ReadAndDecodeBlock(Source & source, OutIt it)
+ {
+ vector<uint8_t> bwtBuffer;
+ vector<uint8_t> revBuffer;
+ return ReadAndDecodeBlock(source, bwtBuffer, revBuffer, it);
+ }
+
+ template <typename Source, typename OutIt>
+ static OutIt ReadAndDecode(Source & source, OutIt it)
+ {
+ auto const numBlocks = ReadVarUint<uint64_t, Source>(source);
+ CHECK_LESS(numBlocks, std::numeric_limits<size_t>::max(), ());
+
+ std::vector<uint8_t> bwtBuffer;
+ std::vector<uint8_t> revBuffer;
+
+ for (size_t i = 0; i < static_cast<size_t>(numBlocks); ++i)
+ it = ReadAndDecodeBlock(source, bwtBuffer, revBuffer, it);
+ return it;
+ }
+};
+} // namespace coding
diff --git a/coding/coding.pro b/coding/coding.pro
index a6f03f95c4..3b4d1b8733 100644
--- a/coding/coding.pro
+++ b/coding/coding.pro
@@ -40,6 +40,7 @@ HEADERS += \
base64.hpp \
bit_streams.hpp \
buffer_reader.hpp \
+ bwt_coder.hpp \
byte_stream.hpp \
coder.hpp \
coder_util.hpp \
diff --git a/coding/coding_tests/CMakeLists.txt b/coding/coding_tests/CMakeLists.txt
index 944730f8c7..3b709659b9 100644
--- a/coding/coding_tests/CMakeLists.txt
+++ b/coding/coding_tests/CMakeLists.txt
@@ -6,6 +6,7 @@ set(
SRC
base64_test.cpp
bit_streams_test.cpp
+ bwt_coder_tests.cpp
coder_test.hpp
coder_util_test.cpp
compressed_bit_vector_test.cpp
diff --git a/coding/coding_tests/bwt_coder_tests.cpp b/coding/coding_tests/bwt_coder_tests.cpp
new file mode 100644
index 0000000000..28965a54e9
--- /dev/null
+++ b/coding/coding_tests/bwt_coder_tests.cpp
@@ -0,0 +1,82 @@
+#include "testing/testing.hpp"
+
+#include "coding/bwt_coder.hpp"
+#include "coding/reader.hpp"
+#include "coding/writer.hpp"
+
+#include <algorithm>
+#include <iterator>
+#include <random>
+#include <string>
+
+using namespace coding;
+using namespace std;
+
+namespace
+{
+string EncodeDecode(BWTCoder::Params const & params, string const & s)
+{
+ vector<uint8_t> data;
+
+ {
+ MemWriter<decltype(data)> sink(data);
+ BWTCoder::EncodeAndWrite(params, sink, s.size(), reinterpret_cast<uint8_t const *>(s.data()));
+ }
+
+ string result;
+ {
+ MemReader reader(data.data(), data.size());
+ ReaderSource<MemReader> source(reader);
+
+ BWTCoder::ReadAndDecode(source, back_inserter(result));
+ }
+
+ return result;
+}
+
+UNIT_TEST(BWTEncoder_Smoke)
+{
+ for (size_t blockSize = 1; blockSize < 100; ++blockSize)
+ {
+ BWTCoder::Params params;
+
+ params.m_blockSize = blockSize;
+ string const s = "abracadabra";
+ TEST_EQUAL(s, EncodeDecode(params, s), (blockSize));
+ }
+
+ string const strings[] = {"", "mississippi", "again and again and again"};
+ for (auto const & s : strings)
+ TEST_EQUAL(s, EncodeDecode(BWTCoder::Params{}, s), ());
+}
+
+UNIT_TEST(BWT_Large)
+{
+ string s;
+ for (size_t i = 0; i < 10000; ++i)
+ s += "mississippi";
+ TEST_EQUAL(s, EncodeDecode(BWTCoder::Params{}, s), ());
+}
+
+UNIT_TEST(BWT_AllBytes)
+{
+ int kSeed = 42;
+ int kMin = 1;
+ int kMax = 1000;
+
+ mt19937 engine(kSeed);
+ uniform_int_distribution<int> uid(kMin, kMax);
+
+ string s;
+ for (size_t i = 0; i < 256; ++i)
+ {
+ auto const count = uid(engine);
+ ASSERT_GREATER_OR_EQUAL(count, kMin, ());
+ ASSERT_LESS_OR_EQUAL(count, kMax, ());
+ for (int j = 0; j < count; ++j)
+ s.push_back(static_cast<uint8_t>(i));
+ }
+ shuffle(s.begin(), s.end(), engine);
+ TEST_EQUAL(s, EncodeDecode(BWTCoder::Params{}, s), ());
+}
+} // namespace
diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro
index 20bf404214..cc74f90994 100644
--- a/coding/coding_tests/coding_tests.pro
+++ b/coding/coding_tests/coding_tests.pro
@@ -15,6 +15,7 @@ DEFINES += OMIM_UNIT_TEST_DISABLE_PLATFORM_INIT
SOURCES += ../../testing/testingmain.cpp \
base64_test.cpp \
bit_streams_test.cpp \
+ bwt_coder_tests.cpp \
coder_util_test.cpp \
compressed_bit_vector_test.cpp \
dd_vector_test.cpp \
diff --git a/coding/huffman.cpp b/coding/huffman.cpp
index a8d8a1275a..83d60039a3 100644
--- a/coding/huffman.cpp
+++ b/coding/huffman.cpp
@@ -9,13 +9,6 @@ HuffmanCoder::~HuffmanCoder()
DeleteHuffmanTree(m_root);
}
-void HuffmanCoder::Init(vector<strings::UniString> const & data)
-{
- DeleteHuffmanTree(m_root);
- BuildHuffmanTree(data.begin(), data.end());
- BuildTables(m_root, 0);
-}
-
bool HuffmanCoder::Encode(uint32_t symbol, Code & code) const
{
auto it = m_encoderTable.find(symbol);
@@ -49,6 +42,14 @@ void HuffmanCoder::BuildTables(Node * root, uint32_t path)
BuildTables(root->r, path + (static_cast<uint32_t>(1) << root->depth));
}
+void HuffmanCoder::Clear()
+{
+ DeleteHuffmanTree(m_root);
+ m_root = nullptr;
+ m_encoderTable.clear();
+ m_decoderTable.clear();
+}
+
void HuffmanCoder::DeleteHuffmanTree(Node * root)
{
if (!root)
@@ -58,6 +59,47 @@ void HuffmanCoder::DeleteHuffmanTree(Node * root)
delete root;
}
+void HuffmanCoder::BuildHuffmanTree(Freqs const & freqs)
+{
+ // One would need more than 2^32 symbols to build a code that long.
+ // On the other hand, 32 is short enough for our purposes, so do not
+ // try to shrink the trees beyond this threshold.
+
+ uint32_t const kMaxDepth = 32;
+
+ priority_queue<Node *, vector<Node *>, NodeComparator> pq;
+ for (auto const & e : freqs.GetTable())
+ pq.push(new Node(e.first, e.second, true /* isLeaf */));
+
+ if (pq.empty())
+ {
+ m_root = nullptr;
+ return;
+ }
+
+ while (pq.size() > 1)
+ {
+ auto a = pq.top();
+ pq.pop();
+ auto b = pq.top();
+ pq.pop();
+ if (a->symbol > b->symbol)
+ swap(a, b);
+ // Give it the smaller symbol to make the resulting encoding more predictable.
+ auto ab = new Node(a->symbol, a->freq + b->freq, false /* isLeaf */);
+ ab->l = a;
+ ab->r = b;
+ CHECK_LESS_OR_EQUAL(a->depth, kMaxDepth, ());
+ CHECK_LESS_OR_EQUAL(b->depth, kMaxDepth, ());
+ pq.push(ab);
+ }
+
+ m_root = pq.top();
+ pq.pop();
+
+ SetDepths(m_root, 0);
+}
+
void HuffmanCoder::SetDepths(Node * root, uint32_t depth)
{
if (!root)
diff --git a/coding/huffman.hpp b/coding/huffman.hpp
index a59b24a195..5beaef4033 100644
--- a/coding/huffman.hpp
+++ b/coding/huffman.hpp
@@ -7,6 +7,7 @@
#include "base/string_utils.hpp"
#include "std/algorithm.hpp"
+#include "std/iterator.hpp"
#include "std/map.hpp"
#include "std/queue.hpp"
#include "std/vector.hpp"
@@ -16,6 +17,43 @@ namespace coding
class HuffmanCoder
{
public:
+ class Freqs
+ {
+ public:
+ using Table = map<uint32_t, uint32_t>;
+
+ Freqs() = default;
+
+ template <typename... Args>
+ Freqs(Args const &... args)
+ {
+ Add(args...);
+ }
+
+ void Add(strings::UniString const & s) { Add(s.begin(), s.end()); }
+
+ void Add(string const & s) { Add(s.begin(), s.end()); }
+
+ template <typename It>
+ void Add(It begin, It end)
+ {
+ for (; begin != end; ++begin)
+ ++m_table[static_cast<uint32_t>(*begin)];
+ }
+
+ template <typename T>
+ void Add(vector<T> const & v)
+ {
+ for (auto const & e : v)
+ Add(e);
+ }
+
+ Table const & GetTable() const { return m_table; }
+
+ private:
+ Table m_table;
+ };
+
// A Code encodes a path to a leaf. It is read starting from
// the least significant bit.
struct Code
@@ -39,7 +77,15 @@ public:
// Internally builds a Huffman tree and makes
// the EncodeAndWrite and ReadAndDecode methods available.
- void Init(vector<strings::UniString> const & data);
+ template <typename... Args>
+ void Init(Args const &... args)
+ {
+ Clear();
+ BuildHuffmanTree(Freqs(args...));
+ BuildTables(m_root, 0);
+ }
+
+ void Clear();
// One way to store the encoding would be
// -- the succinct representation of the topology of Huffman tree;
@@ -103,32 +149,51 @@ public:
bool Encode(uint32_t symbol, Code & code) const;
bool Decode(Code const & code, uint32_t & symbol) const;
+ template <typename TWriter, typename It>
+ uint32_t EncodeAndWrite(TWriter & writer, It begin, It end) const
+ {
+ size_t const d = distance(begin, end);
+ BitWriter<TWriter> bitWriter(writer);
+ WriteVarUint(writer, d);
+ uint32_t sz = 0;
+ for (; begin != end; ++begin)
+ sz += EncodeAndWrite(bitWriter, static_cast<uint32_t>(*begin));
+ return sz;
+ }
+
+ template <typename TWriter>
+ uint32_t EncodeAndWrite(TWriter & writer, string const & s) const
+ {
+ return EncodeAndWrite(writer, s.begin(), s.end());
+ }
+
// Returns the number of bits written AFTER the size, i.e. the number
// of bits that the encoded string consists of.
template <typename TWriter>
uint32_t EncodeAndWrite(TWriter & writer, strings::UniString const & s) const
{
- BitWriter<TWriter> bitWriter(writer);
- WriteVarUint(writer, s.size());
- uint32_t sz = 0;
- for (size_t i = 0; i < s.size(); ++i)
- sz += EncodeAndWrite(bitWriter, static_cast<uint32_t>(s[i]));
- return sz;
+ return EncodeAndWrite(writer, s.begin(), s.end());
}
- template <typename TSource>
- strings::UniString ReadAndDecode(TSource & src) const
+ template <typename TSource, typename OutIt>
+ void ReadAndDecode(TSource & src, OutIt out) const
{
BitReader<TSource> bitReader(src);
size_t sz = static_cast<size_t>(ReadVarUint<uint32_t, TSource>(src));
vector<strings::UniChar> v(sz);
for (size_t i = 0; i < sz; ++i)
- v[i] = static_cast<strings::UniChar>(ReadAndDecode(bitReader));
- return strings::UniString(v.begin(), v.end());
+ *out++ = ReadAndDecode(bitReader);
}
-private:
+ template <typename TSource>
+ strings::UniString ReadAndDecode(TSource & src) const
+ {
+ strings::UniString result;
+ ReadAndDecode(src, std::back_inserter(result));
+ return result;
+ }
+private:
struct Node
{
Node *l, *r;
@@ -195,72 +260,14 @@ private:
void DeleteHuffmanTree(Node * root);
- // Builds a fixed Huffman tree for a collection of strings::UniStrings.
- // UniString is always UTF-32. Every code point is treated as a symbol for the encoder.
- template <typename TIter>
- void BuildHuffmanTree(TIter const & beg, TIter const & end)
- {
- if (beg == end)
- {
- m_root = nullptr;
- return;
- }
-
- // One would need more than 2^32 symbols to build a code that long.
- // On the other hand, 32 is short enough for our purposes, so do not
- // try to shrink the trees beyond this threshold.
- uint32_t const kMaxDepth = 32;
-
- map<uint32_t, uint32_t> freqs;
- for (auto it = beg; it != end; ++it)
- {
- auto const & e = *it;
- for (size_t i = 0; i < e.size(); ++i)
- {
- ++freqs[static_cast<uint32_t>(e[i])];
- }
- }
-
- priority_queue<Node *, vector<Node *>, NodeComparator> pq;
- for (auto const & e : freqs)
- pq.push(new Node(e.first, e.second, true /* isLeaf */));
-
- if (pq.empty())
- {
- m_root = nullptr;
- return;
- }
-
- while (pq.size() > 1)
- {
- auto a = pq.top();
- pq.pop();
- auto b = pq.top();
- pq.pop();
- if (a->symbol > b->symbol)
- swap(a, b);
- // Give it the smaller symbol to make the resulting encoding more predictable.
- auto ab = new Node(a->symbol, a->freq + b->freq, false /* isLeaf */);
- ab->l = a;
- ab->r = b;
- CHECK_LESS_OR_EQUAL(a->depth, kMaxDepth, ());
- CHECK_LESS_OR_EQUAL(b->depth, kMaxDepth, ());
- pq.push(ab);
- }
-
- m_root = pq.top();
- pq.pop();
-
- SetDepths(m_root, 0);
- }
+ void BuildHuffmanTree(Freqs const & freqs);
// Properly sets the depth field in the subtree rooted at root.
// It is easier to do it after the tree is built.
void SetDepths(Node * root, uint32_t depth);
- Node * m_root; // m_pRoot?
+ Node * m_root;
map<Code, uint32_t> m_decoderTable;
map<uint32_t, Code> m_encoderTable;
};
-
} // namespace coding