Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/coding
diff options
context:
space:
mode:
authorMaxim Pimenov <m@maps.me>2015-07-03 20:25:19 +0300
committerAlex Zolotarev <alex@maps.me>2015-09-23 02:56:57 +0300
commit097618caddc3734e141ef18275f2ab3672f1f8f4 (patch)
tree1c2f8cce36745a1913c92a315156fa83555dcca8 /coding
parent84d1eb88a17e628bedbbbb98f0d275a1eb165605 (diff)
[omim] [coding] A Huffman coding implementation.
Diffstat (limited to 'coding')
-rw-r--r--coding/coding.pro2
-rw-r--r--coding/coding_tests/coding_tests.pro1
-rw-r--r--coding/coding_tests/huffman_test.cpp99
-rw-r--r--coding/huffman.cpp78
-rw-r--r--coding/huffman.hpp254
5 files changed, 434 insertions, 0 deletions
diff --git a/coding/coding.pro b/coding/coding.pro
index 9c48d33449..6fa5ba57f7 100644
--- a/coding/coding.pro
+++ b/coding/coding.pro
@@ -37,6 +37,7 @@ SOURCES += \
compressed_bit_vector.cpp \
# compressed_varnum_vector.cpp \
png_memory_encoder.cpp \
+ huffman.cpp \
HEADERS += \
internal/xmlparser.hpp \
@@ -104,3 +105,4 @@ HEADERS += \
varint_misc.hpp \
bit_streams.hpp \
png_memory_encoder.hpp \
+ huffman.hpp \
diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro
index 4f92622d01..a8f5845170 100644
--- a/coding/coding_tests/coding_tests.pro
+++ b/coding/coding_tests/coding_tests.pro
@@ -30,6 +30,7 @@ SOURCES += ../../testing/testingmain.cpp \
file_utils_test.cpp \
gzip_test.cpp \
hex_test.cpp \
+ huffman_test.cpp \
mem_file_reader_test.cpp \
mem_file_writer_test.cpp \
multilang_utf8_string_test.cpp \
diff --git a/coding/coding_tests/huffman_test.cpp b/coding/coding_tests/huffman_test.cpp
new file mode 100644
index 0000000000..0aff25126a
--- /dev/null
+++ b/coding/coding_tests/huffman_test.cpp
@@ -0,0 +1,99 @@
+#include "testing/testing.hpp"
+
+#include "coding/huffman.hpp"
+#include "coding/reader.hpp"
+#include "coding/writer.hpp"
+
+#include "base/string_utils.hpp"
+#include "std/vector.hpp"
+
+namespace
+{
+vector<strings::UniString> MakeUniStringVector(vector<string> const & v)
+{
+ vector<strings::UniString> result(v.size());
+ for (size_t i = 0; i < v.size(); ++i)
+ result[i] = strings::UniString(v[i].begin(), v[i].end());
+ return result;
+}
+
+void TestDecode(coding::HuffmanCoder const & h, uint32_t bits, uint32_t len, uint32_t expected)
+{
+ coding::HuffmanCoder::Code code(bits, len);
+ uint32_t received;
+ TEST(h.Decode(code, received), ("Could not decode", code.bits, "( length", code.len, ")"));
+ TEST_EQUAL(expected, received, ());
+}
+} // namespace
+
+namespace coding
+{
+UNIT_TEST(Huffman_Smoke)
+{
+ HuffmanCoder h;
+ h.Init(MakeUniStringVector(vector<string>{"ab", "ac"}));
+
+ TestDecode(h, 0, 1, static_cast<uint32_t>('a')); // 0
+ TestDecode(h, 1, 2, static_cast<uint32_t>('b')); // 10
+ TestDecode(h, 3, 2, static_cast<uint32_t>('c')); // 11
+}
+
+UNIT_TEST(Huffman_OneSymbol)
+{
+ HuffmanCoder h;
+ h.Init(MakeUniStringVector(vector<string>{string(5, 0)}));
+
+ TestDecode(h, 0, 0, 0);
+}
+
+UNIT_TEST(Huffman_Serialization_Encoding)
+{
+ HuffmanCoder hW;
+ hW.Init(MakeUniStringVector(
+ vector<string>{"aaaaaaaaaa", "bbbbbbbbbb", "ccccc", "ddddd"})); // 10, 10, 5, 5
+ vector<uint8_t> buf;
+ MemWriter<vector<uint8_t>> writer(buf);
+ hW.WriteEncoding(writer);
+
+ HuffmanCoder hR;
+ MemReader memReader(&buf[0], buf.size());
+ ReaderSource<MemReader> reader(memReader);
+ hR.ReadEncoding(reader);
+
+ TEST_EQUAL(reader.Pos(), writer.Pos(), ());
+
+ TestDecode(hW, 0, 2, static_cast<uint32_t>('a')); // 00
+ TestDecode(hW, 2, 2, static_cast<uint32_t>('b')); // 01
+ TestDecode(hW, 1, 2, static_cast<uint32_t>('c')); // 10
+ TestDecode(hW, 3, 2, static_cast<uint32_t>('d')); // 11
+
+ TestDecode(hR, 0, 2, static_cast<uint32_t>('a'));
+ TestDecode(hR, 2, 2, static_cast<uint32_t>('b'));
+ TestDecode(hR, 1, 2, static_cast<uint32_t>('c'));
+ TestDecode(hR, 3, 2, static_cast<uint32_t>('d'));
+}
+
+UNIT_TEST(Huffman_Serialization_Data)
+{
+ HuffmanCoder hW;
+ hW.Init(MakeUniStringVector(
+ vector<string>{"aaaaaaaaaa", "bbbbbbbbbb", "ccccc", "ddddd"})); // 10, 10, 5, 5
+ vector<uint8_t> buf;
+
+ string const data = "abacabaddddaaabbcabacabadbabd";
+ strings::UniString expected = strings::UniString(data.begin(), data.end());
+
+ MemWriter<vector<uint8_t>> writer(buf);
+ hW.WriteEncoding(writer);
+ hW.EncodeAndWrite(writer, expected);
+
+ HuffmanCoder hR;
+ MemReader memReader(&buf[0], buf.size());
+ ReaderSource<MemReader> reader(memReader);
+ hR.ReadEncoding(reader);
+ strings::UniString received = hR.ReadAndDecode(reader);
+
+ TEST_EQUAL(expected, received, ());
+}
+
+} // namespace coding
diff --git a/coding/huffman.cpp b/coding/huffman.cpp
new file mode 100644
index 0000000000..acdd945303
--- /dev/null
+++ b/coding/huffman.cpp
@@ -0,0 +1,78 @@
+#include "coding/huffman.hpp"
+
+#include "base/logging.hpp"
+
+namespace coding
+{
+HuffmanCoder::~HuffmanCoder()
+{
+ try
+ {
+ DeleteHuffmanTree(m_root);
+ }
+ catch (...)
+ {
+ LOG(LWARNING, ("Caught an exception when deleting a Huffman tree."));
+ }
+}
+
+void HuffmanCoder::Init(vector<strings::UniString> const & data)
+{
+ BuildHuffmanTree(data.begin(), data.end());
+ BuildTables(m_root, 0);
+ DeleteHuffmanTree(m_root);
+ m_root = nullptr;
+}
+
+bool HuffmanCoder::Encode(uint32_t symbol, Code & code) const
+{
+ auto it = m_encoderTable.find(symbol);
+ if (it == m_encoderTable.end())
+ return false;
+ code = it->second;
+ return true;
+}
+
+bool HuffmanCoder::Decode(Code const & code, uint32_t & symbol) const
+{
+ auto it = m_decoderTable.find(code);
+ if (it == m_decoderTable.end())
+ return false;
+ symbol = it->second;
+ return true;
+}
+
+void HuffmanCoder::BuildTables(Node * root, uint32_t path)
+{
+ if (!root)
+ return;
+ if (root->isLeaf)
+ {
+ Code code(path, root->depth);
+ m_encoderTable[root->symbol] = code;
+ m_decoderTable[code] = root->symbol;
+ return;
+ }
+ BuildTables(root->l, path);
+ BuildTables(root->r, path + (static_cast<uint32_t>(1) << root->depth));
+}
+
+void HuffmanCoder::DeleteHuffmanTree(Node * root)
+{
+ if (!root)
+ return;
+ DeleteHuffmanTree(root->l);
+ DeleteHuffmanTree(root->r);
+ delete root;
+}
+
+void HuffmanCoder::SetDepths(Node * root, uint32_t depth)
+{
+ if (!root)
+ return;
+ root->depth = depth;
+ SetDepths(root->l, depth + 1);
+ SetDepths(root->r, depth + 1);
+}
+
+} // namespace coding
diff --git a/coding/huffman.hpp b/coding/huffman.hpp
new file mode 100644
index 0000000000..b004a00268
--- /dev/null
+++ b/coding/huffman.hpp
@@ -0,0 +1,254 @@
+#pragma once
+
+#include "coding/bit_streams.hpp"
+#include "coding/varint.hpp"
+
+#include "base/assert.hpp"
+#include "base/string_utils.hpp"
+
+#include "std/algorithm.hpp"
+#include "std/map.hpp"
+#include "std/queue.hpp"
+#include "std/vector.hpp"
+
+namespace coding
+{
+class HuffmanCoder
+{
+public:
+ // A Code encodes a path to a leaf. It is read starting from
+ // the least significant bit.
+ struct Code
+ {
+ uint32_t bits;
+ uint32_t len;
+
+ Code() = default;
+ Code(uint32_t bits, uint32_t len) : bits(bits), len(len) {}
+
+ bool operator<(const Code & o) const
+ {
+ if (bits != o.bits)
+ return bits < o.bits;
+ return len < o.len;
+ }
+ };
+
+ HuffmanCoder() : m_root(nullptr) {}
+ ~HuffmanCoder();
+
+ // Internally builds a Huffman tree and makes
+ // the EncodeAndWrite and ReadAndDecode methods available.
+ void Init(vector<strings::UniString> const & data);
+
+ // One way to store the encoding would be
+ // -- the succinct representation of the topology of Huffman tree;
+ // -- the list of symbols that are stored in the leaves, as varuints in delta encoding.
+ // This would probably be an overkill.
+ template <typename TWriter>
+ void WriteEncoding(TWriter & writer)
+ {
+ // @todo Do not waste space, use BitWriter.
+ WriteVarUint(writer, m_decoderTable.size());
+ for (auto const & kv : m_decoderTable)
+ {
+ WriteVarUint(writer, kv.first.bits);
+ WriteVarUint(writer, kv.first.len);
+ WriteVarUint(writer, kv.second);
+ }
+ }
+
+ template <typename TReader>
+ void ReadEncoding(TReader & reader)
+ {
+ DeleteHuffmanTree(m_root);
+ m_root = new Node(0 /* symbol */, 0 /* freq */, false /* isLeaf */);
+
+ m_encoderTable.clear();
+ m_decoderTable.clear();
+
+ size_t sz = static_cast<size_t>(ReadVarUint<uint32_t, TReader>(reader));
+ for (size_t i = 0; i < sz; ++i)
+ {
+ uint32_t bits = ReadVarUint<uint32_t, TReader>(reader);
+ uint32_t len = ReadVarUint<uint32_t, TReader>(reader);
+ uint32_t symbol = ReadVarUint<uint32_t, TReader>(reader);
+ Code code(bits, len);
+
+ m_encoderTable[symbol] = code;
+ m_decoderTable[code] = symbol;
+
+ Node * cur = m_root;
+ for (size_t j = 0; j < len; ++j)
+ {
+ if (((bits >> j) & 1) == 0)
+ {
+ if (!cur->l)
+ cur->l = new Node(0 /* symbol */, 0 /* freq */, false /* isLeaf */);
+ cur = cur->l;
+ }
+ else
+ {
+ if (!cur->r)
+ cur->r = new Node(0 /* symbol */, 0 /* freq */, false /* isLeaf */);
+ cur = cur->r;
+ }
+ cur->depth = j + 1;
+ }
+ cur->isLeaf = true;
+ cur->symbol = symbol;
+ }
+ }
+
+ bool Encode(uint32_t symbol, Code & code) const;
+ bool Decode(Code const & code, uint32_t & symbol) const;
+
+ template <typename TWriter>
+ void EncodeAndWrite(TWriter & writer, strings::UniString const & s)
+ {
+ BitWriter<TWriter> bitWriter(writer);
+ WriteVarUint(writer, s.size());
+ for (size_t i = 0; i < s.size(); ++i)
+ EncodeAndWrite(bitWriter, static_cast<uint32_t>(s[i]));
+ }
+
+ template <typename TReader>
+ strings::UniString ReadAndDecode(TReader & reader) const
+ {
+ BitReader<TReader> bitReader(reader);
+ size_t sz = static_cast<size_t>(ReadVarUint<uint32_t, TReader>(reader));
+ vector<strings::UniChar> v(sz);
+ for (size_t i = 0; i < sz; ++i)
+ v[i] = static_cast<strings::UniChar>(ReadAndDecode(bitReader));
+ return strings::UniString(v.begin(), v.end());
+ }
+
+private:
+ // One would need more than 2^32 symbols to build a code that long.
+ // On the other hand, 32 is short enough for our purposes, so do not
+ // try to shrink the trees beyond this threshold.
+ const uint32_t kMaxDepth = 32;
+
+ struct Node
+ {
+ Node *l, *r;
+ uint32_t symbol;
+ uint32_t freq;
+ uint32_t depth;
+ bool isLeaf;
+
+ Node(uint32_t symbol, uint32_t freq, bool isLeaf)
+ : l(nullptr), r(nullptr), symbol(symbol), freq(freq), depth(0), isLeaf(isLeaf)
+ {
+ }
+ };
+
+ struct NodeComparator
+ {
+ bool operator()(Node const * const a, Node const * const b) const
+ {
+ if (a->freq != b->freq)
+ return a->freq > b->freq;
+ return a->symbol > b->symbol;
+ }
+ };
+
+ // No need to clump the interface: keep private the methods
+ // that encode one symbol only.
+ template <typename TWriter>
+ void EncodeAndWrite(BitWriter<TWriter> & bitWriter, uint32_t symbol)
+ {
+ Code code;
+ CHECK(Encode(symbol, code), ());
+ for (size_t i = 0; i < code.len; ++i)
+ bitWriter.Write((code.bits >> i) & 1, 1);
+ }
+
+ template <typename TReader>
+ uint32_t ReadAndDecode(BitReader<TReader> & bitReader) const
+ {
+ Node * cur = m_root;
+ while (cur)
+ {
+ if (cur->isLeaf)
+ return cur->symbol;
+ uint8_t bit = bitReader.Read(1);
+ if (bit == 0)
+ cur = cur->l;
+ else
+ cur = cur->r;
+ }
+ CHECK(false, ("Could not decode a Huffman-encoded symbol."));
+ return 0;
+ }
+
+ // Converts a Huffman tree into the more convenient representation
+ // of encoding and decoding tables.
+ void BuildTables(Node * root, uint32_t path);
+
+ void DeleteHuffmanTree(Node * root);
+
+ // Builds a fixed Huffman tree for a collection of strings::UniStrings.
+ // UniString is always UTF-32. Every code point is treated as a symbol for the encoder.
+ template <typename TIter>
+ void BuildHuffmanTree(TIter beg, TIter end)
+ {
+ if (beg == end)
+ {
+ m_root = nullptr;
+ return;
+ }
+
+ map<uint32_t, uint32_t> freqs;
+ for (auto it = beg; it != end; ++it)
+ {
+ auto const & e = *it;
+ for (size_t i = 0; i < e.size(); ++i)
+ {
+ ++freqs[static_cast<uint32_t>(e[i])];
+ }
+ }
+
+ priority_queue<Node *, vector<Node *>, NodeComparator> pq;
+ for (auto const & e : freqs)
+ pq.push(new Node(e.first, e.second, true /* isLeaf */));
+
+ if (pq.empty())
+ {
+ m_root = nullptr;
+ return;
+ }
+
+ while (pq.size() > 1)
+ {
+ auto a = pq.top();
+ pq.pop();
+ auto b = pq.top();
+ pq.pop();
+ if (a->symbol > b->symbol)
+ swap(a, b);
+ // Give it the smaller symbol to make the resulting encoding more predictable.
+ auto ab = new Node(a->symbol, a->freq + b->freq, false /* isLeaf */);
+ ab->l = a;
+ ab->r = b;
+ CHECK_LESS_OR_EQUAL(a->depth, kMaxDepth, ());
+ CHECK_LESS_OR_EQUAL(b->depth, kMaxDepth, ());
+ pq.push(ab);
+ }
+
+ m_root = pq.top();
+ pq.pop();
+
+ SetDepths(m_root, 0);
+ }
+
+ // Properly sets the depth field in the subtree rooted at root.
+ // It is easier to do it after the tree is built.
+ void SetDepths(Node * root, uint32_t depth);
+
+ Node * m_root; // m_pRoot?
+ map<Code, uint32_t> m_decoderTable;
+ map<uint32_t, Code> m_encoderTable;
+};
+
+} // namespace coding