Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/coding
diff options
context:
space:
mode:
authorMaxim Pimenov <m@maps.me>2015-11-03 20:04:40 +0300
committerSergey Yershov <yershov@corp.mail.ru>2016-03-23 16:02:37 +0300
commita818bb4b37e4de58fe6a251c4a6cfcce53b34c6a (patch)
tree4b3ac8ad61318f6e62bbaba29d5f5d2941771dce /coding
parent907cb3786b0d10e2fcf44c906ee4c8625f41630b (diff)
Moved search index from coding to indexer.
Diffstat (limited to 'coding')
-rw-r--r--coding/coding.pro6
-rw-r--r--coding/coding_tests/coding_tests.pro2
-rw-r--r--coding/coding_tests/succinct_trie_test.cpp271
-rw-r--r--coding/coding_tests/trie_test.cpp262
-rw-r--r--coding/succinct_trie_builder.hpp199
-rw-r--r--coding/succinct_trie_reader.hpp257
-rw-r--r--coding/trie.hpp85
-rw-r--r--coding/trie_builder.hpp265
-rw-r--r--coding/trie_reader.hpp159
9 files changed, 0 insertions, 1506 deletions
diff --git a/coding/coding.pro b/coding/coding.pro
index f159e3fb3a..adf2dd5515 100644
--- a/coding/coding.pro
+++ b/coding/coding.pro
@@ -89,12 +89,6 @@ HEADERS += \
streams_common.hpp \
streams_sink.hpp \
succinct_mapper.hpp \
- succinct_trie.hpp \
- succinct_trie_builder.hpp \
- succinct_trie_reader.hpp \
- trie.hpp \
- trie_builder.hpp \
- trie_reader.hpp \
uri.hpp \
url_encode.hpp \
value_opt_string.hpp \
diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro
index f782173851..adb3676b54 100644
--- a/coding/coding_tests/coding_tests.pro
+++ b/coding/coding_tests/coding_tests.pro
@@ -39,8 +39,6 @@ SOURCES += ../../testing/testingmain.cpp \
simple_dense_coding_test.cpp \
sha2_test.cpp \
succinct_mapper_test.cpp \
- succinct_trie_test.cpp \
- trie_test.cpp \
uri_test.cpp \
url_encode_test.cpp \
value_opt_string_test.cpp \
diff --git a/coding/coding_tests/succinct_trie_test.cpp b/coding/coding_tests/succinct_trie_test.cpp
deleted file mode 100644
index 4eeae0f9cf..0000000000
--- a/coding/coding_tests/succinct_trie_test.cpp
+++ /dev/null
@@ -1,271 +0,0 @@
-#include "testing/testing.hpp"
-
-#include "coding/succinct_trie_builder.hpp"
-#include "coding/succinct_trie_reader.hpp"
-#include "coding/trie_builder.hpp"
-#include "coding/trie.hpp"
-#include "coding/reader.hpp"
-#include "coding/writer.hpp"
-
-#include "coding/trie_reader.hpp"
-
-#include "indexer/search_trie.hpp"
-
-#include "base/string_utils.hpp"
-
-#include "std/algorithm.hpp"
-#include "std/vector.hpp"
-
-namespace
-{
-struct StringsFileEntryMock
-{
- StringsFileEntryMock() = default;
- StringsFileEntryMock(string const & key, uint8_t value)
- : m_key(key.begin(), key.end()), m_value(value)
- {
- }
-
- trie::TrieChar const * GetKeyData() { return m_key.data(); }
-
- size_t GetKeySize() const { return m_key.size(); }
-
- uint8_t GetValue() const { return m_value; }
- void * value_data() const { return nullptr; }
- size_t value_size() const { return 0; }
-
- void Swap(StringsFileEntryMock & o)
- {
- swap(m_key, o.m_key);
- swap(m_value, o.m_value);
- }
-
- bool operator==(const StringsFileEntryMock & o) const
- {
- return m_key == o.m_key && m_value == o.m_value;
- }
-
- bool operator<(const StringsFileEntryMock & o) const
- {
- if (m_key != o.m_key)
- return m_key < o.m_key;
- return m_value < o.m_value;
- }
-
- vector<trie::TrieChar> m_key;
- uint8_t m_value;
-};
-
-template <typename TWriter>
-struct EmptyValueList
-{
- size_t size() const { return 0; }
- void Dump(TWriter &) const {}
- void Append(int) {}
-};
-
-struct SimpleValueReader
-{
-public:
- SimpleValueReader() = default;
-
- using ValueType = uint8_t;
-
- template <typename TReader>
- void operator()(TReader & reader, ValueType & v) const
- {
- v = ReadPrimitiveFromSource<uint8_t>(reader);
- }
-
- template <class TWriter>
- void Save(TWriter & writer, ValueType const & v) const
- {
- WriteToSink(writer, v);
- }
-};
-
-template <typename TWriter>
-struct SimpleValueList
-{
- size_t size() const { return m_valueList.size(); }
-
- void Dump(TWriter & writer) const
- {
- for (uint8_t x : m_valueList)
- WriteToSink(writer, x);
- }
-
- void Append(uint8_t x) { m_valueList.push_back(x); }
-
- vector<uint8_t> m_valueList;
-};
-
-using TSimpleIterator = trie::SuccinctTrieIterator<MemReader, SimpleValueReader>;
-
-void ReadAllValues(TSimpleIterator & root, vector<uint8_t> & values)
-{
- for (size_t i = 0; i < root.NumValues(); ++i)
- values.push_back(root.GetValue(i));
-}
-
-void CollectInSubtree(TSimpleIterator & root, vector<uint8_t> & collectedValues)
-{
- ReadAllValues(root, collectedValues);
-
- if (auto l = root.GoToEdge(0))
- CollectInSubtree(*l, collectedValues);
- if (auto r = root.GoToEdge(1))
- CollectInSubtree(*r, collectedValues);
-}
-
-template <typename TWriter>
-void BuildFromSimpleValueList(TWriter & writer, vector<StringsFileEntryMock> & data)
-{
- trie::BuildSuccinctTrie<TWriter, vector<StringsFileEntryMock>::iterator,
- SimpleValueList<TWriter>>(writer, data.begin(), data.end());
-}
-} // namespace
-
-namespace trie
-{
-// todo(@pimenov): It may be worth it to write a test
-// for the trie's topology but the test may be flaky because
-// it has to depend on the particular Huffman encoding of the key strings.
-// This is remedied by separation of the string-encoding and trie-building
-// parts, but they are not separated now, hence there is no such test.
-
-UNIT_TEST(SuccinctTrie_Serialization_Smoke1)
-{
- vector<uint8_t> buf;
-
- using TWriter = MemWriter<vector<uint8_t>>;
- TWriter memWriter(buf);
-
- vector<StringsFileEntryMock> data = {StringsFileEntryMock("abacaba", 1)};
-
- trie::BuildSuccinctTrie<TWriter, vector<StringsFileEntryMock>::iterator, EmptyValueList<TWriter>>(
- memWriter, data.begin(), data.end());
-
- MemReader memReader(buf.data(), buf.size());
-
- using TEmptyValue = trie::EmptyValueReader::ValueType;
-
- auto trieRoot = trie::ReadSuccinctTrie(memReader, trie::EmptyValueReader());
- TEST(trieRoot, ());
-}
-
-UNIT_TEST(SuccinctTrie_Serialization_Smoke2)
-{
- vector<uint8_t> buf;
-
- using TWriter = MemWriter<vector<uint8_t>>;
- TWriter memWriter(buf);
-
- vector<StringsFileEntryMock> data = {StringsFileEntryMock("abacaba", 1)};
-
- BuildFromSimpleValueList(memWriter, data);
-
- MemReader memReader(buf.data(), buf.size());
-
- using TEmptyValue = trie::EmptyValueReader::ValueType;
-
- auto trieRoot = trie::ReadSuccinctTrie(memReader, SimpleValueReader());
- TEST(trieRoot, ());
-}
-
-UNIT_TEST(SuccinctTrie_Iterator)
-{
- vector<uint8_t> buf;
-
- using TWriter = MemWriter<vector<uint8_t>>;
- TWriter memWriter(buf);
-
- vector<StringsFileEntryMock> data = {StringsFileEntryMock("a", 1), StringsFileEntryMock("b", 2),
- StringsFileEntryMock("ab", 3), StringsFileEntryMock("ba", 4),
- StringsFileEntryMock("abc", 5)};
- sort(data.begin(), data.end());
-
- BuildFromSimpleValueList(memWriter, data);
-
- MemReader memReader(buf.data(), buf.size());
-
- using TEmptyValue = trie::EmptyValueReader::ValueType;
-
- auto trieRoot = trie::ReadSuccinctTrie(memReader, SimpleValueReader());
- TEST(trieRoot, ());
-
- vector<uint8_t> collectedValues;
- CollectInSubtree(*trieRoot, collectedValues);
- sort(collectedValues.begin(), collectedValues.end());
- TEST_EQUAL(collectedValues.size(), 5, ());
- for (size_t i = 0; i < collectedValues.size(); ++i)
- TEST_EQUAL(collectedValues[i], i + 1, ());
-}
-
-UNIT_TEST(SuccinctTrie_MoveToString)
-{
- vector<uint8_t> buf;
-
- using TWriter = MemWriter<vector<uint8_t>>;
- TWriter memWriter(buf);
-
- vector<StringsFileEntryMock> data = {
- StringsFileEntryMock("abcde", 1), StringsFileEntryMock("aaaaa", 2),
- StringsFileEntryMock("aaa", 3), StringsFileEntryMock("aaa", 4)};
- sort(data.begin(), data.end());
-
- BuildFromSimpleValueList(memWriter, data);
- MemReader memReader(buf.data(), buf.size());
-
- using TEmptyValue = trie::EmptyValueReader::ValueType;
-
- auto trieRoot = trie::ReadSuccinctTrie(memReader, SimpleValueReader());
-
- {
- auto it = trieRoot->GoToString(strings::MakeUniString("a"));
- TEST(it != nullptr, ());
- vector<uint8_t> expectedValues;
- vector<uint8_t> receivedValues;
- ReadAllValues(*it.get(), receivedValues);
- TEST_EQUAL(expectedValues, receivedValues, ());
- }
-
- {
- auto it = trieRoot->GoToString(strings::MakeUniString("abcde"));
- TEST(it != nullptr, ());
- vector<uint8_t> expectedValues{1};
- vector<uint8_t> receivedValues;
- ReadAllValues(*it.get(), receivedValues);
- TEST_EQUAL(expectedValues, receivedValues, ());
- }
-
- {
- auto it = trieRoot->GoToString(strings::MakeUniString("aaaaa"));
- TEST(it != nullptr, ());
- vector<uint8_t> expectedValues{2};
- vector<uint8_t> receivedValues;
- ReadAllValues(*it.get(), receivedValues);
- TEST_EQUAL(expectedValues, receivedValues, ());
- }
-
- {
- auto it = trieRoot->GoToString(strings::MakeUniString("aaa"));
- TEST(it != nullptr, ());
- vector<uint8_t> expectedValues{3, 4};
- vector<uint8_t> receivedValues;
- ReadAllValues(*it.get(), receivedValues);
- TEST_EQUAL(expectedValues, receivedValues, ());
- }
-
- {
- auto it = trieRoot->GoToString(strings::MakeUniString("b"));
- TEST(it == nullptr, ());
- }
-
- {
- auto it = trieRoot->GoToString(strings::MakeUniString("bbbbb"));
- TEST(it == nullptr, ());
- }
-}
-
-} // namespace trie
diff --git a/coding/coding_tests/trie_test.cpp b/coding/coding_tests/trie_test.cpp
deleted file mode 100644
index 890e6bdaff..0000000000
--- a/coding/coding_tests/trie_test.cpp
+++ /dev/null
@@ -1,262 +0,0 @@
-#include "testing/testing.hpp"
-#include "coding/byte_stream.hpp"
-#include "coding/reader.hpp"
-#include "coding/trie.hpp"
-#include "coding/trie_builder.hpp"
-#include "coding/trie_reader.hpp"
-#include "coding/write_to_sink.hpp"
-
-#include "base/logging.hpp"
-
-#include "std/algorithm.hpp"
-#include "std/cstring.hpp"
-#include "std/string.hpp"
-#include "std/vector.hpp"
-
-#include <boost/utility/binary.hpp>
-
-namespace
-{
-struct ChildNodeInfo
-{
- bool m_isLeaf;
- uint32_t m_size;
- vector<uint32_t> m_edge;
-
- ChildNodeInfo(bool isLeaf, uint32_t size, char const * edge) : m_isLeaf(isLeaf), m_size(size)
- {
- while (*edge)
- m_edge.push_back(*edge++);
- }
-
- uint32_t Size() const { return m_size; }
- bool IsLeaf() const { return m_isLeaf; }
- uint32_t const * GetEdge() const { return &m_edge[0]; }
- uint32_t GetEdgeSize() const { return m_edge.size(); }
-};
-
-struct KeyValuePair
-{
- buffer_vector<trie::TrieChar, 8> m_key;
- uint32_t m_value;
-
- KeyValuePair() {}
-
- template <class TString>
- KeyValuePair(TString const & key, int value)
- : m_key(key.begin(), key.end()), m_value(value)
- {}
-
- uint32_t GetKeySize() const { return m_key.size(); }
- trie::TrieChar const * GetKeyData() const { return m_key.data(); }
- uint32_t GetValue() const { return m_value; }
-
- inline void const * value_data() const { return &m_value; }
-
- inline size_t value_size() const { return sizeof(m_value); }
-
- bool operator == (KeyValuePair const & p) const
- {
- return (m_key == p.m_key && m_value == p.m_value);
- }
-
- bool operator < (KeyValuePair const & p) const
- {
- return ((m_key != p.m_key) ? m_key < p.m_key : m_value < p.m_value);
- }
-
- void Swap(KeyValuePair & r)
- {
- m_key.swap(r.m_key);
- swap(m_value, r.m_value);
- }
-};
-
-string DebugPrint(KeyValuePair const & p)
-{
- string keyS = ::DebugPrint(p.m_key);
- ostringstream out;
- out << "KVP(" << keyS << ", " << p.m_value << ")";
- return out.str();
-}
-
-struct KeyValuePairBackInserter
-{
- template <class TString>
- void operator()(TString const & s, uint32_t const & value)
- {
- m_v.push_back(KeyValuePair(s, value));
- }
-
- vector<KeyValuePair> m_v;
-};
-
-// The SingleValueSerializer and ValueList classes are similar to
-// those in indexer/string_file_values.hpp but that file
-// is not included to avoid coding_tests's dependency from indexer.
-template <typename TPrimitive>
-class SingleValueSerializer
-{
-public:
- static_assert(is_trivially_copyable<TPrimitive>::value, "");
-
- template <typename TWriter>
- void Serialize(TWriter & writer, TPrimitive const & v) const
- {
- WriteToSink(writer, v);
- }
-};
-
-template <typename TPrimitive>
-class ValueList
-{
-public:
- using TValue = TPrimitive;
- using TSerializer = SingleValueSerializer<TValue>;
-
- static_assert(is_trivially_copyable<TPrimitive>::value, "");
-
- ValueList() = default;
-
- void Init(vector<TValue> const & values) { m_values = values; }
-
- size_t Size() const { return m_values.size(); }
-
- bool IsEmpty() const { return m_values.empty(); }
-
- template <typename TSink>
- void Serialize(TSink & sink, TSerializer const & /* serializer */) const
- {
- for (auto const & value : m_values)
- WriteToSink(sink, value);
- }
-
- template <typename TSource>
- void Deserialize(TSource & src, uint32_t valueCount, TSerializer const & /* serializer */)
- {
- if (valueCount == 0)
- {
- Deserialize(src, TSerializer());
- return;
- }
- m_values.resize(valueCount);
- for (size_t i = 0; i < valueCount; ++i)
- m_values[i] = ReadPrimitiveFromSource<TValue>(src);
- }
-
- template <typename TSource>
- void Deserialize(TSource & src, TSerializer const & /* serializer */)
- {
- m_values.clear();
- while (src.Size() > 0)
- m_values.emplace_back(ReadPrimitiveFromSource<TValue>(src));
- }
-
- template <typename TF>
- void ForEach(TF && f) const
- {
- for (auto const & value : m_values)
- f(value);
- }
-
-private:
- vector<TValue> m_values;
-};
-} // namespace
-
-#define ZENC bits::ZigZagEncode
-#define MKSC(x) static_cast<signed char>(x)
-#define MKUC(x) static_cast<uint8_t>(x)
-
-UNIT_TEST(TrieBuilder_WriteNode_Smoke)
-{
- vector<uint8_t> buf;
- PushBackByteSink<vector<uint8_t>> sink(buf);
- ChildNodeInfo children[] = {
- ChildNodeInfo(true, 1, "1A"), ChildNodeInfo(false, 2, "B"), ChildNodeInfo(false, 3, "zz"),
- ChildNodeInfo(true, 4,
- "abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij"),
- ChildNodeInfo(true, 5, "a")};
-
- ValueList<char> valueList;
- valueList.Init({'1', '2', '3'});
- trie::WriteNode(sink, SingleValueSerializer<char>(), 0, valueList, &children[0],
- &children[0] + ARRAY_SIZE(children));
- uint8_t const expected [] =
- {
- BOOST_BINARY(11000101), // Header: [0b11] [0b000101]
- 3, // Number of values
- '1', '2', '3', // Values
- BOOST_BINARY(10000001), // Child 1: header: [+leaf] [-supershort] [2 symbols]
- MKUC(ZENC(MKSC('1'))), MKUC(ZENC(MKSC('A') - MKSC('1'))), // Child 1: edge
- 1, // Child 1: size
- MKUC(64 | ZENC(MKSC('B') - MKSC('1'))), // Child 2: header: [-leaf] [+supershort]
- 2, // Child 2: size
- BOOST_BINARY(00000001), // Child 3: header: [-leaf] [-supershort] [2 symbols]
- MKUC(ZENC(MKSC('z') - MKSC('B'))), 0, // Child 3: edge
- 3, // Child 3: size
- BOOST_BINARY(10111111), // Child 4: header: [+leaf] [-supershort] [>= 63 symbols]
- 69, // Child 4: edgeSize - 1
- MKUC(ZENC(MKSC('a') - MKSC('z'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
- MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
- MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
- MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
- MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
- MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
- MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge
- 4, // Child 4: size
- MKUC(BOOST_BINARY(11000000) | ZENC(0)), // Child 5: header: [+leaf] [+supershort]
- };
-
- TEST_EQUAL(buf, vector<uint8_t>(&expected[0], &expected[0] + ARRAY_SIZE(expected)), ());
-}
-
-UNIT_TEST(TrieBuilder_Build)
-{
- int const base = 3;
- int const maxLen = 3;
-
- vector<string> possibleStrings(1, string());
- for (int len = 1; len <= maxLen; ++len)
- {
- for (int i = 0, p = static_cast<int>(pow((double) base, len)); i < p; ++i)
- {
- string s(len, 'A');
- int t = i;
- for (int l = len - 1; l >= 0; --l, t /= base)
- s[l] += (t % base);
- possibleStrings.push_back(s);
- }
- }
- sort(possibleStrings.begin(), possibleStrings.end());
- // LOG(LINFO, (possibleStrings));
-
- int const count = static_cast<int>(possibleStrings.size());
- for (int i0 = -1; i0 < count; ++i0)
- for (int i1 = i0; i1 < count; ++i1)
- for (int i2 = i1; i2 < count; ++i2)
- {
- vector<KeyValuePair> v;
- if (i0 >= 0) v.push_back(KeyValuePair(possibleStrings[i0], i0));
- if (i1 >= 0) v.push_back(KeyValuePair(possibleStrings[i1], i1 + 10));
- if (i2 >= 0) v.push_back(KeyValuePair(possibleStrings[i2], i2 + 100));
- vector<string> vs;
- for (size_t i = 0; i < v.size(); ++i)
- vs.push_back(string(v[i].m_key.begin(), v[i].m_key.end()));
-
- vector<uint8_t> buf;
- PushBackByteSink<vector<uint8_t>> sink(buf);
- SingleValueSerializer<uint32_t> serializer;
- trie::Build<PushBackByteSink<vector<uint8_t>>, typename vector<KeyValuePair>::iterator,
- ValueList<uint32_t>>(sink, serializer, v.begin(), v.end());
- reverse(buf.begin(), buf.end());
-
- MemReader memReader = MemReader(&buf[0], buf.size());
- auto const root = trie::ReadTrie<MemReader, ValueList<uint32_t>>(memReader, serializer);
- vector<KeyValuePair> res;
- KeyValuePairBackInserter f;
- trie::ForEachRef(*root, f, vector<trie::TrieChar>());
- sort(f.m_v.begin(), f.m_v.end());
- TEST_EQUAL(v, f.m_v, ());
- }
-}
diff --git a/coding/succinct_trie_builder.hpp b/coding/succinct_trie_builder.hpp
deleted file mode 100644
index 5ba0f56cf0..0000000000
--- a/coding/succinct_trie_builder.hpp
+++ /dev/null
@@ -1,199 +0,0 @@
-#pragma once
-
-#include "coding/bit_streams.hpp"
-#include "coding/byte_stream.hpp"
-#include "coding/huffman.hpp"
-#include "coding/reader.hpp"
-#include "coding/varint.hpp"
-#include "coding/writer.hpp"
-
-#include "base/buffer_vector.hpp"
-#include "base/scope_guard.hpp"
-#include "base/string_utils.hpp"
-
-#include "std/algorithm.hpp"
-#include "std/bind.hpp"
-
-// Trie format:
-// -- Serialized Huffman encoding.
-// -- Topology of the trie built on Huffman-encoded input strings [2 bits per node, level-order representation].
-// -- List of pairs (node id, offset). One pair per final node (i.e. a node where a string ends).
-// The lists of node ids and offsets are both non-decreasing and are delta-encoded with varuints.
-// -- Values of final nodes in level-order. The values for final node |id| start at offset |offset|
-// if there is a pair (id, offset) in the list above.
-
-namespace trie
-{
-// Node is a temporary struct that is used to store the trie in memory
-// before it is converted to a succinct representation and put to disk.
-// It is rather verbose but hopefully is small enough to fit in memory.
-template <class TValueList>
-struct Node
-{
- // The left child is reached by 0, the right by 1.
- Node *l, *r;
-
- // A node is final if a key string ends in it.
- // In particular, all non-final nodes' valueLists are empty
- // and it is expected that a final node's valueList is non-empty.
- bool m_isFinal;
-
- // m_valueLists stores all the additional information that is related
- // to the key string which leads to this node.
- TValueList m_valueList;
-
- Node() : l(nullptr), r(nullptr), m_isFinal(false) {}
-};
-
-template <class TNode, class TReader>
-TNode * AddToTrie(TNode * root, TReader & bitEncoding, uint32_t numBits)
-{
- ReaderSource<TReader> src(bitEncoding);
- // String size.
- ReadVarUint<uint32_t, ReaderSource<TReader>>(src);
-
- BitReader<ReaderSource<TReader>> bitReader(src);
- auto cur = root;
- for (uint32_t i = 0; i < numBits; ++i)
- {
- auto nxt = bitReader.Read(1) == 0 ? &cur->l : &cur->r;
- if (!*nxt)
- *nxt = new TNode();
- cur = *nxt;
- }
- return cur;
-}
-
-template <class TNode>
-void DeleteTrie(TNode * root)
-{
- if (!root)
- return;
- DeleteTrie(root->l);
- DeleteTrie(root->r);
- delete root;
-}
-
-template <typename TNode>
-void WriteInLevelOrder(TNode * root, vector<TNode *> & levelOrder)
-{
- ASSERT(root, ());
- levelOrder.push_back(root);
- size_t i = 0;
- while (i < levelOrder.size())
- {
- TNode * cur = levelOrder[i++];
- if (cur->l)
- levelOrder.push_back(cur->l);
- if (cur->r)
- levelOrder.push_back(cur->r);
- }
-}
-
-template <typename TWriter, typename TIter, typename TValueList>
-void BuildSuccinctTrie(TWriter & writer, TIter const beg, TIter const end)
-{
- using TrieChar = uint32_t;
- using TTrieString = buffer_vector<TrieChar, 32>;
- using TNode = Node<TValueList>;
- using TEntry = typename TIter::value_type;
-
- TNode * root = new TNode();
- MY_SCOPE_GUARD(cleanup, bind(&DeleteTrie<TNode>, root));
- TTrieString prevKey;
- TEntry prevEntry;
-
- vector<TEntry> entries;
- vector<strings::UniString> entryStrings;
- for (TIter it = beg; it != end; ++it)
- {
- TEntry entry = *it;
- if (it != beg && entry == prevEntry)
- continue;
- TrieChar const * const keyData = entry.GetKeyData();
- TTrieString key(keyData, keyData + entry.GetKeySize());
- using namespace std::rel_ops; // ">=" for keys.
- CHECK_GREATER_OR_EQUAL(key, prevKey, (key, prevKey));
- entries.push_back(entry);
- entryStrings.push_back(strings::UniString(keyData, keyData + entry.GetKeySize()));
- prevKey.swap(key);
- prevEntry.Swap(entry);
- }
-
- coding::HuffmanCoder huffman;
- huffman.Init(entryStrings);
- huffman.WriteEncoding(writer);
-
- for (size_t i = 0; i < entries.size(); ++i)
- {
- TEntry const & entry = entries[i];
- auto const & key = entryStrings[i];
-
- vector<uint8_t> buf;
- MemWriter<vector<uint8_t>> memWriter(buf);
- uint32_t numBits = huffman.EncodeAndWrite(memWriter, key);
-
- MemReader bitEncoding(&buf[0], buf.size());
-
- TNode * cur = AddToTrie(root, bitEncoding, numBits);
- cur->m_isFinal = true;
- cur->m_valueList.Append(entry.GetValue());
- }
-
- vector<TNode *> levelOrder;
- WriteInLevelOrder(root, levelOrder);
-
- {
- // Trie topology.
- BitWriter<TWriter> bitWriter(writer);
- WriteVarUint(writer, levelOrder.size());
- for (size_t i = 0; i < levelOrder.size(); ++i)
- {
- auto const & cur = levelOrder[i];
- bitWriter.Write(cur->l ? 1 : 0, 1 /* numBits */);
- bitWriter.Write(cur->r ? 1 : 0, 1 /* numBits */);
- }
- }
-
- vector<uint32_t> finalNodeIds;
- vector<uint32_t> offsetTable;
- vector<uint8_t> valueBuf;
- MemWriter<vector<uint8_t>> valueWriter(valueBuf);
- for (size_t i = 0; i < levelOrder.size(); ++i)
- {
- TNode const * node = levelOrder[i];
- if (!node->m_isFinal)
- continue;
- finalNodeIds.push_back(static_cast<uint32_t>(i));
- offsetTable.push_back(valueWriter.Pos());
- node->m_valueList.Dump(valueWriter);
- }
-
- WriteVarUint(writer, finalNodeIds.size());
- for (size_t i = 0; i < finalNodeIds.size(); ++i)
- {
- if (i == 0)
- {
- WriteVarUint(writer, finalNodeIds[i]);
- WriteVarUint(writer, offsetTable[i]);
- }
- else
- {
- WriteVarUint(writer, finalNodeIds[i] - finalNodeIds[i - 1]);
- WriteVarUint(writer, offsetTable[i] - offsetTable[i - 1]);
- }
- }
-
- writer.Write(valueBuf.data(), valueBuf.size());
-
- // todo(@pimenov):
- // 1. Investigate the possibility of path compression (short edges + lcp table).
- // 2. It is highly probable that valueList will be only a list of feature ids in our
- // future implementations of the search.
- // We can then dispose of offsetTable and store finalNodeIds as another bit vector.
- // Alternatively (and probably better), we can append a special symbol to all those
- // strings that have non-empty value lists. We can then get the leaf number
- // of a final node by doing rank0.
-}
-
-} // namespace trie
diff --git a/coding/succinct_trie_reader.hpp b/coding/succinct_trie_reader.hpp
deleted file mode 100644
index cc61d6b0f2..0000000000
--- a/coding/succinct_trie_reader.hpp
+++ /dev/null
@@ -1,257 +0,0 @@
-#pragma once
-#include "coding/bit_streams.hpp"
-#include "coding/huffman.hpp"
-#include "coding/reader.hpp"
-#include "coding/trie.hpp"
-#include "coding/varint.hpp"
-
-#include "base/assert.hpp"
-#include "base/bits.hpp"
-#include "base/macros.hpp"
-#include "base/string_utils.hpp"
-
-#include "std/unique_ptr.hpp"
-#include "std/utility.hpp"
-#include "std/vector.hpp"
-
-#include "3party/succinct/rs_bit_vector.hpp"
-
-namespace trie
-{
-// This class is used to read the data that is common to every node
-// of the trie: the trie topology and the offsets into the data buffer.
-// The topology can then be used to navigate the trie and the offsets
-// can be used to extract the values associated with the key strings.
-template <class TReader, class TValueReader>
-class TopologyAndOffsets
-{
-public:
- using TValue = typename TValueReader::ValueType;
-
- TopologyAndOffsets(TReader const & reader, TValueReader const & valueReader)
- : m_reader(reader), m_valueReader(valueReader)
- {
- Parse();
- }
-
- // Returns the topology of the trie in the external node representation.
- // Arrange the nodes in the level order and write two bits for every
- // node: the first bit for its left child and the second bit for its right child.
- // Write '1' if the child is present and '0' if it is not.
- // Prepend the resulting bit string with '1' (for a super-root) and you will get
- // the external node representaiton.
- succinct::rs_bit_vector const & GetTopology() const { return m_trieTopology; }
-
- // Returns the number of trie nodes.
- uint32_t NumNodes() const { return m_numNodes; }
-
- // Returns the Huffman encoding that was used to encode the strings
- // before adding them to this trie.
- coding::HuffmanCoder const & GetEncoding() const { return m_huffman; }
-
- // Returns true if and only if the node is the last node of a string
- // that was used as a key when building this trie.
- bool NodeIsFinal(uint32_t id) const { return m_finalNodeIndex[id] >= 0; }
-
- // Returns the offset relative to the beginning of the start of the
- // value buffer where the data for node number |id| starts.
- uint32_t Offset(uint32_t id) const
- {
- CHECK(NodeIsFinal(id), ());
- return m_offsetTable[m_finalNodeIndex[id]];
- }
-
- // Returns the number of values associated with a final node |id|.
- uint32_t ValueListSize(uint32_t id)
- {
- CHECK(NodeIsFinal(id), ());
- int finalId = m_finalNodeIndex[id];
- return m_offsetTable[finalId + 1] - m_offsetTable[finalId];
- }
-
- // Returns the reader from which the value buffer can be read.
- TValueReader const & GetValueReader() const { return m_valueReader; }
-
- // Returns the reader from which the trie can be read.
- TReader const & GetReader() const { return m_reader; }
-
-private:
- void Parse()
- {
- ReaderSource<TReader> src(m_reader);
-
- m_huffman.ReadEncoding(src);
- m_numNodes = ReadVarUint<uint32_t, ReaderSource<TReader>>(src);
-
- BitReader<ReaderSource<TReader>> bitReader(src);
- vector<bool> bv(2 * m_numNodes + 1);
- // Convert the level order representation into the external node representation.
- bv[0] = 1;
- for (size_t i = 0; i < 2 * m_numNodes; ++i)
- bv[i + 1] = bitReader.Read(1) == 1;
-
- succinct::rs_bit_vector(bv).swap(m_trieTopology);
-
- uint32_t const numFinalNodes = ReadVarUint<uint32_t, ReaderSource<TReader>>(src);
- m_finalNodeIndex.assign(m_numNodes, -1);
- m_offsetTable.resize(numFinalNodes + 1);
- uint32_t id = 0;
- uint32_t offset = 0;
- for (size_t i = 0; i < numFinalNodes; ++i)
- {
- // ids and offsets are delta-encoded
- id += ReadVarUint<uint32_t, ReaderSource<TReader>>(src);
- offset += ReadVarUint<uint32_t, ReaderSource<TReader>>(src);
- m_finalNodeIndex[id] = i;
- m_offsetTable[i] = offset;
- }
- m_offsetTable[numFinalNodes] = src.Size();
- m_reader = m_reader.SubReader(src.Pos(), src.Size());
- }
-
- TReader m_reader;
-
- // todo(@pimenov) Why do we even need an instance? Type name is enough.
- TValueReader const & m_valueReader;
- uint32_t m_numNodes;
-
- coding::HuffmanCoder m_huffman;
- succinct::rs_bit_vector m_trieTopology;
-
- // m_finalNodeIndex[i] is the 0-based index of the i'th
- // node in the list of all final nodes, or -1 if the
- // node is not final.
- vector<int> m_finalNodeIndex;
- vector<uint32_t> m_offsetTable;
-};
-
-template <class TReader, class TValueReader>
-class SuccinctTrieIterator
-{
-public:
- using TValue = typename TValueReader::ValueType;
- using TCommonData = TopologyAndOffsets<TReader, TValueReader>;
-
- SuccinctTrieIterator(TReader const & reader, shared_ptr<TCommonData> common,
- uint32_t nodeBitPosition)
- : m_reader(reader), m_common(common), m_nodeBitPosition(nodeBitPosition), m_valuesRead(false)
- {
- }
-
- unique_ptr<SuccinctTrieIterator> Clone() const
- {
- return make_unique<SuccinctTrieIterator>(m_reader, m_common, m_nodeBitPosition);
- }
-
- unique_ptr<SuccinctTrieIterator> GoToEdge(size_t i) const
- {
- ASSERT_LESS(i, 2, ("Bad edge id of a binary trie."));
- ASSERT_EQUAL(m_common->GetTopology()[m_nodeBitPosition - 1], 1, (m_nodeBitPosition));
-
- // rank(x) returns the number of ones in [0, x) but we count bit positions from 1
- uint32_t childBitPosition = 2 * m_common->GetTopology().rank(m_nodeBitPosition);
- if (i == 1)
- ++childBitPosition;
- if (childBitPosition > 2 * m_common->NumNodes() ||
- m_common->GetTopology()[childBitPosition - 1] == 0)
- {
- return nullptr;
- }
- return make_unique<SuccinctTrieIterator>(m_reader, m_common, childBitPosition);
- }
-
- template <typename TEncodingReader>
- unique_ptr<SuccinctTrieIterator> GoToString(TEncodingReader & bitEncoding, uint32_t numBits)
- {
- ReaderSource<TEncodingReader> src(bitEncoding);
-
- // String size. Useful when the string is not alone in the writer to know where it
- // ends but useless for our purpose here.
- ReadVarUint<uint32_t, ReaderSource<TEncodingReader>>(src);
-
- BitReader<ReaderSource<TEncodingReader>> bitReader(src);
-
- auto ret = Clone();
- for (uint32_t i = 0; i < numBits; ++i)
- {
- uint8_t bit = bitReader.Read(1);
- auto nxt = ret->GoToEdge(bit);
- if (!nxt)
- return nullptr;
- ret = move(nxt);
- }
- return ret;
- }
-
- unique_ptr<SuccinctTrieIterator> GoToString(strings::UniString const & s)
- {
- vector<uint8_t> buf;
- uint32_t numBits;
- {
- MemWriter<vector<uint8_t>> writer(buf);
- numBits = m_common->GetEncoding().EncodeAndWrite(writer, s);
- }
- MemReader reader(&buf[0], buf.size());
- return GoToString(reader, numBits);
- }
-
- uint32_t NumValues()
- {
- if (!m_valuesRead)
- ReadValues();
- return m_values.size();
- }
-
- TValue GetValue(size_t i)
- {
- if (!m_valuesRead)
- ReadValues();
- ASSERT_LESS(i, m_values.size(), ());
- return m_values[i];
- }
-
-private:
- void ReadValues()
- {
- if (m_valuesRead)
- return;
- m_valuesRead = true;
- // Back to 0-based indices.
- uint32_t m_nodeId = m_common->GetTopology().rank(m_nodeBitPosition) - 1;
- if (!m_common->NodeIsFinal(m_nodeId))
- return;
- uint32_t offset = m_common->Offset(m_nodeId);
- uint32_t size = m_common->ValueListSize(m_nodeId);
- ReaderPtr<TReader> subReaderPtr(m_reader.CreateSubReader(offset, size));
- ReaderSource<ReaderPtr<TReader>> src(subReaderPtr);
- while (src.Size() > 0)
- {
- TValue val;
- m_common->GetValueReader()(src, val);
- m_values.push_back(val);
- }
- }
-
- TReader const & m_reader;
- shared_ptr<TCommonData> m_common;
-
- // The bit with this 1-based index represents this node
- // in the external node representation of binary trie.
- uint32_t m_nodeBitPosition;
-
- vector<TValue> m_values;
- bool m_valuesRead;
-};
-
-template <typename TReader, typename TValueReader>
-unique_ptr<SuccinctTrieIterator<TReader, TValueReader>> ReadSuccinctTrie(
- TReader const & reader, TValueReader valueReader = TValueReader())
-{
- using TCommonData = TopologyAndOffsets<TReader, TValueReader>;
- using TIter = SuccinctTrieIterator<TReader, TValueReader>;
-
- shared_ptr<TCommonData> common(new TCommonData(reader, valueReader));
- return make_unique<TIter>(common->GetReader(), common, 1 /* bitPosition */);
-}
-
-} // namespace trie
diff --git a/coding/trie.hpp b/coding/trie.hpp
deleted file mode 100644
index 4c1b397b08..0000000000
--- a/coding/trie.hpp
+++ /dev/null
@@ -1,85 +0,0 @@
-#pragma once
-
-#include "base/assert.hpp"
-#include "base/base.hpp"
-#include "base/buffer_vector.hpp"
-
-#include "std/unique_ptr.hpp"
-
-namespace trie
-{
-
-typedef uint32_t TrieChar;
-
-// 95 is a good value for the default baseChar, since both small and capital latin letters
-// are less than +/- 32 from it and thus can fit into supershort edge.
-// However 0 is used because the first byte is actually language id.
-static uint32_t const DEFAULT_CHAR = 0;
-
-template <typename TValueList>
-class Iterator
-{
- //dbg::ObjectTracker m_tracker;
-
-public:
- using TValue = typename TValueList::TValue;
-
- struct Edge
- {
- using TEdgeLabel = buffer_vector<TrieChar, 8>;
- TEdgeLabel m_label;
- };
-
- buffer_vector<Edge, 8> m_edge;
- TValueList m_valueList;
-
- virtual ~Iterator() {}
-
- virtual unique_ptr<Iterator<TValueList>> Clone() const = 0;
- virtual unique_ptr<Iterator<TValueList>> GoToEdge(size_t i) const = 0;
-};
-
-struct EmptyValueReader
-{
- typedef unsigned char ValueType;
-
- EmptyValueReader() = default;
-
- template <typename SourceT>
- void operator() (SourceT &, ValueType & value) const
- {
- value = 0;
- }
-};
-
-template <unsigned int N>
-struct FixedSizeValueReader
-{
- struct ValueType
- {
- unsigned char m_data[N];
- };
-
- template <typename SourceT>
- void operator() (SourceT & src, ValueType & value) const
- {
- src.Read(&value.m_data[0], N);
- }
-};
-
-template <typename TValueList, typename TF, typename TString>
-void ForEachRef(Iterator<TValueList> const & it, TF && f, TString const & s)
-{
- it.m_valueList.ForEach([&f, &s](typename TValueList::TValue const & value)
- {
- f(s, value);
- });
- for (size_t i = 0; i < it.m_edge.size(); ++i)
- {
- TString s1(s);
- s1.insert(s1.end(), it.m_edge[i].m_label.begin(), it.m_edge[i].m_label.end());
- auto nextIt = it.GoToEdge(i);
- ForEachRef(*nextIt, f, s1);
- }
-}
-} // namespace trie
diff --git a/coding/trie_builder.hpp b/coding/trie_builder.hpp
deleted file mode 100644
index 714c8f02f6..0000000000
--- a/coding/trie_builder.hpp
+++ /dev/null
@@ -1,265 +0,0 @@
-#pragma once
-
-#include "coding/byte_stream.hpp"
-#include "coding/varint.hpp"
-
-#include "base/buffer_vector.hpp"
-#include "base/logging.hpp"
-
-#include "std/algorithm.hpp"
-#include "std/vector.hpp"
-
-// Trie format:
-// [1: header]
-// [node] ... [node]
-
-// Nodes are written in post-order (first child, last child, parent). Contents of nodes is writern
-// reversed. The resulting file should be reverese before use! Then its contents will appear in
-// pre-order alphabetically reversed (parent, last child, first child).
-
-// Leaf node format:
-// [valueList]
-
-// Internal node format:
-// [1: header]: [2: min(valueCount, 3)] [6: min(childCount, 63)]
-// [vu valueCount]: if valueCount in header == 3
-// [vu childCount]: if childCount in header == 63
-// [valueList]
-// [childInfo] ... [childInfo]
-
-// Child info format:
-// Every char of the edge is encoded as varint difference from the previous char. First char is
-// encoded as varint difference from the base char, which is the last char of the current prefix.
-//
-// [1: header]: [1: isLeaf] [1: isShortEdge] [6: (edgeChar0 - baseChar) or min(edgeLen-1, 63)]
-// [vu edgeLen-1]: if edgeLen-1 in header == 63
-// [vi edgeChar0 - baseChar]
-// [vi edgeChar1 - edgeChar0]
-// ...
-// [vi edgeCharN - edgeCharN-1]
-// [child size]: if the child is not the last one when reading
-
-namespace trie
-{
-template <typename TSink, typename TChildIter, typename TValueList, typename TSerializer>
-void WriteNode(TSink & sink, TSerializer const & serializer, TrieChar baseChar,
- TValueList const & valueList, TChildIter const begChild, TChildIter const endChild,
- bool isRoot = false)
-{
- uint32_t const valueCount = valueList.Size();
- if (begChild == endChild && !isRoot)
- {
- // Leaf node.
- valueList.Serialize(sink, serializer);
- return;
- }
- uint32_t const childCount = endChild - begChild;
- uint8_t const header = static_cast<uint32_t>((min(valueCount, 3U) << 6) + min(childCount, 63U));
- sink.Write(&header, 1);
- if (valueCount >= 3)
- WriteVarUint(sink, valueCount);
- if (childCount >= 63)
- WriteVarUint(sink, childCount);
- valueList.Serialize(sink, serializer);
- for (TChildIter it = begChild; it != endChild; /*++it*/)
- {
- uint8_t header = (it->IsLeaf() ? 128 : 0);
- TrieChar const * const edge = it->GetEdge();
- uint32_t const edgeSize = it->GetEdgeSize();
- CHECK_NOT_EQUAL(edgeSize, 0, ());
- CHECK_LESS(edgeSize, 100000, ());
- uint32_t const diff0 = bits::ZigZagEncode(int32_t(edge[0] - baseChar));
- if (edgeSize == 1 && (diff0 & ~63U) == 0)
- {
- header |= 64;
- header |= diff0;
- WriteToSink(sink, header);
- }
- else
- {
- if (edgeSize - 1 < 63)
- {
- header |= edgeSize - 1;
- WriteToSink(sink, header);
- }
- else
- {
- header |= 63;
- WriteToSink(sink, header);
- WriteVarUint(sink, edgeSize - 1);
- }
- for (uint32_t i = 0; i < edgeSize; ++i)
- {
- WriteVarInt(sink, int32_t(edge[i] - baseChar));
- baseChar = edge[i];
- }
- }
- baseChar = edge[0];
-
- uint32_t const childSize = it->Size();
- if (++it != endChild)
- WriteVarUint(sink, childSize);
- }
-}
-
-struct ChildInfo
-{
- bool m_isLeaf;
- uint32_t m_size;
- vector<TrieChar> m_edge;
-
- ChildInfo(bool isLeaf, uint32_t size, TrieChar c) : m_isLeaf(isLeaf), m_size(size), m_edge(1, c)
- {
- }
-
- uint32_t Size() const { return m_size; }
- bool IsLeaf() const { return m_isLeaf; }
- TrieChar const * GetEdge() const { return m_edge.data(); }
- uint32_t GetEdgeSize() const { return m_edge.size(); }
-};
-
-template <typename TValueList>
-struct NodeInfo
-{
- uint64_t m_begPos;
- TrieChar m_char;
- vector<ChildInfo> m_children;
-
- // This is ugly but will do until we rename ValueList.
- // Here is the rationale. ValueList<> is the entity that
- // we store in the nodes of the search trie. It can be read
- // or written via its methods but not directly as was assumed
- // in a previous version of this code. That version provided
- // serialization methods for ValueList but the deserialization
- // was ad hoc.
- // Because of the possibility of serialized ValueLists to represent
- // something completely different from an array of FeatureIds
- // (a compressed bit vector, for instance) and because of the
- // need to update a node's ValueList until the node is finalized
- // this vector is needed here. It is better to leave it here
- // than to expose it in ValueList.
- vector<typename TValueList::TValue> m_temporaryValueList;
- TValueList m_valueList;
- bool m_mayAppend;
-
- NodeInfo() : m_begPos(0), m_char(0), m_valueList(TValueList()), m_mayAppend(true) {}
- NodeInfo(uint64_t pos, TrieChar trieChar)
- : m_begPos(pos), m_char(trieChar), m_valueList(TValueList()), m_mayAppend(true)
- {
- }
-
- // It is finalized in the sense that no more appends are possible
- // so it is a fine moment to initialize the underlying ValueList.
- void FinalizeValueList()
- {
- m_valueList.Init(m_temporaryValueList);
- m_mayAppend = false;
- }
-};
-
-template <typename TSink, typename TValueList, typename TSerializer>
-void WriteNodeReverse(TSink & sink, TSerializer const & serializer, TrieChar baseChar,
- NodeInfo<TValueList> & node, bool isRoot = false)
-{
- using TOutStorage = buffer_vector<uint8_t, 64>;
- TOutStorage out;
- PushBackByteSink<TOutStorage> outSink(out);
- node.FinalizeValueList();
- WriteNode(outSink, serializer, baseChar, node.m_valueList, node.m_children.rbegin(),
- node.m_children.rend(), isRoot);
- reverse(out.begin(), out.end());
- sink.Write(out.data(), out.size());
-}
-
-template <typename TSink, typename TNodes, typename TSerializer>
-void PopNodes(TSink & sink, TSerializer const & serializer, TNodes & nodes, int nodesToPop)
-{
- using TNodeInfo = typename TNodes::value_type;
- ASSERT_GREATER(nodes.size(), nodesToPop, ());
- for (; nodesToPop > 0; --nodesToPop)
- {
- TNodeInfo & node = nodes.back();
- TNodeInfo & prevNode = nodes[nodes.size() - 2];
-
- if (node.m_temporaryValueList.empty() && node.m_children.size() <= 1)
- {
- ASSERT_EQUAL(node.m_children.size(), 1, ());
- ChildInfo & child = node.m_children[0];
- prevNode.m_children.emplace_back(child.m_isLeaf, child.m_size, node.m_char);
- auto & prevChild = prevNode.m_children.back();
- prevChild.m_edge.insert(prevChild.m_edge.end(), child.m_edge.begin(), child.m_edge.end());
- }
- else
- {
- WriteNodeReverse(sink, serializer, node.m_char, node);
- prevNode.m_children.emplace_back(
- node.m_children.empty(), static_cast<uint32_t>(sink.Pos() - node.m_begPos), node.m_char);
- }
-
- nodes.pop_back();
- }
-}
-
-template <typename TNodeInfo, typename TValue>
-void AppendValue(TNodeInfo & node, TValue const & value)
-{
- // External-memory trie adds <string, value> pairs in a sorted
- // order so the values are supposed to be accumulated in the
- // sorted order and we can avoid sorting them before doing
- // further operations such as ValueList construction.
- using namespace std::rel_ops;
- ASSERT(node.m_temporaryValueList.empty() || node.m_temporaryValueList.back() <= value, ());
- if (!node.m_temporaryValueList.empty() && node.m_temporaryValueList.back() == value)
- return;
- if (node.m_mayAppend)
- node.m_temporaryValueList.push_back(value);
- else
- LOG(LERROR, ("Cannot append to a finalized value list."));
-}
-
-template <typename TSink, typename TIter, typename TValueList, typename TSerializer>
-void Build(TSink & sink, TSerializer const & serializer, TIter const beg, TIter const end)
-{
- using TTrieString = buffer_vector<TrieChar, 32>;
- using TNodeInfo = NodeInfo<TValueList>;
-
- vector<TNodeInfo> nodes;
- nodes.emplace_back(sink.Pos(), DEFAULT_CHAR);
-
- TTrieString prevKey;
-
- using TElement = typename TIter::value_type;
- TElement prevE;
-
- for (TIter it = beg; it != end; ++it)
- {
- TElement e = *it;
- if (it != beg && e == prevE)
- continue;
-
- TrieChar const * const pKeyData = e.GetKeyData();
- TTrieString key(pKeyData, pKeyData + e.GetKeySize());
- CHECK(!(key < prevKey), (key, prevKey));
- size_t nCommon = 0;
- while (nCommon < min(key.size(), prevKey.size()) && prevKey[nCommon] == key[nCommon])
- ++nCommon;
-
- // Root is also a common node.
- PopNodes(sink, serializer, nodes, nodes.size() - nCommon - 1);
- uint64_t const pos = sink.Pos();
- for (size_t i = nCommon; i < key.size(); ++i)
- nodes.emplace_back(pos, key[i]);
- AppendValue(nodes.back(), e.GetValue());
-
- prevKey.swap(key);
- prevE.Swap(e);
- }
-
- // Pop all the nodes from the stack.
- PopNodes(sink, serializer, nodes, nodes.size() - 1);
-
- // Write the root.
- WriteNodeReverse(sink, serializer, DEFAULT_CHAR /* baseChar */, nodes.back(), true /* isRoot */);
-}
-
-} // namespace trie
diff --git a/coding/trie_reader.hpp b/coding/trie_reader.hpp
deleted file mode 100644
index 2b4c651762..0000000000
--- a/coding/trie_reader.hpp
+++ /dev/null
@@ -1,159 +0,0 @@
-#pragma once
-#include "coding/trie.hpp"
-#include "coding/reader.hpp"
-#include "coding/varint.hpp"
-
-#include "base/assert.hpp"
-#include "base/bits.hpp"
-#include "base/macros.hpp"
-
-namespace trie
-{
-template <class TValueList, typename TSerializer>
-class LeafIterator0 : public Iterator<TValueList>
-{
-public:
- using TValue = typename TValueList::TValue;
- using Iterator<TValueList>::m_valueList;
-
- template <class TReader>
- LeafIterator0(TReader const & reader, TSerializer const & serializer)
- {
- ReaderSource<TReader> src(reader);
- if (src.Size() > 0)
- m_valueList.Deserialize(src, 0 /* valueCount */, serializer);
- ASSERT_EQUAL(src.Size(), 0, ());
- }
-
- // trie::Iterator overrides:
- unique_ptr<Iterator<TValueList>> Clone() const override
- {
- return make_unique<LeafIterator0<TValueList, TSerializer>>(*this);
- }
-
- unique_ptr<Iterator<TValueList>> GoToEdge(size_t i) const override
- {
- ASSERT(false, (i));
- UNUSED_VALUE(i);
- return nullptr;
- }
-};
-
-template <typename TReader, typename TValueList, typename TSerializer>
-class Iterator0 : public Iterator<TValueList>
-{
-public:
- using TValue = typename TValueList::TValue;
- using Iterator<TValueList>::m_valueList;
- using Iterator<TValueList>::m_edge;
-
- Iterator0(TReader const & reader, TrieChar baseChar, TSerializer const & serializer)
- : m_reader(reader), m_serializer(serializer)
- {
- ParseNode(baseChar);
- }
-
- // trie::Iterator overrides:
- unique_ptr<Iterator<TValueList>> Clone() const override
- {
- return make_unique<Iterator0<TReader, TValueList, TSerializer>>(*this);
- }
-
- unique_ptr<Iterator<TValueList>> GoToEdge(size_t i) const override
- {
- ASSERT_LESS(i, this->m_edge.size(), ());
- uint32_t const offset = m_edgeInfo[i].m_offset;
- uint32_t const size = m_edgeInfo[i+1].m_offset - offset;
-
- if (m_edgeInfo[i].m_isLeaf)
- {
- return make_unique<LeafIterator0<TValueList, TSerializer>>(m_reader.SubReader(offset, size),
- m_serializer);
- }
-
- return make_unique<Iterator0<TReader, TValueList, TSerializer>>(
- m_reader.SubReader(offset, size), this->m_edge[i].m_label.back(), m_serializer);
- }
-
-private:
- void ParseNode(TrieChar baseChar)
- {
- ReaderSource<TReader> src(m_reader);
-
- // [1: header]: [2: min(valueCount, 3)] [6: min(childCount, 63)]
- uint8_t const header = ReadPrimitiveFromSource<uint8_t>(src);
- uint32_t valueCount = (header >> 6);
- uint32_t childCount = (header & 63);
-
- // [vu valueCount]: if valueCount in header == 3
- if (valueCount == 3)
- valueCount = ReadVarUint<uint32_t>(src);
-
- // [vu childCount]: if childCount in header == 63
- if (childCount == 63)
- childCount = ReadVarUint<uint32_t>(src);
-
- // [valueList]
- m_valueList.Deserialize(src, valueCount, m_serializer);
-
- // [childInfo] ... [childInfo]
- this->m_edge.resize(childCount);
- m_edgeInfo.resize(childCount + 1);
- m_edgeInfo[0].m_offset = 0;
- for (uint32_t i = 0; i < childCount; ++i)
- {
- typename Iterator<TValueList>::Edge & e = this->m_edge[i];
-
- // [1: header]: [1: isLeaf] [1: isShortEdge] [6: (edgeChar0 - baseChar) or min(edgeLen-1, 63)]
- uint8_t const header = ReadPrimitiveFromSource<uint8_t>(src);
- m_edgeInfo[i].m_isLeaf = ((header & 128) != 0);
- if (header & 64)
- e.m_label.push_back(baseChar + bits::ZigZagDecode(header & 63U));
- else
- {
- // [vu edgeLen-1]: if edgeLen-1 in header == 63
- uint32_t edgeLen = (header & 63);
- if (edgeLen == 63)
- edgeLen = ReadVarUint<uint32_t>(src);
- edgeLen += 1;
-
- // [vi edgeChar0 - baseChar] [vi edgeChar1 - edgeChar0] ... [vi edgeCharN - edgeCharN-1]
- e.m_label.reserve(edgeLen);
- for (uint32_t i = 0; i < edgeLen; ++i)
- e.m_label.push_back(baseChar += ReadVarInt<int32_t>(src));
- }
-
- // [child size]: if the child is not the last one
- m_edgeInfo[i + 1].m_offset = m_edgeInfo[i].m_offset;
- if (i != childCount - 1)
- m_edgeInfo[i + 1].m_offset += ReadVarUint<uint32_t>(src);
-
- baseChar = e.m_label[0];
- }
-
- uint32_t const currentOffset = static_cast<uint32_t>(src.Pos());
- for (size_t i = 0; i < m_edgeInfo.size(); ++i)
- m_edgeInfo[i].m_offset += currentOffset;
- m_edgeInfo.back().m_offset = static_cast<uint32_t>(m_reader.Size());
- }
-
- struct EdgeInfo
- {
- uint32_t m_offset;
- bool m_isLeaf;
- };
-
- buffer_vector<EdgeInfo, 9> m_edgeInfo;
-
- TReader m_reader;
- TSerializer m_serializer;
-};
-
-// Returns iterator to the root of the trie.
-template <class TReader, class TValueList, class TSerializer>
-unique_ptr<Iterator<TValueList>> ReadTrie(TReader const & reader, TSerializer const & serializer)
-{
- return make_unique<Iterator0<TReader, TValueList, TSerializer>>(reader, DEFAULT_CHAR, serializer);
-}
-
-} // namespace trie