diff options
author | Maxim Pimenov <m@maps.me> | 2015-11-03 20:04:40 +0300 |
---|---|---|
committer | Sergey Yershov <yershov@corp.mail.ru> | 2016-03-23 16:02:37 +0300 |
commit | a818bb4b37e4de58fe6a251c4a6cfcce53b34c6a (patch) | |
tree | 4b3ac8ad61318f6e62bbaba29d5f5d2941771dce /coding | |
parent | 907cb3786b0d10e2fcf44c906ee4c8625f41630b (diff) |
Moved search index from coding to indexer.
Diffstat (limited to 'coding')
-rw-r--r-- | coding/coding.pro | 6 | ||||
-rw-r--r-- | coding/coding_tests/coding_tests.pro | 2 | ||||
-rw-r--r-- | coding/coding_tests/succinct_trie_test.cpp | 271 | ||||
-rw-r--r-- | coding/coding_tests/trie_test.cpp | 262 | ||||
-rw-r--r-- | coding/succinct_trie_builder.hpp | 199 | ||||
-rw-r--r-- | coding/succinct_trie_reader.hpp | 257 | ||||
-rw-r--r-- | coding/trie.hpp | 85 | ||||
-rw-r--r-- | coding/trie_builder.hpp | 265 | ||||
-rw-r--r-- | coding/trie_reader.hpp | 159 |
9 files changed, 0 insertions, 1506 deletions
diff --git a/coding/coding.pro b/coding/coding.pro index f159e3fb3a..adf2dd5515 100644 --- a/coding/coding.pro +++ b/coding/coding.pro @@ -89,12 +89,6 @@ HEADERS += \ streams_common.hpp \ streams_sink.hpp \ succinct_mapper.hpp \ - succinct_trie.hpp \ - succinct_trie_builder.hpp \ - succinct_trie_reader.hpp \ - trie.hpp \ - trie_builder.hpp \ - trie_reader.hpp \ uri.hpp \ url_encode.hpp \ value_opt_string.hpp \ diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro index f782173851..adb3676b54 100644 --- a/coding/coding_tests/coding_tests.pro +++ b/coding/coding_tests/coding_tests.pro @@ -39,8 +39,6 @@ SOURCES += ../../testing/testingmain.cpp \ simple_dense_coding_test.cpp \ sha2_test.cpp \ succinct_mapper_test.cpp \ - succinct_trie_test.cpp \ - trie_test.cpp \ uri_test.cpp \ url_encode_test.cpp \ value_opt_string_test.cpp \ diff --git a/coding/coding_tests/succinct_trie_test.cpp b/coding/coding_tests/succinct_trie_test.cpp deleted file mode 100644 index 4eeae0f9cf..0000000000 --- a/coding/coding_tests/succinct_trie_test.cpp +++ /dev/null @@ -1,271 +0,0 @@ -#include "testing/testing.hpp" - -#include "coding/succinct_trie_builder.hpp" -#include "coding/succinct_trie_reader.hpp" -#include "coding/trie_builder.hpp" -#include "coding/trie.hpp" -#include "coding/reader.hpp" -#include "coding/writer.hpp" - -#include "coding/trie_reader.hpp" - -#include "indexer/search_trie.hpp" - -#include "base/string_utils.hpp" - -#include "std/algorithm.hpp" -#include "std/vector.hpp" - -namespace -{ -struct StringsFileEntryMock -{ - StringsFileEntryMock() = default; - StringsFileEntryMock(string const & key, uint8_t value) - : m_key(key.begin(), key.end()), m_value(value) - { - } - - trie::TrieChar const * GetKeyData() { return m_key.data(); } - - size_t GetKeySize() const { return m_key.size(); } - - uint8_t GetValue() const { return m_value; } - void * value_data() const { return nullptr; } - size_t value_size() const { return 0; } - - void Swap(StringsFileEntryMock & o) - { - swap(m_key, o.m_key); - swap(m_value, o.m_value); - } - - bool operator==(const StringsFileEntryMock & o) const - { - return m_key == o.m_key && m_value == o.m_value; - } - - bool operator<(const StringsFileEntryMock & o) const - { - if (m_key != o.m_key) - return m_key < o.m_key; - return m_value < o.m_value; - } - - vector<trie::TrieChar> m_key; - uint8_t m_value; -}; - -template <typename TWriter> -struct EmptyValueList -{ - size_t size() const { return 0; } - void Dump(TWriter &) const {} - void Append(int) {} -}; - -struct SimpleValueReader -{ -public: - SimpleValueReader() = default; - - using ValueType = uint8_t; - - template <typename TReader> - void operator()(TReader & reader, ValueType & v) const - { - v = ReadPrimitiveFromSource<uint8_t>(reader); - } - - template <class TWriter> - void Save(TWriter & writer, ValueType const & v) const - { - WriteToSink(writer, v); - } -}; - -template <typename TWriter> -struct SimpleValueList -{ - size_t size() const { return m_valueList.size(); } - - void Dump(TWriter & writer) const - { - for (uint8_t x : m_valueList) - WriteToSink(writer, x); - } - - void Append(uint8_t x) { m_valueList.push_back(x); } - - vector<uint8_t> m_valueList; -}; - -using TSimpleIterator = trie::SuccinctTrieIterator<MemReader, SimpleValueReader>; - -void ReadAllValues(TSimpleIterator & root, vector<uint8_t> & values) -{ - for (size_t i = 0; i < root.NumValues(); ++i) - values.push_back(root.GetValue(i)); -} - -void CollectInSubtree(TSimpleIterator & root, vector<uint8_t> & collectedValues) -{ - ReadAllValues(root, collectedValues); - - if (auto l = root.GoToEdge(0)) - CollectInSubtree(*l, collectedValues); - if (auto r = root.GoToEdge(1)) - CollectInSubtree(*r, collectedValues); -} - -template <typename TWriter> -void BuildFromSimpleValueList(TWriter & writer, vector<StringsFileEntryMock> & data) -{ - trie::BuildSuccinctTrie<TWriter, vector<StringsFileEntryMock>::iterator, - SimpleValueList<TWriter>>(writer, data.begin(), data.end()); -} -} // namespace - -namespace trie -{ -// todo(@pimenov): It may be worth it to write a test -// for the trie's topology but the test may be flaky because -// it has to depend on the particular Huffman encoding of the key strings. -// This is remedied by separation of the string-encoding and trie-building -// parts, but they are not separated now, hence there is no such test. - -UNIT_TEST(SuccinctTrie_Serialization_Smoke1) -{ - vector<uint8_t> buf; - - using TWriter = MemWriter<vector<uint8_t>>; - TWriter memWriter(buf); - - vector<StringsFileEntryMock> data = {StringsFileEntryMock("abacaba", 1)}; - - trie::BuildSuccinctTrie<TWriter, vector<StringsFileEntryMock>::iterator, EmptyValueList<TWriter>>( - memWriter, data.begin(), data.end()); - - MemReader memReader(buf.data(), buf.size()); - - using TEmptyValue = trie::EmptyValueReader::ValueType; - - auto trieRoot = trie::ReadSuccinctTrie(memReader, trie::EmptyValueReader()); - TEST(trieRoot, ()); -} - -UNIT_TEST(SuccinctTrie_Serialization_Smoke2) -{ - vector<uint8_t> buf; - - using TWriter = MemWriter<vector<uint8_t>>; - TWriter memWriter(buf); - - vector<StringsFileEntryMock> data = {StringsFileEntryMock("abacaba", 1)}; - - BuildFromSimpleValueList(memWriter, data); - - MemReader memReader(buf.data(), buf.size()); - - using TEmptyValue = trie::EmptyValueReader::ValueType; - - auto trieRoot = trie::ReadSuccinctTrie(memReader, SimpleValueReader()); - TEST(trieRoot, ()); -} - -UNIT_TEST(SuccinctTrie_Iterator) -{ - vector<uint8_t> buf; - - using TWriter = MemWriter<vector<uint8_t>>; - TWriter memWriter(buf); - - vector<StringsFileEntryMock> data = {StringsFileEntryMock("a", 1), StringsFileEntryMock("b", 2), - StringsFileEntryMock("ab", 3), StringsFileEntryMock("ba", 4), - StringsFileEntryMock("abc", 5)}; - sort(data.begin(), data.end()); - - BuildFromSimpleValueList(memWriter, data); - - MemReader memReader(buf.data(), buf.size()); - - using TEmptyValue = trie::EmptyValueReader::ValueType; - - auto trieRoot = trie::ReadSuccinctTrie(memReader, SimpleValueReader()); - TEST(trieRoot, ()); - - vector<uint8_t> collectedValues; - CollectInSubtree(*trieRoot, collectedValues); - sort(collectedValues.begin(), collectedValues.end()); - TEST_EQUAL(collectedValues.size(), 5, ()); - for (size_t i = 0; i < collectedValues.size(); ++i) - TEST_EQUAL(collectedValues[i], i + 1, ()); -} - -UNIT_TEST(SuccinctTrie_MoveToString) -{ - vector<uint8_t> buf; - - using TWriter = MemWriter<vector<uint8_t>>; - TWriter memWriter(buf); - - vector<StringsFileEntryMock> data = { - StringsFileEntryMock("abcde", 1), StringsFileEntryMock("aaaaa", 2), - StringsFileEntryMock("aaa", 3), StringsFileEntryMock("aaa", 4)}; - sort(data.begin(), data.end()); - - BuildFromSimpleValueList(memWriter, data); - MemReader memReader(buf.data(), buf.size()); - - using TEmptyValue = trie::EmptyValueReader::ValueType; - - auto trieRoot = trie::ReadSuccinctTrie(memReader, SimpleValueReader()); - - { - auto it = trieRoot->GoToString(strings::MakeUniString("a")); - TEST(it != nullptr, ()); - vector<uint8_t> expectedValues; - vector<uint8_t> receivedValues; - ReadAllValues(*it.get(), receivedValues); - TEST_EQUAL(expectedValues, receivedValues, ()); - } - - { - auto it = trieRoot->GoToString(strings::MakeUniString("abcde")); - TEST(it != nullptr, ()); - vector<uint8_t> expectedValues{1}; - vector<uint8_t> receivedValues; - ReadAllValues(*it.get(), receivedValues); - TEST_EQUAL(expectedValues, receivedValues, ()); - } - - { - auto it = trieRoot->GoToString(strings::MakeUniString("aaaaa")); - TEST(it != nullptr, ()); - vector<uint8_t> expectedValues{2}; - vector<uint8_t> receivedValues; - ReadAllValues(*it.get(), receivedValues); - TEST_EQUAL(expectedValues, receivedValues, ()); - } - - { - auto it = trieRoot->GoToString(strings::MakeUniString("aaa")); - TEST(it != nullptr, ()); - vector<uint8_t> expectedValues{3, 4}; - vector<uint8_t> receivedValues; - ReadAllValues(*it.get(), receivedValues); - TEST_EQUAL(expectedValues, receivedValues, ()); - } - - { - auto it = trieRoot->GoToString(strings::MakeUniString("b")); - TEST(it == nullptr, ()); - } - - { - auto it = trieRoot->GoToString(strings::MakeUniString("bbbbb")); - TEST(it == nullptr, ()); - } -} - -} // namespace trie diff --git a/coding/coding_tests/trie_test.cpp b/coding/coding_tests/trie_test.cpp deleted file mode 100644 index 890e6bdaff..0000000000 --- a/coding/coding_tests/trie_test.cpp +++ /dev/null @@ -1,262 +0,0 @@ -#include "testing/testing.hpp" -#include "coding/byte_stream.hpp" -#include "coding/reader.hpp" -#include "coding/trie.hpp" -#include "coding/trie_builder.hpp" -#include "coding/trie_reader.hpp" -#include "coding/write_to_sink.hpp" - -#include "base/logging.hpp" - -#include "std/algorithm.hpp" -#include "std/cstring.hpp" -#include "std/string.hpp" -#include "std/vector.hpp" - -#include <boost/utility/binary.hpp> - -namespace -{ -struct ChildNodeInfo -{ - bool m_isLeaf; - uint32_t m_size; - vector<uint32_t> m_edge; - - ChildNodeInfo(bool isLeaf, uint32_t size, char const * edge) : m_isLeaf(isLeaf), m_size(size) - { - while (*edge) - m_edge.push_back(*edge++); - } - - uint32_t Size() const { return m_size; } - bool IsLeaf() const { return m_isLeaf; } - uint32_t const * GetEdge() const { return &m_edge[0]; } - uint32_t GetEdgeSize() const { return m_edge.size(); } -}; - -struct KeyValuePair -{ - buffer_vector<trie::TrieChar, 8> m_key; - uint32_t m_value; - - KeyValuePair() {} - - template <class TString> - KeyValuePair(TString const & key, int value) - : m_key(key.begin(), key.end()), m_value(value) - {} - - uint32_t GetKeySize() const { return m_key.size(); } - trie::TrieChar const * GetKeyData() const { return m_key.data(); } - uint32_t GetValue() const { return m_value; } - - inline void const * value_data() const { return &m_value; } - - inline size_t value_size() const { return sizeof(m_value); } - - bool operator == (KeyValuePair const & p) const - { - return (m_key == p.m_key && m_value == p.m_value); - } - - bool operator < (KeyValuePair const & p) const - { - return ((m_key != p.m_key) ? m_key < p.m_key : m_value < p.m_value); - } - - void Swap(KeyValuePair & r) - { - m_key.swap(r.m_key); - swap(m_value, r.m_value); - } -}; - -string DebugPrint(KeyValuePair const & p) -{ - string keyS = ::DebugPrint(p.m_key); - ostringstream out; - out << "KVP(" << keyS << ", " << p.m_value << ")"; - return out.str(); -} - -struct KeyValuePairBackInserter -{ - template <class TString> - void operator()(TString const & s, uint32_t const & value) - { - m_v.push_back(KeyValuePair(s, value)); - } - - vector<KeyValuePair> m_v; -}; - -// The SingleValueSerializer and ValueList classes are similar to -// those in indexer/string_file_values.hpp but that file -// is not included to avoid coding_tests's dependency from indexer. -template <typename TPrimitive> -class SingleValueSerializer -{ -public: - static_assert(is_trivially_copyable<TPrimitive>::value, ""); - - template <typename TWriter> - void Serialize(TWriter & writer, TPrimitive const & v) const - { - WriteToSink(writer, v); - } -}; - -template <typename TPrimitive> -class ValueList -{ -public: - using TValue = TPrimitive; - using TSerializer = SingleValueSerializer<TValue>; - - static_assert(is_trivially_copyable<TPrimitive>::value, ""); - - ValueList() = default; - - void Init(vector<TValue> const & values) { m_values = values; } - - size_t Size() const { return m_values.size(); } - - bool IsEmpty() const { return m_values.empty(); } - - template <typename TSink> - void Serialize(TSink & sink, TSerializer const & /* serializer */) const - { - for (auto const & value : m_values) - WriteToSink(sink, value); - } - - template <typename TSource> - void Deserialize(TSource & src, uint32_t valueCount, TSerializer const & /* serializer */) - { - if (valueCount == 0) - { - Deserialize(src, TSerializer()); - return; - } - m_values.resize(valueCount); - for (size_t i = 0; i < valueCount; ++i) - m_values[i] = ReadPrimitiveFromSource<TValue>(src); - } - - template <typename TSource> - void Deserialize(TSource & src, TSerializer const & /* serializer */) - { - m_values.clear(); - while (src.Size() > 0) - m_values.emplace_back(ReadPrimitiveFromSource<TValue>(src)); - } - - template <typename TF> - void ForEach(TF && f) const - { - for (auto const & value : m_values) - f(value); - } - -private: - vector<TValue> m_values; -}; -} // namespace - -#define ZENC bits::ZigZagEncode -#define MKSC(x) static_cast<signed char>(x) -#define MKUC(x) static_cast<uint8_t>(x) - -UNIT_TEST(TrieBuilder_WriteNode_Smoke) -{ - vector<uint8_t> buf; - PushBackByteSink<vector<uint8_t>> sink(buf); - ChildNodeInfo children[] = { - ChildNodeInfo(true, 1, "1A"), ChildNodeInfo(false, 2, "B"), ChildNodeInfo(false, 3, "zz"), - ChildNodeInfo(true, 4, - "abcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghijabcdefghij"), - ChildNodeInfo(true, 5, "a")}; - - ValueList<char> valueList; - valueList.Init({'1', '2', '3'}); - trie::WriteNode(sink, SingleValueSerializer<char>(), 0, valueList, &children[0], - &children[0] + ARRAY_SIZE(children)); - uint8_t const expected [] = - { - BOOST_BINARY(11000101), // Header: [0b11] [0b000101] - 3, // Number of values - '1', '2', '3', // Values - BOOST_BINARY(10000001), // Child 1: header: [+leaf] [-supershort] [2 symbols] - MKUC(ZENC(MKSC('1'))), MKUC(ZENC(MKSC('A') - MKSC('1'))), // Child 1: edge - 1, // Child 1: size - MKUC(64 | ZENC(MKSC('B') - MKSC('1'))), // Child 2: header: [-leaf] [+supershort] - 2, // Child 2: size - BOOST_BINARY(00000001), // Child 3: header: [-leaf] [-supershort] [2 symbols] - MKUC(ZENC(MKSC('z') - MKSC('B'))), 0, // Child 3: edge - 3, // Child 3: size - BOOST_BINARY(10111111), // Child 4: header: [+leaf] [-supershort] [>= 63 symbols] - 69, // Child 4: edgeSize - 1 - MKUC(ZENC(MKSC('a') - MKSC('z'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge - MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge - MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge - MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge - MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge - MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge - MKUC(ZENC(MKSC('a') - MKSC('j'))), 2,2,2,2,2,2,2,2,2, // Child 4: edge - 4, // Child 4: size - MKUC(BOOST_BINARY(11000000) | ZENC(0)), // Child 5: header: [+leaf] [+supershort] - }; - - TEST_EQUAL(buf, vector<uint8_t>(&expected[0], &expected[0] + ARRAY_SIZE(expected)), ()); -} - -UNIT_TEST(TrieBuilder_Build) -{ - int const base = 3; - int const maxLen = 3; - - vector<string> possibleStrings(1, string()); - for (int len = 1; len <= maxLen; ++len) - { - for (int i = 0, p = static_cast<int>(pow((double) base, len)); i < p; ++i) - { - string s(len, 'A'); - int t = i; - for (int l = len - 1; l >= 0; --l, t /= base) - s[l] += (t % base); - possibleStrings.push_back(s); - } - } - sort(possibleStrings.begin(), possibleStrings.end()); - // LOG(LINFO, (possibleStrings)); - - int const count = static_cast<int>(possibleStrings.size()); - for (int i0 = -1; i0 < count; ++i0) - for (int i1 = i0; i1 < count; ++i1) - for (int i2 = i1; i2 < count; ++i2) - { - vector<KeyValuePair> v; - if (i0 >= 0) v.push_back(KeyValuePair(possibleStrings[i0], i0)); - if (i1 >= 0) v.push_back(KeyValuePair(possibleStrings[i1], i1 + 10)); - if (i2 >= 0) v.push_back(KeyValuePair(possibleStrings[i2], i2 + 100)); - vector<string> vs; - for (size_t i = 0; i < v.size(); ++i) - vs.push_back(string(v[i].m_key.begin(), v[i].m_key.end())); - - vector<uint8_t> buf; - PushBackByteSink<vector<uint8_t>> sink(buf); - SingleValueSerializer<uint32_t> serializer; - trie::Build<PushBackByteSink<vector<uint8_t>>, typename vector<KeyValuePair>::iterator, - ValueList<uint32_t>>(sink, serializer, v.begin(), v.end()); - reverse(buf.begin(), buf.end()); - - MemReader memReader = MemReader(&buf[0], buf.size()); - auto const root = trie::ReadTrie<MemReader, ValueList<uint32_t>>(memReader, serializer); - vector<KeyValuePair> res; - KeyValuePairBackInserter f; - trie::ForEachRef(*root, f, vector<trie::TrieChar>()); - sort(f.m_v.begin(), f.m_v.end()); - TEST_EQUAL(v, f.m_v, ()); - } -} diff --git a/coding/succinct_trie_builder.hpp b/coding/succinct_trie_builder.hpp deleted file mode 100644 index 5ba0f56cf0..0000000000 --- a/coding/succinct_trie_builder.hpp +++ /dev/null @@ -1,199 +0,0 @@ -#pragma once - -#include "coding/bit_streams.hpp" -#include "coding/byte_stream.hpp" -#include "coding/huffman.hpp" -#include "coding/reader.hpp" -#include "coding/varint.hpp" -#include "coding/writer.hpp" - -#include "base/buffer_vector.hpp" -#include "base/scope_guard.hpp" -#include "base/string_utils.hpp" - -#include "std/algorithm.hpp" -#include "std/bind.hpp" - -// Trie format: -// -- Serialized Huffman encoding. -// -- Topology of the trie built on Huffman-encoded input strings [2 bits per node, level-order representation]. -// -- List of pairs (node id, offset). One pair per final node (i.e. a node where a string ends). -// The lists of node ids and offsets are both non-decreasing and are delta-encoded with varuints. -// -- Values of final nodes in level-order. The values for final node |id| start at offset |offset| -// if there is a pair (id, offset) in the list above. - -namespace trie -{ -// Node is a temporary struct that is used to store the trie in memory -// before it is converted to a succinct representation and put to disk. -// It is rather verbose but hopefully is small enough to fit in memory. -template <class TValueList> -struct Node -{ - // The left child is reached by 0, the right by 1. - Node *l, *r; - - // A node is final if a key string ends in it. - // In particular, all non-final nodes' valueLists are empty - // and it is expected that a final node's valueList is non-empty. - bool m_isFinal; - - // m_valueLists stores all the additional information that is related - // to the key string which leads to this node. - TValueList m_valueList; - - Node() : l(nullptr), r(nullptr), m_isFinal(false) {} -}; - -template <class TNode, class TReader> -TNode * AddToTrie(TNode * root, TReader & bitEncoding, uint32_t numBits) -{ - ReaderSource<TReader> src(bitEncoding); - // String size. - ReadVarUint<uint32_t, ReaderSource<TReader>>(src); - - BitReader<ReaderSource<TReader>> bitReader(src); - auto cur = root; - for (uint32_t i = 0; i < numBits; ++i) - { - auto nxt = bitReader.Read(1) == 0 ? &cur->l : &cur->r; - if (!*nxt) - *nxt = new TNode(); - cur = *nxt; - } - return cur; -} - -template <class TNode> -void DeleteTrie(TNode * root) -{ - if (!root) - return; - DeleteTrie(root->l); - DeleteTrie(root->r); - delete root; -} - -template <typename TNode> -void WriteInLevelOrder(TNode * root, vector<TNode *> & levelOrder) -{ - ASSERT(root, ()); - levelOrder.push_back(root); - size_t i = 0; - while (i < levelOrder.size()) - { - TNode * cur = levelOrder[i++]; - if (cur->l) - levelOrder.push_back(cur->l); - if (cur->r) - levelOrder.push_back(cur->r); - } -} - -template <typename TWriter, typename TIter, typename TValueList> -void BuildSuccinctTrie(TWriter & writer, TIter const beg, TIter const end) -{ - using TrieChar = uint32_t; - using TTrieString = buffer_vector<TrieChar, 32>; - using TNode = Node<TValueList>; - using TEntry = typename TIter::value_type; - - TNode * root = new TNode(); - MY_SCOPE_GUARD(cleanup, bind(&DeleteTrie<TNode>, root)); - TTrieString prevKey; - TEntry prevEntry; - - vector<TEntry> entries; - vector<strings::UniString> entryStrings; - for (TIter it = beg; it != end; ++it) - { - TEntry entry = *it; - if (it != beg && entry == prevEntry) - continue; - TrieChar const * const keyData = entry.GetKeyData(); - TTrieString key(keyData, keyData + entry.GetKeySize()); - using namespace std::rel_ops; // ">=" for keys. - CHECK_GREATER_OR_EQUAL(key, prevKey, (key, prevKey)); - entries.push_back(entry); - entryStrings.push_back(strings::UniString(keyData, keyData + entry.GetKeySize())); - prevKey.swap(key); - prevEntry.Swap(entry); - } - - coding::HuffmanCoder huffman; - huffman.Init(entryStrings); - huffman.WriteEncoding(writer); - - for (size_t i = 0; i < entries.size(); ++i) - { - TEntry const & entry = entries[i]; - auto const & key = entryStrings[i]; - - vector<uint8_t> buf; - MemWriter<vector<uint8_t>> memWriter(buf); - uint32_t numBits = huffman.EncodeAndWrite(memWriter, key); - - MemReader bitEncoding(&buf[0], buf.size()); - - TNode * cur = AddToTrie(root, bitEncoding, numBits); - cur->m_isFinal = true; - cur->m_valueList.Append(entry.GetValue()); - } - - vector<TNode *> levelOrder; - WriteInLevelOrder(root, levelOrder); - - { - // Trie topology. - BitWriter<TWriter> bitWriter(writer); - WriteVarUint(writer, levelOrder.size()); - for (size_t i = 0; i < levelOrder.size(); ++i) - { - auto const & cur = levelOrder[i]; - bitWriter.Write(cur->l ? 1 : 0, 1 /* numBits */); - bitWriter.Write(cur->r ? 1 : 0, 1 /* numBits */); - } - } - - vector<uint32_t> finalNodeIds; - vector<uint32_t> offsetTable; - vector<uint8_t> valueBuf; - MemWriter<vector<uint8_t>> valueWriter(valueBuf); - for (size_t i = 0; i < levelOrder.size(); ++i) - { - TNode const * node = levelOrder[i]; - if (!node->m_isFinal) - continue; - finalNodeIds.push_back(static_cast<uint32_t>(i)); - offsetTable.push_back(valueWriter.Pos()); - node->m_valueList.Dump(valueWriter); - } - - WriteVarUint(writer, finalNodeIds.size()); - for (size_t i = 0; i < finalNodeIds.size(); ++i) - { - if (i == 0) - { - WriteVarUint(writer, finalNodeIds[i]); - WriteVarUint(writer, offsetTable[i]); - } - else - { - WriteVarUint(writer, finalNodeIds[i] - finalNodeIds[i - 1]); - WriteVarUint(writer, offsetTable[i] - offsetTable[i - 1]); - } - } - - writer.Write(valueBuf.data(), valueBuf.size()); - - // todo(@pimenov): - // 1. Investigate the possibility of path compression (short edges + lcp table). - // 2. It is highly probable that valueList will be only a list of feature ids in our - // future implementations of the search. - // We can then dispose of offsetTable and store finalNodeIds as another bit vector. - // Alternatively (and probably better), we can append a special symbol to all those - // strings that have non-empty value lists. We can then get the leaf number - // of a final node by doing rank0. -} - -} // namespace trie diff --git a/coding/succinct_trie_reader.hpp b/coding/succinct_trie_reader.hpp deleted file mode 100644 index cc61d6b0f2..0000000000 --- a/coding/succinct_trie_reader.hpp +++ /dev/null @@ -1,257 +0,0 @@ -#pragma once -#include "coding/bit_streams.hpp" -#include "coding/huffman.hpp" -#include "coding/reader.hpp" -#include "coding/trie.hpp" -#include "coding/varint.hpp" - -#include "base/assert.hpp" -#include "base/bits.hpp" -#include "base/macros.hpp" -#include "base/string_utils.hpp" - -#include "std/unique_ptr.hpp" -#include "std/utility.hpp" -#include "std/vector.hpp" - -#include "3party/succinct/rs_bit_vector.hpp" - -namespace trie -{ -// This class is used to read the data that is common to every node -// of the trie: the trie topology and the offsets into the data buffer. -// The topology can then be used to navigate the trie and the offsets -// can be used to extract the values associated with the key strings. -template <class TReader, class TValueReader> -class TopologyAndOffsets -{ -public: - using TValue = typename TValueReader::ValueType; - - TopologyAndOffsets(TReader const & reader, TValueReader const & valueReader) - : m_reader(reader), m_valueReader(valueReader) - { - Parse(); - } - - // Returns the topology of the trie in the external node representation. - // Arrange the nodes in the level order and write two bits for every - // node: the first bit for its left child and the second bit for its right child. - // Write '1' if the child is present and '0' if it is not. - // Prepend the resulting bit string with '1' (for a super-root) and you will get - // the external node representaiton. - succinct::rs_bit_vector const & GetTopology() const { return m_trieTopology; } - - // Returns the number of trie nodes. - uint32_t NumNodes() const { return m_numNodes; } - - // Returns the Huffman encoding that was used to encode the strings - // before adding them to this trie. - coding::HuffmanCoder const & GetEncoding() const { return m_huffman; } - - // Returns true if and only if the node is the last node of a string - // that was used as a key when building this trie. - bool NodeIsFinal(uint32_t id) const { return m_finalNodeIndex[id] >= 0; } - - // Returns the offset relative to the beginning of the start of the - // value buffer where the data for node number |id| starts. - uint32_t Offset(uint32_t id) const - { - CHECK(NodeIsFinal(id), ()); - return m_offsetTable[m_finalNodeIndex[id]]; - } - - // Returns the number of values associated with a final node |id|. - uint32_t ValueListSize(uint32_t id) - { - CHECK(NodeIsFinal(id), ()); - int finalId = m_finalNodeIndex[id]; - return m_offsetTable[finalId + 1] - m_offsetTable[finalId]; - } - - // Returns the reader from which the value buffer can be read. - TValueReader const & GetValueReader() const { return m_valueReader; } - - // Returns the reader from which the trie can be read. - TReader const & GetReader() const { return m_reader; } - -private: - void Parse() - { - ReaderSource<TReader> src(m_reader); - - m_huffman.ReadEncoding(src); - m_numNodes = ReadVarUint<uint32_t, ReaderSource<TReader>>(src); - - BitReader<ReaderSource<TReader>> bitReader(src); - vector<bool> bv(2 * m_numNodes + 1); - // Convert the level order representation into the external node representation. - bv[0] = 1; - for (size_t i = 0; i < 2 * m_numNodes; ++i) - bv[i + 1] = bitReader.Read(1) == 1; - - succinct::rs_bit_vector(bv).swap(m_trieTopology); - - uint32_t const numFinalNodes = ReadVarUint<uint32_t, ReaderSource<TReader>>(src); - m_finalNodeIndex.assign(m_numNodes, -1); - m_offsetTable.resize(numFinalNodes + 1); - uint32_t id = 0; - uint32_t offset = 0; - for (size_t i = 0; i < numFinalNodes; ++i) - { - // ids and offsets are delta-encoded - id += ReadVarUint<uint32_t, ReaderSource<TReader>>(src); - offset += ReadVarUint<uint32_t, ReaderSource<TReader>>(src); - m_finalNodeIndex[id] = i; - m_offsetTable[i] = offset; - } - m_offsetTable[numFinalNodes] = src.Size(); - m_reader = m_reader.SubReader(src.Pos(), src.Size()); - } - - TReader m_reader; - - // todo(@pimenov) Why do we even need an instance? Type name is enough. - TValueReader const & m_valueReader; - uint32_t m_numNodes; - - coding::HuffmanCoder m_huffman; - succinct::rs_bit_vector m_trieTopology; - - // m_finalNodeIndex[i] is the 0-based index of the i'th - // node in the list of all final nodes, or -1 if the - // node is not final. - vector<int> m_finalNodeIndex; - vector<uint32_t> m_offsetTable; -}; - -template <class TReader, class TValueReader> -class SuccinctTrieIterator -{ -public: - using TValue = typename TValueReader::ValueType; - using TCommonData = TopologyAndOffsets<TReader, TValueReader>; - - SuccinctTrieIterator(TReader const & reader, shared_ptr<TCommonData> common, - uint32_t nodeBitPosition) - : m_reader(reader), m_common(common), m_nodeBitPosition(nodeBitPosition), m_valuesRead(false) - { - } - - unique_ptr<SuccinctTrieIterator> Clone() const - { - return make_unique<SuccinctTrieIterator>(m_reader, m_common, m_nodeBitPosition); - } - - unique_ptr<SuccinctTrieIterator> GoToEdge(size_t i) const - { - ASSERT_LESS(i, 2, ("Bad edge id of a binary trie.")); - ASSERT_EQUAL(m_common->GetTopology()[m_nodeBitPosition - 1], 1, (m_nodeBitPosition)); - - // rank(x) returns the number of ones in [0, x) but we count bit positions from 1 - uint32_t childBitPosition = 2 * m_common->GetTopology().rank(m_nodeBitPosition); - if (i == 1) - ++childBitPosition; - if (childBitPosition > 2 * m_common->NumNodes() || - m_common->GetTopology()[childBitPosition - 1] == 0) - { - return nullptr; - } - return make_unique<SuccinctTrieIterator>(m_reader, m_common, childBitPosition); - } - - template <typename TEncodingReader> - unique_ptr<SuccinctTrieIterator> GoToString(TEncodingReader & bitEncoding, uint32_t numBits) - { - ReaderSource<TEncodingReader> src(bitEncoding); - - // String size. Useful when the string is not alone in the writer to know where it - // ends but useless for our purpose here. - ReadVarUint<uint32_t, ReaderSource<TEncodingReader>>(src); - - BitReader<ReaderSource<TEncodingReader>> bitReader(src); - - auto ret = Clone(); - for (uint32_t i = 0; i < numBits; ++i) - { - uint8_t bit = bitReader.Read(1); - auto nxt = ret->GoToEdge(bit); - if (!nxt) - return nullptr; - ret = move(nxt); - } - return ret; - } - - unique_ptr<SuccinctTrieIterator> GoToString(strings::UniString const & s) - { - vector<uint8_t> buf; - uint32_t numBits; - { - MemWriter<vector<uint8_t>> writer(buf); - numBits = m_common->GetEncoding().EncodeAndWrite(writer, s); - } - MemReader reader(&buf[0], buf.size()); - return GoToString(reader, numBits); - } - - uint32_t NumValues() - { - if (!m_valuesRead) - ReadValues(); - return m_values.size(); - } - - TValue GetValue(size_t i) - { - if (!m_valuesRead) - ReadValues(); - ASSERT_LESS(i, m_values.size(), ()); - return m_values[i]; - } - -private: - void ReadValues() - { - if (m_valuesRead) - return; - m_valuesRead = true; - // Back to 0-based indices. - uint32_t m_nodeId = m_common->GetTopology().rank(m_nodeBitPosition) - 1; - if (!m_common->NodeIsFinal(m_nodeId)) - return; - uint32_t offset = m_common->Offset(m_nodeId); - uint32_t size = m_common->ValueListSize(m_nodeId); - ReaderPtr<TReader> subReaderPtr(m_reader.CreateSubReader(offset, size)); - ReaderSource<ReaderPtr<TReader>> src(subReaderPtr); - while (src.Size() > 0) - { - TValue val; - m_common->GetValueReader()(src, val); - m_values.push_back(val); - } - } - - TReader const & m_reader; - shared_ptr<TCommonData> m_common; - - // The bit with this 1-based index represents this node - // in the external node representation of binary trie. - uint32_t m_nodeBitPosition; - - vector<TValue> m_values; - bool m_valuesRead; -}; - -template <typename TReader, typename TValueReader> -unique_ptr<SuccinctTrieIterator<TReader, TValueReader>> ReadSuccinctTrie( - TReader const & reader, TValueReader valueReader = TValueReader()) -{ - using TCommonData = TopologyAndOffsets<TReader, TValueReader>; - using TIter = SuccinctTrieIterator<TReader, TValueReader>; - - shared_ptr<TCommonData> common(new TCommonData(reader, valueReader)); - return make_unique<TIter>(common->GetReader(), common, 1 /* bitPosition */); -} - -} // namespace trie diff --git a/coding/trie.hpp b/coding/trie.hpp deleted file mode 100644 index 4c1b397b08..0000000000 --- a/coding/trie.hpp +++ /dev/null @@ -1,85 +0,0 @@ -#pragma once - -#include "base/assert.hpp" -#include "base/base.hpp" -#include "base/buffer_vector.hpp" - -#include "std/unique_ptr.hpp" - -namespace trie -{ - -typedef uint32_t TrieChar; - -// 95 is a good value for the default baseChar, since both small and capital latin letters -// are less than +/- 32 from it and thus can fit into supershort edge. -// However 0 is used because the first byte is actually language id. -static uint32_t const DEFAULT_CHAR = 0; - -template <typename TValueList> -class Iterator -{ - //dbg::ObjectTracker m_tracker; - -public: - using TValue = typename TValueList::TValue; - - struct Edge - { - using TEdgeLabel = buffer_vector<TrieChar, 8>; - TEdgeLabel m_label; - }; - - buffer_vector<Edge, 8> m_edge; - TValueList m_valueList; - - virtual ~Iterator() {} - - virtual unique_ptr<Iterator<TValueList>> Clone() const = 0; - virtual unique_ptr<Iterator<TValueList>> GoToEdge(size_t i) const = 0; -}; - -struct EmptyValueReader -{ - typedef unsigned char ValueType; - - EmptyValueReader() = default; - - template <typename SourceT> - void operator() (SourceT &, ValueType & value) const - { - value = 0; - } -}; - -template <unsigned int N> -struct FixedSizeValueReader -{ - struct ValueType - { - unsigned char m_data[N]; - }; - - template <typename SourceT> - void operator() (SourceT & src, ValueType & value) const - { - src.Read(&value.m_data[0], N); - } -}; - -template <typename TValueList, typename TF, typename TString> -void ForEachRef(Iterator<TValueList> const & it, TF && f, TString const & s) -{ - it.m_valueList.ForEach([&f, &s](typename TValueList::TValue const & value) - { - f(s, value); - }); - for (size_t i = 0; i < it.m_edge.size(); ++i) - { - TString s1(s); - s1.insert(s1.end(), it.m_edge[i].m_label.begin(), it.m_edge[i].m_label.end()); - auto nextIt = it.GoToEdge(i); - ForEachRef(*nextIt, f, s1); - } -} -} // namespace trie diff --git a/coding/trie_builder.hpp b/coding/trie_builder.hpp deleted file mode 100644 index 714c8f02f6..0000000000 --- a/coding/trie_builder.hpp +++ /dev/null @@ -1,265 +0,0 @@ -#pragma once - -#include "coding/byte_stream.hpp" -#include "coding/varint.hpp" - -#include "base/buffer_vector.hpp" -#include "base/logging.hpp" - -#include "std/algorithm.hpp" -#include "std/vector.hpp" - -// Trie format: -// [1: header] -// [node] ... [node] - -// Nodes are written in post-order (first child, last child, parent). Contents of nodes is writern -// reversed. The resulting file should be reverese before use! Then its contents will appear in -// pre-order alphabetically reversed (parent, last child, first child). - -// Leaf node format: -// [valueList] - -// Internal node format: -// [1: header]: [2: min(valueCount, 3)] [6: min(childCount, 63)] -// [vu valueCount]: if valueCount in header == 3 -// [vu childCount]: if childCount in header == 63 -// [valueList] -// [childInfo] ... [childInfo] - -// Child info format: -// Every char of the edge is encoded as varint difference from the previous char. First char is -// encoded as varint difference from the base char, which is the last char of the current prefix. -// -// [1: header]: [1: isLeaf] [1: isShortEdge] [6: (edgeChar0 - baseChar) or min(edgeLen-1, 63)] -// [vu edgeLen-1]: if edgeLen-1 in header == 63 -// [vi edgeChar0 - baseChar] -// [vi edgeChar1 - edgeChar0] -// ... -// [vi edgeCharN - edgeCharN-1] -// [child size]: if the child is not the last one when reading - -namespace trie -{ -template <typename TSink, typename TChildIter, typename TValueList, typename TSerializer> -void WriteNode(TSink & sink, TSerializer const & serializer, TrieChar baseChar, - TValueList const & valueList, TChildIter const begChild, TChildIter const endChild, - bool isRoot = false) -{ - uint32_t const valueCount = valueList.Size(); - if (begChild == endChild && !isRoot) - { - // Leaf node. - valueList.Serialize(sink, serializer); - return; - } - uint32_t const childCount = endChild - begChild; - uint8_t const header = static_cast<uint32_t>((min(valueCount, 3U) << 6) + min(childCount, 63U)); - sink.Write(&header, 1); - if (valueCount >= 3) - WriteVarUint(sink, valueCount); - if (childCount >= 63) - WriteVarUint(sink, childCount); - valueList.Serialize(sink, serializer); - for (TChildIter it = begChild; it != endChild; /*++it*/) - { - uint8_t header = (it->IsLeaf() ? 128 : 0); - TrieChar const * const edge = it->GetEdge(); - uint32_t const edgeSize = it->GetEdgeSize(); - CHECK_NOT_EQUAL(edgeSize, 0, ()); - CHECK_LESS(edgeSize, 100000, ()); - uint32_t const diff0 = bits::ZigZagEncode(int32_t(edge[0] - baseChar)); - if (edgeSize == 1 && (diff0 & ~63U) == 0) - { - header |= 64; - header |= diff0; - WriteToSink(sink, header); - } - else - { - if (edgeSize - 1 < 63) - { - header |= edgeSize - 1; - WriteToSink(sink, header); - } - else - { - header |= 63; - WriteToSink(sink, header); - WriteVarUint(sink, edgeSize - 1); - } - for (uint32_t i = 0; i < edgeSize; ++i) - { - WriteVarInt(sink, int32_t(edge[i] - baseChar)); - baseChar = edge[i]; - } - } - baseChar = edge[0]; - - uint32_t const childSize = it->Size(); - if (++it != endChild) - WriteVarUint(sink, childSize); - } -} - -struct ChildInfo -{ - bool m_isLeaf; - uint32_t m_size; - vector<TrieChar> m_edge; - - ChildInfo(bool isLeaf, uint32_t size, TrieChar c) : m_isLeaf(isLeaf), m_size(size), m_edge(1, c) - { - } - - uint32_t Size() const { return m_size; } - bool IsLeaf() const { return m_isLeaf; } - TrieChar const * GetEdge() const { return m_edge.data(); } - uint32_t GetEdgeSize() const { return m_edge.size(); } -}; - -template <typename TValueList> -struct NodeInfo -{ - uint64_t m_begPos; - TrieChar m_char; - vector<ChildInfo> m_children; - - // This is ugly but will do until we rename ValueList. - // Here is the rationale. ValueList<> is the entity that - // we store in the nodes of the search trie. It can be read - // or written via its methods but not directly as was assumed - // in a previous version of this code. That version provided - // serialization methods for ValueList but the deserialization - // was ad hoc. - // Because of the possibility of serialized ValueLists to represent - // something completely different from an array of FeatureIds - // (a compressed bit vector, for instance) and because of the - // need to update a node's ValueList until the node is finalized - // this vector is needed here. It is better to leave it here - // than to expose it in ValueList. - vector<typename TValueList::TValue> m_temporaryValueList; - TValueList m_valueList; - bool m_mayAppend; - - NodeInfo() : m_begPos(0), m_char(0), m_valueList(TValueList()), m_mayAppend(true) {} - NodeInfo(uint64_t pos, TrieChar trieChar) - : m_begPos(pos), m_char(trieChar), m_valueList(TValueList()), m_mayAppend(true) - { - } - - // It is finalized in the sense that no more appends are possible - // so it is a fine moment to initialize the underlying ValueList. - void FinalizeValueList() - { - m_valueList.Init(m_temporaryValueList); - m_mayAppend = false; - } -}; - -template <typename TSink, typename TValueList, typename TSerializer> -void WriteNodeReverse(TSink & sink, TSerializer const & serializer, TrieChar baseChar, - NodeInfo<TValueList> & node, bool isRoot = false) -{ - using TOutStorage = buffer_vector<uint8_t, 64>; - TOutStorage out; - PushBackByteSink<TOutStorage> outSink(out); - node.FinalizeValueList(); - WriteNode(outSink, serializer, baseChar, node.m_valueList, node.m_children.rbegin(), - node.m_children.rend(), isRoot); - reverse(out.begin(), out.end()); - sink.Write(out.data(), out.size()); -} - -template <typename TSink, typename TNodes, typename TSerializer> -void PopNodes(TSink & sink, TSerializer const & serializer, TNodes & nodes, int nodesToPop) -{ - using TNodeInfo = typename TNodes::value_type; - ASSERT_GREATER(nodes.size(), nodesToPop, ()); - for (; nodesToPop > 0; --nodesToPop) - { - TNodeInfo & node = nodes.back(); - TNodeInfo & prevNode = nodes[nodes.size() - 2]; - - if (node.m_temporaryValueList.empty() && node.m_children.size() <= 1) - { - ASSERT_EQUAL(node.m_children.size(), 1, ()); - ChildInfo & child = node.m_children[0]; - prevNode.m_children.emplace_back(child.m_isLeaf, child.m_size, node.m_char); - auto & prevChild = prevNode.m_children.back(); - prevChild.m_edge.insert(prevChild.m_edge.end(), child.m_edge.begin(), child.m_edge.end()); - } - else - { - WriteNodeReverse(sink, serializer, node.m_char, node); - prevNode.m_children.emplace_back( - node.m_children.empty(), static_cast<uint32_t>(sink.Pos() - node.m_begPos), node.m_char); - } - - nodes.pop_back(); - } -} - -template <typename TNodeInfo, typename TValue> -void AppendValue(TNodeInfo & node, TValue const & value) -{ - // External-memory trie adds <string, value> pairs in a sorted - // order so the values are supposed to be accumulated in the - // sorted order and we can avoid sorting them before doing - // further operations such as ValueList construction. - using namespace std::rel_ops; - ASSERT(node.m_temporaryValueList.empty() || node.m_temporaryValueList.back() <= value, ()); - if (!node.m_temporaryValueList.empty() && node.m_temporaryValueList.back() == value) - return; - if (node.m_mayAppend) - node.m_temporaryValueList.push_back(value); - else - LOG(LERROR, ("Cannot append to a finalized value list.")); -} - -template <typename TSink, typename TIter, typename TValueList, typename TSerializer> -void Build(TSink & sink, TSerializer const & serializer, TIter const beg, TIter const end) -{ - using TTrieString = buffer_vector<TrieChar, 32>; - using TNodeInfo = NodeInfo<TValueList>; - - vector<TNodeInfo> nodes; - nodes.emplace_back(sink.Pos(), DEFAULT_CHAR); - - TTrieString prevKey; - - using TElement = typename TIter::value_type; - TElement prevE; - - for (TIter it = beg; it != end; ++it) - { - TElement e = *it; - if (it != beg && e == prevE) - continue; - - TrieChar const * const pKeyData = e.GetKeyData(); - TTrieString key(pKeyData, pKeyData + e.GetKeySize()); - CHECK(!(key < prevKey), (key, prevKey)); - size_t nCommon = 0; - while (nCommon < min(key.size(), prevKey.size()) && prevKey[nCommon] == key[nCommon]) - ++nCommon; - - // Root is also a common node. - PopNodes(sink, serializer, nodes, nodes.size() - nCommon - 1); - uint64_t const pos = sink.Pos(); - for (size_t i = nCommon; i < key.size(); ++i) - nodes.emplace_back(pos, key[i]); - AppendValue(nodes.back(), e.GetValue()); - - prevKey.swap(key); - prevE.Swap(e); - } - - // Pop all the nodes from the stack. - PopNodes(sink, serializer, nodes, nodes.size() - 1); - - // Write the root. - WriteNodeReverse(sink, serializer, DEFAULT_CHAR /* baseChar */, nodes.back(), true /* isRoot */); -} - -} // namespace trie diff --git a/coding/trie_reader.hpp b/coding/trie_reader.hpp deleted file mode 100644 index 2b4c651762..0000000000 --- a/coding/trie_reader.hpp +++ /dev/null @@ -1,159 +0,0 @@ -#pragma once -#include "coding/trie.hpp" -#include "coding/reader.hpp" -#include "coding/varint.hpp" - -#include "base/assert.hpp" -#include "base/bits.hpp" -#include "base/macros.hpp" - -namespace trie -{ -template <class TValueList, typename TSerializer> -class LeafIterator0 : public Iterator<TValueList> -{ -public: - using TValue = typename TValueList::TValue; - using Iterator<TValueList>::m_valueList; - - template <class TReader> - LeafIterator0(TReader const & reader, TSerializer const & serializer) - { - ReaderSource<TReader> src(reader); - if (src.Size() > 0) - m_valueList.Deserialize(src, 0 /* valueCount */, serializer); - ASSERT_EQUAL(src.Size(), 0, ()); - } - - // trie::Iterator overrides: - unique_ptr<Iterator<TValueList>> Clone() const override - { - return make_unique<LeafIterator0<TValueList, TSerializer>>(*this); - } - - unique_ptr<Iterator<TValueList>> GoToEdge(size_t i) const override - { - ASSERT(false, (i)); - UNUSED_VALUE(i); - return nullptr; - } -}; - -template <typename TReader, typename TValueList, typename TSerializer> -class Iterator0 : public Iterator<TValueList> -{ -public: - using TValue = typename TValueList::TValue; - using Iterator<TValueList>::m_valueList; - using Iterator<TValueList>::m_edge; - - Iterator0(TReader const & reader, TrieChar baseChar, TSerializer const & serializer) - : m_reader(reader), m_serializer(serializer) - { - ParseNode(baseChar); - } - - // trie::Iterator overrides: - unique_ptr<Iterator<TValueList>> Clone() const override - { - return make_unique<Iterator0<TReader, TValueList, TSerializer>>(*this); - } - - unique_ptr<Iterator<TValueList>> GoToEdge(size_t i) const override - { - ASSERT_LESS(i, this->m_edge.size(), ()); - uint32_t const offset = m_edgeInfo[i].m_offset; - uint32_t const size = m_edgeInfo[i+1].m_offset - offset; - - if (m_edgeInfo[i].m_isLeaf) - { - return make_unique<LeafIterator0<TValueList, TSerializer>>(m_reader.SubReader(offset, size), - m_serializer); - } - - return make_unique<Iterator0<TReader, TValueList, TSerializer>>( - m_reader.SubReader(offset, size), this->m_edge[i].m_label.back(), m_serializer); - } - -private: - void ParseNode(TrieChar baseChar) - { - ReaderSource<TReader> src(m_reader); - - // [1: header]: [2: min(valueCount, 3)] [6: min(childCount, 63)] - uint8_t const header = ReadPrimitiveFromSource<uint8_t>(src); - uint32_t valueCount = (header >> 6); - uint32_t childCount = (header & 63); - - // [vu valueCount]: if valueCount in header == 3 - if (valueCount == 3) - valueCount = ReadVarUint<uint32_t>(src); - - // [vu childCount]: if childCount in header == 63 - if (childCount == 63) - childCount = ReadVarUint<uint32_t>(src); - - // [valueList] - m_valueList.Deserialize(src, valueCount, m_serializer); - - // [childInfo] ... [childInfo] - this->m_edge.resize(childCount); - m_edgeInfo.resize(childCount + 1); - m_edgeInfo[0].m_offset = 0; - for (uint32_t i = 0; i < childCount; ++i) - { - typename Iterator<TValueList>::Edge & e = this->m_edge[i]; - - // [1: header]: [1: isLeaf] [1: isShortEdge] [6: (edgeChar0 - baseChar) or min(edgeLen-1, 63)] - uint8_t const header = ReadPrimitiveFromSource<uint8_t>(src); - m_edgeInfo[i].m_isLeaf = ((header & 128) != 0); - if (header & 64) - e.m_label.push_back(baseChar + bits::ZigZagDecode(header & 63U)); - else - { - // [vu edgeLen-1]: if edgeLen-1 in header == 63 - uint32_t edgeLen = (header & 63); - if (edgeLen == 63) - edgeLen = ReadVarUint<uint32_t>(src); - edgeLen += 1; - - // [vi edgeChar0 - baseChar] [vi edgeChar1 - edgeChar0] ... [vi edgeCharN - edgeCharN-1] - e.m_label.reserve(edgeLen); - for (uint32_t i = 0; i < edgeLen; ++i) - e.m_label.push_back(baseChar += ReadVarInt<int32_t>(src)); - } - - // [child size]: if the child is not the last one - m_edgeInfo[i + 1].m_offset = m_edgeInfo[i].m_offset; - if (i != childCount - 1) - m_edgeInfo[i + 1].m_offset += ReadVarUint<uint32_t>(src); - - baseChar = e.m_label[0]; - } - - uint32_t const currentOffset = static_cast<uint32_t>(src.Pos()); - for (size_t i = 0; i < m_edgeInfo.size(); ++i) - m_edgeInfo[i].m_offset += currentOffset; - m_edgeInfo.back().m_offset = static_cast<uint32_t>(m_reader.Size()); - } - - struct EdgeInfo - { - uint32_t m_offset; - bool m_isLeaf; - }; - - buffer_vector<EdgeInfo, 9> m_edgeInfo; - - TReader m_reader; - TSerializer m_serializer; -}; - -// Returns iterator to the root of the trie. -template <class TReader, class TValueList, class TSerializer> -unique_ptr<Iterator<TValueList>> ReadTrie(TReader const & reader, TSerializer const & serializer) -{ - return make_unique<Iterator0<TReader, TValueList, TSerializer>>(reader, DEFAULT_CHAR, serializer); -} - -} // namespace trie |