Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMaxim Pimenov <m@maps.me>2015-11-03 20:04:40 +0300
committerSergey Yershov <yershov@corp.mail.ru>2016-03-23 16:02:37 +0300
commita818bb4b37e4de58fe6a251c4a6cfcce53b34c6a (patch)
tree4b3ac8ad61318f6e62bbaba29d5f5d2941771dce /indexer/succinct_trie_builder.hpp
parent907cb3786b0d10e2fcf44c906ee4c8625f41630b (diff)
Moved search index from coding to indexer.
Diffstat (limited to 'indexer/succinct_trie_builder.hpp')
-rw-r--r--indexer/succinct_trie_builder.hpp199
1 files changed, 199 insertions, 0 deletions
diff --git a/indexer/succinct_trie_builder.hpp b/indexer/succinct_trie_builder.hpp
new file mode 100644
index 0000000000..5ba0f56cf0
--- /dev/null
+++ b/indexer/succinct_trie_builder.hpp
@@ -0,0 +1,199 @@
+#pragma once
+
+#include "coding/bit_streams.hpp"
+#include "coding/byte_stream.hpp"
+#include "coding/huffman.hpp"
+#include "coding/reader.hpp"
+#include "coding/varint.hpp"
+#include "coding/writer.hpp"
+
+#include "base/buffer_vector.hpp"
+#include "base/scope_guard.hpp"
+#include "base/string_utils.hpp"
+
+#include "std/algorithm.hpp"
+#include "std/bind.hpp"
+
+// Trie format:
+// -- Serialized Huffman encoding.
+// -- Topology of the trie built on Huffman-encoded input strings [2 bits per node, level-order representation].
+// -- List of pairs (node id, offset). One pair per final node (i.e. a node where a string ends).
+// The lists of node ids and offsets are both non-decreasing and are delta-encoded with varuints.
+// -- Values of final nodes in level-order. The values for final node |id| start at offset |offset|
+// if there is a pair (id, offset) in the list above.
+
+namespace trie
+{
+// Node is a temporary struct that is used to store the trie in memory
+// before it is converted to a succinct representation and put to disk.
+// It is rather verbose but hopefully is small enough to fit in memory.
+template <class TValueList>
+struct Node
+{
+ // The left child is reached by 0, the right by 1.
+ Node *l, *r;
+
+ // A node is final if a key string ends in it.
+ // In particular, all non-final nodes' valueLists are empty
+ // and it is expected that a final node's valueList is non-empty.
+ bool m_isFinal;
+
+ // m_valueLists stores all the additional information that is related
+ // to the key string which leads to this node.
+ TValueList m_valueList;
+
+ Node() : l(nullptr), r(nullptr), m_isFinal(false) {}
+};
+
+template <class TNode, class TReader>
+TNode * AddToTrie(TNode * root, TReader & bitEncoding, uint32_t numBits)
+{
+ ReaderSource<TReader> src(bitEncoding);
+ // String size.
+ ReadVarUint<uint32_t, ReaderSource<TReader>>(src);
+
+ BitReader<ReaderSource<TReader>> bitReader(src);
+ auto cur = root;
+ for (uint32_t i = 0; i < numBits; ++i)
+ {
+ auto nxt = bitReader.Read(1) == 0 ? &cur->l : &cur->r;
+ if (!*nxt)
+ *nxt = new TNode();
+ cur = *nxt;
+ }
+ return cur;
+}
+
+template <class TNode>
+void DeleteTrie(TNode * root)
+{
+ if (!root)
+ return;
+ DeleteTrie(root->l);
+ DeleteTrie(root->r);
+ delete root;
+}
+
+template <typename TNode>
+void WriteInLevelOrder(TNode * root, vector<TNode *> & levelOrder)
+{
+ ASSERT(root, ());
+ levelOrder.push_back(root);
+ size_t i = 0;
+ while (i < levelOrder.size())
+ {
+ TNode * cur = levelOrder[i++];
+ if (cur->l)
+ levelOrder.push_back(cur->l);
+ if (cur->r)
+ levelOrder.push_back(cur->r);
+ }
+}
+
+template <typename TWriter, typename TIter, typename TValueList>
+void BuildSuccinctTrie(TWriter & writer, TIter const beg, TIter const end)
+{
+ using TrieChar = uint32_t;
+ using TTrieString = buffer_vector<TrieChar, 32>;
+ using TNode = Node<TValueList>;
+ using TEntry = typename TIter::value_type;
+
+ TNode * root = new TNode();
+ MY_SCOPE_GUARD(cleanup, bind(&DeleteTrie<TNode>, root));
+ TTrieString prevKey;
+ TEntry prevEntry;
+
+ vector<TEntry> entries;
+ vector<strings::UniString> entryStrings;
+ for (TIter it = beg; it != end; ++it)
+ {
+ TEntry entry = *it;
+ if (it != beg && entry == prevEntry)
+ continue;
+ TrieChar const * const keyData = entry.GetKeyData();
+ TTrieString key(keyData, keyData + entry.GetKeySize());
+ using namespace std::rel_ops; // ">=" for keys.
+ CHECK_GREATER_OR_EQUAL(key, prevKey, (key, prevKey));
+ entries.push_back(entry);
+ entryStrings.push_back(strings::UniString(keyData, keyData + entry.GetKeySize()));
+ prevKey.swap(key);
+ prevEntry.Swap(entry);
+ }
+
+ coding::HuffmanCoder huffman;
+ huffman.Init(entryStrings);
+ huffman.WriteEncoding(writer);
+
+ for (size_t i = 0; i < entries.size(); ++i)
+ {
+ TEntry const & entry = entries[i];
+ auto const & key = entryStrings[i];
+
+ vector<uint8_t> buf;
+ MemWriter<vector<uint8_t>> memWriter(buf);
+ uint32_t numBits = huffman.EncodeAndWrite(memWriter, key);
+
+ MemReader bitEncoding(&buf[0], buf.size());
+
+ TNode * cur = AddToTrie(root, bitEncoding, numBits);
+ cur->m_isFinal = true;
+ cur->m_valueList.Append(entry.GetValue());
+ }
+
+ vector<TNode *> levelOrder;
+ WriteInLevelOrder(root, levelOrder);
+
+ {
+ // Trie topology.
+ BitWriter<TWriter> bitWriter(writer);
+ WriteVarUint(writer, levelOrder.size());
+ for (size_t i = 0; i < levelOrder.size(); ++i)
+ {
+ auto const & cur = levelOrder[i];
+ bitWriter.Write(cur->l ? 1 : 0, 1 /* numBits */);
+ bitWriter.Write(cur->r ? 1 : 0, 1 /* numBits */);
+ }
+ }
+
+ vector<uint32_t> finalNodeIds;
+ vector<uint32_t> offsetTable;
+ vector<uint8_t> valueBuf;
+ MemWriter<vector<uint8_t>> valueWriter(valueBuf);
+ for (size_t i = 0; i < levelOrder.size(); ++i)
+ {
+ TNode const * node = levelOrder[i];
+ if (!node->m_isFinal)
+ continue;
+ finalNodeIds.push_back(static_cast<uint32_t>(i));
+ offsetTable.push_back(valueWriter.Pos());
+ node->m_valueList.Dump(valueWriter);
+ }
+
+ WriteVarUint(writer, finalNodeIds.size());
+ for (size_t i = 0; i < finalNodeIds.size(); ++i)
+ {
+ if (i == 0)
+ {
+ WriteVarUint(writer, finalNodeIds[i]);
+ WriteVarUint(writer, offsetTable[i]);
+ }
+ else
+ {
+ WriteVarUint(writer, finalNodeIds[i] - finalNodeIds[i - 1]);
+ WriteVarUint(writer, offsetTable[i] - offsetTable[i - 1]);
+ }
+ }
+
+ writer.Write(valueBuf.data(), valueBuf.size());
+
+ // todo(@pimenov):
+ // 1. Investigate the possibility of path compression (short edges + lcp table).
+ // 2. It is highly probable that valueList will be only a list of feature ids in our
+ // future implementations of the search.
+ // We can then dispose of offsetTable and store finalNodeIds as another bit vector.
+ // Alternatively (and probably better), we can append a special symbol to all those
+ // strings that have non-empty value lists. We can then get the leaf number
+ // of a final node by doing rank0.
+}
+
+} // namespace trie