Moved search index from coding to indexer.

author: Maxim Pimenov <m@maps.me> 2015-11-03 20:04:40 +0300
committer: Sergey Yershov <yershov@corp.mail.ru> 2016-03-23 16:02:37 +0300
commit: a818bb4b37e4de58fe6a251c4a6cfcce53b34c6a (patch)
tree: 4b3ac8ad61318f6e62bbaba29d5f5d2941771dce /indexer/succinct_trie_builder.hpp
parent: 907cb3786b0d10e2fcf44c906ee4c8625f41630b (diff)
1 files changed, 199 insertions, 0 deletions
diff --git a/indexer/succinct_trie_builder.hpp b/indexer/succinct_trie_builder.hpp
new file mode 100644
index 0000000000..5ba0f56cf0
--- /dev/null
+++ b/indexer/succinct_trie_builder.hpp
@@ -0,0 +1,199 @@
+#pragma once
+
+#include "coding/bit_streams.hpp"
+#include "coding/byte_stream.hpp"
+#include "coding/huffman.hpp"
+#include "coding/reader.hpp"
+#include "coding/varint.hpp"
+#include "coding/writer.hpp"
+
+#include "base/buffer_vector.hpp"
+#include "base/scope_guard.hpp"
+#include "base/string_utils.hpp"
+
+#include "std/algorithm.hpp"
+#include "std/bind.hpp"
+
+// Trie format:
+//   -- Serialized Huffman encoding.
+//   -- Topology of the trie built on Huffman-encoded input strings [2 bits per node, level-order representation].
+//   -- List of pairs (node id, offset). One pair per final node (i.e. a node where a string ends).
+//      The lists of node ids and offsets are both non-decreasing and are delta-encoded with varuints.
+//   -- Values of final nodes in level-order. The values for final node |id| start at offset |offset|
+//      if there is a pair (id, offset) in the list above.
+
+namespace trie
+{
+// Node is a temporary struct that is used to store the trie in memory
+// before it is converted to a succinct representation and put to disk.
+// It is rather verbose but hopefully is small enough to fit in memory.
+template <class TValueList>
+struct Node
+{
+  // The left child is reached by 0, the right by 1.
+  Node *l, *r;
+
+  // A node is final if a key string ends in it.
+  // In particular, all non-final nodes' valueLists are empty
+  // and it is expected that a final node's valueList is non-empty.
+  bool m_isFinal;
+
+  // m_valueLists stores all the additional information that is related
+  // to the key string which leads to this node.
+  TValueList m_valueList;
+
+  Node() : l(nullptr), r(nullptr), m_isFinal(false) {}
+};
+
+template <class TNode, class TReader>
+TNode * AddToTrie(TNode * root, TReader & bitEncoding, uint32_t numBits)
+{
+  ReaderSource<TReader> src(bitEncoding);
+  // String size.
+  ReadVarUint<uint32_t, ReaderSource<TReader>>(src);
+
+  BitReader<ReaderSource<TReader>> bitReader(src);
+  auto cur = root;
+  for (uint32_t i = 0; i < numBits; ++i)
+  {
+    auto nxt = bitReader.Read(1) == 0 ? &cur->l : &cur->r;
+    if (!*nxt)
+      *nxt = new TNode();
+    cur = *nxt;
+  }
+  return cur;
+}
+
+template <class TNode>
+void DeleteTrie(TNode * root)
+{
+  if (!root)
+    return;
+  DeleteTrie(root->l);
+  DeleteTrie(root->r);
+  delete root;
+}
+
+template <typename TNode>
+void WriteInLevelOrder(TNode * root, vector<TNode *> & levelOrder)
+{
+  ASSERT(root, ());
+  levelOrder.push_back(root);
+  size_t i = 0;
+  while (i < levelOrder.size())
+  {
+    TNode * cur = levelOrder[i++];
+    if (cur->l)
+      levelOrder.push_back(cur->l);
+    if (cur->r)
+      levelOrder.push_back(cur->r);
+  }
+}
+
+template <typename TWriter, typename TIter, typename TValueList>
+void BuildSuccinctTrie(TWriter & writer, TIter const beg, TIter const end)
+{
+  using TrieChar = uint32_t;
+  using TTrieString = buffer_vector<TrieChar, 32>;
+  using TNode = Node<TValueList>;
+  using TEntry = typename TIter::value_type;
+
+  TNode * root = new TNode();
+  MY_SCOPE_GUARD(cleanup, bind(&DeleteTrie<TNode>, root));
+  TTrieString prevKey;
+  TEntry prevEntry;
+
+  vector<TEntry> entries;
+  vector<strings::UniString> entryStrings;
+  for (TIter it = beg; it != end; ++it)
+  {
+    TEntry entry = *it;
+    if (it != beg && entry == prevEntry)
+      continue;
+    TrieChar const * const keyData = entry.GetKeyData();
+    TTrieString key(keyData, keyData + entry.GetKeySize());
+    using namespace std::rel_ops;  // ">=" for keys.
+    CHECK_GREATER_OR_EQUAL(key, prevKey, (key, prevKey));
+    entries.push_back(entry);
+    entryStrings.push_back(strings::UniString(keyData, keyData + entry.GetKeySize()));
+    prevKey.swap(key);
+    prevEntry.Swap(entry);
+  }
+
+  coding::HuffmanCoder huffman;
+  huffman.Init(entryStrings);
+  huffman.WriteEncoding(writer);
+
+  for (size_t i = 0; i < entries.size(); ++i)
+  {
+    TEntry const & entry = entries[i];
+    auto const & key = entryStrings[i];
+
+    vector<uint8_t> buf;
+    MemWriter<vector<uint8_t>> memWriter(buf);
+    uint32_t numBits = huffman.EncodeAndWrite(memWriter, key);
+
+    MemReader bitEncoding(&buf[0], buf.size());
+
+    TNode * cur = AddToTrie(root, bitEncoding, numBits);
+    cur->m_isFinal = true;
+    cur->m_valueList.Append(entry.GetValue());
+  }
+
+  vector<TNode *> levelOrder;
+  WriteInLevelOrder(root, levelOrder);
+
+  {
+    // Trie topology.
+    BitWriter<TWriter> bitWriter(writer);
+    WriteVarUint(writer, levelOrder.size());
+    for (size_t i = 0; i < levelOrder.size(); ++i)
+    {
+      auto const & cur = levelOrder[i];
+      bitWriter.Write(cur->l ? 1 : 0, 1 /* numBits */);
+      bitWriter.Write(cur->r ? 1 : 0, 1 /* numBits */);
+    }
+  }
+
+  vector<uint32_t> finalNodeIds;
+  vector<uint32_t> offsetTable;
+  vector<uint8_t> valueBuf;
+  MemWriter<vector<uint8_t>> valueWriter(valueBuf);
+  for (size_t i = 0; i < levelOrder.size(); ++i)
+  {
+    TNode const * node = levelOrder[i];
+    if (!node->m_isFinal)
+      continue;
+    finalNodeIds.push_back(static_cast<uint32_t>(i));
+    offsetTable.push_back(valueWriter.Pos());
+    node->m_valueList.Dump(valueWriter);
+  }
+
+  WriteVarUint(writer, finalNodeIds.size());
+  for (size_t i = 0; i < finalNodeIds.size(); ++i)
+  {
+    if (i == 0)
+    {
+      WriteVarUint(writer, finalNodeIds[i]);
+      WriteVarUint(writer, offsetTable[i]);
+    }
+    else
+    {
+      WriteVarUint(writer, finalNodeIds[i] - finalNodeIds[i - 1]);
+      WriteVarUint(writer, offsetTable[i] - offsetTable[i - 1]);
+    }
+  }
+
+  writer.Write(valueBuf.data(), valueBuf.size());
+
+  // todo(@pimenov):
+  // 1. Investigate the possibility of path compression (short edges + lcp table).
+  // 2. It is highly probable that valueList will be only a list of feature ids in our
+  //    future implementations of the search.
+  //    We can then dispose of offsetTable and store finalNodeIds as another bit vector.
+  //    Alternatively (and probably better), we can append a special symbol to all those
+  //    strings that have non-empty value lists. We can then get the leaf number
+  //    of a final node by doing rank0.
+}
+
+}  // namespace trie
author	Maxim Pimenov <m@maps.me>	2015-11-03 20:04:40 +0300
committer	Sergey Yershov <yershov@corp.mail.ru>	2016-03-23 16:02:37 +0300
commit	a818bb4b37e4de58fe6a251c4a6cfcce53b34c6a (patch)
tree	4b3ac8ad61318f6e62bbaba29d5f5d2941771dce /indexer/succinct_trie_builder.hpp
parent	907cb3786b0d10e2fcf44c906ee4c8625f41630b (diff)