diff options
author | Maxim Pimenov <m@maps.me> | 2015-11-03 20:04:40 +0300 |
---|---|---|
committer | Sergey Yershov <yershov@corp.mail.ru> | 2016-03-23 16:02:37 +0300 |
commit | a818bb4b37e4de58fe6a251c4a6cfcce53b34c6a (patch) | |
tree | 4b3ac8ad61318f6e62bbaba29d5f5d2941771dce /indexer/trie_builder.hpp | |
parent | 907cb3786b0d10e2fcf44c906ee4c8625f41630b (diff) |
Moved search index from coding to indexer.
Diffstat (limited to 'indexer/trie_builder.hpp')
-rw-r--r-- | indexer/trie_builder.hpp | 272 |
1 files changed, 272 insertions, 0 deletions
diff --git a/indexer/trie_builder.hpp b/indexer/trie_builder.hpp new file mode 100644 index 0000000000..6ba69c2430 --- /dev/null +++ b/indexer/trie_builder.hpp @@ -0,0 +1,272 @@ +#pragma once + +#include "coding/byte_stream.hpp" +#include "coding/varint.hpp" + +#include "base/buffer_vector.hpp" +#include "base/logging.hpp" + +#include "std/algorithm.hpp" +#include "std/vector.hpp" + +// Trie format: +// [1: header] +// [node] ... [node] + +// Nodes are written in post-order (first child, last child, parent). Contents of nodes is writern +// reversed. The resulting file should be reverese before use! Then its contents will appear in +// pre-order alphabetically reversed (parent, last child, first child). + +// Leaf node format: +// [valueList] + +// Internal node format: +// [1: header]: [2: min(valueCount, 3)] [6: min(childCount, 63)] +// [vu valueCount]: if valueCount in header == 3 +// [vu childCount]: if childCount in header == 63 +// [valueList] +// [childInfo] ... [childInfo] + +// Child info format: +// Every char of the edge is encoded as varint difference from the previous char. First char is +// encoded as varint difference from the base char, which is the last char of the current prefix. +// +// [1: header]: [1: isLeaf] [1: isShortEdge] [6: (edgeChar0 - baseChar) or min(edgeLen-1, 63)] +// [vu edgeLen-1]: if edgeLen-1 in header == 63 +// [vi edgeChar0 - baseChar] +// [vi edgeChar1 - edgeChar0] +// ... +// [vi edgeCharN - edgeCharN-1] +// [child size]: if the child is not the last one when reading + +namespace trie +{ +template <typename TSink, typename TChildIter, typename TValueList, typename TSerializer> +void WriteNode(TSink & sink, TSerializer const & serializer, TrieChar baseChar, + TValueList const & valueList, TChildIter const begChild, TChildIter const endChild, + bool isRoot = false) +{ + uint32_t const valueCount = valueList.Size(); + if (begChild == endChild && !isRoot) + { + // Leaf node. +#ifdef DEBUG + auto posBefore = sink.Pos(); +#endif + valueList.Serialize(sink, serializer); +#ifdef DEBUG + if (valueCount == 0) + ASSERT_EQUAL(sink.Pos(), posBefore, ("Empty valueList must produce an empty serialization.")); +#endif + return; + } + uint32_t const childCount = endChild - begChild; + uint8_t const header = static_cast<uint32_t>((min(valueCount, 3U) << 6) + min(childCount, 63U)); + sink.Write(&header, 1); + if (valueCount >= 3) + WriteVarUint(sink, valueCount); + if (childCount >= 63) + WriteVarUint(sink, childCount); + valueList.Serialize(sink, serializer); + for (TChildIter it = begChild; it != endChild; /*++it*/) + { + uint8_t header = (it->IsLeaf() ? 128 : 0); + TrieChar const * const edge = it->GetEdge(); + uint32_t const edgeSize = it->GetEdgeSize(); + CHECK_NOT_EQUAL(edgeSize, 0, ()); + CHECK_LESS(edgeSize, 100000, ()); + uint32_t const diff0 = bits::ZigZagEncode(int32_t(edge[0] - baseChar)); + if (edgeSize == 1 && (diff0 & ~63U) == 0) + { + header |= 64; + header |= diff0; + WriteToSink(sink, header); + } + else + { + if (edgeSize - 1 < 63) + { + header |= edgeSize - 1; + WriteToSink(sink, header); + } + else + { + header |= 63; + WriteToSink(sink, header); + WriteVarUint(sink, edgeSize - 1); + } + for (uint32_t i = 0; i < edgeSize; ++i) + { + WriteVarInt(sink, int32_t(edge[i] - baseChar)); + baseChar = edge[i]; + } + } + baseChar = edge[0]; + + uint32_t const childSize = it->Size(); + if (++it != endChild) + WriteVarUint(sink, childSize); + } +} + +struct ChildInfo +{ + bool m_isLeaf; + uint32_t m_size; + vector<TrieChar> m_edge; + + ChildInfo(bool isLeaf, uint32_t size, TrieChar c) : m_isLeaf(isLeaf), m_size(size), m_edge(1, c) + { + } + + uint32_t Size() const { return m_size; } + bool IsLeaf() const { return m_isLeaf; } + TrieChar const * GetEdge() const { return m_edge.data(); } + uint32_t GetEdgeSize() const { return m_edge.size(); } +}; + +template <typename TValueList> +struct NodeInfo +{ + uint64_t m_begPos; + TrieChar m_char; + vector<ChildInfo> m_children; + + // This is ugly but will do until we rename ValueList. + // Here is the rationale. ValueList<> is the entity that + // we store in the nodes of the search trie. It can be read + // or written via its methods but not directly as was assumed + // in a previous version of this code. That version provided + // serialization methods for ValueList but the deserialization + // was ad hoc. + // Because of the possibility of serialized ValueLists to represent + // something completely different from an array of FeatureIds + // (a compressed bit vector, for instance) and because of the + // need to update a node's ValueList until the node is finalized + // this vector is needed here. It is better to leave it here + // than to expose it in ValueList. + vector<typename TValueList::TValue> m_temporaryValueList; + TValueList m_valueList; + bool m_mayAppend; + + NodeInfo() : m_begPos(0), m_char(0), m_valueList(TValueList()), m_mayAppend(true) {} + NodeInfo(uint64_t pos, TrieChar trieChar) + : m_begPos(pos), m_char(trieChar), m_valueList(TValueList()), m_mayAppend(true) + { + } + + // It is finalized in the sense that no more appends are possible + // so it is a fine moment to initialize the underlying ValueList. + void FinalizeValueList() + { + m_valueList.Init(m_temporaryValueList); + m_mayAppend = false; + } +}; + +template <typename TSink, typename TValueList, typename TSerializer> +void WriteNodeReverse(TSink & sink, TSerializer const & serializer, TrieChar baseChar, + NodeInfo<TValueList> & node, bool isRoot = false) +{ + using TOutStorage = buffer_vector<uint8_t, 64>; + TOutStorage out; + PushBackByteSink<TOutStorage> outSink(out); + node.FinalizeValueList(); + WriteNode(outSink, serializer, baseChar, node.m_valueList, node.m_children.rbegin(), + node.m_children.rend(), isRoot); + reverse(out.begin(), out.end()); + sink.Write(out.data(), out.size()); +} + +template <typename TSink, typename TNodes, typename TSerializer> +void PopNodes(TSink & sink, TSerializer const & serializer, TNodes & nodes, int nodesToPop) +{ + using TNodeInfo = typename TNodes::value_type; + ASSERT_GREATER(nodes.size(), nodesToPop, ()); + for (; nodesToPop > 0; --nodesToPop) + { + TNodeInfo & node = nodes.back(); + TNodeInfo & prevNode = nodes[nodes.size() - 2]; + + if (node.m_temporaryValueList.empty() && node.m_children.size() <= 1) + { + ASSERT_EQUAL(node.m_children.size(), 1, ()); + ChildInfo & child = node.m_children[0]; + prevNode.m_children.emplace_back(child.m_isLeaf, child.m_size, node.m_char); + auto & prevChild = prevNode.m_children.back(); + prevChild.m_edge.insert(prevChild.m_edge.end(), child.m_edge.begin(), child.m_edge.end()); + } + else + { + WriteNodeReverse(sink, serializer, node.m_char, node); + prevNode.m_children.emplace_back( + node.m_children.empty(), static_cast<uint32_t>(sink.Pos() - node.m_begPos), node.m_char); + } + + nodes.pop_back(); + } +} + +template <typename TNodeInfo, typename TValue> +void AppendValue(TNodeInfo & node, TValue const & value) +{ + // External-memory trie adds <string, value> pairs in a sorted + // order so the values are supposed to be accumulated in the + // sorted order and we can avoid sorting them before doing + // further operations such as ValueList construction. + using namespace std::rel_ops; + ASSERT(node.m_temporaryValueList.empty() || node.m_temporaryValueList.back() <= value, ()); + if (!node.m_temporaryValueList.empty() && node.m_temporaryValueList.back() == value) + return; + if (node.m_mayAppend) + node.m_temporaryValueList.push_back(value); + else + LOG(LERROR, ("Cannot append to a finalized value list.")); +} + +template <typename TSink, typename TIter, typename TValueList, typename TSerializer> +void Build(TSink & sink, TSerializer const & serializer, TIter const beg, TIter const end) +{ + using TTrieString = buffer_vector<TrieChar, 32>; + using TNodeInfo = NodeInfo<TValueList>; + + vector<TNodeInfo> nodes; + nodes.emplace_back(sink.Pos(), DEFAULT_CHAR); + + TTrieString prevKey; + + using TElement = typename TIter::value_type; + TElement prevE; + + for (TIter it = beg; it != end; ++it) + { + TElement e = *it; + if (it != beg && e == prevE) + continue; + + TrieChar const * const pKeyData = e.GetKeyData(); + TTrieString key(pKeyData, pKeyData + e.GetKeySize()); + CHECK(!(key < prevKey), (key, prevKey)); + size_t nCommon = 0; + while (nCommon < min(key.size(), prevKey.size()) && prevKey[nCommon] == key[nCommon]) + ++nCommon; + + // Root is also a common node. + PopNodes(sink, serializer, nodes, nodes.size() - nCommon - 1); + uint64_t const pos = sink.Pos(); + for (size_t i = nCommon; i < key.size(); ++i) + nodes.emplace_back(pos, key[i]); + AppendValue(nodes.back(), e.GetValue()); + + prevKey.swap(key); + prevE.Swap(e); + } + + // Pop all the nodes from the stack. + PopNodes(sink, serializer, nodes, nodes.size() - 1); + + // Write the root. + WriteNodeReverse(sink, serializer, DEFAULT_CHAR /* baseChar */, nodes.back(), true /* isRoot */); +} + +} // namespace trie |