#pragma once #include "trie.hpp" #include "coding/byte_stream.hpp" #include "coding/varint.hpp" #include "base/buffer_vector.hpp" #include "base/checked_cast.hpp" #include "base/logging.hpp" #include "std/algorithm.hpp" #include "std/vector.hpp" #include "std/utility.hpp" // Trie format: // [1: header] // [node] ... [node] // Nodes are written in post-order (first child, last child, parent). Contents of nodes is writern // reversed. The resulting file should be reverese before use! Then its contents will appear in // pre-order alphabetically reversed (parent, last child, first child). // Leaf node format: // [valueList] // Internal node format: // [1: header]: [2: min(valueCount, 3)] [6: min(childCount, 63)] // [vu valueCount]: if valueCount in header == 3 // [vu childCount]: if childCount in header == 63 // [valueList] // [childInfo] ... [childInfo] // Child info format: // Every char of the edge is encoded as varint difference from the previous char. First char is // encoded as varint difference from the base char, which is the last char of the current prefix. // // [1: header]: [1: isLeaf] [1: isShortEdge] [6: (edgeChar0 - baseChar) or min(edgeLen-1, 63)] // [vu edgeLen-1]: if edgeLen-1 in header == 63 // [vi edgeChar0 - baseChar] // [vi edgeChar1 - edgeChar0] // ... // [vi edgeCharN - edgeCharN-1] // [child size]: if the child is not the last one when reading namespace trie { template void WriteNode(TSink & sink, TSerializer const & serializer, TrieChar baseChar, TValueList const & valueList, TChildIter const begChild, TChildIter const endChild, bool isRoot = false) { uint32_t const valueCount = base::asserted_cast(valueList.Size()); if (begChild == endChild && !isRoot) { // Leaf node. #ifdef DEBUG auto posBefore = sink.Pos(); #endif valueList.Serialize(sink, serializer); #ifdef DEBUG if (valueCount == 0) ASSERT_EQUAL(sink.Pos(), posBefore, ("Empty valueList must produce an empty serialization.")); #endif return; } uint32_t const childCount = base::asserted_cast(endChild - begChild); uint8_t const header = static_cast((min(valueCount, 3U) << 6) + min(childCount, 63U)); sink.Write(&header, 1); if (valueCount >= 3) WriteVarUint(sink, valueCount); if (childCount >= 63) WriteVarUint(sink, childCount); valueList.Serialize(sink, serializer); for (TChildIter it = begChild; it != endChild; /*++it*/) { uint8_t header = (it->IsLeaf() ? 128 : 0); TrieChar const * const edge = it->GetEdge(); uint32_t const edgeSize = base::asserted_cast(it->GetEdgeSize()); CHECK_NOT_EQUAL(edgeSize, 0, ()); CHECK_LESS(edgeSize, 100000, ()); uint32_t const diff0 = bits::ZigZagEncode(int32_t(edge[0] - baseChar)); if (edgeSize == 1 && (diff0 & ~63U) == 0) { header |= 64; header |= diff0; WriteToSink(sink, header); } else { if (edgeSize - 1 < 63) { header |= edgeSize - 1; WriteToSink(sink, header); } else { header |= 63; WriteToSink(sink, header); WriteVarUint(sink, edgeSize - 1); } for (uint32_t i = 0; i < edgeSize; ++i) { WriteVarInt(sink, int32_t(edge[i] - baseChar)); baseChar = edge[i]; } } baseChar = edge[0]; uint32_t const childSize = it->Size(); if (++it != endChild) WriteVarUint(sink, childSize); } } struct ChildInfo { bool m_isLeaf; uint32_t m_size; vector m_edge; ChildInfo(bool isLeaf, uint32_t size, TrieChar c) : m_isLeaf(isLeaf), m_size(size), m_edge(1, c) { } uint32_t Size() const { return m_size; } bool IsLeaf() const { return m_isLeaf; } TrieChar const * GetEdge() const { return m_edge.data(); } size_t GetEdgeSize() const { return m_edge.size(); } }; template struct NodeInfo { uint64_t m_begPos; TrieChar m_char; vector m_children; // This is ugly but will do until we rename ValueList. // Here is the rationale. ValueList<> is the entity that // we store in the nodes of the search trie. It can be read // or written via its methods but not directly as was assumed // in a previous version of this code. That version provided // serialization methods for ValueList but the deserialization // was ad hoc. // Because of the possibility of serialized ValueLists to represent // something completely different from an array of FeatureIds // (a compressed bit vector, for instance) and because of the // need to update a node's ValueList until the node is finalized // this vector is needed here. It is better to leave it here // than to expose it in ValueList. vector m_temporaryValueList; TValueList m_valueList; bool m_mayAppend; NodeInfo() : m_begPos(0), m_char(0), m_valueList(TValueList()), m_mayAppend(true) {} NodeInfo(uint64_t pos, TrieChar trieChar) : m_begPos(pos), m_char(trieChar), m_valueList(TValueList()), m_mayAppend(true) { } // It is finalized in the sense that no more appends are possible // so it is a fine moment to initialize the underlying ValueList. void FinalizeValueList() { m_valueList.Init(m_temporaryValueList); m_mayAppend = false; } }; template void WriteNodeReverse(TSink & sink, TSerializer const & serializer, TrieChar baseChar, NodeInfo & node, bool isRoot = false) { using TOutStorage = buffer_vector; TOutStorage out; PushBackByteSink outSink(out); node.FinalizeValueList(); WriteNode(outSink, serializer, baseChar, node.m_valueList, node.m_children.rbegin(), node.m_children.rend(), isRoot); reverse(out.begin(), out.end()); sink.Write(out.data(), out.size()); } template void PopNodes(TSink & sink, TSerializer const & serializer, TNodes & nodes, size_t nodesToPop) { using TNodeInfo = typename TNodes::value_type; ASSERT_GREATER(nodes.size(), nodesToPop, ()); for (; nodesToPop > 0; --nodesToPop) { TNodeInfo & node = nodes.back(); TNodeInfo & prevNode = nodes[nodes.size() - 2]; if (node.m_temporaryValueList.empty() && node.m_children.size() <= 1) { ASSERT_EQUAL(node.m_children.size(), 1, ()); ChildInfo & child = node.m_children[0]; prevNode.m_children.emplace_back(child.m_isLeaf, child.m_size, node.m_char); auto & prevChild = prevNode.m_children.back(); prevChild.m_edge.insert(prevChild.m_edge.end(), child.m_edge.begin(), child.m_edge.end()); } else { WriteNodeReverse(sink, serializer, node.m_char, node); prevNode.m_children.emplace_back( node.m_children.empty(), static_cast(sink.Pos() - node.m_begPos), node.m_char); } nodes.pop_back(); } } template void AppendValue(TNodeInfo & node, TValue const & value) { // External-memory trie adds pairs in a sorted // order so the values are supposed to be accumulated in the // sorted order and we can avoid sorting them before doing // further operations such as ValueList construction. using namespace std::rel_ops; ASSERT(node.m_temporaryValueList.empty() || node.m_temporaryValueList.back() <= value, (node.m_temporaryValueList.size())); if (!node.m_temporaryValueList.empty() && node.m_temporaryValueList.back() == value) return; if (node.m_mayAppend) node.m_temporaryValueList.push_back(value); else LOG(LERROR, ("Cannot append to a finalized value list.")); } template void Build(TSink & sink, TSerializer const & serializer, vector> const & data) { using TValue = typename TValueList::TValue; using TNodeInfo = NodeInfo; vector nodes; nodes.emplace_back(sink.Pos(), kDefaultChar); TKey prevKey; pair prevE; // e for "element". for (auto it = data.begin(); it != data.end(); ++it) { auto e = *it; if (it != data.begin() && e == prevE) continue; auto const & key = e.first; CHECK(!(key < prevKey), (key, prevKey)); size_t nCommon = 0; while (nCommon < min(key.size(), prevKey.size()) && prevKey[nCommon] == key[nCommon]) ++nCommon; // Root is also a common node. PopNodes(sink, serializer, nodes, nodes.size() - nCommon - 1); uint64_t const pos = sink.Pos(); for (size_t i = nCommon; i < key.size(); ++i) nodes.emplace_back(pos, key[i]); AppendValue(nodes.back(), e.second); prevKey = key; swap(e, prevE); } // Pop all the nodes from the stack. PopNodes(sink, serializer, nodes, nodes.size() - 1); // Write the root. WriteNodeReverse(sink, serializer, kDefaultChar /* baseChar */, nodes.back(), true /* isRoot */); } } // namespace trie