diff options
author | Maxim Pimenov <m@maps.me> | 2015-10-21 14:45:17 +0300 |
---|---|---|
committer | Sergey Yershov <yershov@corp.mail.ru> | 2016-03-23 16:02:27 +0300 |
commit | 7e2abfba0bffbc3e96314f6c1dabe5e27f9782e8 (patch) | |
tree | 38b4a3b4c8d7cf3696ba045ff529004e8ce65dc6 /coding | |
parent | 469e0b9b0c4eaee75fdee4ec4556f94a27f4e7d6 (diff) |
[search] A refactoring of ValueList.
Diffstat (limited to 'coding')
-rw-r--r-- | coding/coding_tests/coding_tests.pro | 2 | ||||
-rw-r--r-- | coding/coding_tests/trie_test.cpp | 100 | ||||
-rw-r--r-- | coding/compressed_bit_vector.hpp | 10 | ||||
-rw-r--r-- | coding/trie.hpp | 22 | ||||
-rw-r--r-- | coding/trie_builder.hpp | 73 | ||||
-rw-r--r-- | coding/trie_reader.hpp | 112 |
6 files changed, 185 insertions, 134 deletions
diff --git a/coding/coding_tests/coding_tests.pro b/coding/coding_tests/coding_tests.pro index f782173851..3241f9eea5 100644 --- a/coding/coding_tests/coding_tests.pro +++ b/coding/coding_tests/coding_tests.pro @@ -6,7 +6,7 @@ TEMPLATE = app ROOT_DIR = ../.. -DEPENDENCIES = coding base minizip tomcrypt succinct +DEPENDENCIES = coding base indexer minizip tomcrypt succinct include($$ROOT_DIR/common.pri) diff --git a/coding/coding_tests/trie_test.cpp b/coding/coding_tests/trie_test.cpp index 8576cad436..8935d42563 100644 --- a/coding/coding_tests/trie_test.cpp +++ b/coding/coding_tests/trie_test.cpp @@ -1,20 +1,23 @@ #include "testing/testing.hpp" +#include "coding/byte_stream.hpp" +#include "coding/reader.hpp" #include "coding/trie.hpp" #include "coding/trie_builder.hpp" #include "coding/trie_reader.hpp" -#include "coding/byte_stream.hpp" #include "coding/write_to_sink.hpp" +#include "indexer/coding_params.hpp" +#include "indexer/string_file_values.hpp" + #include "base/logging.hpp" #include "std/algorithm.hpp" +#include "std/cstring.hpp" #include "std/string.hpp" #include "std/vector.hpp" -#include "std/cstring.hpp" #include <boost/utility/binary.hpp> - namespace { @@ -83,14 +86,13 @@ string DebugPrint(KeyValuePair const & p) struct KeyValuePairBackInserter { - vector<KeyValuePair> m_v; template <class TString> - void operator()(TString const & s, trie::FixedSizeValueReader<4>::ValueType const & rawValue) + void operator()(TString const & s, uint32_t const & value) { - uint32_t value; - memcpy(&value, &rawValue, 4); m_v.push_back(KeyValuePair(s, value)); } + + vector<KeyValuePair> m_v; }; struct MaxValueCalc @@ -110,14 +112,18 @@ struct MaxValueCalc class CharValueList { public: + using TValue = char; + + void Init(vector<TValue> const &) {} + CharValueList(const string & s) : m_string(s) {} - size_t size() const { return m_string.size(); } + size_t Size() const { return m_string.size(); } - bool empty() const { return m_string.empty(); } + bool IsEmpty() const { return m_string.empty(); } template <typename TSink> - void Dump(TSink & sink) const + void Serialize(TSink & sink) const { sink.Write(m_string.data(), m_string.size()); } @@ -126,40 +132,70 @@ private: string m_string; }; -class Uint32ValueList +} // namespace + +template <> +class ValueList<uint32_t> { public: - using TBuffer = vector<uint32_t>; + using TValue = uint32_t; + + ValueList() = default; + ValueList(serial::CodingParams const & codingParams) : m_codingParams(codingParams) {} - void Append(uint32_t value) + void Init(vector<TValue> const & values) { m_values = values; } + + size_t Size() const { return m_values.size(); } + + bool IsEmpty() const { return m_values.empty(); } + + template <typename TSink> + void Serialize(TSink & sink) const { - m_values.push_back(value); + for (auto const & value : m_values) + WriteToSink(sink, value); } - uint32_t size() const { return m_values.size(); } + template <typename TSource> + void Deserialize(TSource & src, uint32_t valueCount) + { + m_values.resize(valueCount); + for (size_t i = 0; i < valueCount; ++i) + m_values[i] = ReadPrimitiveFromSource<TValue>(src); + } - bool empty() const { return m_values.empty(); } + template <typename TSource> + void Deserialize(TSource & src) + { + while (src.Size() > 0) + { + m_values.push_back(TValue()); + m_values.back() = ReadPrimitiveFromSource<TValue>(src); + } + } - template <typename TSink> - void Dump(TSink & sink) const + template <typename TF> + void ForEach(TF && f) const { - sink.Write(m_values.data(), m_values.size() * sizeof(TBuffer::value_type)); + for (auto const & value : m_values) + f(value); } + void SetCodingParams(serial::CodingParams const & codingParams) { m_codingParams = codingParams; } + private: - TBuffer m_values; + vector<TValue> m_values; + serial::CodingParams m_codingParams; }; -} // unnamed namespace - #define ZENC bits::ZigZagEncode #define MKSC(x) static_cast<signed char>(x) #define MKUC(x) static_cast<uint8_t>(x) UNIT_TEST(TrieBuilder_WriteNode_Smoke) { - vector<uint8_t> serial; - PushBackByteSink<vector<uint8_t> > sink(serial); + vector<uint8_t> buf; + PushBackByteSink<vector<uint8_t>> sink(buf); ChildNodeInfo children[] = { ChildNodeInfo(true, 1, "1A"), ChildNodeInfo(false, 2, "B"), ChildNodeInfo(false, 3, "zz"), ChildNodeInfo(true, 4, @@ -194,7 +230,7 @@ UNIT_TEST(TrieBuilder_WriteNode_Smoke) MKUC(BOOST_BINARY(11000000) | ZENC(0)), // Child 5: header: [+leaf] [+supershort] }; - TEST_EQUAL(serial, vector<uint8_t>(&expected[0], &expected[0] + ARRAY_SIZE(expected)), ()); + TEST_EQUAL(buf, vector<uint8_t>(&expected[0], &expected[0] + ARRAY_SIZE(expected)), ()); } UNIT_TEST(TrieBuilder_Build) @@ -230,15 +266,15 @@ UNIT_TEST(TrieBuilder_Build) for (size_t i = 0; i < v.size(); ++i) vs.push_back(string(v[i].m_key.begin(), v[i].m_key.end())); - vector<uint8_t> serial; - PushBackByteSink<vector<uint8_t> > sink(serial); + vector<uint8_t> buf; + PushBackByteSink<vector<uint8_t>> sink(buf); trie::Build<PushBackByteSink<vector<uint8_t>>, typename vector<KeyValuePair>::iterator, - Uint32ValueList>(sink, v.begin(), v.end()); - reverse(serial.begin(), serial.end()); + ValueList<uint32_t>>(sink, v.begin(), v.end()); + reverse(buf.begin(), buf.end()); - MemReader memReader = MemReader(&serial[0], serial.size()); - using TIterator = trie::Iterator<trie::FixedSizeValueReader<4>::ValueType>; - auto const root = trie::ReadTrie(memReader, trie::FixedSizeValueReader<4>()); + MemReader memReader = MemReader(&buf[0], buf.size()); + auto const root = + trie::ReadTrie<MemReader, ValueList<uint32_t>>(memReader, serial::CodingParams()); vector<KeyValuePair> res; KeyValuePairBackInserter f; trie::ForEachRef(*root, f, vector<trie::TrieChar>()); diff --git a/coding/compressed_bit_vector.hpp b/coding/compressed_bit_vector.hpp index d048c86767..95d2d86ac4 100644 --- a/coding/compressed_bit_vector.hpp +++ b/coding/compressed_bit_vector.hpp @@ -156,6 +156,14 @@ public: static unique_ptr<CompressedBitVector> Deserialize(TReader & reader) { ReaderSource<TReader> src(reader); + return DeserializeFromSource(src); + } + + // Reads a bit vector from source which must contain a valid + // bit vector representation (see CompressedBitVector::Serialize for the format). + template <typename TSource> + static unique_ptr<CompressedBitVector> DeserializeFromSource(TSource & src) + { uint8_t header = ReadPrimitiveFromSource<uint8_t>(src); CompressedBitVector::StorageStrategy strat = static_cast<CompressedBitVector::StorageStrategy>(header); @@ -174,7 +182,7 @@ public: return make_unique<SparseCBV>(move(setBits)); } } - return nullptr; + return unique_ptr<CompressedBitVector>(); } }; diff --git a/coding/trie.hpp b/coding/trie.hpp index 589541f450..ef80a14e11 100644 --- a/coding/trie.hpp +++ b/coding/trie.hpp @@ -4,6 +4,8 @@ #include "base/base.hpp" #include "base/buffer_vector.hpp" +#include "indexer/string_file_values.hpp" + #include "std/unique_ptr.hpp" namespace trie @@ -16,7 +18,7 @@ typedef uint32_t TrieChar; // However 0 is used because the first byte is actually language id. static uint32_t const DEFAULT_CHAR = 0; -template <typename TValue> +template <typename TValueList> class Iterator { //dbg::ObjectTracker m_tracker; @@ -29,12 +31,12 @@ public: }; buffer_vector<Edge, 8> m_edge; - buffer_vector<TValue, 2> m_value; + TValueList m_valueList; virtual ~Iterator() {} - virtual unique_ptr<Iterator<TValue>> Clone() const = 0; - virtual unique_ptr<Iterator<TValue>> GoToEdge(size_t i) const = 0; + virtual unique_ptr<Iterator<TValueList>> Clone() const = 0; + virtual unique_ptr<Iterator<TValueList>> GoToEdge(size_t i) const = 0; }; struct EmptyValueReader @@ -65,11 +67,13 @@ struct FixedSizeValueReader } }; -template <typename TValue, typename F, typename TString> -void ForEachRef(Iterator<TValue> const & iter, F & f, TString const & s) +template <typename TValueList, typename TF, typename TString> +void ForEachRef(Iterator<TValueList> const & iter, TF && f, TString const & s) { - for (size_t i = 0; i < iter.m_value.size(); ++i) - f(s, iter.m_value[i]); + iter.m_valueList.ForEach([&f, &s](typename TValueList::TValue value) + { + f(s, value); + }); for (size_t i = 0; i < iter.m_edge.size(); ++i) { TString s1(s); @@ -79,4 +83,4 @@ void ForEachRef(Iterator<TValue> const & iter, F & f, TString const & s) } } -} // namespace Trie +} // namespace trie diff --git a/coding/trie_builder.hpp b/coding/trie_builder.hpp index 03ab9f3614..d42ebaf119 100644 --- a/coding/trie_builder.hpp +++ b/coding/trie_builder.hpp @@ -6,6 +6,7 @@ #include "base/buffer_vector.hpp" #include "std/algorithm.hpp" +#include "std/vector.hpp" // Trie format: // [1: header] @@ -46,18 +47,18 @@ void WriteNode(TSink & sink, TrieChar baseChar, TValueList const & valueList, if (begChild == endChild && !isRoot) { // Leaf node. - valueList.Dump(sink); + valueList.Serialize(sink); return; } uint32_t const childCount = endChild - begChild; - uint32_t const valueCount = valueList.size(); + uint32_t const valueCount = valueList.Size(); uint8_t const header = static_cast<uint32_t>((min(valueCount, 3U) << 6) + min(childCount, 63U)); sink.Write(&header, 1); if (valueCount >= 3) WriteVarUint(sink, valueCount); if (childCount >= 63) WriteVarUint(sink, childCount); - valueList.Dump(sink); + valueList.Serialize(sink); for (TChildIter it = begChild; it != endChild; /*++it*/) { uint8_t header = (it->IsLeaf() ? 128 : 0); @@ -103,7 +104,7 @@ struct ChildInfo { bool m_isLeaf; uint32_t m_size; - buffer_vector<TrieChar, 8> m_edge; + vector<TrieChar> m_edge; ChildInfo(bool isLeaf, uint32_t size, TrieChar c) : m_isLeaf(isLeaf), m_size(size), m_edge(1, c) { @@ -121,19 +122,44 @@ struct NodeInfo uint64_t m_begPos; TrieChar m_char; vector<ChildInfo> m_children; + + // This is ugly but will do until we rename ValueList. + // Here is the rationale. ValueList<> is the entity that + // we store in the nodes of the search trie. It can be read + // or written via its methods but not directly as was assumed + // in a previous version of this code. That version provided + // serialization methods for ValueList but the deserialization + // was ad hoc. + // Because of the possibility of serialized ValueLists to represent + // something completely different from an array of FeatureIds + // (a compressed bit vector, for instance) and because of the + // need to update a node's ValueList until the node is finalized + // this vector is needed here. It is better to leave it here + // than to expose it in ValueList. + vector<typename TValueList::TValue> m_temporaryValueList; TValueList m_valueList; + bool m_mayAppend; NodeInfo() : m_begPos(0), m_char(0) {} - NodeInfo(uint64_t pos, TrieChar trieChar) : m_begPos(pos), m_char(trieChar) {} + NodeInfo(uint64_t pos, TrieChar trieChar) : m_begPos(pos), m_char(trieChar), m_mayAppend(true) {} + + // It is finalized in the sense that no more appends are possible + // so it is a fine moment to initialize the underlying ValueList. + void FinalizeValueList() + { + m_valueList.Init(m_temporaryValueList); + m_mayAppend = false; + } }; template <typename TSink, typename TValueList> -void WriteNodeReverse(TSink & sink, TrieChar baseChar, NodeInfo<TValueList> const & node, +void WriteNodeReverse(TSink & sink, TrieChar baseChar, NodeInfo<TValueList> & node, bool isRoot = false) { using TOutStorage = buffer_vector<uint8_t, 64>; TOutStorage out; PushBackByteSink<TOutStorage> outSink(out); + node.FinalizeValueList(); WriteNode(outSink, baseChar, node.m_valueList, node.m_children.rbegin(), node.m_children.rend(), isRoot); reverse(out.begin(), out.end()); @@ -150,33 +176,48 @@ void PopNodes(TSink & sink, TNodes & nodes, int nodesToPop) TNodeInfo & node = nodes.back(); TNodeInfo & prevNode = nodes[nodes.size() - 2]; - if (node.m_valueList.empty() && node.m_children.size() <= 1) + if (node.m_temporaryValueList.empty() && node.m_children.size() <= 1) { ASSERT_EQUAL(node.m_children.size(), 1, ()); ChildInfo & child = node.m_children[0]; - prevNode.m_children.push_back(ChildInfo(child.m_isLeaf, child.m_size, node.m_char)); - prevNode.m_children.back().m_edge.append(child.m_edge.begin(), child.m_edge.end()); + prevNode.m_children.emplace_back(child.m_isLeaf, child.m_size, node.m_char); + auto & prevChild = prevNode.m_children.back(); + prevChild.m_edge.insert(prevChild.m_edge.end(), child.m_edge.begin(), child.m_edge.end()); } else { WriteNodeReverse(sink, node.m_char, node); - prevNode.m_children.push_back(ChildInfo(node.m_children.empty(), - static_cast<uint32_t>(sink.Pos() - node.m_begPos), - node.m_char)); + prevNode.m_children.emplace_back( + node.m_children.empty(), static_cast<uint32_t>(sink.Pos() - node.m_begPos), node.m_char); } nodes.pop_back(); } } +template <typename TNodeInfo, typename TValue> +void AppendValue(TNodeInfo & node, TValue const & value) +{ + // External-memory trie adds <string, value> pairs in a sorted + // order so the values are supposed to be accumulated in the + // sorted order and we can avoid sorting them before doing + // further operations such as ValueList construction. + using namespace std::rel_ops; + ASSERT(node.m_temporaryValueList.empty() || node.m_temporaryValueList.back() <= value, ()); + if (!node.m_temporaryValueList.empty() && node.m_temporaryValueList.back() == value) + return; + ASSERT(node.m_mayAppend, ()); + node.m_temporaryValueList.push_back(value); +} + template <typename TSink, typename TIter, typename TValueList> void Build(TSink & sink, TIter const beg, TIter const end) { using TTrieString = buffer_vector<TrieChar, 32>; using TNodeInfo = NodeInfo<TValueList>; - buffer_vector<TNodeInfo, 32> nodes; - nodes.push_back(TNodeInfo(sink.Pos(), DEFAULT_CHAR)); + vector<TNodeInfo> nodes; + nodes.emplace_back(sink.Pos(), DEFAULT_CHAR); TTrieString prevKey; @@ -200,8 +241,8 @@ void Build(TSink & sink, TIter const beg, TIter const end) uint64_t const pos = sink.Pos(); for (size_t i = nCommon; i < key.size(); ++i) - nodes.push_back(TNodeInfo(pos, key[i])); - nodes.back().m_valueList.Append(e.GetValue()); + nodes.emplace_back(pos, key[i]); + AppendValue(nodes.back(), e.GetValue()); prevKey.swap(key); prevE.Swap(e); diff --git a/coding/trie_reader.hpp b/coding/trie_reader.hpp index 30c4596fce..f3978f460a 100644 --- a/coding/trie_reader.hpp +++ b/coding/trie_reader.hpp @@ -3,42 +3,38 @@ #include "coding/reader.hpp" #include "coding/varint.hpp" +#include "indexer/coding_params.hpp" +#include "indexer/string_file_values.hpp" + #include "base/assert.hpp" #include "base/bits.hpp" #include "base/macros.hpp" namespace trie { -template <class TValueReader> -class LeafIterator0 : public Iterator<typename TValueReader::ValueType> +template <class TValueList> +class LeafIterator0 : public Iterator<TValueList> { public: - using ValueType = typename TValueReader::ValueType; + using Iterator<TValueList>::m_valueList; template <class TReader> - LeafIterator0(TReader const & reader, TValueReader const & valueReader) + LeafIterator0(TReader const & reader, serial::CodingParams const & codingParams) { - uint32_t const size = static_cast<uint32_t>(reader.Size()); ReaderSource<TReader> src(reader); - while (src.Pos() < size) - { - this->m_value.push_back(ValueType()); -#ifdef DEBUG - uint64_t const pos = src.Pos(); -#endif - valueReader(src, this->m_value.back()); - ASSERT_NOT_EQUAL(pos, src.Pos(), ()); - } - ASSERT_EQUAL(size, src.Pos(), ()); + m_valueList.SetCodingParams(codingParams); + m_valueList.Deserialize(src); + // todo(@mpimenov) There used to be an assert here + // that src is completely exhausted by this time. } // trie::Iterator overrides: - unique_ptr<Iterator<ValueType>> Clone() const override + unique_ptr<Iterator<TValueList>> Clone() const override { - return make_unique<LeafIterator0<TValueReader>>(*this); + return make_unique<LeafIterator0<TValueList>>(*this); } - unique_ptr<Iterator<ValueType>> GoToEdge(size_t i) const override + unique_ptr<Iterator<TValueList>> GoToEdge(size_t i) const override { ASSERT(false, (i)); UNUSED_VALUE(i); @@ -46,70 +42,38 @@ public: } }; -template <class TReader, class TValueReader> -class IteratorImplBase : public Iterator<typename TValueReader::ValueType> -{ -protected: - enum { IS_READER_IN_MEMORY = 0 }; -}; - -template <class TValueReader> -class IteratorImplBase<SharedMemReader, TValueReader> - : public Iterator<typename TValueReader::ValueType> -{ -protected: - enum { IS_READER_IN_MEMORY = 1 }; -}; - -template <class TReader, class TValueReader> -class Iterator0 : public IteratorImplBase<TReader, TValueReader> +template <class TReader, class TValueList> +class Iterator0 : public Iterator<TValueList> { public: - typedef typename TValueReader::ValueType ValueType; + using Iterator<TValueList>::m_valueList; + using Iterator<TValueList>::m_edge; - Iterator0(TReader const & reader, TValueReader const & valueReader, TrieChar baseChar) - : m_reader(reader), m_valueReader(valueReader) + Iterator0(TReader const & reader, TrieChar baseChar, serial::CodingParams const & codingParams) + : m_reader(reader), m_codingParams(codingParams) { + m_valueList.SetCodingParams(m_codingParams); ParseNode(baseChar); } // trie::Iterator overrides: - unique_ptr<Iterator<ValueType>> Clone() const override + unique_ptr<Iterator<TValueList>> Clone() const override { - return make_unique<Iterator0<TReader, TValueReader>>(*this); + return make_unique<Iterator0<TReader, TValueList>>(*this); } - unique_ptr<Iterator<ValueType>> GoToEdge(size_t i) const override + unique_ptr<Iterator<TValueList>> GoToEdge(size_t i) const override { ASSERT_LESS(i, this->m_edge.size(), ()); uint32_t const offset = m_edgeInfo[i].m_offset; uint32_t const size = m_edgeInfo[i+1].m_offset - offset; - // TODO: Profile to check that MemReader optimization helps? - /* - if (!IteratorImplBase<TReader, TValueReader>::IS_READER_IN_MEMORY && - size < 1024) - { - SharedMemReader memReader(size); - m_reader.Read(offset, memReader.Data(), size); - if (m_edgeInfo[i].m_isLeaf) - return make_unique<LeafIterator0<SharedMemReader, TValueReader>>( - memReader, m_valueReader); - else - return make_unique<Iterator0<SharedMemReader, TValueReader>>( - memReader, m_valueReader, - this->m_edge[i].m_str.back()); - } - else - */ - { - if (m_edgeInfo[i].m_isLeaf) - return make_unique<LeafIterator0<TValueReader>>(m_reader.SubReader(offset, size), - m_valueReader); - else - return make_unique<Iterator0<TReader, TValueReader>>( - m_reader.SubReader(offset, size), m_valueReader, this->m_edge[i].m_str.back()); - } + if (m_edgeInfo[i].m_isLeaf) + return make_unique<LeafIterator0<TValueList>>(m_reader.SubReader(offset, size), + m_codingParams); + + return make_unique<Iterator0<TReader, TValueList>>( + m_reader.SubReader(offset, size), this->m_edge[i].m_str.back(), m_codingParams); } private: @@ -131,9 +95,7 @@ private: childCount = ReadVarUint<uint32_t>(src); // [value] ... [value] - this->m_value.resize(valueCount); - for (uint32_t i = 0; i < valueCount; ++i) - m_valueReader(src, this->m_value[i]); + m_valueList.Deserialize(src, valueCount); // [childInfo] ... [childInfo] this->m_edge.resize(childCount); @@ -141,7 +103,7 @@ private: m_edgeInfo[0].m_offset = 0; for (uint32_t i = 0; i < childCount; ++i) { - typename Iterator<ValueType>::Edge & e = this->m_edge[i]; + typename Iterator<TValueList>::Edge & e = this->m_edge[i]; // [1: header]: [1: isLeaf] [1: isShortEdge] [6: (edgeChar0 - baseChar) or min(edgeLen-1, 63)] uint8_t const header = ReadPrimitiveFromSource<uint8_t>(src); @@ -185,15 +147,15 @@ private: buffer_vector<EdgeInfo, 9> m_edgeInfo; TReader m_reader; - TValueReader m_valueReader; + serial::CodingParams m_codingParams; }; // Returns iterator to the root of the trie. -template <class TReader, class TValueReader> -unique_ptr<Iterator<typename TValueReader::ValueType>> ReadTrie( - TReader const & reader, TValueReader valueReader = TValueReader()) +template <class TReader, class TValueList> +unique_ptr<Iterator<TValueList>> ReadTrie(TReader const & reader, + serial::CodingParams const & codingParams) { - return make_unique<Iterator0<TReader, TValueReader>>(reader, valueReader, DEFAULT_CHAR); + return make_unique<Iterator0<TReader, TValueList>>(reader, DEFAULT_CHAR, codingParams); } } // namespace trie |