Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/search
diff options
context:
space:
mode:
authorMaxim Pimenov <m@maps.me>2018-06-29 17:11:05 +0300
committerTatiana Yan <tatiana.kondakova@gmail.com>2018-07-02 15:45:54 +0300
commit9f8508bb7bffc7ef2b7aba18fd46e26ff6fa0a79 (patch)
treee3b8f57bb2066799f88ce16eef049422128d8b99 /search
parentcb504d0504cc047f15b5e310092a44af7f9faa0e (diff)
[search] Broke the text index into multiple files.
Diffstat (limited to 'search')
-rw-r--r--search/CMakeLists.txt10
-rw-r--r--search/base/text_index.hpp429
-rw-r--r--search/base/text_index/dictionary.hpp118
-rw-r--r--search/base/text_index/header.cpp12
-rw-r--r--search/base/text_index/header.hpp59
-rw-r--r--search/base/text_index/mem.cpp37
-rw-r--r--search/base/text_index/mem.hpp179
-rw-r--r--search/base/text_index/reader.hpp77
-rw-r--r--search/base/text_index/text_index.cpp (renamed from search/base/text_index.cpp)5
-rw-r--r--search/base/text_index/text_index.hpp45
-rw-r--r--search/search_tests/text_index_tests.cpp4
11 files changed, 539 insertions, 436 deletions
diff --git a/search/CMakeLists.txt b/search/CMakeLists.txt
index 5ef9ab23c2..5fe22c5437 100644
--- a/search/CMakeLists.txt
+++ b/search/CMakeLists.txt
@@ -7,8 +7,14 @@ set(
approximate_string_match.hpp
base/inverted_list.hpp
base/mem_search_index.hpp
- base/text_index.cpp
- base/text_index.hpp
+ base/text_index/dictionary.hpp
+ base/text_index/header.cpp
+ base/text_index/header.hpp
+ base/text_index/mem.cpp
+ base/text_index/mem.hpp
+ base/text_index/reader.hpp
+ base/text_index/text_index.cpp
+ base/text_index/text_index.hpp
bookmarks/data.cpp
bookmarks/data.hpp
bookmarks/processor.cpp
diff --git a/search/base/text_index.hpp b/search/base/text_index.hpp
deleted file mode 100644
index 3f69898389..0000000000
--- a/search/base/text_index.hpp
+++ /dev/null
@@ -1,429 +0,0 @@
-#pragma once
-
-#include "coding/file_reader.hpp"
-#include "coding/reader.hpp"
-#include "coding/varint.hpp"
-#include "coding/write_to_sink.hpp"
-
-#include "base/assert.hpp"
-#include "base/checked_cast.hpp"
-#include "base/stl_helpers.hpp"
-#include "base/string_utils.hpp"
-
-#include <algorithm>
-#include <cstdint>
-#include <map>
-#include <memory>
-#include <string>
-#include <vector>
-
-// This file contains the structures needed to store an
-// updatable text index on disk.
-//
-// The index maps tokens of string type (typically std::string or
-// strings::UniString) to postings lists, i.e. to lists of entities
-// called postings that encode the locations of the strings in the collection
-// of the text documents that is being indexed. An example of a posting
-// is a document id (docid). Another example is a pair of a document id and
-// a position within the corresponding document.
-//
-// The updates are performed by rebuilding the index, either as a result
-// of merging several indexes together, or as a result of clearing outdated
-// entries from an old index.
-//
-// For version 0, the postings lists are docid arrays, i.e. arrays of unsigned
-// 32-bit integers stored in increasing order.
-// The structure of the index is:
-// [header: version and offsets]
-// [array containing the starting positions of tokens]
-// [tokens, written without separators in the lexicographical order]
-// [array containing the offsets for the postings lists]
-// [postings lists, stored as delta-encoded varints]
-//
-// All offsets are measured relative to the start of the index.
-namespace search
-{
-namespace base
-{
-using Token = std::string;
-using Posting = uint32_t;
-
-enum class TextIndexVersion : uint8_t
-{
- V0 = 0,
- Latest = V0
-};
-
-struct TextIndexHeader
-{
- template <typename Sink>
- void Serialize(Sink & sink) const
- {
- CHECK_EQUAL(m_version, TextIndexVersion::V0, ());
-
- sink.Write(kHeaderMagic.data(), kHeaderMagic.size());
- WriteToSink(sink, static_cast<uint8_t>(m_version));
- WriteToSink(sink, m_numTokens);
- WriteToSink(sink, m_dictPositionsOffset);
- WriteToSink(sink, m_dictWordsOffset);
- WriteToSink(sink, m_postingsStartsOffset);
- WriteToSink(sink, m_postingsListsOffset);
- }
-
- template <typename Source>
- void Deserialize(Source & source)
- {
- CHECK_EQUAL(m_version, TextIndexVersion::V0, ());
-
- std::string headerMagic(kHeaderMagic.size(), ' ');
- source.Read(&headerMagic[0], headerMagic.size());
- CHECK_EQUAL(headerMagic, kHeaderMagic, ());
- m_version = static_cast<TextIndexVersion>(ReadPrimitiveFromSource<uint8_t>(source));
- CHECK_EQUAL(m_version, TextIndexVersion::V0, ());
- m_numTokens = ReadPrimitiveFromSource<uint32_t>(source);
- m_dictPositionsOffset = ReadPrimitiveFromSource<uint32_t>(source);
- m_dictWordsOffset = ReadPrimitiveFromSource<uint32_t>(source);
- m_postingsStartsOffset = ReadPrimitiveFromSource<uint32_t>(source);
- m_postingsListsOffset = ReadPrimitiveFromSource<uint32_t>(source);
- }
-
- static std::string const kHeaderMagic;
- TextIndexVersion m_version = TextIndexVersion::Latest;
- uint32_t m_numTokens = 0;
- uint32_t m_dictPositionsOffset = 0;
- uint32_t m_dictWordsOffset = 0;
- uint32_t m_postingsStartsOffset = 0;
- uint32_t m_postingsListsOffset = 0;
-};
-
-// The dictionary contains all tokens that are present
-// in the text index.
-class TextIndexDictionary
-{
-public:
- bool GetTokenId(Token const & token, size_t & id) const
- {
- auto const it = std::lower_bound(m_tokens.cbegin(), m_tokens.cend(), token);
- if (it == m_tokens.cend() || *it != token)
- return false;
- id = ::base::checked_cast<size_t>(std::distance(m_tokens.cbegin(), it));
- return true;
- }
-
- void SetTokens(std::vector<Token> && tokens)
- {
- ASSERT(std::is_sorted(tokens.begin(), tokens.end()), ());
- m_tokens = std::move(tokens);
- }
- std::vector<Token> const & GetTokens() const { return m_tokens; }
-
- template <typename Sink>
- void Serialize(Sink & sink, TextIndexHeader & header, uint64_t startPos) const
- {
- header.m_numTokens = ::base::checked_cast<uint32_t>(m_tokens.size());
-
- header.m_dictPositionsOffset = RelativePos(sink, startPos);
- // An uint32_t for each 32-bit offset and an uint32_t for the dummy entry at the end.
- WriteZeroesToSink(sink, sizeof(uint32_t) * (header.m_numTokens + 1));
- header.m_dictWordsOffset = RelativePos(sink, startPos);
-
- std::vector<uint32_t> offsets;
- offsets.reserve(header.m_numTokens + 1);
- for (auto const & token : m_tokens)
- {
- offsets.emplace_back(RelativePos(sink, startPos));
- SerializeToken(sink, token);
- }
- offsets.emplace_back(RelativePos(sink, startPos));
-
- {
- uint64_t const savedPos = sink.Pos();
- sink.Seek(startPos + header.m_dictPositionsOffset);
-
- for (uint32_t const o : offsets)
- WriteToSink(sink, o);
-
- CHECK_EQUAL(sink.Pos(), startPos + header.m_dictWordsOffset, ());
- sink.Seek(savedPos);
- }
- }
-
- template <typename Source>
- void Deserialize(Source & source, TextIndexHeader const & header)
- {
- auto const startPos = source.Pos();
-
- std::vector<uint32_t> tokenOffsets(header.m_numTokens + 1);
- for (uint32_t & offset : tokenOffsets)
- offset = ReadPrimitiveFromSource<uint32_t>(source);
-
- uint64_t const expectedSize = header.m_dictWordsOffset - header.m_dictPositionsOffset;
- CHECK_EQUAL(source.Pos(), startPos + expectedSize, ());
- m_tokens.resize(header.m_numTokens);
- for (size_t i = 0; i < m_tokens.size(); ++i)
- {
- size_t const size = ::base::checked_cast<size_t>(tokenOffsets[i + 1] - tokenOffsets[i]);
- DeserializeToken(source, m_tokens[i], size);
- }
- }
-
-private:
- template <typename Sink>
- static void SerializeToken(Sink & sink, Token const & token)
- {
- CHECK(!token.empty(), ());
- // todo(@m) Endianness.
- sink.Write(token.data(), token.size() * sizeof(typename Token::value_type));
- }
-
- template <typename Source>
- static void DeserializeToken(Source & source, Token & token, size_t size)
- {
- CHECK_GREATER(size, 0, ());
- ASSERT_EQUAL(size % sizeof(typename Token::value_type), 0, ());
- token.resize(size / sizeof(typename Token::value_type));
- source.Read(&token[0], size);
- }
-
- template <typename Sink>
- static uint32_t RelativePos(Sink & sink, uint64_t startPos)
- {
- return ::base::checked_cast<uint32_t>(sink.Pos() - startPos);
- }
-
- std::vector<Token> m_tokens;
-};
-
-class MemTextIndex
-{
-public:
- MemTextIndex() = default;
-
- void AddPosting(Token const & token, Posting const & posting)
- {
- m_postingsByToken[token].emplace_back(posting);
- }
-
- // Executes |fn| on every posting associated with |token|.
- // The order of postings is not specified.
- template <typename Fn>
- void ForEachPosting(Token const & token, Fn && fn) const
- {
- auto const it = m_postingsByToken.find(token);
- if (it == m_postingsByToken.end())
- return;
- for (auto const p : it->second)
- fn(p);
- }
-
- template <typename Fn>
- void ForEachPosting(strings::UniString const & token, Fn && fn) const
- {
- auto const utf8s = strings::ToUtf8(token);
- ForEachPosting(std::move(utf8s), std::forward<Fn>(fn));
- }
-
- template <typename Sink>
- void Serialize(Sink & sink)
- {
- SortPostings();
- BuildDictionary();
-
- TextIndexHeader header;
-
- uint64_t const startPos = sink.Pos();
- // Will be filled in later.
- header.Serialize(sink);
-
- SerializeDictionary(sink, header, startPos);
- SerializePostingsLists(sink, header, startPos);
-
- uint64_t const finishPos = sink.Pos();
- sink.Seek(startPos);
- header.Serialize(sink);
- sink.Seek(finishPos);
- }
-
- template <typename Source>
- void Deserialize(Source & source)
- {
- uint64_t startPos = source.Pos();
-
- TextIndexHeader header;
- header.Deserialize(source);
-
- DeserializeDictionary(source, header, startPos);
- DeserializePostingsLists(source, header, startPos);
- }
-
-private:
- void SortPostings()
- {
- for (auto & entry : m_postingsByToken)
- {
- // A posting may occur several times in a document,
- // so we remove duplicates for the docid index.
- // If the count is needed for ranking it may be stored
- // separately.
- my::SortUnique(entry.second);
- }
- }
-
- void BuildDictionary()
- {
- std::vector<Token> tokens;
- tokens.reserve(m_postingsByToken.size());
- for (auto const & entry : m_postingsByToken)
- tokens.emplace_back(entry.first);
- m_dictionary.SetTokens(std::move(tokens));
- }
-
- template <typename Sink>
- void SerializeDictionary(Sink & sink, TextIndexHeader & header, uint64_t startPos) const
- {
- m_dictionary.Serialize(sink, header, startPos);
- }
-
- template <typename Source>
- void DeserializeDictionary(Source & source, TextIndexHeader const & header, uint64_t startPos)
- {
- CHECK_EQUAL(source.Pos(), startPos + header.m_dictPositionsOffset, ());
- m_dictionary.Deserialize(source, header);
- }
-
- template <typename Sink>
- void SerializePostingsLists(Sink & sink, TextIndexHeader & header, uint64_t startPos) const
- {
- header.m_postingsStartsOffset = RelativePos(sink, startPos);
- // An uint32_t for each 32-bit offset and an uint32_t for the dummy entry at the end.
- WriteZeroesToSink(sink, sizeof(uint32_t) * (header.m_numTokens + 1));
-
- header.m_postingsListsOffset = RelativePos(sink, startPos);
-
- std::vector<uint32_t> postingsStarts;
- postingsStarts.reserve(header.m_numTokens);
- for (auto const & entry : m_postingsByToken)
- {
- auto const & postings = entry.second;
-
- postingsStarts.emplace_back(RelativePos(sink, startPos));
-
- uint32_t last = 0;
- for (auto const p : postings)
- {
- CHECK(last == 0 || last < p, (last, p));
- uint32_t const delta = p - last;
- WriteVarUint(sink, delta);
- last = p;
- }
- }
- // One more for convenience.
- postingsStarts.emplace_back(RelativePos(sink, startPos));
-
- {
- uint64_t const savedPos = sink.Pos();
- sink.Seek(startPos + header.m_postingsStartsOffset);
- for (uint32_t const s : postingsStarts)
- WriteToSink(sink, s);
-
- CHECK_EQUAL(sink.Pos(), startPos + header.m_postingsListsOffset, ());
- sink.Seek(savedPos);
- }
- }
-
- template <typename Source>
- void DeserializePostingsLists(Source & source, TextIndexHeader const & header, uint64_t startPos)
- {
- CHECK_EQUAL(source.Pos(), startPos + header.m_postingsStartsOffset, ());
- std::vector<uint32_t> postingsStarts(header.m_numTokens + 1);
- for (uint32_t & start : postingsStarts)
- start = ReadPrimitiveFromSource<uint32_t>(source);
-
- auto const & tokens = m_dictionary.GetTokens();
- CHECK_EQUAL(source.Pos(), startPos + header.m_postingsListsOffset, ());
- m_postingsByToken.clear();
- for (size_t i = 0; i < header.m_numTokens; ++i)
- {
- std::vector<uint32_t> postings;
- uint32_t last = 0;
- while (source.Pos() < startPos + postingsStarts[i + 1])
- {
- last += ReadVarUint<uint32_t>(source);
- postings.emplace_back(last);
- }
- CHECK_EQUAL(source.Pos(), postingsStarts[i + 1], ());
-
- m_postingsByToken.emplace(tokens[i], postings);
- }
- }
-
- template <typename Sink>
- static uint32_t RelativePos(Sink & sink, uint64_t startPos)
- {
- return ::base::checked_cast<uint32_t>(sink.Pos() - startPos);
- }
-
- std::map<Token, std::vector<Posting>> m_postingsByToken;
- TextIndexDictionary m_dictionary;
-};
-
-// A reader class for on-demand reading of postings lists from disk.
-class TextIndexReader
-{
-public:
- explicit TextIndexReader(FileReader const & fileReader) : m_fileReader(fileReader)
- {
- TextIndexHeader header;
- ReaderSource<FileReader> headerSource(m_fileReader);
- header.Deserialize(headerSource);
-
- uint64_t const dictStart = header.m_dictPositionsOffset;
- uint64_t const dictEnd = header.m_postingsStartsOffset;
- ReaderSource<FileReader> dictSource(m_fileReader.SubReader(dictStart, dictEnd - dictStart));
- m_dictionary.Deserialize(dictSource, header);
-
- uint64_t const postStart = header.m_postingsStartsOffset;
- uint64_t const postEnd = header.m_postingsListsOffset;
- ReaderSource<FileReader> postingsSource(m_fileReader.SubReader(postStart, postEnd - postStart));
- m_postingsStarts.resize(header.m_numTokens + 1);
- for (uint32_t & start : m_postingsStarts)
- start = ReadPrimitiveFromSource<uint32_t>(postingsSource);
- }
-
- // Executes |fn| on every posting associated with |token|.
- // The order of postings is not specified.
- template <typename Fn>
- void ForEachPosting(Token const & token, Fn && fn) const
- {
- size_t tokenId = 0;
- if (!m_dictionary.GetTokenId(token, tokenId))
- return;
- CHECK_LESS(tokenId + 1, m_postingsStarts.size(), ());
-
- ReaderSource<FileReader> source(m_fileReader.SubReader(
- m_postingsStarts[tokenId], m_postingsStarts[tokenId + 1] - m_postingsStarts[tokenId]));
-
- uint32_t last = 0;
- while (source.Size() > 0)
- {
- last += ReadVarUint<uint32_t>(source);
- fn(last);
- }
- }
-
- template <typename Fn>
- void ForEachPosting(strings::UniString const & token, Fn && fn) const
- {
- auto const utf8s = strings::ToUtf8(token);
- ForEachPosting(std::move(utf8s), std::forward<Fn>(fn));
- }
-
-private:
- FileReader m_fileReader;
- TextIndexDictionary m_dictionary;
- std::vector<uint32_t> m_postingsStarts;
-};
-
-std::string DebugPrint(TextIndexVersion const & version);
-} // namespace base
-} // namespace search
diff --git a/search/base/text_index/dictionary.hpp b/search/base/text_index/dictionary.hpp
new file mode 100644
index 0000000000..01670d2f9a
--- /dev/null
+++ b/search/base/text_index/dictionary.hpp
@@ -0,0 +1,118 @@
+#pragma once
+
+#include "search/base/text_index/header.hpp"
+#include "search/base/text_index/text_index.hpp"
+
+#include "coding/write_to_sink.hpp"
+
+#include "base/assert.hpp"
+#include "base/checked_cast.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <vector>
+
+namespace search
+{
+namespace base
+{
+// The dictionary contains all tokens that are present
+// in the text index.
+class TextIndexDictionary
+{
+public:
+ bool GetTokenId(Token const & token, size_t & id) const
+ {
+ auto const it = std::lower_bound(m_tokens.cbegin(), m_tokens.cend(), token);
+ if (it == m_tokens.cend() || *it != token)
+ return false;
+ id = ::base::checked_cast<uint32_t>(std::distance(m_tokens.cbegin(), it));
+ return true;
+ }
+
+ void SetTokens(std::vector<Token> && tokens)
+ {
+ ASSERT(std::is_sorted(tokens.begin(), tokens.end()), ());
+ m_tokens = std::move(tokens);
+ }
+
+ std::vector<Token> const & GetTokens() const { return m_tokens; }
+
+ template <typename Sink>
+ void Serialize(Sink & sink, TextIndexHeader & header, uint64_t startPos) const
+ {
+ header.m_numTokens = ::base::checked_cast<uint32_t>(m_tokens.size());
+
+ header.m_dictPositionsOffset = RelativePos(sink, startPos);
+ // An uint32_t for each 32-bit offset and an uint32_t for the dummy entry at the end.
+ WriteZeroesToSink(sink, sizeof(uint32_t) * (header.m_numTokens + 1));
+ header.m_dictWordsOffset = RelativePos(sink, startPos);
+
+ std::vector<uint32_t> offsets;
+ offsets.reserve(header.m_numTokens + 1);
+ for (auto const & token : m_tokens)
+ {
+ offsets.emplace_back(RelativePos(sink, startPos));
+ SerializeToken(sink, token);
+ }
+ offsets.emplace_back(RelativePos(sink, startPos));
+
+ {
+ uint64_t const savedPos = sink.Pos();
+ sink.Seek(startPos + header.m_dictPositionsOffset);
+
+ for (uint32_t const o : offsets)
+ WriteToSink(sink, o);
+
+ CHECK_EQUAL(sink.Pos(), startPos + header.m_dictWordsOffset, ());
+ sink.Seek(savedPos);
+ }
+ }
+
+ template <typename Source>
+ void Deserialize(Source & source, TextIndexHeader const & header)
+ {
+ auto const startPos = source.Pos();
+
+ std::vector<uint32_t> tokenOffsets(header.m_numTokens + 1);
+ for (uint32_t & offset : tokenOffsets)
+ offset = ReadPrimitiveFromSource<uint32_t>(source);
+
+ uint64_t const expectedSize = header.m_dictWordsOffset - header.m_dictPositionsOffset;
+ CHECK_EQUAL(source.Pos(), startPos + expectedSize, ());
+ m_tokens.resize(header.m_numTokens);
+ for (size_t i = 0; i < m_tokens.size(); ++i)
+ {
+ size_t const size = ::base::checked_cast<size_t>(tokenOffsets[i + 1] - tokenOffsets[i]);
+ DeserializeToken(source, m_tokens[i], size);
+ }
+ }
+
+private:
+ template <typename Sink>
+ static void SerializeToken(Sink & sink, Token const & token)
+ {
+ CHECK(!token.empty(), ());
+ // todo(@m) Endianness.
+ sink.Write(token.data(), token.size() * sizeof(typename Token::value_type));
+ }
+
+ template <typename Source>
+ static void DeserializeToken(Source & source, Token & token, size_t size)
+ {
+ CHECK_GREATER(size, 0, ());
+ ASSERT_EQUAL(size % sizeof(typename Token::value_type), 0, ());
+ token.resize(size / sizeof(typename Token::value_type));
+ source.Read(&token[0], size);
+ }
+
+ template <typename Sink>
+ static uint32_t RelativePos(Sink & sink, uint64_t startPos)
+ {
+ return ::base::checked_cast<uint32_t>(sink.Pos() - startPos);
+ }
+
+ std::vector<Token> m_tokens;
+};
+} // namespace base
+} // namespace search
diff --git a/search/base/text_index/header.cpp b/search/base/text_index/header.cpp
new file mode 100644
index 0000000000..c0f987e16f
--- /dev/null
+++ b/search/base/text_index/header.cpp
@@ -0,0 +1,12 @@
+#include "search/base/text_index/header.hpp"
+
+using namespace std;
+
+namespace search
+{
+namespace base
+{
+// static
+string const TextIndexHeader::kHeaderMagic = "mapsmetextidx";
+} // namespace base
+} // namespace search
diff --git a/search/base/text_index/header.hpp b/search/base/text_index/header.hpp
new file mode 100644
index 0000000000..71b0eb7df6
--- /dev/null
+++ b/search/base/text_index/header.hpp
@@ -0,0 +1,59 @@
+#pragma once
+
+#include "search/base/text_index/text_index.hpp"
+
+#include "coding/reader.hpp"
+#include "coding/write_to_sink.hpp"
+
+#include "base/assert.hpp"
+
+#include <cstdint>
+#include <string>
+
+namespace search
+{
+namespace base
+{
+struct TextIndexHeader
+{
+ template <typename Sink>
+ void Serialize(Sink & sink) const
+ {
+ CHECK_EQUAL(m_version, TextIndexVersion::V0, ());
+
+ sink.Write(kHeaderMagic.data(), kHeaderMagic.size());
+ WriteToSink(sink, static_cast<uint8_t>(m_version));
+ WriteToSink(sink, m_numTokens);
+ WriteToSink(sink, m_dictPositionsOffset);
+ WriteToSink(sink, m_dictWordsOffset);
+ WriteToSink(sink, m_postingsStartsOffset);
+ WriteToSink(sink, m_postingsListsOffset);
+ }
+
+ template <typename Source>
+ void Deserialize(Source & source)
+ {
+ CHECK_EQUAL(m_version, TextIndexVersion::V0, ());
+
+ std::string headerMagic(kHeaderMagic.size(), ' ');
+ source.Read(&headerMagic[0], headerMagic.size());
+ CHECK_EQUAL(headerMagic, kHeaderMagic, ());
+ m_version = static_cast<TextIndexVersion>(ReadPrimitiveFromSource<uint8_t>(source));
+ CHECK_EQUAL(m_version, TextIndexVersion::V0, ());
+ m_numTokens = ReadPrimitiveFromSource<uint32_t>(source);
+ m_dictPositionsOffset = ReadPrimitiveFromSource<uint32_t>(source);
+ m_dictWordsOffset = ReadPrimitiveFromSource<uint32_t>(source);
+ m_postingsStartsOffset = ReadPrimitiveFromSource<uint32_t>(source);
+ m_postingsListsOffset = ReadPrimitiveFromSource<uint32_t>(source);
+ }
+
+ static std::string const kHeaderMagic;
+ TextIndexVersion m_version = TextIndexVersion::Latest;
+ uint32_t m_numTokens = 0;
+ uint32_t m_dictPositionsOffset = 0;
+ uint32_t m_dictWordsOffset = 0;
+ uint32_t m_postingsStartsOffset = 0;
+ uint32_t m_postingsListsOffset = 0;
+};
+} // namespace base
+} // namespace search
diff --git a/search/base/text_index/mem.cpp b/search/base/text_index/mem.cpp
new file mode 100644
index 0000000000..4c18f3b90f
--- /dev/null
+++ b/search/base/text_index/mem.cpp
@@ -0,0 +1,37 @@
+#include "search/base/text_index/mem.hpp"
+
+#include "base/stl_helpers.hpp"
+
+using namespace std;
+
+namespace search
+{
+namespace base
+{
+void MemTextIndex::AddPosting(Token const & token, Posting const & posting)
+{
+ m_postingsByToken[token].emplace_back(posting);
+}
+
+void MemTextIndex::SortPostings()
+{
+ for (auto & entry : m_postingsByToken)
+ {
+ // A posting may occur several times in a document,
+ // so we remove duplicates for the docid index.
+ // If the count is needed for ranking it may be stored
+ // separately.
+ my::SortUnique(entry.second);
+ }
+}
+
+void MemTextIndex::BuildDictionary()
+{
+ vector<Token> tokens;
+ tokens.reserve(m_postingsByToken.size());
+ for (auto const & entry : m_postingsByToken)
+ tokens.emplace_back(entry.first);
+ m_dictionary.SetTokens(move(tokens));
+}
+} // namespace base
+} // namespace search
diff --git a/search/base/text_index/mem.hpp b/search/base/text_index/mem.hpp
new file mode 100644
index 0000000000..a0611c318a
--- /dev/null
+++ b/search/base/text_index/mem.hpp
@@ -0,0 +1,179 @@
+#pragma once
+
+#include "search/base/text_index/dictionary.hpp"
+#include "search/base/text_index/header.hpp"
+#include "search/base/text_index/text_index.hpp"
+
+#include "coding/reader.hpp"
+#include "coding/varint.hpp"
+#include "coding/write_to_sink.hpp"
+
+#include "base/assert.hpp"
+#include "base/checked_cast.hpp"
+#include "base/string_utils.hpp"
+
+#include <algorithm>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+namespace search
+{
+namespace base
+{
+class MemTextIndex
+{
+public:
+ MemTextIndex() = default;
+
+ void AddPosting(Token const & token, Posting const & posting);
+
+ // Executes |fn| on every posting associated with |token|.
+ // The order of postings is not specified.
+ template <typename Fn>
+ void ForEachPosting(Token const & token, Fn && fn) const
+ {
+ auto const it = m_postingsByToken.find(token);
+ if (it == m_postingsByToken.end())
+ return;
+ for (auto const p : it->second)
+ fn(p);
+ }
+
+ template <typename Fn>
+ void ForEachPosting(strings::UniString const & token, Fn && fn) const
+ {
+ auto const utf8s = strings::ToUtf8(token);
+ ForEachPosting(std::move(utf8s), std::forward<Fn>(fn));
+ }
+
+ template <typename Sink>
+ void Serialize(Sink & sink)
+ {
+ SortPostings();
+ BuildDictionary();
+
+ TextIndexHeader header;
+
+ uint64_t const startPos = sink.Pos();
+ // Will be filled in later.
+ header.Serialize(sink);
+
+ SerializeDictionary(sink, header, startPos);
+ SerializePostingsLists(sink, header, startPos);
+
+ uint64_t const finishPos = sink.Pos();
+ sink.Seek(startPos);
+ header.Serialize(sink);
+ sink.Seek(finishPos);
+ }
+
+ template <typename Source>
+ void Deserialize(Source & source)
+ {
+ uint64_t startPos = source.Pos();
+
+ TextIndexHeader header;
+ header.Deserialize(source);
+
+ DeserializeDictionary(source, header, startPos);
+ DeserializePostingsLists(source, header, startPos);
+ }
+
+private:
+ void SortPostings();
+
+ void BuildDictionary();
+
+ template <typename Sink>
+ void SerializeDictionary(Sink & sink, TextIndexHeader & header, uint64_t startPos) const
+ {
+ m_dictionary.Serialize(sink, header, startPos);
+ }
+
+ template <typename Source>
+ void DeserializeDictionary(Source & source, TextIndexHeader const & header, uint64_t startPos)
+ {
+ CHECK_EQUAL(source.Pos(), startPos + header.m_dictPositionsOffset, ());
+ m_dictionary.Deserialize(source, header);
+ }
+
+ template <typename Sink>
+ void SerializePostingsLists(Sink & sink, TextIndexHeader & header, uint64_t startPos) const
+ {
+ header.m_postingsStartsOffset = RelativePos(sink, startPos);
+ // An uint32_t for each 32-bit offset and an uint32_t for the dummy entry at the end.
+ WriteZeroesToSink(sink, sizeof(uint32_t) * (header.m_numTokens + 1));
+
+ header.m_postingsListsOffset = RelativePos(sink, startPos);
+
+ std::vector<uint32_t> postingsStarts;
+ postingsStarts.reserve(header.m_numTokens);
+ for (auto const & entry : m_postingsByToken)
+ {
+ auto const & postings = entry.second;
+
+ postingsStarts.emplace_back(RelativePos(sink, startPos));
+
+ uint32_t last = 0;
+ for (auto const p : postings)
+ {
+ CHECK(last == 0 || last < p, (last, p));
+ uint32_t const delta = p - last;
+ WriteVarUint(sink, delta);
+ last = p;
+ }
+ }
+ // One more for convenience.
+ postingsStarts.emplace_back(RelativePos(sink, startPos));
+
+ {
+ uint64_t const savedPos = sink.Pos();
+ sink.Seek(startPos + header.m_postingsStartsOffset);
+ for (uint32_t const s : postingsStarts)
+ WriteToSink(sink, s);
+
+ CHECK_EQUAL(sink.Pos(), startPos + header.m_postingsListsOffset, ());
+ sink.Seek(savedPos);
+ }
+ }
+
+ template <typename Source>
+ void DeserializePostingsLists(Source & source, TextIndexHeader const & header, uint64_t startPos)
+ {
+ CHECK_EQUAL(source.Pos(), startPos + header.m_postingsStartsOffset, ());
+ std::vector<uint32_t> postingsStarts(header.m_numTokens + 1);
+ for (uint32_t & start : postingsStarts)
+ start = ReadPrimitiveFromSource<uint32_t>(source);
+
+ auto const & tokens = m_dictionary.GetTokens();
+ CHECK_EQUAL(source.Pos(), startPos + header.m_postingsListsOffset, ());
+ m_postingsByToken.clear();
+ for (size_t i = 0; i < header.m_numTokens; ++i)
+ {
+ std::vector<uint32_t> postings;
+ uint32_t last = 0;
+ while (source.Pos() < startPos + postingsStarts[i + 1])
+ {
+ last += ReadVarUint<uint32_t>(source);
+ postings.emplace_back(last);
+ }
+ CHECK_EQUAL(source.Pos(), postingsStarts[i + 1], ());
+
+ m_postingsByToken.emplace(tokens[i], postings);
+ }
+ }
+
+ template <typename Sink>
+ static uint32_t RelativePos(Sink & sink, uint64_t startPos)
+ {
+ return ::base::checked_cast<uint32_t>(sink.Pos() - startPos);
+ }
+
+ std::map<Token, std::vector<Posting>> m_postingsByToken;
+ TextIndexDictionary m_dictionary;
+};
+} // namespace base
+} // namespace search
diff --git a/search/base/text_index/reader.hpp b/search/base/text_index/reader.hpp
new file mode 100644
index 0000000000..47609982f6
--- /dev/null
+++ b/search/base/text_index/reader.hpp
@@ -0,0 +1,77 @@
+#pragma once
+
+#include "search/base/text_index/text_index.hpp"
+
+#include "coding/file_reader.hpp"
+#include "coding/reader.hpp"
+#include "coding/varint.hpp"
+
+#include "base/assert.hpp"
+#include "base/string_utils.hpp"
+
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace search
+{
+namespace base
+{
+// A reader class for on-demand reading of postings lists from disk.
+class TextIndexReader
+{
+public:
+ explicit TextIndexReader(FileReader const & fileReader) : m_fileReader(fileReader)
+ {
+ ReaderSource<FileReader> headerSource(m_fileReader);
+ TextIndexHeader header;
+ header.Deserialize(headerSource);
+
+ uint64_t const dictStart = header.m_dictPositionsOffset;
+ uint64_t const dictEnd = header.m_postingsStartsOffset;
+ ReaderSource<FileReader> dictSource(m_fileReader.SubReader(dictStart, dictEnd - dictStart));
+ m_dictionary.Deserialize(dictSource, header);
+
+ uint64_t const postStart = header.m_postingsStartsOffset;
+ uint64_t const postEnd = header.m_postingsListsOffset;
+ ReaderSource<FileReader> postingsSource(m_fileReader.SubReader(postStart, postEnd - postStart));
+ m_postingsStarts.resize(header.m_numTokens + 1);
+ for (uint32_t & start : m_postingsStarts)
+ start = ReadPrimitiveFromSource<uint32_t>(postingsSource);
+ }
+
+ // Executes |fn| on every posting associated with |token|.
+ // The order of postings is not specified.
+ template <typename Fn>
+ void ForEachPosting(Token const & token, Fn && fn) const
+ {
+ size_t tokenId = 0;
+ if (!m_dictionary.GetTokenId(token, tokenId))
+ return;
+ CHECK_LESS(tokenId + 1, m_postingsStarts.size(), ());
+
+ ReaderSource<FileReader> source(m_fileReader.SubReader(
+ m_postingsStarts[tokenId], m_postingsStarts[tokenId + 1] - m_postingsStarts[tokenId]));
+
+ uint32_t last = 0;
+ while (source.Size() > 0)
+ {
+ last += ReadVarUint<uint32_t>(source);
+ fn(last);
+ }
+ }
+
+ template <typename Fn>
+ void ForEachPosting(strings::UniString const & token, Fn && fn) const
+ {
+ auto const utf8s = strings::ToUtf8(token);
+ ForEachPosting(std::move(utf8s), std::forward<Fn>(fn));
+ }
+
+private:
+ FileReader m_fileReader;
+ TextIndexDictionary m_dictionary;
+ std::vector<uint32_t> m_postingsStarts;
+};
+} // namespace base
+} // namespace search
diff --git a/search/base/text_index.cpp b/search/base/text_index/text_index.cpp
index 9b66ea5b30..d9ea9ef562 100644
--- a/search/base/text_index.cpp
+++ b/search/base/text_index/text_index.cpp
@@ -1,4 +1,4 @@
-#include "search/base/text_index.hpp"
+#include "search/base/text_index/text_index.hpp"
#include "base/assert.hpp"
#include "base/string_utils.hpp"
@@ -9,9 +9,6 @@ namespace search
{
namespace base
{
-// static
-string const TextIndexHeader::kHeaderMagic = "mapsmetextidx";
-
string DebugPrint(TextIndexVersion const & version)
{
switch (version)
diff --git a/search/base/text_index/text_index.hpp b/search/base/text_index/text_index.hpp
new file mode 100644
index 0000000000..eeff695ca5
--- /dev/null
+++ b/search/base/text_index/text_index.hpp
@@ -0,0 +1,45 @@
+#pragma once
+
+#include <cstdint>
+#include <string>
+
+// This file contains the structures needed to store an
+// updatable text index on disk.
+//
+// The index maps tokens of string type (typically std::string or
+// strings::UniString) to postings lists, i.e. to lists of entities
+// called postings that encode the locations of the strings in the collection
+// of the text documents that is being indexed. An example of a posting
+// is a document id (docid). Another example is a pair of a document id and
+// a position within the corresponding document.
+//
+// The updates are performed by rebuilding the index, either as a result
+// of merging several indexes together, or as a result of clearing outdated
+// entries from an old index.
+//
+// For version 0, the postings lists are docid arrays, i.e. arrays of unsigned
+// 32-bit integers stored in increasing order.
+// The structure of the index is:
+// [header: version and offsets]
+// [array containing the starting positions of tokens]
+// [tokens, written without separators in the lexicographical order]
+// [array containing the offsets for the postings lists]
+// [postings lists, stored as delta-encoded varints]
+//
+// All offsets are measured relative to the start of the index.
+namespace search
+{
+namespace base
+{
+using Token = std::string;
+using Posting = uint32_t;
+
+enum class TextIndexVersion : uint8_t
+{
+ V0 = 0,
+ Latest = V0
+};
+
+std::string DebugPrint(TextIndexVersion const & version);
+} // namespace base
+} // namespace search
diff --git a/search/search_tests/text_index_tests.cpp b/search/search_tests/text_index_tests.cpp
index 3b29218b21..15cbaa70b3 100644
--- a/search/search_tests/text_index_tests.cpp
+++ b/search/search_tests/text_index_tests.cpp
@@ -1,6 +1,8 @@
#include "testing/testing.hpp"
-#include "search/base/text_index.hpp"
+#include "search/base/text_index/mem.hpp"
+#include "search/base/text_index/reader.hpp"
+#include "search/base/text_index/text_index.hpp"
#include "indexer/search_string_utils.hpp"