Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/search
diff options
context:
space:
mode:
authorMaxim Pimenov <m@maps.me>2018-06-29 16:26:21 +0300
committerTatiana Yan <tatiana.kondakova@gmail.com>2018-07-02 15:45:54 +0300
commitcb504d0504cc047f15b5e310092a44af7f9faa0e (patch)
treeda33085f673a93c198f54d85d55084d859ee1cc0 /search
parent7ec094b9f4451fd3ef393e35ceded580d18961ca (diff)
[search] Got rid of templated Token type in the text index.
The tokens are now stored as UTF8-encoded strings.
Diffstat (limited to 'search')
-rw-r--r--search/base/text_index.hpp22
-rw-r--r--search/search_tests/text_index_tests.cpp24
2 files changed, 27 insertions, 19 deletions
diff --git a/search/base/text_index.hpp b/search/base/text_index.hpp
index 6f546610cf..3f69898389 100644
--- a/search/base/text_index.hpp
+++ b/search/base/text_index.hpp
@@ -45,6 +45,7 @@ namespace search
{
namespace base
{
+using Token = std::string;
using Posting = uint32_t;
enum class TextIndexVersion : uint8_t
@@ -97,7 +98,6 @@ struct TextIndexHeader
// The dictionary contains all tokens that are present
// in the text index.
-template <typename Token>
class TextIndexDictionary
{
public:
@@ -194,7 +194,6 @@ private:
std::vector<Token> m_tokens;
};
-template <typename Token>
class MemTextIndex
{
public:
@@ -217,6 +216,13 @@ public:
fn(p);
}
+ template <typename Fn>
+ void ForEachPosting(strings::UniString const & token, Fn && fn) const
+ {
+ auto const utf8s = strings::ToUtf8(token);
+ ForEachPosting(std::move(utf8s), std::forward<Fn>(fn));
+ }
+
template <typename Sink>
void Serialize(Sink & sink)
{
@@ -358,11 +364,10 @@ private:
}
std::map<Token, std::vector<Posting>> m_postingsByToken;
- TextIndexDictionary<Token> m_dictionary;
+ TextIndexDictionary m_dictionary;
};
// A reader class for on-demand reading of postings lists from disk.
-template <typename Token>
class TextIndexReader
{
public:
@@ -406,9 +411,16 @@ public:
}
}
+ template <typename Fn>
+ void ForEachPosting(strings::UniString const & token, Fn && fn) const
+ {
+ auto const utf8s = strings::ToUtf8(token);
+ ForEachPosting(std::move(utf8s), std::forward<Fn>(fn));
+ }
+
private:
FileReader m_fileReader;
- TextIndexDictionary<Token> m_dictionary;
+ TextIndexDictionary m_dictionary;
std::vector<uint32_t> m_postingsStarts;
};
diff --git a/search/search_tests/text_index_tests.cpp b/search/search_tests/text_index_tests.cpp
index 4007d1cb9d..3b29218b21 100644
--- a/search/search_tests/text_index_tests.cpp
+++ b/search/search_tests/text_index_tests.cpp
@@ -31,9 +31,7 @@ namespace
// Prepend several bytes to serialized indexes in order to check the relative offsets.
size_t const kSkip = 10;
-template <typename Token>
-void Serdes(MemTextIndex<Token> & memIndex, MemTextIndex<Token> & deserializedMemIndex,
- vector<uint8_t> & buf)
+void Serdes(MemTextIndex & memIndex, MemTextIndex & deserializedMemIndex, vector<uint8_t> & buf)
{
buf.clear();
{
@@ -62,14 +60,14 @@ namespace search
{
UNIT_TEST(TextIndex_Smoke)
{
- using Token = string;
+ using Token = base::Token;
vector<Token> const docsCollection = {
"a b c",
"a c",
};
- MemTextIndex<Token> memIndex;
+ MemTextIndex memIndex;
for (size_t docId = 0; docId < docsCollection.size(); ++docId)
{
@@ -82,7 +80,7 @@ UNIT_TEST(TextIndex_Smoke)
}
vector<uint8_t> indexData;
- MemTextIndex<Token> deserializedMemIndex;
+ MemTextIndex deserializedMemIndex;
Serdes(memIndex, deserializedMemIndex, indexData);
for (auto const & index : {memIndex, deserializedMemIndex})
@@ -98,7 +96,7 @@ UNIT_TEST(TextIndex_Smoke)
copy_n(indexData.begin() + kSkip, indexData.size() - kSkip, back_inserter(contents));
ScopedFile file("text_index_tmp", contents);
FileReader fileReader(file.GetFullPath());
- TextIndexReader<Token> textIndexReader(fileReader);
+ TextIndexReader textIndexReader(fileReader);
TestForEach(textIndexReader, "a", {0, 1});
TestForEach(textIndexReader, "b", {0});
TestForEach(textIndexReader, "c", {0, 1});
@@ -108,29 +106,27 @@ UNIT_TEST(TextIndex_Smoke)
UNIT_TEST(TextIndex_UniString)
{
- using Token = strings::UniString;
-
vector<std::string> const docsCollectionUtf8s = {
"â b ç",
"â ç",
};
- vector<Token> const docsCollection(
+ vector<strings::UniString> const docsCollection(
make_transform_iterator(docsCollectionUtf8s.begin(), &strings::MakeUniString),
make_transform_iterator(docsCollectionUtf8s.end(), &strings::MakeUniString));
- MemTextIndex<Token> memIndex;
+ MemTextIndex memIndex;
for (size_t docId = 0; docId < docsCollection.size(); ++docId)
{
- auto addToIndex = [&](Token const & token) {
- memIndex.AddPosting(token, static_cast<uint32_t>(docId));
+ auto addToIndex = [&](strings::UniString const & token) {
+ memIndex.AddPosting(strings::ToUtf8(token), static_cast<uint32_t>(docId));
};
auto delims = [](strings::UniChar const & c) { return c == ' '; };
SplitUniString(docsCollection[docId], addToIndex, delims);
}
vector<uint8_t> indexData;
- MemTextIndex<Token> deserializedMemIndex;
+ MemTextIndex deserializedMemIndex;
Serdes(memIndex, deserializedMemIndex, indexData);
for (auto const & index : {memIndex, deserializedMemIndex})