[search] Got rid of templated Token type in the text index.

The tokens are now stored as UTF8-encoded strings.
author: Maxim Pimenov <m@maps.me> 2018-06-29 16:26:21 +0300
committer: Tatiana Yan <tatiana.kondakova@gmail.com> 2018-07-02 15:45:54 +0300
commit: cb504d0504cc047f15b5e310092a44af7f9faa0e (patch)
tree: da33085f673a93c198f54d85d55084d859ee1cc0 /search
parent: 7ec094b9f4451fd3ef393e35ceded580d18961ca (diff)
2 files changed, 27 insertions, 19 deletions
diff --git a/search/base/text_index.hpp b/search/base/text_index.hpp
index 6f546610cf..3f69898389 100644
--- a/search/base/text_index.hpp
+++ b/search/base/text_index.hpp
@@ -45,6 +45,7 @@ namespace search
 {
 namespace base
 {
+using Token = std::string;
 using Posting = uint32_t;
 
 enum class TextIndexVersion : uint8_t
@@ -97,7 +98,6 @@ struct TextIndexHeader
 
 // The dictionary contains all tokens that are present
 // in the text index.
-template <typename Token>
 class TextIndexDictionary
 {
 public:
@@ -194,7 +194,6 @@ private:
   std::vector<Token> m_tokens;
 };
 
-template <typename Token>
 class MemTextIndex
 {
 public:
@@ -217,6 +216,13 @@ public:
       fn(p);
   }
 
+  template <typename Fn>
+  void ForEachPosting(strings::UniString const & token, Fn && fn) const
+  {
+    auto const utf8s = strings::ToUtf8(token);
+    ForEachPosting(std::move(utf8s), std::forward<Fn>(fn));
+  }
+
   template <typename Sink>
   void Serialize(Sink & sink)
   {
@@ -358,11 +364,10 @@ private:
   }
 
   std::map<Token, std::vector<Posting>> m_postingsByToken;
-  TextIndexDictionary<Token> m_dictionary;
+  TextIndexDictionary m_dictionary;
 };
 
 // A reader class for on-demand reading of postings lists from disk.
-template <typename Token>
 class TextIndexReader
 {
 public:
@@ -406,9 +411,16 @@ public:
     }
   }
 
+  template <typename Fn>
+  void ForEachPosting(strings::UniString const & token, Fn && fn) const
+  {
+    auto const utf8s = strings::ToUtf8(token);
+    ForEachPosting(std::move(utf8s), std::forward<Fn>(fn));
+  }
+
 private:
   FileReader m_fileReader;
-  TextIndexDictionary<Token> m_dictionary;
+  TextIndexDictionary m_dictionary;
   std::vector<uint32_t> m_postingsStarts;
 };
 
diff --git a/search/search_tests/text_index_tests.cpp b/search/search_tests/text_index_tests.cpp
index 4007d1cb9d..3b29218b21 100644
--- a/search/search_tests/text_index_tests.cpp
+++ b/search/search_tests/text_index_tests.cpp
@@ -31,9 +31,7 @@ namespace
 // Prepend several bytes to serialized indexes in order to check the relative offsets.
 size_t const kSkip = 10;
 
-template <typename Token>
-void Serdes(MemTextIndex<Token> & memIndex, MemTextIndex<Token> & deserializedMemIndex,
-            vector<uint8_t> & buf)
+void Serdes(MemTextIndex & memIndex, MemTextIndex & deserializedMemIndex, vector<uint8_t> & buf)
 {
   buf.clear();
   {
@@ -62,14 +60,14 @@ namespace search
 {
 UNIT_TEST(TextIndex_Smoke)
 {
-  using Token = string;
+  using Token = base::Token;
 
   vector<Token> const docsCollection = {
       "a b c",
       "a c",
   };
 
-  MemTextIndex<Token> memIndex;
+  MemTextIndex memIndex;
 
   for (size_t docId = 0; docId < docsCollection.size(); ++docId)
   {
@@ -82,7 +80,7 @@ UNIT_TEST(TextIndex_Smoke)
   }
 
   vector<uint8_t> indexData;
-  MemTextIndex<Token> deserializedMemIndex;
+  MemTextIndex deserializedMemIndex;
   Serdes(memIndex, deserializedMemIndex, indexData);
 
   for (auto const & index : {memIndex, deserializedMemIndex})
@@ -98,7 +96,7 @@ UNIT_TEST(TextIndex_Smoke)
     copy_n(indexData.begin() + kSkip, indexData.size() - kSkip, back_inserter(contents));
     ScopedFile file("text_index_tmp", contents);
     FileReader fileReader(file.GetFullPath());
-    TextIndexReader<Token> textIndexReader(fileReader);
+    TextIndexReader textIndexReader(fileReader);
     TestForEach(textIndexReader, "a", {0, 1});
     TestForEach(textIndexReader, "b", {0});
     TestForEach(textIndexReader, "c", {0, 1});
@@ -108,29 +106,27 @@ UNIT_TEST(TextIndex_Smoke)
 
 UNIT_TEST(TextIndex_UniString)
 {
-  using Token = strings::UniString;
-
   vector<std::string> const docsCollectionUtf8s = {
       "â b ç",
       "â ç",
   };
-  vector<Token> const docsCollection(
+  vector<strings::UniString> const docsCollection(
       make_transform_iterator(docsCollectionUtf8s.begin(), &strings::MakeUniString),
       make_transform_iterator(docsCollectionUtf8s.end(), &strings::MakeUniString));
 
-  MemTextIndex<Token> memIndex;
+  MemTextIndex memIndex;
 
   for (size_t docId = 0; docId < docsCollection.size(); ++docId)
   {
-    auto addToIndex = [&](Token const & token) {
-      memIndex.AddPosting(token, static_cast<uint32_t>(docId));
+    auto addToIndex = [&](strings::UniString const & token) {
+      memIndex.AddPosting(strings::ToUtf8(token), static_cast<uint32_t>(docId));
     };
     auto delims = [](strings::UniChar const & c) { return c == ' '; };
     SplitUniString(docsCollection[docId], addToIndex, delims);
   }
 
   vector<uint8_t> indexData;
-  MemTextIndex<Token> deserializedMemIndex;
+  MemTextIndex deserializedMemIndex;
   Serdes(memIndex, deserializedMemIndex, indexData);
 
   for (auto const & index : {memIndex, deserializedMemIndex})
author	Maxim Pimenov <m@maps.me>	2018-06-29 16:26:21 +0300
committer	Tatiana Yan <tatiana.kondakova@gmail.com>	2018-07-02 15:45:54 +0300
commit	cb504d0504cc047f15b5e310092a44af7f9faa0e (patch)
tree	da33085f673a93c198f54d85d55084d859ee1cc0 /search
parent	7ec094b9f4451fd3ef393e35ceded580d18961ca (diff)