Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--base/mem_trie.hpp66
-rw-r--r--indexer/categories_holder.hpp9
-rw-r--r--search/CMakeLists.txt1
-rw-r--r--search/common.hpp12
-rw-r--r--search/feature_offset_match.hpp2
-rw-r--r--search/geocoder.cpp12
-rw-r--r--search/keyword_matcher.cpp4
-rw-r--r--search/processor.cpp24
-rw-r--r--search/processor.hpp3
-rw-r--r--search/search.pro1
-rw-r--r--search/search_integration_tests/processor_test.cpp15
-rw-r--r--search/utils.hpp100
12 files changed, 197 insertions, 52 deletions
diff --git a/base/mem_trie.hpp b/base/mem_trie.hpp
index fdd35af365..d5394f5bf3 100644
--- a/base/mem_trie.hpp
+++ b/base/mem_trie.hpp
@@ -15,7 +15,12 @@ namespace my
template <typename String, typename Value>
class MemTrie
{
+private:
+ struct Node;
+
public:
+ using Char = typename String::value_type;
+
MemTrie() = default;
MemTrie(MemTrie && rhs) { *this = std::move(rhs); }
@@ -27,6 +32,31 @@ public:
return *this;
}
+ // A read-only iterator. Any modification to the
+ // underlying trie is assumed to invalidate the iterator.
+ class Iterator
+ {
+ public:
+ Iterator(MemTrie::Node const & node) : m_node(node) {}
+
+ template <typename ToDo>
+ void ForEachMove(ToDo && todo) const
+ {
+ for (auto const & move : m_node.m_moves)
+ todo(move.first, Iterator(*move.second));
+ }
+
+ template <typename ToDo>
+ void ForEachInNode(ToDo && todo) const
+ {
+ for (auto const & value : m_node.m_values)
+ todo(value);
+ }
+
+ private:
+ MemTrie::Node const & m_node;
+ };
+
// Adds a key-value pair to the trie.
void Add(String const & key, Value const & value)
{
@@ -41,40 +71,42 @@ public:
cur->AddValue(value);
}
- // Traverses all key-value pairs in the trie and calls |toDo| on each of them.
+ // Traverses all key-value pairs in the trie and calls |todo| on each of them.
template <typename ToDo>
- void ForEachInTrie(ToDo && toDo) const
+ void ForEachInTrie(ToDo && todo) const
{
String prefix;
- ForEachInSubtree(m_root, prefix, std::forward<ToDo>(toDo));
+ ForEachInSubtree(m_root, prefix, std::forward<ToDo>(todo));
}
- // Calls |toDo| for each key-value pair in the node that is reachable
+ // Calls |todo| for each key-value pair in the node that is reachable
// by |prefix| from the trie root. Does nothing if such node does
// not exist.
template <typename ToDo>
- void ForEachInNode(String const & prefix, ToDo && toDo) const
+ void ForEachInNode(String const & prefix, ToDo && todo) const
{
if (auto const * root = MoveTo(prefix))
- ForEachInNode(*root, prefix, std::forward<ToDo>(toDo));
+ ForEachInNode(*root, prefix, std::forward<ToDo>(todo));
}
- // Calls |toDo| for each key-value pair in a subtree that is
+ // Calls |todo| for each key-value pair in a subtree that is
// reachable by |prefix| from the trie root. Does nothing if such
// subtree does not exist.
template <typename ToDo>
- void ForEachInSubtree(String prefix, ToDo && toDo) const
+ void ForEachInSubtree(String prefix, ToDo && todo) const
{
if (auto const * root = MoveTo(prefix))
- ForEachInSubtree(*root, prefix, std::forward<ToDo>(toDo));
+ ForEachInSubtree(*root, prefix, std::forward<ToDo>(todo));
}
size_t GetNumNodes() const { return m_numNodes; }
+ Iterator GetRootIterator() const { return Iterator(m_root); }
+ Node const & GetRoot() const { return m_root; }
private:
struct Node
{
- using Char = typename String::value_type;
+ friend class MemTrie<String, Value>::Iterator;
Node() = default;
Node(Node && /* rhs */) = default;
@@ -117,27 +149,27 @@ private:
return cur;
}
- // Calls |toDo| for each key-value pair in a |node| that is
+ // Calls |todo| for each key-value pair in a |node| that is
// reachable by |prefix| from the trie root.
template <typename ToDo>
- void ForEachInNode(Node const & node, String const & prefix, ToDo && toDo) const
+ void ForEachInNode(Node const & node, String const & prefix, ToDo && todo) const
{
for (auto const & value : node.m_values)
- toDo(prefix, value);
+ todo(prefix, value);
}
- // Calls |toDo| for each key-value pair in subtree where |node| is a
+ // Calls |todo| for each key-value pair in subtree where |node| is a
// root of the subtree. |prefix| is a path from the trie root to the
// |node|.
template <typename ToDo>
- void ForEachInSubtree(Node const & node, String & prefix, ToDo && toDo) const
+ void ForEachInSubtree(Node const & node, String & prefix, ToDo && todo) const
{
- ForEachInNode(node, prefix, toDo);
+ ForEachInNode(node, prefix, todo);
for (auto const & move : node.m_moves)
{
prefix.push_back(move.first);
- ForEachInSubtree(*move.second, prefix, toDo);
+ ForEachInSubtree(*move.second, prefix, todo);
prefix.pop_back();
}
}
diff --git a/indexer/categories_holder.hpp b/indexer/categories_holder.hpp
index ba40e43460..b40982e001 100644
--- a/indexer/categories_holder.hpp
+++ b/indexer/categories_holder.hpp
@@ -125,6 +125,15 @@ public:
/// @returns raw classificator type if it's not localized in categories.txt.
string GetReadableFeatureType(uint32_t type, int8_t locale) const;
+ // Exposes the tries that map category tokens to types.
+ Trie const * GetNameToTypesTrie(int8_t locale) const
+ {
+ auto const it = m_name2type.find(locale);
+ if (it == m_name2type.end())
+ return nullptr;
+ return it->second.get();
+ }
+
bool IsTypeExist(uint32_t type) const;
inline void Swap(CategoriesHolder & r)
diff --git a/search/CMakeLists.txt b/search/CMakeLists.txt
index 5b2c22322f..870853153e 100644
--- a/search/CMakeLists.txt
+++ b/search/CMakeLists.txt
@@ -123,6 +123,7 @@ set(
token_slice.hpp
types_skipper.cpp
types_skipper.hpp
+ utils.cpp
utils.hpp
viewport_search_callback.cpp
viewport_search_callback.hpp
diff --git a/search/common.hpp b/search/common.hpp
index 2bdc9300a6..abd8a88c71 100644
--- a/search/common.hpp
+++ b/search/common.hpp
@@ -5,16 +5,4 @@ namespace search
/// Upper bound for max count of tokens for indexing and scoring.
int constexpr MAX_TOKENS = 32;
int constexpr MAX_SUGGESTS_COUNT = 5;
-
-template <typename IterT1, typename IterT2>
-bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix)
-{
- while (beg != end && begPrefix != endPrefix && *beg == *begPrefix)
- {
- ++beg;
- ++begPrefix;
- }
- return begPrefix == endPrefix;
-}
-
} // namespace search
diff --git a/search/feature_offset_match.hpp b/search/feature_offset_match.hpp
index 1c57683304..f05cf5eff9 100644
--- a/search/feature_offset_match.hpp
+++ b/search/feature_offset_match.hpp
@@ -221,7 +221,7 @@ struct SearchTrieRequest
QueryParams::Langs m_langs;
};
-// Calls |toDo| for each feature accepted but at least one DFA.
+// Calls |toDo| for each feature accepted by at least one DFA.
//
// *NOTE* |toDo| may be called several times for the same feature.
template <typename DFA, typename Value, typename ToDo>
diff --git a/search/geocoder.cpp b/search/geocoder.cpp
index 6a279ed36e..41a65e01ef 100644
--- a/search/geocoder.cpp
+++ b/search/geocoder.cpp
@@ -315,18 +315,6 @@ size_t OrderCountries(m2::RectD const & pivot, vector<shared_ptr<MwmInfo>> & inf
auto const sep = stable_partition(infos.begin(), infos.end(), intersects);
return distance(infos.begin(), sep);
}
-
-size_t GetMaxErrorsForToken(UniString const & token)
-{
- bool const digitsOnly = all_of(token.begin(), token.end(), isdigit);
- if (digitsOnly)
- return 0;
- if (token.size() < 4)
- return 0;
- if (token.size() < 8)
- return 1;
- return 2;
-}
} // namespace
// Geocoder::Params --------------------------------------------------------------------------------
diff --git a/search/keyword_matcher.cpp b/search/keyword_matcher.cpp
index 3b7ddcee60..42672f5119 100644
--- a/search/keyword_matcher.cpp
+++ b/search/keyword_matcher.cpp
@@ -1,4 +1,6 @@
-#include "keyword_matcher.hpp"
+#include "search/keyword_matcher.hpp"
+
+#include "search/utils.hpp"
#include "indexer/search_delimiters.hpp"
#include "indexer/search_string_utils.hpp"
diff --git a/search/processor.cpp b/search/processor.cpp
index ea1042eb60..b5e7c4e577 100644
--- a/search/processor.cpp
+++ b/search/processor.cpp
@@ -128,8 +128,8 @@ void SendStatistics(SearchParams const & params, m2::RectD const & viewport, Res
GetPlatform().GetMarketingService().SendMarketingEvent(marketing::kSearchEmitResultsAndCoords, {});
}
-// Removes all full-token stop words from |params|, unless |params|
-// consists of all such tokens.
+// Removes all full-token stop words from |params|.
+// Does nothing if all tokens in |params| are stop words.
void RemoveStopWordsIfNeeded(QueryParams & params)
{
size_t numStopWords = 0;
@@ -331,6 +331,7 @@ int8_t Processor::GetLanguage(int id) const
{
return m_ranker.GetLanguage(GetLangIndex(id));
}
+
m2::PointD Processor::GetPivotPoint() const
{
bool const viewportSearch = m_mode == Mode::Viewport;
@@ -413,6 +414,13 @@ void Processor::ForEachCategoryType(StringSliceBase const & slice, ToDo && todo)
::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, forward<ToDo>(todo));
}
+template <typename ToDo>
+void Processor::ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && todo) const
+{
+ ::search::ForEachCategoryTypeFuzzy(slice, GetCategoryLocales(), m_categories,
+ forward<ToDo>(todo));
+}
+
void Processor::Search(SearchParams const & params, m2::RectD const & viewport)
{
SetMode(params.m_mode);
@@ -671,11 +679,9 @@ void Processor::InitParams(QueryParams & params)
}
}
};
- ForEachCategoryType(QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix), addSyms);
- auto & langs = params.GetLangs();
- for (int i = 0; i < LANG_COUNT; ++i)
- langs.Insert(GetLanguage(i));
+ // todo(@m, @y). Shall we match prefix tokens for categories?
+ ForEachCategoryTypeFuzzy(QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix), addSyms);
RemoveStopWordsIfNeeded(params);
@@ -687,6 +693,12 @@ void Processor::InitParams(QueryParams & params)
if (IsStreetSynonym(token.m_original))
params.GetTypeIndices(i).clear();
}
+
+ for (size_t i = 0; i < params.GetNumTokens(); ++i)
+ my::SortUnique(params.GetTypeIndices(i));
+
+ for (int i = 0; i < LANG_COUNT; ++i)
+ params.GetLangs().Insert(GetLanguage(i));
}
void Processor::InitGeocoder(Geocoder::Params & params)
diff --git a/search/processor.hpp b/search/processor.hpp
index 5e784954fd..c32d03f16d 100644
--- a/search/processor.hpp
+++ b/search/processor.hpp
@@ -141,6 +141,9 @@ protected:
template <typename ToDo>
void ForEachCategoryType(StringSliceBase const & slice, ToDo && todo) const;
+ template <typename ToDo>
+ void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && todo) const;
+
m2::PointD GetPivotPoint() const;
m2::RectD GetPivotRect() const;
diff --git a/search/search.pro b/search/search.pro
index abb050f3c0..2c11aa354f 100644
--- a/search/search.pro
+++ b/search/search.pro
@@ -135,4 +135,5 @@ SOURCES += \
streets_matcher.cpp \
token_slice.cpp \
types_skipper.cpp \
+ utils.cpp \
viewport_search_callback.cpp
diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp
index ef8fdc3bc4..63aa3b5342 100644
--- a/search/search_integration_tests/processor_test.cpp
+++ b/search/search_integration_tests/processor_test.cpp
@@ -754,6 +754,9 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
TestPOI bar(m2::PointD(0, 0), "Черчилль", "ru");
bar.SetTypes({{"amenity", "pub"}});
+ TestPOI metro(m2::PointD(5.0, 5.0), "Liceu", "es");
+ metro.SetTypes({{"railway", "subway_entrance"}});
+
BuildWorld([&](TestMwmBuilder & builder) {
builder.Add(country);
builder.Add(city);
@@ -762,6 +765,7 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
auto id = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
builder.Add(street);
builder.Add(bar);
+ builder.Add(metro);
});
SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0)));
@@ -778,6 +782,17 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
TEST(ResultsMatch("масква ленинргадский чирчиль", "ru", TRules{}), ());
TEST(ResultsMatch("моксва ленинргадский черчиль", "ru", rules), ());
+
+ TEST(ResultsMatch("food", "ru", rules), ());
+ TEST(ResultsMatch("foood", "ru", rules), ());
+ TEST(ResultsMatch("fod", "ru", TRules{}), ());
+
+ TRules rulesMetro = {ExactMatch(id, metro)};
+ TEST(ResultsMatch("transporte", "es", rulesMetro), ());
+ TEST(ResultsMatch("transport", "es", rulesMetro), ());
+ TEST(ResultsMatch("transpurt", "en", rulesMetro), ());
+ TEST(ResultsMatch("transpurrt", "es", rulesMetro), ());
+ TEST(ResultsMatch("transportation", "en", TRules{}), ());
}
}
diff --git a/search/utils.hpp b/search/utils.hpp
index c36f74736a..919a18e907 100644
--- a/search/utils.hpp
+++ b/search/utils.hpp
@@ -3,14 +3,74 @@
#include "search/token_slice.hpp"
#include "indexer/categories_holder.hpp"
+#include "indexer/search_delimiters.hpp"
+#include "indexer/search_string_utils.hpp"
#include "base/buffer_vector.hpp"
+#include "base/levenshtein_dfa.hpp"
+#include "base/stl_helpers.hpp"
#include "base/string_utils.hpp"
+#include <algorithm>
+#include <cctype>
+#include <functional>
+#include <queue>
+
namespace search
{
+namespace
+{
+// my::MemTrie<strings::UniString, uint32_t>
+// todo(@m, @y). Unite with the similar function in search/feature_offset_match.hpp.
+template <typename Trie, typename DFA, typename ToDo>
+bool MatchInTrie(Trie const & trie, DFA const & dfa, ToDo && toDo)
+{
+ using Char = typename Trie::Char;
+ using TrieIt = typename Trie::Iterator;
+ using DFAIt = typename DFA::Iterator;
+ using State = pair<TrieIt, DFAIt>;
+
+ std::queue<State> q;
+
+ {
+ auto it = dfa.Begin();
+ if (it.Rejects())
+ return false;
+ q.emplace(trie.GetRootIterator(), it);
+ }
+
+ bool found = false;
+
+ while (!q.empty())
+ {
+ auto const p = q.front();
+ q.pop();
+
+ auto const & trieIt = p.first;
+ auto const & dfaIt = p.second;
+
+ if (dfaIt.Accepts())
+ {
+ trieIt.ForEachInNode(toDo);
+ found = true;
+ }
+
+ trieIt.ForEachMove([&](Char const & c, TrieIt const & nextTrieIt) {
+ auto nextDfaIt = dfaIt;
+ nextDfaIt.Move(c);
+ if (!nextDfaIt.Rejects())
+ q.emplace(nextTrieIt, nextDfaIt);
+ });
+ }
+
+ return found;
+}
+} // namespace
+
using TLocales = buffer_vector<int8_t, 3>;
+size_t GetMaxErrorsForToken(strings::UniString const & token);
+
template <typename ToDo>
void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales,
CategoriesHolder const & categories, ToDo && todo)
@@ -18,16 +78,50 @@ void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales
for (size_t i = 0; i < slice.Size(); ++i)
{
auto const & token = slice.Get(i);
- for (size_t j = 0; j < locales.size(); ++j)
- categories.ForEachTypeByName(locales[j], token, bind<void>(todo, i, _1));
+ for (int8_t const locale : locales)
+ categories.ForEachTypeByName(locale, token, std::bind<void>(todo, i, std::placeholders::_1));
// Special case processing of 2 codepoints emoji (e.g. black guy on a bike).
// Only emoji synonyms can have one codepoint.
if (token.size() > 1)
{
categories.ForEachTypeByName(CategoriesHolder::kEnglishCode, strings::UniString(1, token[0]),
- bind<void>(todo, i, _1));
+ std::bind<void>(todo, i, std::placeholders::_1));
}
}
}
+
+// Unlike ForEachCategoryType which extracts types by a token
+// from |slice| by exactly matching it to a token in the name
+// of a category, in the worst case this function has to loop through the tokens
+// in all category synonyms in all |locales| in order to find a token
+// whose edit distance is close enough to the required token from |slice|.
+template <typename ToDo>
+void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, TLocales const & locales,
+ CategoriesHolder const & categories, ToDo && todo)
+{
+ for (size_t i = 0; i < slice.Size(); ++i)
+ {
+ auto const & token = slice.Get(i);
+ auto const & dfa =
+ strings::LevenshteinDFA(token, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(token));
+ for (int8_t const locale : locales)
+ {
+ auto const * trie = categories.GetNameToTypesTrie(locale);
+ if (trie != nullptr)
+ MatchInTrie(*trie, dfa, std::bind<void>(todo, i, std::placeholders::_1));
+ }
+ }
+}
+
+template <typename IterT1, typename IterT2>
+bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix)
+{
+ while (beg != end && begPrefix != endPrefix && *beg == *begPrefix)
+ {
+ ++beg;
+ ++begPrefix;
+ }
+ return begPrefix == endPrefix;
+}
} // namespace search