diff options
-rw-r--r-- | base/mem_trie.hpp | 66 | ||||
-rw-r--r-- | indexer/categories_holder.hpp | 9 | ||||
-rw-r--r-- | search/CMakeLists.txt | 1 | ||||
-rw-r--r-- | search/common.hpp | 12 | ||||
-rw-r--r-- | search/feature_offset_match.hpp | 2 | ||||
-rw-r--r-- | search/geocoder.cpp | 12 | ||||
-rw-r--r-- | search/keyword_matcher.cpp | 4 | ||||
-rw-r--r-- | search/processor.cpp | 24 | ||||
-rw-r--r-- | search/processor.hpp | 3 | ||||
-rw-r--r-- | search/search.pro | 1 | ||||
-rw-r--r-- | search/search_integration_tests/processor_test.cpp | 15 | ||||
-rw-r--r-- | search/utils.hpp | 100 |
12 files changed, 197 insertions, 52 deletions
diff --git a/base/mem_trie.hpp b/base/mem_trie.hpp index fdd35af365..d5394f5bf3 100644 --- a/base/mem_trie.hpp +++ b/base/mem_trie.hpp @@ -15,7 +15,12 @@ namespace my template <typename String, typename Value> class MemTrie { +private: + struct Node; + public: + using Char = typename String::value_type; + MemTrie() = default; MemTrie(MemTrie && rhs) { *this = std::move(rhs); } @@ -27,6 +32,31 @@ public: return *this; } + // A read-only iterator. Any modification to the + // underlying trie is assumed to invalidate the iterator. + class Iterator + { + public: + Iterator(MemTrie::Node const & node) : m_node(node) {} + + template <typename ToDo> + void ForEachMove(ToDo && todo) const + { + for (auto const & move : m_node.m_moves) + todo(move.first, Iterator(*move.second)); + } + + template <typename ToDo> + void ForEachInNode(ToDo && todo) const + { + for (auto const & value : m_node.m_values) + todo(value); + } + + private: + MemTrie::Node const & m_node; + }; + // Adds a key-value pair to the trie. void Add(String const & key, Value const & value) { @@ -41,40 +71,42 @@ public: cur->AddValue(value); } - // Traverses all key-value pairs in the trie and calls |toDo| on each of them. + // Traverses all key-value pairs in the trie and calls |todo| on each of them. template <typename ToDo> - void ForEachInTrie(ToDo && toDo) const + void ForEachInTrie(ToDo && todo) const { String prefix; - ForEachInSubtree(m_root, prefix, std::forward<ToDo>(toDo)); + ForEachInSubtree(m_root, prefix, std::forward<ToDo>(todo)); } - // Calls |toDo| for each key-value pair in the node that is reachable + // Calls |todo| for each key-value pair in the node that is reachable // by |prefix| from the trie root. Does nothing if such node does // not exist. template <typename ToDo> - void ForEachInNode(String const & prefix, ToDo && toDo) const + void ForEachInNode(String const & prefix, ToDo && todo) const { if (auto const * root = MoveTo(prefix)) - ForEachInNode(*root, prefix, std::forward<ToDo>(toDo)); + ForEachInNode(*root, prefix, std::forward<ToDo>(todo)); } - // Calls |toDo| for each key-value pair in a subtree that is + // Calls |todo| for each key-value pair in a subtree that is // reachable by |prefix| from the trie root. Does nothing if such // subtree does not exist. template <typename ToDo> - void ForEachInSubtree(String prefix, ToDo && toDo) const + void ForEachInSubtree(String prefix, ToDo && todo) const { if (auto const * root = MoveTo(prefix)) - ForEachInSubtree(*root, prefix, std::forward<ToDo>(toDo)); + ForEachInSubtree(*root, prefix, std::forward<ToDo>(todo)); } size_t GetNumNodes() const { return m_numNodes; } + Iterator GetRootIterator() const { return Iterator(m_root); } + Node const & GetRoot() const { return m_root; } private: struct Node { - using Char = typename String::value_type; + friend class MemTrie<String, Value>::Iterator; Node() = default; Node(Node && /* rhs */) = default; @@ -117,27 +149,27 @@ private: return cur; } - // Calls |toDo| for each key-value pair in a |node| that is + // Calls |todo| for each key-value pair in a |node| that is // reachable by |prefix| from the trie root. template <typename ToDo> - void ForEachInNode(Node const & node, String const & prefix, ToDo && toDo) const + void ForEachInNode(Node const & node, String const & prefix, ToDo && todo) const { for (auto const & value : node.m_values) - toDo(prefix, value); + todo(prefix, value); } - // Calls |toDo| for each key-value pair in subtree where |node| is a + // Calls |todo| for each key-value pair in subtree where |node| is a // root of the subtree. |prefix| is a path from the trie root to the // |node|. template <typename ToDo> - void ForEachInSubtree(Node const & node, String & prefix, ToDo && toDo) const + void ForEachInSubtree(Node const & node, String & prefix, ToDo && todo) const { - ForEachInNode(node, prefix, toDo); + ForEachInNode(node, prefix, todo); for (auto const & move : node.m_moves) { prefix.push_back(move.first); - ForEachInSubtree(*move.second, prefix, toDo); + ForEachInSubtree(*move.second, prefix, todo); prefix.pop_back(); } } diff --git a/indexer/categories_holder.hpp b/indexer/categories_holder.hpp index ba40e43460..b40982e001 100644 --- a/indexer/categories_holder.hpp +++ b/indexer/categories_holder.hpp @@ -125,6 +125,15 @@ public: /// @returns raw classificator type if it's not localized in categories.txt. string GetReadableFeatureType(uint32_t type, int8_t locale) const; + // Exposes the tries that map category tokens to types. + Trie const * GetNameToTypesTrie(int8_t locale) const + { + auto const it = m_name2type.find(locale); + if (it == m_name2type.end()) + return nullptr; + return it->second.get(); + } + bool IsTypeExist(uint32_t type) const; inline void Swap(CategoriesHolder & r) diff --git a/search/CMakeLists.txt b/search/CMakeLists.txt index 5b2c22322f..870853153e 100644 --- a/search/CMakeLists.txt +++ b/search/CMakeLists.txt @@ -123,6 +123,7 @@ set( token_slice.hpp types_skipper.cpp types_skipper.hpp + utils.cpp utils.hpp viewport_search_callback.cpp viewport_search_callback.hpp diff --git a/search/common.hpp b/search/common.hpp index 2bdc9300a6..abd8a88c71 100644 --- a/search/common.hpp +++ b/search/common.hpp @@ -5,16 +5,4 @@ namespace search /// Upper bound for max count of tokens for indexing and scoring. int constexpr MAX_TOKENS = 32; int constexpr MAX_SUGGESTS_COUNT = 5; - -template <typename IterT1, typename IterT2> -bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix) -{ - while (beg != end && begPrefix != endPrefix && *beg == *begPrefix) - { - ++beg; - ++begPrefix; - } - return begPrefix == endPrefix; -} - } // namespace search diff --git a/search/feature_offset_match.hpp b/search/feature_offset_match.hpp index 1c57683304..f05cf5eff9 100644 --- a/search/feature_offset_match.hpp +++ b/search/feature_offset_match.hpp @@ -221,7 +221,7 @@ struct SearchTrieRequest QueryParams::Langs m_langs; }; -// Calls |toDo| for each feature accepted but at least one DFA. +// Calls |toDo| for each feature accepted by at least one DFA. // // *NOTE* |toDo| may be called several times for the same feature. template <typename DFA, typename Value, typename ToDo> diff --git a/search/geocoder.cpp b/search/geocoder.cpp index 6a279ed36e..41a65e01ef 100644 --- a/search/geocoder.cpp +++ b/search/geocoder.cpp @@ -315,18 +315,6 @@ size_t OrderCountries(m2::RectD const & pivot, vector<shared_ptr<MwmInfo>> & inf auto const sep = stable_partition(infos.begin(), infos.end(), intersects); return distance(infos.begin(), sep); } - -size_t GetMaxErrorsForToken(UniString const & token) -{ - bool const digitsOnly = all_of(token.begin(), token.end(), isdigit); - if (digitsOnly) - return 0; - if (token.size() < 4) - return 0; - if (token.size() < 8) - return 1; - return 2; -} } // namespace // Geocoder::Params -------------------------------------------------------------------------------- diff --git a/search/keyword_matcher.cpp b/search/keyword_matcher.cpp index 3b7ddcee60..42672f5119 100644 --- a/search/keyword_matcher.cpp +++ b/search/keyword_matcher.cpp @@ -1,4 +1,6 @@ -#include "keyword_matcher.hpp" +#include "search/keyword_matcher.hpp" + +#include "search/utils.hpp" #include "indexer/search_delimiters.hpp" #include "indexer/search_string_utils.hpp" diff --git a/search/processor.cpp b/search/processor.cpp index ea1042eb60..b5e7c4e577 100644 --- a/search/processor.cpp +++ b/search/processor.cpp @@ -128,8 +128,8 @@ void SendStatistics(SearchParams const & params, m2::RectD const & viewport, Res GetPlatform().GetMarketingService().SendMarketingEvent(marketing::kSearchEmitResultsAndCoords, {}); } -// Removes all full-token stop words from |params|, unless |params| -// consists of all such tokens. +// Removes all full-token stop words from |params|. +// Does nothing if all tokens in |params| are stop words. void RemoveStopWordsIfNeeded(QueryParams & params) { size_t numStopWords = 0; @@ -331,6 +331,7 @@ int8_t Processor::GetLanguage(int id) const { return m_ranker.GetLanguage(GetLangIndex(id)); } + m2::PointD Processor::GetPivotPoint() const { bool const viewportSearch = m_mode == Mode::Viewport; @@ -413,6 +414,13 @@ void Processor::ForEachCategoryType(StringSliceBase const & slice, ToDo && todo) ::search::ForEachCategoryType(slice, GetCategoryLocales(), m_categories, forward<ToDo>(todo)); } +template <typename ToDo> +void Processor::ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && todo) const +{ + ::search::ForEachCategoryTypeFuzzy(slice, GetCategoryLocales(), m_categories, + forward<ToDo>(todo)); +} + void Processor::Search(SearchParams const & params, m2::RectD const & viewport) { SetMode(params.m_mode); @@ -671,11 +679,9 @@ void Processor::InitParams(QueryParams & params) } } }; - ForEachCategoryType(QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix), addSyms); - auto & langs = params.GetLangs(); - for (int i = 0; i < LANG_COUNT; ++i) - langs.Insert(GetLanguage(i)); + // todo(@m, @y). Shall we match prefix tokens for categories? + ForEachCategoryTypeFuzzy(QuerySliceOnRawStrings<decltype(m_tokens)>(m_tokens, m_prefix), addSyms); RemoveStopWordsIfNeeded(params); @@ -687,6 +693,12 @@ void Processor::InitParams(QueryParams & params) if (IsStreetSynonym(token.m_original)) params.GetTypeIndices(i).clear(); } + + for (size_t i = 0; i < params.GetNumTokens(); ++i) + my::SortUnique(params.GetTypeIndices(i)); + + for (int i = 0; i < LANG_COUNT; ++i) + params.GetLangs().Insert(GetLanguage(i)); } void Processor::InitGeocoder(Geocoder::Params & params) diff --git a/search/processor.hpp b/search/processor.hpp index 5e784954fd..c32d03f16d 100644 --- a/search/processor.hpp +++ b/search/processor.hpp @@ -141,6 +141,9 @@ protected: template <typename ToDo> void ForEachCategoryType(StringSliceBase const & slice, ToDo && todo) const; + template <typename ToDo> + void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, ToDo && todo) const; + m2::PointD GetPivotPoint() const; m2::RectD GetPivotRect() const; diff --git a/search/search.pro b/search/search.pro index abb050f3c0..2c11aa354f 100644 --- a/search/search.pro +++ b/search/search.pro @@ -135,4 +135,5 @@ SOURCES += \ streets_matcher.cpp \ token_slice.cpp \ types_skipper.cpp \ + utils.cpp \ viewport_search_callback.cpp diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp index ef8fdc3bc4..63aa3b5342 100644 --- a/search/search_integration_tests/processor_test.cpp +++ b/search/search_integration_tests/processor_test.cpp @@ -754,6 +754,9 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch) TestPOI bar(m2::PointD(0, 0), "Черчилль", "ru"); bar.SetTypes({{"amenity", "pub"}}); + TestPOI metro(m2::PointD(5.0, 5.0), "Liceu", "es"); + metro.SetTypes({{"railway", "subway_entrance"}}); + BuildWorld([&](TestMwmBuilder & builder) { builder.Add(country); builder.Add(city); @@ -762,6 +765,7 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch) auto id = BuildCountry(countryName, [&](TestMwmBuilder & builder) { builder.Add(street); builder.Add(bar); + builder.Add(metro); }); SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0))); @@ -778,6 +782,17 @@ UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch) TEST(ResultsMatch("масква ленинргадский чирчиль", "ru", TRules{}), ()); TEST(ResultsMatch("моксва ленинргадский черчиль", "ru", rules), ()); + + TEST(ResultsMatch("food", "ru", rules), ()); + TEST(ResultsMatch("foood", "ru", rules), ()); + TEST(ResultsMatch("fod", "ru", TRules{}), ()); + + TRules rulesMetro = {ExactMatch(id, metro)}; + TEST(ResultsMatch("transporte", "es", rulesMetro), ()); + TEST(ResultsMatch("transport", "es", rulesMetro), ()); + TEST(ResultsMatch("transpurt", "en", rulesMetro), ()); + TEST(ResultsMatch("transpurrt", "es", rulesMetro), ()); + TEST(ResultsMatch("transportation", "en", TRules{}), ()); } } diff --git a/search/utils.hpp b/search/utils.hpp index c36f74736a..919a18e907 100644 --- a/search/utils.hpp +++ b/search/utils.hpp @@ -3,14 +3,74 @@ #include "search/token_slice.hpp" #include "indexer/categories_holder.hpp" +#include "indexer/search_delimiters.hpp" +#include "indexer/search_string_utils.hpp" #include "base/buffer_vector.hpp" +#include "base/levenshtein_dfa.hpp" +#include "base/stl_helpers.hpp" #include "base/string_utils.hpp" +#include <algorithm> +#include <cctype> +#include <functional> +#include <queue> + namespace search { +namespace +{ +// my::MemTrie<strings::UniString, uint32_t> +// todo(@m, @y). Unite with the similar function in search/feature_offset_match.hpp. +template <typename Trie, typename DFA, typename ToDo> +bool MatchInTrie(Trie const & trie, DFA const & dfa, ToDo && toDo) +{ + using Char = typename Trie::Char; + using TrieIt = typename Trie::Iterator; + using DFAIt = typename DFA::Iterator; + using State = pair<TrieIt, DFAIt>; + + std::queue<State> q; + + { + auto it = dfa.Begin(); + if (it.Rejects()) + return false; + q.emplace(trie.GetRootIterator(), it); + } + + bool found = false; + + while (!q.empty()) + { + auto const p = q.front(); + q.pop(); + + auto const & trieIt = p.first; + auto const & dfaIt = p.second; + + if (dfaIt.Accepts()) + { + trieIt.ForEachInNode(toDo); + found = true; + } + + trieIt.ForEachMove([&](Char const & c, TrieIt const & nextTrieIt) { + auto nextDfaIt = dfaIt; + nextDfaIt.Move(c); + if (!nextDfaIt.Rejects()) + q.emplace(nextTrieIt, nextDfaIt); + }); + } + + return found; +} +} // namespace + using TLocales = buffer_vector<int8_t, 3>; +size_t GetMaxErrorsForToken(strings::UniString const & token); + template <typename ToDo> void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales, CategoriesHolder const & categories, ToDo && todo) @@ -18,16 +78,50 @@ void ForEachCategoryType(StringSliceBase const & slice, TLocales const & locales for (size_t i = 0; i < slice.Size(); ++i) { auto const & token = slice.Get(i); - for (size_t j = 0; j < locales.size(); ++j) - categories.ForEachTypeByName(locales[j], token, bind<void>(todo, i, _1)); + for (int8_t const locale : locales) + categories.ForEachTypeByName(locale, token, std::bind<void>(todo, i, std::placeholders::_1)); // Special case processing of 2 codepoints emoji (e.g. black guy on a bike). // Only emoji synonyms can have one codepoint. if (token.size() > 1) { categories.ForEachTypeByName(CategoriesHolder::kEnglishCode, strings::UniString(1, token[0]), - bind<void>(todo, i, _1)); + std::bind<void>(todo, i, std::placeholders::_1)); } } } + +// Unlike ForEachCategoryType which extracts types by a token +// from |slice| by exactly matching it to a token in the name +// of a category, in the worst case this function has to loop through the tokens +// in all category synonyms in all |locales| in order to find a token +// whose edit distance is close enough to the required token from |slice|. +template <typename ToDo> +void ForEachCategoryTypeFuzzy(StringSliceBase const & slice, TLocales const & locales, + CategoriesHolder const & categories, ToDo && todo) +{ + for (size_t i = 0; i < slice.Size(); ++i) + { + auto const & token = slice.Get(i); + auto const & dfa = + strings::LevenshteinDFA(token, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(token)); + for (int8_t const locale : locales) + { + auto const * trie = categories.GetNameToTypesTrie(locale); + if (trie != nullptr) + MatchInTrie(*trie, dfa, std::bind<void>(todo, i, std::placeholders::_1)); + } + } +} + +template <typename IterT1, typename IterT2> +bool StartsWith(IterT1 beg, IterT1 end, IterT2 begPrefix, IterT2 endPrefix) +{ + while (beg != end && begPrefix != endPrefix && *beg == *begPrefix) + { + ++beg; + ++begPrefix; + } + return begPrefix == endPrefix; +} } // namespace search |