Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--base/base_tests/levenshtein_dfa_test.cpp17
-rw-r--r--base/levenshtein_dfa.cpp30
-rw-r--r--base/levenshtein_dfa.hpp8
-rw-r--r--search/categories_cache.cpp46
-rw-r--r--search/categories_cache.hpp5
-rw-r--r--search/categories_set.hpp22
-rw-r--r--search/feature_offset_match.hpp20
-rw-r--r--search/geocoder.cpp119
-rw-r--r--search/geocoder.hpp10
-rw-r--r--search/processor.cpp14
-rw-r--r--search/query_params.cpp14
-rw-r--r--search/query_params.hpp6
-rw-r--r--search/retrieval.cpp203
-rw-r--r--search/retrieval.hpp17
-rw-r--r--search/search_integration_tests/processor_test.cpp36
15 files changed, 311 insertions, 256 deletions
diff --git a/base/base_tests/levenshtein_dfa_test.cpp b/base/base_tests/levenshtein_dfa_test.cpp
index f3cd896fa2..cf8e776be2 100644
--- a/base/base_tests/levenshtein_dfa_test.cpp
+++ b/base/base_tests/levenshtein_dfa_test.cpp
@@ -93,4 +93,21 @@ UNIT_TEST(LevenshteinDFA_Smoke)
TEST(Rejects(dfa, "san"), ());
}
}
+
+UNIT_TEST(LevenshteinDFA_Prefix)
+{
+ {
+ LevenshteinDFA dfa("москва", 1 /* prefixCharsToKeep */, 1 /* maxErrors */);
+ TEST(Accepts(dfa, "москва"), ());
+ TEST(Accepts(dfa, "масква"), ());
+ TEST(Accepts(dfa, "моска"), ());
+ TEST(Rejects(dfa, "иосква"), ());
+ }
+ {
+ LevenshteinDFA dfa("москва", 0 /* prefixCharsToKeep */, 1 /* maxErrors */);
+ TEST(Accepts(dfa, "москва"), ());
+ TEST(Accepts(dfa, "иосква"), ());
+ TEST(Rejects(dfa, "моксва"), ());
+ }
+}
} // namespace
diff --git a/base/levenshtein_dfa.cpp b/base/levenshtein_dfa.cpp
index 6c78c4f2a4..54c4bb07e0 100644
--- a/base/levenshtein_dfa.cpp
+++ b/base/levenshtein_dfa.cpp
@@ -21,16 +21,18 @@ public:
{
}
- void Move(LevenshteinDFA::State const & s, UniChar c, LevenshteinDFA::State & t)
+ void Move(LevenshteinDFA::State const & s, size_t prefixCharsToKeep, UniChar c,
+ LevenshteinDFA::State & t)
{
t.Clear();
for (auto const & p : s.m_positions)
- GetMoves(p, c, t);
+ GetMoves(p, prefixCharsToKeep, c, t);
t.Normalize();
}
private:
- void GetMoves(LevenshteinDFA::Position const & p, UniChar c, LevenshteinDFA::State & t)
+ void GetMoves(LevenshteinDFA::Position const & p, size_t prefixCharsToKeep, UniChar c,
+ LevenshteinDFA::State & t)
{
auto & ps = t.m_positions;
@@ -43,6 +45,9 @@ private:
if (p.m_numErrors == m_maxErrors)
return;
+ if (p.m_offset < prefixCharsToKeep)
+ return;
+
ps.emplace_back(p.m_offset, p.m_numErrors + 1);
if (p.m_offset == m_size)
@@ -150,7 +155,7 @@ void LevenshteinDFA::State::Normalize()
// LevenshteinDFA ----------------------------------------------------------------------------------
// static
-LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors)
+LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, uint8_t maxErrors)
: m_size(s.size()), m_maxErrors(maxErrors)
{
m_alphabet.assign(s.begin(), s.end());
@@ -175,6 +180,7 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors)
states.emplace(state);
visited[state] = id;
m_transitions.emplace_back(m_alphabet.size());
+ m_accepting.push_back(false);
};
pushState(State::MakeStart(), kStartingState);
@@ -193,12 +199,12 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors)
ASSERT_LESS(id, m_transitions.size(), ());
if (IsAccepting(curr))
- m_accepting.insert(id);
+ m_accepting[id] = true;
for (size_t i = 0; i < m_alphabet.size(); ++i)
{
State next;
- table.Move(curr, m_alphabet[i], next);
+ table.Move(curr, prefixCharsToKeep, m_alphabet[i], next);
size_t nid;
@@ -218,8 +224,18 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors)
}
}
+LevenshteinDFA::LevenshteinDFA(string const & s, size_t prefixCharsToKeep, uint8_t maxErrors)
+ : LevenshteinDFA(MakeUniString(s), prefixCharsToKeep, maxErrors)
+{
+}
+
+LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors)
+ : LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors)
+{
+}
+
LevenshteinDFA::LevenshteinDFA(string const & s, uint8_t maxErrors)
- : LevenshteinDFA(MakeUniString(s), maxErrors)
+ : LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors)
{
}
diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp
index bdf5ca5b30..0e3236784b 100644
--- a/base/levenshtein_dfa.hpp
+++ b/base/levenshtein_dfa.hpp
@@ -3,7 +3,7 @@
#include "base/string_utils.hpp"
#include "std/cstdint.hpp"
-#include "std/unordered_set.hpp"
+#include "std/vector.hpp"
namespace strings
{
@@ -83,6 +83,8 @@ public:
LevenshteinDFA const & m_dfa;
};
+ LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, uint8_t maxErrors);
+ LevenshteinDFA(string const & s, size_t prefixCharsToKeep, uint8_t maxErrors);
LevenshteinDFA(UniString const & s, uint8_t maxErrors);
LevenshteinDFA(string const & s, uint8_t maxErrors);
@@ -101,7 +103,7 @@ private:
bool IsAccepting(Position const & p) const;
bool IsAccepting(State const & s) const;
- inline bool IsAccepting(size_t s) const { return m_accepting.count(s) != 0; }
+ inline bool IsAccepting(size_t s) const { return m_accepting[s]; }
inline bool IsRejecting(State const & s) const { return s.m_positions.empty(); }
inline bool IsRejecting(size_t s) const { return s == kRejectingState; }
@@ -114,7 +116,7 @@ private:
vector<UniChar> m_alphabet;
vector<vector<size_t>> m_transitions;
- unordered_set<size_t> m_accepting;
+ vector<bool> m_accepting;
};
string DebugPrint(LevenshteinDFA::Position const & p);
diff --git a/search/categories_cache.cpp b/search/categories_cache.cpp
index 233b21b84f..a0e6b6e21b 100644
--- a/search/categories_cache.cpp
+++ b/search/categories_cache.cpp
@@ -4,33 +4,17 @@
#include "search/query_params.hpp"
#include "search/retrieval.hpp"
+#include "indexer/classificator.hpp"
#include "indexer/ftypes_matcher.hpp"
+#include "indexer/search_string_utils.hpp"
#include "base/assert.hpp"
+#include "base/levenshtein_dfa.hpp"
#include "std/vector.hpp"
namespace search
{
-namespace
-{
-// Performs pairwise union of adjacent bit vectors
-// until at most one bit vector is left.
-void UniteCBVs(vector<CBV> & cbvs)
-{
- while (cbvs.size() > 1)
- {
- size_t i = 0;
- size_t j = 0;
- for (; j + 1 < cbvs.size(); j += 2)
- cbvs[i++] = cbvs[j].Union(cbvs[j + 1]);
- for (; j < cbvs.size(); ++j)
- cbvs[i++] = move(cbvs[j]);
- cbvs.resize(i);
- }
-}
-} // namespace
-
// CategoriesCache ---------------------------------------------------------------------------------
CBV CategoriesCache::Get(MwmContext const & context)
{
@@ -52,29 +36,15 @@ CBV CategoriesCache::Load(MwmContext const & context)
ASSERT(context.m_handle.IsAlive(), ());
ASSERT(context.m_value.HasSearchIndex(), ());
- QueryParams params;
- params.m_tokens.resize(1);
- params.m_tokens[0].resize(1);
+ auto const & c = classif();
- params.m_types.resize(1);
- params.m_types[0].resize(1);
+ SearchTrieRequest<strings::LevenshteinDFA> request;
- vector<CBV> cbvs;
-
- m_categories.ForEach([&](strings::UniString const & key, uint32_t const type) {
- params.m_tokens[0][0] = key;
- params.m_types[0][0] = type;
-
- CBV cbv(RetrieveAddressFeatures(context, m_cancellable, params));
- if (!cbv.IsEmpty())
- cbvs.push_back(move(cbv));
+ m_categories.ForEach([&request, &c](uint32_t const type) {
+ request.m_categories.emplace_back(FeatureTypeToString(c.GetIndexForType(type)));
});
- UniteCBVs(cbvs);
- if (cbvs.empty())
- cbvs.emplace_back();
-
- return cbvs[0];
+ return CBV(RetrieveAddressFeatures(context, m_cancellable, request));
}
// StreetsCache ------------------------------------------------------------------------------------
diff --git a/search/categories_cache.hpp b/search/categories_cache.hpp
index 356ee2772d..64049e9a0c 100644
--- a/search/categories_cache.hpp
+++ b/search/categories_cache.hpp
@@ -27,11 +27,6 @@ public:
CBV Get(MwmContext const & context);
- inline bool HasCategory(strings::UniString const & category) const
- {
- return m_categories.HasKey(category);
- }
-
inline void Clear() { m_cache.clear(); }
private:
diff --git a/search/categories_set.hpp b/search/categories_set.hpp
index 1308f5caf0..7216a5a65f 100644
--- a/search/categories_set.hpp
+++ b/search/categories_set.hpp
@@ -8,7 +8,7 @@
#include "std/algorithm.hpp"
#include "std/cstdint.hpp"
-#include "std/map.hpp"
+#include "std/unordered_set.hpp"
namespace search
{
@@ -17,27 +17,17 @@ class CategoriesSet
public:
CategoriesSet() : m_classificator(classif()) {}
- void Add(uint32_t type)
- {
- uint32_t const index = m_classificator.GetIndexForType(type);
- m_categories[FeatureTypeToString(index)] = type;
- }
-
- template <typename TFn>
- void ForEach(TFn && fn) const
- {
- for (auto const & p : m_categories)
- fn(p.first, p.second);
- }
+ inline void Add(uint32_t type) { m_categories.insert(type); }
- bool HasKey(strings::UniString const & key) const
+ template <typename Fn>
+ void ForEach(Fn && fn) const
{
- return m_categories.count(key) != 0;
+ for_each(m_categories.begin(), m_categories.end(), forward<Fn>(fn));
}
private:
Classificator const & m_classificator;
- map<strings::UniString, uint32_t> m_categories;
+ unordered_set<uint32_t> m_categories;
DISALLOW_COPY_AND_MOVE(CategoriesSet);
};
diff --git a/search/feature_offset_match.hpp b/search/feature_offset_match.hpp
index 8ed6447b3e..1eb7409462 100644
--- a/search/feature_offset_match.hpp
+++ b/search/feature_offset_match.hpp
@@ -1,8 +1,8 @@
#pragma once
#include "search/common.hpp"
-#include "search/processor.hpp"
#include "search/query_params.hpp"
#include "search/search_index_values.hpp"
+#include "search/search_trie.hpp"
#include "search/token_slice.hpp"
#include "indexer/trie.hpp"
@@ -209,7 +209,15 @@ struct SearchTrieRequest
{
inline bool IsLangExist(int8_t lang) const { return m_langs.count(lang) != 0; }
- vector<DFA> m_dfas;
+ inline void Clear()
+ {
+ m_names.clear();
+ m_categories.clear();
+ m_langs.clear();
+ }
+
+ vector<DFA> m_names;
+ vector<strings::UniStringDFA> m_categories;
unordered_set<int8_t> m_langs;
};
@@ -238,7 +246,7 @@ bool MatchCategoriesInTrie(SearchTrieRequest<DFA> const & request,
ASSERT_GREATER_OR_EQUAL(edge.size(), 1, ());
auto const catRoot = trieRoot.GoToEdge(langIx);
- MatchInTrie(request.m_dfas, TrieRootPrefix<Value>(*catRoot, edge), toDo);
+ MatchInTrie(request.m_categories, TrieRootPrefix<Value>(*catRoot, edge), toDo);
return true;
}
@@ -279,10 +287,12 @@ void MatchFeaturesInTrie(SearchTrieRequest<DFA> const & request,
bool const categoriesMatched = MatchCategoriesInTrie(request, trieRoot, categoriesHolder);
impl::OffsetIntersector<Filter, Value> intersector(filter);
+
ForEachLangPrefix(request, trieRoot,
- [&request, &intersector](TrieRootPrefix<Value> & langRoot, int8_t lang) {
- MatchInTrie(request.m_dfas, langRoot, intersector);
+ [&request, &intersector](TrieRootPrefix<Value> & langRoot, int8_t /* lang */) {
+ MatchInTrie(request.m_names, langRoot, intersector);
});
+
if (categoriesMatched)
categoriesHolder.ForEachValue(intersector);
diff --git a/search/geocoder.cpp b/search/geocoder.cpp
index fb1562b1b5..f4cf69aa88 100644
--- a/search/geocoder.cpp
+++ b/search/geocoder.cpp
@@ -10,6 +10,7 @@
#include "search/processor.hpp"
#include "search/retrieval.hpp"
#include "search/token_slice.hpp"
+#include "search/utils.hpp"
#include "indexer/classificator.hpp"
#include "indexer/feature_decl.hpp"
@@ -53,6 +54,8 @@
#include <gperftools/profiler.h>
#endif
+using namespace strings;
+
namespace search
{
namespace
@@ -71,7 +74,7 @@ size_t const kMaxNumLocalities = LocalityScorer::kDefaultReadLimit;
size_t constexpr kPivotRectsCacheSize = 10;
size_t constexpr kLocalityRectsCacheSize = 10;
-strings::UniString const kUniSpace(strings::MakeUniString(" "));
+UniString const kUniSpace(MakeUniString(" "));
struct ScopedMarkTokens
{
@@ -171,7 +174,7 @@ private:
};
void JoinQueryTokens(QueryParams const & params, size_t curToken, size_t endToken,
- strings::UniString const & sep, strings::UniString & res)
+ UniString const & sep, UniString & res)
{
ASSERT_LESS_OR_EQUAL(curToken, endToken, ());
for (size_t i = curToken; i < endToken; ++i)
@@ -219,15 +222,15 @@ MwmSet::MwmHandle FindWorld(Index const & index, vector<shared_ptr<MwmInfo>> con
return handle;
}
-strings::UniString AsciiToUniString(char const * s) { return strings::UniString(s, s + strlen(s)); }
+UniString AsciiToUniString(char const * s) { return UniString(s, s + strlen(s)); }
-bool IsStopWord(strings::UniString const & s)
+bool IsStopWord(UniString const & s)
{
/// @todo Get all common used stop words and factor out this array into
/// search_string_utils.cpp module for example.
static char const * arr[] = {"a", "de", "da", "la"};
- static set<strings::UniString> const kStopWords(
+ static set<UniString> const kStopWords(
make_transform_iterator(arr, &AsciiToUniString),
make_transform_iterator(arr + ARRAY_SIZE(arr), &AsciiToUniString));
@@ -331,6 +334,18 @@ size_t OrderCountries(m2::RectD const & pivot, vector<shared_ptr<MwmInfo>> & inf
auto const sep = stable_partition(infos.begin(), infos.end(), intersects);
return distance(infos.begin(), sep);
}
+
+size_t GetMaxErrorsForToken(UniString const & token)
+{
+ bool const digitsOnly = all_of(token.begin(), token.end(), isdigit);
+ if (digitsOnly)
+ return 0;
+ if (token.size() < 4)
+ return 0;
+ if (token.size() < 8)
+ return 1;
+ return 2;
+}
} // namespace
// Geocoder::Params --------------------------------------------------------------------------------
@@ -370,16 +385,19 @@ void Geocoder::SetParams(Params const & params)
for (auto & v : m_params.m_tokens)
my::EraseIf(v, &IsStopWord);
- auto & v = m_params.m_tokens;
- my::EraseIf(v, mem_fn(&Params::TSynonymsVector::empty));
+ for (size_t i = 0; i < m_params.GetNumTokens();)
+ {
+ if (m_params.m_tokens[i].empty())
+ m_params.RemoveToken(i);
+ else
+ ++i;
+ }
// If all tokens are stop words - give up.
- if (m_params.m_tokens.empty())
+ if (m_params.GetNumTokens() == 0)
m_params = params;
}
- m_retrievalParams = m_params;
-
// Remove all category synonyms for streets, as they're extracted
// individually.
for (size_t i = 0; i < m_params.GetNumTokens(); ++i)
@@ -388,15 +406,48 @@ void Geocoder::SetParams(Params const & params)
ASSERT(!synonyms.empty(), ());
if (IsStreetSynonym(synonyms.front()))
+ m_params.m_typeIndices[i].clear();
+ }
+
+ m_tokenRequests.resize(m_params.m_tokens.size());
+ for (size_t i = 0; i < m_params.m_tokens.size(); ++i)
+ {
+ auto & request = m_tokenRequests[i];
+ request.Clear();
+ for (auto const & name : m_params.m_tokens[i])
{
- auto b = synonyms.begin();
- auto e = synonyms.end();
- synonyms.erase(remove_if(b + 1, e,
- [this](strings::UniString const & synonym) {
- return m_streetsCache.HasCategory(synonym);
- }),
- e);
+ // Here and below, we use LevenshteinDFAs for fuzzy
+ // matching. But due to performance reasons, we assume that the
+ // first letter is always correct.
+ if (!IsHashtagged(name))
+ request.m_names.emplace_back(name, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(name));
+ else
+ request.m_names.emplace_back(name, 0 /* maxErrors */);
}
+ for (auto const & index : m_params.m_typeIndices[i])
+ request.m_categories.emplace_back(FeatureTypeToString(index));
+ request.m_langs = m_params.m_langs;
+ }
+
+ if (m_params.LastTokenIsPrefix())
+ {
+ auto & request = m_prefixTokenRequest;
+ request.Clear();
+ for (auto const & name : m_params.m_prefixTokens)
+ {
+ if (!IsHashtagged(name))
+ {
+ request.m_names.emplace_back(
+ LevenshteinDFA(name, 1 /* prefixCharsToKeep */, GetMaxErrorsForToken(name)));
+ }
+ else
+ {
+ request.m_names.emplace_back(LevenshteinDFA(name, 0 /* maxErrors */));
+ }
+ }
+ for (auto const & index : m_params.m_typeIndices.back())
+ request.m_categories.emplace_back(FeatureTypeToString(index));
+ request.m_langs = m_params.m_langs;
}
LOG(LDEBUG, ("Tokens =", m_params.m_tokens));
@@ -447,6 +498,11 @@ void Geocoder::GoInViewport()
void Geocoder::GoImpl(vector<shared_ptr<MwmInfo>> & infos, bool inViewport)
{
+#if defined(USE_GOOGLE_PROFILER) && defined(OMIM_OS_LINUX)
+ ProfilerStart("/tmp/geocoder.prof");
+ MY_SCOPE_GUARD(profilerStop, []() { ProfilerStop(); });
+#endif
+
try
{
// Tries to find world and fill localities table.
@@ -565,29 +621,6 @@ void Geocoder::ClearCaches()
m_postcodes.Clear();
}
-void Geocoder::PrepareRetrievalParams(size_t curToken, size_t endToken)
-{
- ASSERT_LESS(curToken, endToken, ());
- ASSERT_LESS_OR_EQUAL(endToken, m_params.GetNumTokens(), ());
-
- m_retrievalParams.m_tokens.clear();
- m_retrievalParams.m_prefixTokens.clear();
- m_retrievalParams.m_types.clear();
-
- // TODO (@y): possibly it's not cheap to copy vectors of strings.
- // Profile it, and in case of serious performance loss, refactor
- // QueryParams to support subsets of tokens.
- for (size_t i = curToken; i < endToken; ++i)
- {
- if (i < m_params.m_tokens.size())
- m_retrievalParams.m_tokens.push_back(m_params.m_tokens[i]);
- else
- m_retrievalParams.m_prefixTokens = m_params.m_prefixTokens;
-
- m_retrievalParams.m_types.push_back(m_params.m_types[i]);
- }
-}
-
void Geocoder::InitBaseContext(BaseContext & ctx)
{
ctx.m_usedTokens.assign(m_params.GetNumTokens(), false);
@@ -595,8 +628,10 @@ void Geocoder::InitBaseContext(BaseContext & ctx)
ctx.m_features.resize(ctx.m_numTokens);
for (size_t i = 0; i < ctx.m_features.size(); ++i)
{
- PrepareRetrievalParams(i, i + 1);
- ctx.m_features[i] = RetrieveAddressFeatures(*m_context, m_cancellable, m_retrievalParams);
+ if (m_params.IsPrefixToken(i))
+ ctx.m_features[i] = RetrieveAddressFeatures(*m_context, m_cancellable, m_prefixTokenRequest);
+ else
+ ctx.m_features[i] = RetrieveAddressFeatures(*m_context, m_cancellable, m_tokenRequests[i]);
}
ctx.m_hotelsFilter = m_hotelsFilter.MakeScopedFilter(*m_context, m_params.m_hotelsFilter);
}
diff --git a/search/geocoder.hpp b/search/geocoder.hpp
index ed178ee6e2..f46af7d19b 100644
--- a/search/geocoder.hpp
+++ b/search/geocoder.hpp
@@ -3,6 +3,7 @@
#include "search/cancel_exception.hpp"
#include "search/categories_cache.hpp"
#include "search/cbv.hpp"
+#include "search/feature_offset_match.hpp"
#include "search/features_layer.hpp"
#include "search/features_layer_path_finder.hpp"
#include "search/geocoder_context.hpp"
@@ -28,6 +29,8 @@
#include "base/buffer_vector.hpp"
#include "base/cancellable.hpp"
+#include "base/dfa_helpers.hpp"
+#include "base/levenshtein_dfa.hpp"
#include "base/macros.hpp"
#include "base/string_utils.hpp"
@@ -183,10 +186,6 @@ private:
QueryParams::TSynonymsVector const & GetTokens(size_t i) const;
- // Fills |m_retrievalParams| with [curToken, endToken) subsequence
- // of search query tokens.
- void PrepareRetrievalParams(size_t curToken, size_t endToken);
-
// Creates a cache of posting lists corresponding to features in m_context
// for each token and saves it to m_addressFeatures.
void InitBaseContext(BaseContext & ctx);
@@ -324,7 +323,8 @@ private:
FeaturesLayerPathFinder m_finder;
// Search query params prepared for retrieval.
- QueryParams m_retrievalParams;
+ vector<SearchTrieRequest<strings::LevenshteinDFA>> m_tokenRequests;
+ SearchTrieRequest<strings::PrefixDFAModifier<strings::LevenshteinDFA>> m_prefixTokenRequest;
// Pointer to the most nested region filled during geocoding.
Region const * m_lastMatchedRegion;
diff --git a/search/processor.cpp b/search/processor.cpp
index b4037a98fa..80ce3d1fc4 100644
--- a/search/processor.cpp
+++ b/search/processor.cpp
@@ -626,20 +626,16 @@ void Processor::InitParams(QueryParams & params)
for (size_t i = 0; i < tokensCount; ++i)
params.m_tokens[i].push_back(m_tokens[i]);
- params.m_types.resize(tokensCount + (m_prefix.empty() ? 0 : 1));
- for (auto & types : params.m_types)
- types.clear();
+ params.m_typeIndices.resize(tokensCount + (m_prefix.empty() ? 0 : 1));
+ for (auto & indices : params.m_typeIndices)
+ indices.clear();
// Add names of categories (and synonyms).
Classificator const & c = classif();
auto addSyms = [&](size_t i, uint32_t t)
{
- QueryParams::TSynonymsVector & v = params.GetTokens(i);
-
- params.m_types[i].push_back(t);
-
uint32_t const index = c.GetIndexForType(t);
- v.push_back(FeatureTypeToString(index));
+ params.m_typeIndices[i].push_back(index);
// v2-version MWM has raw classificator types in search index prefix, so
// do the hack: add synonyms for old convention if needed.
@@ -649,7 +645,7 @@ void Processor::InitParams(QueryParams & params)
if (type >= 0)
{
ASSERT(type == 70 || type > 4000, (type));
- v.push_back(FeatureTypeToString(static_cast<uint32_t>(type)));
+ params.m_typeIndices[i].push_back(static_cast<uint32_t>(type));
}
}
};
diff --git a/search/query_params.cpp b/search/query_params.cpp
index c5aa8b9bee..7e65aedf08 100644
--- a/search/query_params.cpp
+++ b/search/query_params.cpp
@@ -59,7 +59,7 @@ void QueryParams::Clear()
{
m_tokens.clear();
m_prefixTokens.clear();
- m_types.clear();
+ m_typeIndices.clear();
m_langs.clear();
m_scale = scales::GetUpperScale();
}
@@ -67,7 +67,7 @@ void QueryParams::Clear()
bool QueryParams::IsCategorySynonym(size_t i) const
{
ASSERT_LESS(i, GetNumTokens(), ());
- return !m_types[i].empty();
+ return !m_typeIndices[i].empty();
}
bool QueryParams::IsPrefixToken(size_t i) const
@@ -111,6 +111,16 @@ bool QueryParams::IsNumberTokens(size_t start, size_t end) const
return true;
}
+void QueryParams::RemoveToken(size_t i)
+{
+ ASSERT_LESS(i, GetNumTokens(), ());
+ if (i == m_tokens.size())
+ m_prefixTokens.clear();
+ else
+ m_tokens.erase(m_tokens.begin() + i);
+ m_typeIndices.erase(m_typeIndices.begin() + i);
+}
+
string DebugPrint(search::QueryParams const & params)
{
ostringstream os;
diff --git a/search/query_params.hpp b/search/query_params.hpp
index 9a08185f21..f4a8836a0a 100644
--- a/search/query_params.hpp
+++ b/search/query_params.hpp
@@ -24,6 +24,8 @@ struct QueryParams
return m_prefixTokens.empty() ? m_tokens.size() : m_tokens.size() + 1;
}
+ inline bool LastTokenIsPrefix() const { return !m_prefixTokens.empty(); }
+
inline bool IsEmpty() const { return GetNumTokens() == 0; }
inline bool IsLangExist(int8_t lang) const { return m_langs.count(lang) != 0; }
@@ -38,9 +40,11 @@ struct QueryParams
// synonyms.
bool IsNumberTokens(size_t start, size_t end) const;
+ void RemoveToken(size_t i);
+
vector<TSynonymsVector> m_tokens;
TSynonymsVector m_prefixTokens;
- vector<vector<uint32_t>> m_types;
+ vector<vector<uint32_t>> m_typeIndices;
TLangsSet m_langs;
int m_scale = scales::GetUpperScale();
diff --git a/search/retrieval.cpp b/search/retrieval.cpp
index 77cee7c994..e68e420ed3 100644
--- a/search/retrieval.cpp
+++ b/search/retrieval.cpp
@@ -8,6 +8,7 @@
#include "search/search_trie.hpp"
#include "search/token_slice.hpp"
+#include "indexer/classificator.hpp"
#include "indexer/feature.hpp"
#include "indexer/feature_algo.hpp"
#include "indexer/feature_data.hpp"
@@ -24,9 +25,6 @@
#include "coding/compressed_bit_vector.hpp"
#include "coding/reader_wrapper.hpp"
-#include "base/dfa_helpers.hpp"
-#include "base/uni_string_dfa.hpp"
-
#include "std/algorithm.hpp"
using namespace strings;
@@ -44,8 +42,8 @@ public:
{
}
- template <typename TValue>
- void operator()(TValue const & value)
+ template <typename Value>
+ void operator()(Value const & value)
{
if ((++m_counter & 0xFF) == 0)
BailIfCancelled(m_cancellable);
@@ -79,16 +77,16 @@ public:
binary_search(m_modified.begin(), m_modified.end(), featureIndex);
}
- template <typename TFn>
- void ForEachModifiedOrCreated(TFn && fn)
+ template <typename Fn>
+ void ForEachModifiedOrCreated(Fn && fn)
{
ForEach(m_modified, fn);
ForEach(m_created, fn);
}
private:
- template <typename TFn>
- void ForEach(vector<uint32_t> const & features, TFn & fn)
+ template <typename Fn>
+ void ForEach(vector<uint32_t> const & features, Fn & fn)
{
auto & editor = Editor::Instance();
for (auto const index : features)
@@ -111,70 +109,62 @@ unique_ptr<coding::CompressedBitVector> SortFeaturesAndBuildCBV(vector<uint64_t>
return coding::CompressedBitVectorBuilder::FromBitPositions(move(features));
}
-// Checks that any from the |second| matches any from the |first|. In
-// ambiguous case when |second| is empty, returns true.
-template <class TComp, class T>
-bool IsFirstMatchesSecond(vector<T> const & first, vector<T> const & second, TComp const & comp)
+template <typename DFA>
+bool MatchesByName(vector<UniString> const & tokens, vector<DFA> const & dfas)
{
- if (second.empty())
- return true;
-
- for (auto const & s : second)
+ for (auto const & dfa : dfas)
{
- for (auto const & f : first)
+ for (auto const & token : tokens)
{
- if (comp(f, s))
+ auto it = dfa.Begin();
+ DFAMove(it, token);
+ if (it.Accepts())
return true;
}
}
+
return false;
}
-bool MatchFeatureByNameAndType(FeatureType const & ft, QueryParams const & params)
+template <typename DFA>
+bool MatchesByType(feature::TypesHolder const & types, vector<DFA> const & dfas)
{
- using namespace strings;
+ if (dfas.empty())
+ return false;
- auto const prefixMatch = [](UniString const & s1, UniString const & s2) {
- return StartsWith(s1, s2);
- };
+ auto const & c = classif();
+
+ for (auto const & type : types)
+ {
+ UniString const s = FeatureTypeToString(c.GetIndexForType(type));
+
+ for (auto const & dfa : dfas)
+ {
+ auto it = dfa.Begin();
+ DFAMove(it, s);
+ if (it.Accepts())
+ return true;
+ }
+ }
- auto const fullMatch = [](UniString const & s1, UniString const & s2) { return s1 == s2; };
+ return false;
+}
+template <typename DFA>
+bool MatchFeatureByNameAndType(FeatureType const & ft, SearchTrieRequest<DFA> const & request)
+{
feature::TypesHolder th(ft);
bool matched = false;
- ft.ForEachName([&](int8_t lang, string const & utf8Name)
+ ft.ForEachName([&](int8_t lang, string const & name)
{
- if (utf8Name.empty() || !params.IsLangExist(lang))
+ if (name.empty() || !request.IsLangExist(lang))
return true /* continue ForEachName */;
- vector<UniString> nameTokens;
- NormalizeAndTokenizeString(utf8Name, nameTokens, Delimiters());
-
- for (size_t i = 0; i < params.GetNumTokens(); ++i)
- {
- auto const isPrefix = params.IsPrefixToken(i);
- auto const & syms = params.GetTokens(i);
-
- if (!IsFirstMatchesSecond(nameTokens, syms, isPrefix ? prefixMatch : fullMatch))
- {
- // Checks types in case of names mismatch.
- auto const & types = params.m_types[i];
- auto typeMatched = false;
-
- for (auto const & type : types)
- {
- if (th.Has(type))
- {
- typeMatched = true;
- break;
- }
- }
-
- if (!typeMatched)
- return true /* continue ForEachName */;
- }
- }
+ vector<UniString> tokens;
+ NormalizeAndTokenizeString(name, tokens, Delimiters());
+ if (!MatchesByName(tokens, request.m_names) && !MatchesByType(th, request.m_categories))
+ return true /* continue ForEachName */;
matched = true;
return false /* break ForEachName */;
@@ -205,79 +195,48 @@ bool MatchFeatureByPostcode(FeatureType const & ft, TokenSlice const & slice)
return true;
}
-template<typename TValue>
-using TrieRoot = trie::Iterator<ValueList<TValue>>;
+template<typename Value>
+using TrieRoot = trie::Iterator<ValueList<Value>>;
-template <typename TValue, typename TFn>
-void WithSearchTrieRoot(MwmValue & value, TFn && fn)
+template <typename Value, typename Fn>
+void WithSearchTrieRoot(MwmValue & value, Fn && fn)
{
serial::CodingParams codingParams(trie::GetCodingParams(value.GetHeader().GetDefCodingParams()));
ModelReaderPtr searchReader = value.m_cont.GetReader(SEARCH_INDEX_FILE_TAG);
- auto const trieRoot = trie::ReadTrie<SubReaderWrapper<Reader>, ValueList<TValue>>(
- SubReaderWrapper<Reader>(searchReader.GetPtr()), SingleValueSerializer<TValue>(codingParams));
+ auto const trieRoot = trie::ReadTrie<SubReaderWrapper<Reader>, ValueList<Value>>(
+ SubReaderWrapper<Reader>(searchReader.GetPtr()), SingleValueSerializer<Value>(codingParams));
return fn(*trieRoot);
}
// Retrieves from the search index corresponding to |value| all
// features matching to |params|.
-template <typename TValue>
+template <typename Value, typename DFA>
unique_ptr<coding::CompressedBitVector> RetrieveAddressFeaturesImpl(
- MwmContext const & context, my::Cancellable const & cancellable, QueryParams const & params)
+ MwmContext const & context, my::Cancellable const & cancellable,
+ SearchTrieRequest<DFA> const & request)
{
EditedFeaturesHolder holder(context.GetId());
vector<uint64_t> features;
FeaturesCollector collector(cancellable, features);
- // TODO (@y): this code highly depends on the fact that the function
- // is called on a single-token query params. In any case, this code
- // must be fixed ASAP, as this is the wrong place for DFA creation.
- ASSERT_EQUAL(1, params.GetNumTokens(), ());
+ WithSearchTrieRoot<Value>(context.m_value, [&](TrieRoot<Value> const & root) {
+ MatchFeaturesInTrie(
+ request, root,
+ [&holder](uint32_t featureIndex) { return !holder.ModifiedOrDeleted(featureIndex); },
+ collector);
+ });
- for (size_t i = 0; i < params.GetNumTokens(); ++i)
- {
- if (params.IsPrefixToken(i))
- {
- using DFA = PrefixDFAModifier<UniStringDFA>;
-
- SearchTrieRequest<DFA> request;
- for (auto const & sym : params.GetTokens(i))
- request.m_dfas.emplace_back(UniStringDFA(sym));
- request.m_langs = params.m_langs;
-
- WithSearchTrieRoot<TValue>(context.m_value, [&](TrieRoot<TValue> const & root) {
- MatchFeaturesInTrie(
- request, root,
- [&holder](uint32_t featureIndex) { return !holder.ModifiedOrDeleted(featureIndex); },
- collector);
- });
- }
- else
- {
- using DFA = UniStringDFA;
-
- SearchTrieRequest<DFA> request;
- for (auto const & sym : params.GetTokens(i))
- request.m_dfas.emplace_back(sym);
- request.m_langs = params.m_langs;
-
- WithSearchTrieRoot<TValue>(context.m_value, [&](TrieRoot<TValue> const & root) {
- MatchFeaturesInTrie(
- request, root,
- [&holder](uint32_t featureIndex) { return !holder.ModifiedOrDeleted(featureIndex); },
- collector);
- });
- }
- }
holder.ForEachModifiedOrCreated([&](FeatureType & ft, uint64_t index) {
- if (MatchFeatureByNameAndType(ft, params))
+ if (MatchFeatureByNameAndType(ft, request))
features.push_back(index);
});
+
return SortFeaturesAndBuildCBV(move(features));
}
-template <typename TValue>
+template <typename Value>
unique_ptr<coding::CompressedBitVector> RetrievePostcodeFeaturesImpl(
MwmContext const & context, my::Cancellable const & cancellable, TokenSlice const & slice)
{
@@ -285,7 +244,7 @@ unique_ptr<coding::CompressedBitVector> RetrievePostcodeFeaturesImpl(
vector<uint64_t> features;
FeaturesCollector collector(cancellable, features);
- WithSearchTrieRoot<TValue>(context.m_value, [&](TrieRoot<TValue> const & root) {
+ WithSearchTrieRoot<Value>(context.m_value, [&](TrieRoot<Value> const & root) {
MatchPostcodesInTrie(
slice, root,
[&holder](uint32_t featureIndex) { return !holder.ModifiedOrDeleted(featureIndex); },
@@ -326,28 +285,28 @@ unique_ptr<coding::CompressedBitVector> RetrieveGeometryFeaturesImpl(
template <typename T>
struct RetrieveAddressFeaturesAdaptor
{
- template <typename... TArgs>
- unique_ptr<coding::CompressedBitVector> operator()(TArgs &&... args)
+ template <typename... Args>
+ unique_ptr<coding::CompressedBitVector> operator()(Args &&... args)
{
- return RetrieveAddressFeaturesImpl<T>(forward<TArgs>(args)...);
+ return RetrieveAddressFeaturesImpl<T>(forward<Args>(args)...);
}
};
template <typename T>
struct RetrievePostcodeFeaturesAdaptor
{
- template <typename... TArgs>
- unique_ptr<coding::CompressedBitVector> operator()(TArgs &&... args)
+ template <typename... Args>
+ unique_ptr<coding::CompressedBitVector> operator()(Args &&... args)
{
- return RetrievePostcodeFeaturesImpl<T>(forward<TArgs>(args)...);
+ return RetrievePostcodeFeaturesImpl<T>(forward<Args>(args)...);
}
};
template <template <typename> class T>
struct Selector
{
- template <typename... TArgs>
- unique_ptr<coding::CompressedBitVector> operator()(MwmContext const & context, TArgs &&... args)
+ template <typename... Args>
+ unique_ptr<coding::CompressedBitVector> operator()(MwmContext const & context, Args &&... args)
{
version::MwmTraits mwmTraits(context.m_value.GetMwmVersion().GetFormat());
@@ -355,25 +314,33 @@ struct Selector
version::MwmTraits::SearchIndexFormat::FeaturesWithRankAndCenter)
{
T<FeatureWithRankAndCenter> t;
- return t(context, forward<TArgs>(args)...);
+ return t(context, forward<Args>(args)...);
}
if (mwmTraits.GetSearchIndexFormat() ==
version::MwmTraits::SearchIndexFormat::CompressedBitVector)
{
T<FeatureIndexValue> t;
- return t(context, forward<TArgs>(args)...);
+ return t(context, forward<Args>(args)...);
}
return unique_ptr<coding::CompressedBitVector>();
}
};
} // namespace
-unique_ptr<coding::CompressedBitVector> RetrieveAddressFeatures(MwmContext const & context,
- my::Cancellable const & cancellable,
- QueryParams const & params)
+unique_ptr<coding::CompressedBitVector> RetrieveAddressFeatures(
+ MwmContext const & context, my::Cancellable const & cancellable,
+ SearchTrieRequest<LevenshteinDFA> const & request)
+{
+ Selector<RetrieveAddressFeaturesAdaptor> selector;
+ return selector(context, cancellable, request);
+}
+
+unique_ptr<coding::CompressedBitVector> RetrieveAddressFeatures(
+ MwmContext const & context, my::Cancellable const & cancellable,
+ SearchTrieRequest<PrefixDFAModifier<LevenshteinDFA>> const & request)
{
Selector<RetrieveAddressFeaturesAdaptor> selector;
- return selector(context, cancellable, params);
+ return selector(context, cancellable, request);
}
unique_ptr<coding::CompressedBitVector> RetrievePostcodeFeatures(
diff --git a/search/retrieval.hpp b/search/retrieval.hpp
index 88a532528b..569f642f33 100644
--- a/search/retrieval.hpp
+++ b/search/retrieval.hpp
@@ -1,10 +1,13 @@
#pragma once
+#include "search/feature_offset_match.hpp"
#include "search/query_params.hpp"
#include "geometry/rect2d.hpp"
#include "base/cancellable.hpp"
+#include "base/dfa_helpers.hpp"
+#include "base/levenshtein_dfa.hpp"
#include "std/unique_ptr.hpp"
@@ -20,11 +23,15 @@ namespace search
class MwmContext;
class TokenSlice;
-// Retrieves from the search index corresponding to |value| all
-// features matching to |params|.
-unique_ptr<coding::CompressedBitVector> RetrieveAddressFeatures(MwmContext const & context,
- my::Cancellable const & cancellable,
- QueryParams const & params);
+// Following functions retrieve from the search index corresponding to
+// |value| all features matching to |request|.
+unique_ptr<coding::CompressedBitVector> RetrieveAddressFeatures(
+ MwmContext const & context, my::Cancellable const & cancellable,
+ SearchTrieRequest<strings::LevenshteinDFA> const & request);
+
+unique_ptr<coding::CompressedBitVector> RetrieveAddressFeatures(
+ MwmContext const & context, my::Cancellable const & cancellable,
+ SearchTrieRequest<strings::PrefixDFAModifier<strings::LevenshteinDFA>> const & request);
// Retrieves from the search index corresponding to |value| all
// postcodes matching to |slice|.
diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp
index 8af3dfbd3a..610e313d2e 100644
--- a/search/search_integration_tests/processor_test.cpp
+++ b/search/search_integration_tests/processor_test.cpp
@@ -750,5 +750,41 @@ UNIT_CLASS_TEST(ProcessorTest, HotelsFiltering)
TEST(MatchResults(rules, request.Results()), ());
}
}
+
+UNIT_CLASS_TEST(ProcessorTest, FuzzyMatch)
+{
+ string const countryName = "Wonderland";
+ TestCountry country(m2::PointD(10, 10), countryName, "en");
+
+ TestCity city(m2::PointD(0, 0), "Москва", "ru", 100 /* rank */);
+ TestStreet street(vector<m2::PointD>{m2::PointD(-0.001, -0.001), m2::PointD(0.001, 0.001)},
+ "Ленинградский", "ru");
+ TestPOI bar(m2::PointD(0, 0), "Черчилль", "ru");
+ bar.SetTypes({{"amenity", "pub"}});
+
+ BuildWorld([&](TestMwmBuilder & builder) {
+ builder.Add(country);
+ builder.Add(city);
+ });
+
+ auto id = BuildCountry(countryName, [&](TestMwmBuilder & builder) {
+ builder.Add(street);
+ builder.Add(bar);
+ });
+
+ SetViewport(m2::RectD(m2::PointD(-1.0, -1.0), m2::PointD(1.0, 1.0)));
+ {
+ TRules rules = {ExactMatch(id, bar)};
+ TEST(ResultsMatch("москва черчилль", "ru", rules), ());
+ TEST(ResultsMatch("москва ленинградский черчилль", "ru", rules), ());
+ TEST(ResultsMatch("москва ленинградский паб черчилль", "ru", rules), ());
+
+ TEST(ResultsMatch("масква лининградский черчиль", "ru", rules), ());
+ TEST(ResultsMatch("масква ленинргадский черчиль", "ru", rules), ());
+
+ // Too many errors, can't do anything.
+ TEST(ResultsMatch("масква ленинргадский чирчиль", "ru", TRules{}), ());
+ }
+}
} // namespace
} // namespace search