Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authormpimenov <mpimenov@users.noreply.github.com>2017-02-06 17:11:04 +0300
committerGitHub <noreply@github.com>2017-02-06 17:11:04 +0300
commit21e0d54aadc4f11e846ed5dbc4b35e94d1add48a (patch)
tree6539c01240893e319511cd12896404a110018d14
parentd719020c9cf0951001c499c3ece534a1c3da3355 (diff)
parente09b14f438b37dbabbf530255537f2e1890a2cb5 (diff)
Merge pull request #5365 from ygorshenin/fix-name-score
[search] Fixed discrepancy in tokens between geocoder and ranker.
-rw-r--r--search/geocoder.cpp52
-rw-r--r--search/processor.cpp47
-rw-r--r--search/ranker.cpp2
-rw-r--r--search/ranking_utils.cpp33
-rw-r--r--search/ranking_utils.hpp20
-rw-r--r--search/search_integration_tests/processor_test.cpp28
6 files changed, 120 insertions, 62 deletions
diff --git a/search/geocoder.cpp b/search/geocoder.cpp
index 82a10dff1a..4585d44258 100644
--- a/search/geocoder.cpp
+++ b/search/geocoder.cpp
@@ -218,21 +218,6 @@ MwmSet::MwmHandle FindWorld(Index const & index, vector<shared_ptr<MwmInfo>> con
return handle;
}
-UniString AsciiToUniString(char const * s) { return UniString(s, s + strlen(s)); }
-
-bool IsStopWord(UniString const & s)
-{
- /// @todo Get all common used stop words and factor out this array into
- /// search_string_utils.cpp module for example.
- static char const * arr[] = {"a", "de", "da", "la"};
-
- static set<UniString> const kStopWords(
- make_transform_iterator(arr, &AsciiToUniString),
- make_transform_iterator(arr + ARRAY_SIZE(arr), &AsciiToUniString));
-
- return kStopWords.count(s) > 0;
-}
-
double Area(m2::RectD const & rect) { return rect.IsValid() ? rect.SizeX() * rect.SizeY() : 0; }
// Computes an average similaty between |rect| and |pivot|. By
@@ -375,43 +360,6 @@ void Geocoder::SetParams(Params const & params)
{
m_params = params;
- // Filter stop words.
- if (m_params.GetNumTokens() > 1)
- {
- for (size_t i = 0; i < m_params.GetNumTokens();)
- {
- if (m_params.IsPrefixToken(i))
- {
- ++i;
- continue;
- }
-
- auto & token = m_params.GetToken(i);
- if (IsStopWord(token.m_original))
- {
- m_params.RemoveToken(i);
- }
- else
- {
- my::EraseIf(token.m_synonyms, &IsStopWord);
- ++i;
- }
- }
-
- // If all tokens are stop words - give up.
- if (m_params.GetNumTokens() == 0)
- m_params = params;
- }
-
- // Remove all category synonyms for streets, as they're extracted
- // individually.
- for (size_t i = 0; i < m_params.GetNumTokens(); ++i)
- {
- auto & token = m_params.GetToken(i);
- if (IsStreetSynonym(token.m_original))
- m_params.GetTypeIndices(i).clear();
- }
-
m_tokenRequests.clear();
m_prefixTokenRequest.Clear();
for (size_t i = 0; i < m_params.GetNumTokens(); ++i)
diff --git a/search/processor.cpp b/search/processor.cpp
index 63d325e7e8..ab6caeb387 100644
--- a/search/processor.cpp
+++ b/search/processor.cpp
@@ -129,6 +129,42 @@ void SendStatistics(SearchParams const & params, m2::RectD const & viewport, Res
alohalytics::LogEvent("searchEmitResultsAndCoords", stats);
GetPlatform().GetMarketingService().SendMarketingEvent(marketing::kSearchEmitResultsAndCoords, {});
}
+
+// Removes all full-token stop words from |params|, unless |params|
+// consists of all such tokens.
+void RemoveStopWordsIfNeeded(QueryParams & params)
+{
+ size_t numStopWords = 0;
+ for (size_t i = 0; i < params.GetNumTokens(); ++i)
+ {
+ auto & token = params.GetToken(i);
+ if (!params.IsPrefixToken(i) && IsStopWord(token.m_original))
+ ++numStopWords;
+ }
+
+ if (numStopWords == params.GetNumTokens())
+ return;
+
+ for (size_t i = 0; i < params.GetNumTokens();)
+ {
+ if (params.IsPrefixToken(i))
+ {
+ ++i;
+ continue;
+ }
+
+ auto & token = params.GetToken(i);
+ if (IsStopWord(token.m_original))
+ {
+ params.RemoveToken(i);
+ }
+ else
+ {
+ my::EraseIf(token.m_synonyms, &IsStopWord);
+ ++i;
+ }
+ }
+}
} // namespace
// static
@@ -642,6 +678,17 @@ void Processor::InitParams(QueryParams & params)
auto & langs = params.GetLangs();
for (int i = 0; i < LANG_COUNT; ++i)
langs.insert(GetLanguage(i));
+
+ RemoveStopWordsIfNeeded(params);
+
+ // Remove all type indices for streets, as they're considired
+ // individually.
+ for (size_t i = 0; i < params.GetNumTokens(); ++i)
+ {
+ auto & token = params.GetToken(i);
+ if (IsStreetSynonym(token.m_original))
+ params.GetTypeIndices(i).clear();
+ }
}
void Processor::InitGeocoder(Geocoder::Params & params)
diff --git a/search/ranker.cpp b/search/ranker.cpp
index 93bdecc88f..84b4048f2c 100644
--- a/search/ranker.cpp
+++ b/search/ranker.cpp
@@ -188,7 +188,7 @@ class PreResult2Maker
if (!ft.GetName(lang, name))
continue;
vector<strings::UniString> tokens;
- SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters());
+ PrepareStringForMatching(name, tokens);
UpdateNameScore(tokens, slice, info.m_nameScore);
UpdateNameScore(tokens, sliceNoCategories, info.m_nameScore);
diff --git a/search/ranking_utils.cpp b/search/ranking_utils.cpp
index 9d8a97b978..16d6f4bcbc 100644
--- a/search/ranking_utils.cpp
+++ b/search/ranking_utils.cpp
@@ -1,11 +1,18 @@
#include "search/ranking_utils.hpp"
-#include "std/algorithm.hpp"
+#include "std/transform_iterator.hpp"
+
+#include <algorithm>
using namespace strings;
namespace search
{
+namespace
+{
+UniString AsciiToUniString(char const * s) { return UniString(s, s + strlen(s)); }
+} // namespace
+
namespace impl
{
bool FullMatch(QueryParams::Token const & token, UniString const & text)
@@ -30,6 +37,29 @@ bool PrefixMatch(QueryParams::Token const & token, UniString const & text)
}
} // namespace impl
+bool IsStopWord(UniString const & s)
+{
+ /// @todo Get all common used stop words and factor out this array into
+ /// search_string_utils.cpp module for example.
+ static char const * arr[] = {"a", "de", "da", "la"};
+
+ static std::set<UniString> const kStopWords(
+ make_transform_iterator(arr, &AsciiToUniString),
+ make_transform_iterator(arr + ARRAY_SIZE(arr), &AsciiToUniString));
+
+ return kStopWords.count(s) > 0;
+}
+
+void PrepareStringForMatching(std::string const & name, std::vector<strings::UniString> & tokens)
+{
+ auto filter = [&tokens](strings::UniString const & token)
+ {
+ if (!IsStopWord(token))
+ tokens.push_back(token);
+ };
+ SplitUniString(NormalizeAndSimplifyString(name), filter, Delimiters());
+}
+
string DebugPrint(NameScore score)
{
switch (score)
@@ -43,5 +73,4 @@ string DebugPrint(NameScore score)
}
return "Unknown";
}
-
} // namespace search
diff --git a/search/ranking_utils.hpp b/search/ranking_utils.hpp
index 0d6a58046b..bf359e42b2 100644
--- a/search/ranking_utils.hpp
+++ b/search/ranking_utils.hpp
@@ -9,10 +9,10 @@
#include "base/stl_add.hpp"
#include "base/string_utils.hpp"
-#include "std/cstdint.hpp"
-#include "std/limits.hpp"
-#include "std/string.hpp"
-#include "std/vector.hpp"
+#include <cstdint>
+#include <limits>
+#include <string>
+#include <vector>
namespace search
{
@@ -38,19 +38,25 @@ enum NameScore
NAME_SCORE_COUNT
};
+// Returns true when |s| is a stop-word and may be removed from a query.
+bool IsStopWord(strings::UniString const & s);
+
+// Normalizes, simplifies and splits string, removes stop-words.
+void PrepareStringForMatching(std::string const & name, std::vector<strings::UniString> & tokens);
+
template <typename TSlice>
-NameScore GetNameScore(string const & name, TSlice const & slice)
+NameScore GetNameScore(std::string const & name, TSlice const & slice)
{
if (slice.Empty())
return NAME_SCORE_ZERO;
- vector<strings::UniString> tokens;
+ std::vector<strings::UniString> tokens;
SplitUniString(NormalizeAndSimplifyString(name), MakeBackInsertFunctor(tokens), Delimiters());
return GetNameScore(tokens, slice);
}
template <typename TSlice>
-NameScore GetNameScore(vector<strings::UniString> const & tokens, TSlice const & slice)
+NameScore GetNameScore(std::vector<strings::UniString> const & tokens, TSlice const & slice)
{
if (slice.Empty())
return NAME_SCORE_ZERO;
diff --git a/search/search_integration_tests/processor_test.cpp b/search/search_integration_tests/processor_test.cpp
index 114205fbd5..ef8fdc3bc4 100644
--- a/search/search_integration_tests/processor_test.cpp
+++ b/search/search_integration_tests/processor_test.cpp
@@ -805,5 +805,33 @@ UNIT_CLASS_TEST(ProcessorTest, SpacesInCategories)
TEST(ResultsMatch("Москва ночной клуб", "ru", rules), ());
}
}
+
+UNIT_CLASS_TEST(ProcessorTest, StopWords)
+{
+ TestCountry country(m2::PointD(0, 0), "France", "en");
+ TestCity city(m2::PointD(0, 0), "Paris", "en", 100 /* rank */);
+ TestStreet street(
+ vector<m2::PointD>{m2::PointD(-0.001, -0.001), m2::PointD(0, 0), m2::PointD(0.001, 0.001)},
+ "Rue de la Paix", "en");
+
+ BuildWorld([&](TestMwmBuilder & builder) {
+ builder.Add(country);
+ builder.Add(city);
+ });
+
+ auto id = BuildCountry(country.GetName(), [&](TestMwmBuilder & builder) { builder.Add(street); });
+
+ {
+ auto request = MakeRequest("la France à Paris Rue de la Paix");
+
+ TRules rules = {ExactMatch(id, street)};
+
+ auto const & results = request->Results();
+ TEST(MatchResults(rules, results), ());
+
+ auto const & info = results[0].GetRankingInfo();
+ TEST_EQUAL(info.m_nameScore, NAME_SCORE_FULL_MATCH, ());
+ }
+}
} // namespace
} // namespace search