Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorYuri Gorshenin <mipt.vi002@gmail.com>2017-05-25 20:46:17 +0300
committerGitHub <noreply@github.com>2017-05-25 20:46:17 +0300
commit849121e71f5c227370531b39017a9dc1f0363200 (patch)
tree35cd0b9ffaa586849c74286c62340c6a050f76af
parentf87e622b33bdc265b64700cae88dea0f06593570 (diff)
parent83dd94fd2bcb1b829d18287467b19e40c23b75ea (diff)
Merge pull request #6129 from mpimenov/search-name-scoresbeta-829
[search] Changed the name scoring scheme.
-rw-r--r--search/intermediate_result.cpp14
-rw-r--r--search/locality_scorer.cpp2
-rw-r--r--search/ranking_info.cpp25
-rw-r--r--search/ranking_utils.cpp3
-rw-r--r--search/ranking_utils.hpp40
-rwxr-xr-xsearch/search_quality/scoring_model.py9
-rw-r--r--search/search_tests/ranking_tests.cpp12
7 files changed, 54 insertions, 51 deletions
diff --git a/search/intermediate_result.cpp b/search/intermediate_result.cpp
index f0c37fef80..81f53e5bb6 100644
--- a/search/intermediate_result.cpp
+++ b/search/intermediate_result.cpp
@@ -64,7 +64,8 @@ void ProcessMetadata(FeatureType const & ft, Result::Metadata & meta)
meta.m_hotelRating = rating;
int pricing;
- strings::to_int(src.Get(feature::Metadata::FMD_PRICE_RATE), pricing);
+ if (!strings::to_int(src.Get(feature::Metadata::FMD_PRICE_RATE), pricing))
+ pricing = 0;
string pricingStr;
CHECK_GREATER_OR_EQUAL(pricing, 0, ("Pricing must be positive!"));
for (auto i = 0; i < pricing; i++)
@@ -292,11 +293,12 @@ bool PreResult2::IsStreet() const
string PreResult2::DebugPrint() const
{
stringstream ss;
- ss << "{ IntermediateResult: " <<
- "Name: " << m_str <<
- "; Type: " << GetBestType() <<
- "; Rank: " << static_cast<int>(m_info.m_rank) <<
- "; Distance: " << m_distance << " }";
+ ss << "IntermediateResult [ "
+ << "Name: " << m_str
+ << "; Type: " << GetBestType()
+ << "; Ranking info: " << search::DebugPrint(m_info)
+ << "; Linear model rank: " << m_info.GetLinearModelRank()
+ << " ]";
return ss.str();
}
diff --git a/search/locality_scorer.cpp b/search/locality_scorer.cpp
index 3fecd7ec4b..06583618e9 100644
--- a/search/locality_scorer.cpp
+++ b/search/locality_scorer.cpp
@@ -19,7 +19,7 @@ namespace
{
bool IsAlmostFullMatch(NameScore score)
{
- return score == NAME_SCORE_FULL_MATCH_PREFIX || score == NAME_SCORE_FULL_MATCH;
+ return score == NAME_SCORE_PREFIX || score == NAME_SCORE_FULL_MATCH;
}
} // namespace
diff --git a/search/ranking_info.cpp b/search/ranking_info.cpp
index 1d299d6a8c..05f1d3659a 100644
--- a/search/ranking_info.cpp
+++ b/search/ranking_info.cpp
@@ -11,19 +11,26 @@ namespace
{
// See search/search_quality/scoring_model.py for details. In short,
// these coeffs correspond to coeffs in a linear model.
-double const kDistanceToPivot = -1.0000000;
-double const kRank = 0.7165246;
-double const kFalseCats = -0.3833900;
+double const kDistanceToPivot = -0.37897824370302247;
+double const kRank = 1.0;
+double const kFalseCats = -0.05775625793967508;
+
double const kNameScore[NameScore::NAME_SCORE_COUNT] = {
- -0.1069757 /* Zero */, -0.0250079 /* Substring Prefix */, 0.0447104 /* Substring */,
- 0.0872732 /* Full Match Prefix */, 0.0872732 /* Full Match */
+ -0.11436302557264734 /* Zero */
+ , 0.014295634567960331 /* Substring */
+ , 0.046219090910780115 /* Prefix */
+ , 0.05384830009390816 /* Full Match */
};
double const kSearchType[SearchModel::SEARCH_TYPE_COUNT] = {
- -0.3884116 /* POI */, -0.3884116 /* Building */,
- -0.3214653 /* Street */, -0.3357469 /* Unclassified */,
- -0.4341714 /* Village */, 0.2721947 /* City */,
- 0.4708555 /* State */, 0.7367450 /* Country */
+ -0.09164609318265761 /* POI */
+ , -0.09164609318265761 /* Building */
+ , -0.0805969548653964 /* Street */
+ , -0.030493728520630793 /* Unclassified */
+ , -0.19242203325862917 /* Village */
+ , -0.10945592241057521 /* City */
+ , 0.19250143015921584 /* State */
+ , 0.31211330207867427 /* Country */
};
double TransformDistance(double distance)
diff --git a/search/ranking_utils.cpp b/search/ranking_utils.cpp
index 16d6f4bcbc..97d0c5792f 100644
--- a/search/ranking_utils.cpp
+++ b/search/ranking_utils.cpp
@@ -65,9 +65,8 @@ string DebugPrint(NameScore score)
switch (score)
{
case NAME_SCORE_ZERO: return "Zero";
- case NAME_SCORE_SUBSTRING_PREFIX: return "Substring Prefix";
case NAME_SCORE_SUBSTRING: return "Substring";
- case NAME_SCORE_FULL_MATCH_PREFIX: return "Full Match Prefix";
+ case NAME_SCORE_PREFIX: return "Prefix";
case NAME_SCORE_FULL_MATCH: return "Full Match";
case NAME_SCORE_COUNT: return "Count";
}
diff --git a/search/ranking_utils.hpp b/search/ranking_utils.hpp
index bf359e42b2..29a6ae0102 100644
--- a/search/ranking_utils.hpp
+++ b/search/ranking_utils.hpp
@@ -30,10 +30,9 @@ bool PrefixMatch(QueryParams::Token const & token, strings::UniString const & te
enum NameScore
{
NAME_SCORE_ZERO = 0,
- NAME_SCORE_SUBSTRING_PREFIX = 1,
- NAME_SCORE_SUBSTRING = 2,
- NAME_SCORE_FULL_MATCH_PREFIX = 3,
- NAME_SCORE_FULL_MATCH = 4,
+ NAME_SCORE_SUBSTRING = 1,
+ NAME_SCORE_PREFIX = 2,
+ NAME_SCORE_FULL_MATCH = 3,
NAME_SCORE_COUNT
};
@@ -44,8 +43,8 @@ bool IsStopWord(strings::UniString const & s);
// Normalizes, simplifies and splits string, removes stop-words.
void PrepareStringForMatching(std::string const & name, std::vector<strings::UniString> & tokens);
-template <typename TSlice>
-NameScore GetNameScore(std::string const & name, TSlice const & slice)
+template <typename Slice>
+NameScore GetNameScore(std::string const & name, Slice const & slice)
{
if (slice.Empty())
return NAME_SCORE_ZERO;
@@ -55,8 +54,8 @@ NameScore GetNameScore(std::string const & name, TSlice const & slice)
return GetNameScore(tokens, slice);
}
-template <typename TSlice>
-NameScore GetNameScore(std::vector<strings::UniString> const & tokens, TSlice const & slice)
+template <typename Slice>
+NameScore GetNameScore(std::vector<strings::UniString> const & tokens, Slice const & slice)
{
if (slice.Empty())
return NAME_SCORE_ZERO;
@@ -75,18 +74,19 @@ NameScore GetNameScore(std::vector<strings::UniString> const & tokens, TSlice co
if (!match)
continue;
- if (impl::FullMatch(slice.Get(m - 1), tokens[offset + m - 1]))
- {
- if (m == n)
- return NAME_SCORE_FULL_MATCH;
- score = max(score, NAME_SCORE_SUBSTRING);
- }
- if (lastTokenIsPrefix && impl::PrefixMatch(slice.Get(m - 1), tokens[offset + m - 1]))
- {
- if (m == n)
- return NAME_SCORE_FULL_MATCH_PREFIX;
- score = max(score, NAME_SCORE_SUBSTRING_PREFIX);
- }
+ bool const fullMatch = impl::FullMatch(slice.Get(m - 1), tokens[offset + m - 1]);
+ bool const prefixMatch =
+ lastTokenIsPrefix && impl::PrefixMatch(slice.Get(m - 1), tokens[offset + m - 1]);
+ if (!fullMatch && !prefixMatch)
+ continue;
+
+ if (m == n && fullMatch)
+ return NAME_SCORE_FULL_MATCH;
+
+ if (offset == 0)
+ score = max(score, NAME_SCORE_PREFIX);
+
+ score = max(score, NAME_SCORE_SUBSTRING);
}
return score;
}
diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py
index 12cfa8d9de..3a6b259e56 100755
--- a/search/search_quality/scoring_model.py
+++ b/search/search_quality/scoring_model.py
@@ -16,7 +16,7 @@ import sys
MAX_DISTANCE_METERS = 2e6
MAX_RANK = 255
RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3}
-NAME_SCORES = ['Zero', 'Substring Prefix', 'Substring', 'Full Match Prefix', 'Full Match']
+NAME_SCORES = ['Zero', 'Substring', 'Prefix', 'Full Match']
SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country']
FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats'] + NAME_SCORES + SEARCH_TYPES
@@ -25,8 +25,6 @@ FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats'] + NAME_SCORES + SEARCH_TYPES
def transform_name_score(value, categories_match):
if categories_match == 1:
return 'Zero'
- elif value == 'Full Match Prefix':
- return 'Full Match'
else:
return value
@@ -40,10 +38,6 @@ def normalize_data(data):
cats = data['PureCats'].combine(data['FalseCats'], max)
- # Full prefix match is unified with a full match as these features
- # are collinear. But we need both of them as they're also used in
- # locality sorting.
- #
# TODO (@y, @m): do forward/backward/subset selection of features
# instead of this merging. It would be great to conduct PCA on
# the features too.
@@ -277,7 +271,6 @@ def main(args):
# Following code restores coeffs for merged features.
ws[FEATURES.index('Building')] = ws[FEATURES.index('POI')]
- ws[FEATURES.index('Full Match Prefix')] = ws[FEATURES.index('Full Match')]
ndcgs = compute_ndcgs_for_ws(data, ws)
diff --git a/search/search_tests/ranking_tests.cpp b/search/search_tests/ranking_tests.cpp
index e9dbc4af09..2b10b73d51 100644
--- a/search/search_tests/ranking_tests.cpp
+++ b/search/search_tests/ranking_tests.cpp
@@ -45,12 +45,14 @@ UNIT_TEST(NameTest_Smoke)
TEST_EQUAL(GetScore("New York", "Central Park, New York, US", TokenRange(2, 4)),
NAME_SCORE_FULL_MATCH, ());
TEST_EQUAL(GetScore("New York", "York", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ());
- TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH_PREFIX,
- ());
+ TEST_EQUAL(GetScore("Moscow", "Red Square Mosc", TokenRange(2, 3)), NAME_SCORE_PREFIX, ());
TEST_EQUAL(GetScore("Moscow", "Red Square Moscow", TokenRange(2, 3)), NAME_SCORE_FULL_MATCH, ());
- TEST_EQUAL(GetScore("San Francisco", "Fran", TokenRange(0, 1)), NAME_SCORE_SUBSTRING_PREFIX, ());
+ TEST_EQUAL(GetScore("San Francisco", "Fran", TokenRange(0, 1)), NAME_SCORE_SUBSTRING, ());
TEST_EQUAL(GetScore("San Francisco", "Fran ", TokenRange(0, 1)), NAME_SCORE_ZERO, ());
- TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_FULL_MATCH_PREFIX,
- ());
+ TEST_EQUAL(GetScore("San Francisco", "Sa", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
+ TEST_EQUAL(GetScore("San Francisco", "San ", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
+ TEST_EQUAL(GetScore("Лермонтовъ", "Лермонтов", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
+ TEST_EQUAL(GetScore("фото на документы", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
+ TEST_EQUAL(GetScore("фотоателье", "фото", TokenRange(0, 1)), NAME_SCORE_PREFIX, ());
}
} // namespace