[search] ErrorsMade ranking feature.

author: Yuri Gorshenin <y@maps.me> 2017-09-05 16:40:18 +0300
committer: mpimenov <mpimenov@users.noreply.github.com> 2017-09-05 16:45:55 +0300
commit: 62e3177639416107240544835f3a10c4365e15cd (patch)
tree: 43b6dfaf544a8a9df5b3014b02f2ed38e809423c
parent: fc351bfed9bd6e4494878f730c8971ccbb49cd06 (diff)
5 files changed, 54 insertions, 62 deletions
diff --git a/qt/main.cpp b/qt/main.cpp
index e53f4e208c..9e089b1025 100644
--- a/qt/main.cpp
+++ b/qt/main.cpp
@@ -25,8 +25,10 @@
 #include <QtWidgets/QApplication>
 #include <QFileDialog>
 
+DEFINE_string(data_path, "", "Path to data directory");
 DEFINE_string(log_abort_level, my::ToString(my::GetDefaultLogAbortLevel()),
               "Log messages severity that causes termination.");
+DEFINE_string(resources_path, "", "Path to resources directory");
 
 namespace
 {
@@ -94,6 +96,12 @@ int main(int argc, char * argv[])
   google::SetUsageMessage("Desktop application.");
   google::ParseCommandLineFlags(&argc, &argv, true);
 
+  Platform & platform = GetPlatform();
+  if (!FLAGS_resources_path.empty())
+    platform.SetResourceDir(FLAGS_resources_path);
+  if (!FLAGS_data_path.empty())
+    platform.SetWritableDirForTests(FLAGS_data_path);
+
   my::LogLevel level;
   CHECK(my::FromString(FLAGS_log_abort_level, level), ());
   my::g_LogAbortLevel = level;
@@ -115,7 +123,7 @@ int main(int argc, char * argv[])
   alohalytics::Stats::Instance().SetDebugMode(true);
 #endif
 
-  GetPlatform().SetupMeasurementSystem();
+  platform.SetupMeasurementSystem();
 
   // display EULA if needed
   char const * settingsEULA = "EulaAccepted";
@@ -127,7 +135,7 @@ int main(int argc, char * argv[])
 
     string buffer;
     {
-      ReaderPtr<Reader> reader = GetPlatform().GetReader("eula.html");
+      ReaderPtr<Reader> reader = platform.GetReader("eula.html");
       reader.ReadAsString(buffer);
     }
     qt::InfoDialog eulaDialog(qAppName() + QString(" End User Licensing Agreement"), buffer.c_str(), NULL, buttons);
@@ -146,7 +154,7 @@ int main(int argc, char * argv[])
     qt::MainWindow::SetDefaultSurfaceFormat(apiOpenGLES3);
 
 #ifdef BUILD_DESIGNER
-    if (argc >= 2 && GetPlatform().IsFileExistsByFullPath(argv[1]))
+    if (argc >= 2 && platform.IsFileExistsByFullPath(argv[1]))
         mapcssFilePath = argv[1];
     if (0 == mapcssFilePath.length())
     {
diff --git a/search/ranking_info.cpp b/search/ranking_info.cpp
index bd0a52230a..f6812b0daa 100644
--- a/search/ranking_info.cpp
+++ b/search/ranking_info.cpp
@@ -11,26 +11,25 @@ namespace
 {
 // See search/search_quality/scoring_model.py for details.  In short,
 // these coeffs correspond to coeffs in a linear model.
-double const kDistanceToPivot = -0.37897824370302247;
-double const kRank = 1.0;
-double const kFalseCats = -0.05775625793967508;
-
+double const kDistanceToPivot = -1.0000000;
+double const kRank = 0.5238890;
+double const kFalseCats = -0.7319971;
+double const kErrorsMade = -0.0238639;
 double const kNameScore[NameScore::NAME_SCORE_COUNT] = {
-     -0.11436302557264734 /* Zero */
-    , 0.014295634567960331 /* Substring */
-    , 0.046219090910780115 /* Prefix */
-    , 0.05384830009390816 /* Full Match */
+  -0.1683931 /* Zero */,
+  0.0268117 /* Substring */,
+  0.0599575 /* Prefix */,
+  0.0816240 /* Full Match */
 };
-
 double const kType[Model::TYPE_COUNT] = {
-      -0.09164609318265761 /* POI */
-    , -0.09164609318265761 /* Building */
-    , -0.0805969548653964 /* Street */
-    , -0.030493728520630793 /* Unclassified */
-    , -0.19242203325862917 /* Village */
-    , -0.10945592241057521 /* City */
-    , 0.19250143015921584 /* State */
-    , 0.31211330207867427 /* Country */
+  -0.4322325 /* POI */,
+  -0.4322325 /* Building */,
+  -0.3823704 /* Street */,
+  -0.3747346 /* Unclassified */,
+  -0.4453585 /* Village */,
+  0.3900264 /* City */,
+  0.5397572 /* State */,
+  0.7049124 /* Country */
 };
 
 double TransformDistance(double distance)
@@ -48,6 +47,7 @@ void RankingInfo::PrintCSVHeader(ostream & os)
   os << "DistanceToPivot"
      << ",Rank"
      << ",NameScore"
+     << ",ErrorsMade"
      << ",SearchType"
      << ",PureCats"
      << ",FalseCats";
@@ -71,8 +71,13 @@ string DebugPrint(RankingInfo const & info)
 void RankingInfo::ToCSV(ostream & os) const
 {
   os << fixed;
-  os << m_distanceToPivot << "," << static_cast<int>(m_rank) << "," << DebugPrint(m_nameScore)
-     << "," << DebugPrint(m_type) << "," << m_pureCats << "," << m_falseCats;
+  os << m_distanceToPivot << ",";
+  os << static_cast<int>(m_rank) << ",";
+  os << DebugPrint(m_nameScore) << ",";
+  os << GetErrorsMade() << ",";
+  os << DebugPrint(m_type) << ",";
+  os << m_pureCats << ",";
+  os << m_falseCats;
 }
 
 double RankingInfo::GetLinearModelRank() const
@@ -96,7 +101,12 @@ double RankingInfo::GetLinearModelRank() const
     nameScore = NAME_SCORE_ZERO;
   }
 
-  return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] + kType[m_type] +
-         m_falseCats * kFalseCats;
+  return kDistanceToPivot * distanceToPivot + kRank * rank + kNameScore[nameScore] +
+         kErrorsMade * GetErrorsMade() + kType[m_type] + m_falseCats * kFalseCats;
+}
+
+size_t RankingInfo::GetErrorsMade() const
+{
+  return m_errorsMade.IsValid() ? m_errorsMade.m_errorsMade : 0;
 }
 }  // namespace search
diff --git a/search/ranking_info.hpp b/search/ranking_info.hpp
index fc9af152aa..abd600314b 100644
--- a/search/ranking_info.hpp
+++ b/search/ranking_info.hpp
@@ -45,6 +45,8 @@ struct RankingInfo
   // Returns rank calculated by a linear model. Large values
   // correspond to important features.
   double GetLinearModelRank() const;
+
+  size_t GetErrorsMade() const;
 };
 
 string DebugPrint(RankingInfo const & info);
diff --git a/search/search_quality/sample.cpp b/search/search_quality/sample.cpp
index 988149c270..51f1c51692 100644
--- a/search/search_quality/sample.cpp
+++ b/search/search_quality/sample.cpp
@@ -99,7 +99,7 @@ bool Sample::DeserializeFromJSON(string const & jsonStr)
   }
   catch (my::Json::Exception const & e)
   {
-    LOG(LDEBUG, ("Can't parse sample:", e.Msg(), jsonStr));
+    LOG(LWARNING, ("Can't parse sample:", e.Msg(), jsonStr));
   }
   return false;
 }
diff --git a/search/search_quality/scoring_model.py b/search/search_quality/scoring_model.py
index 3a6b259e56..9e599092e7 100755
--- a/search/search_quality/scoring_model.py
+++ b/search/search_quality/scoring_model.py
@@ -2,11 +2,11 @@
 
 from math import exp, log
 from scipy.stats import pearsonr
-from sklearn import cross_validation, grid_search, svm
+from sklearn import svm
+from sklearn.model_selection import GridSearchCV, KFold
 import argparse
 import collections
 import itertools
-import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
 import random
@@ -18,8 +18,7 @@ MAX_RANK = 255
 RELEVANCES = {'Irrelevant': 0, 'Relevant': 1, 'Vital': 3}
 NAME_SCORES = ['Zero', 'Substring', 'Prefix', 'Full Match']
 SEARCH_TYPES = ['POI', 'Building', 'Street', 'Unclassified', 'Village', 'City', 'State', 'Country']
-
-FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats'] + NAME_SCORES + SEARCH_TYPES
+FEATURES = ['DistanceToPivot', 'Rank', 'FalseCats', 'ErrorsMade'] + NAME_SCORES + SEARCH_TYPES
 
 
 def transform_name_score(value, categories_match):
@@ -157,29 +156,6 @@ def transform_data(data):
     return xs, ys
 
 
-def plot_diagrams(xs, ys, features):
-    """
-    For each feature, plots histagrams of x * sign(y), where x is a
-    slice on the feature of a list of pairwise differences between
-    input feature-vectors and y is a list of pairwise differences
-    between relevances of the input feature-vectors.  Stong bias
-    toward positive or negative values in histograms indicates that
-    the current feature is important for ranking, as there is a
-    correlation between difference between features values and
-    relevancy.
-    """
-    for i, f in enumerate(features):
-        x = [x[i] * np.sign(y) for x, y in zip(xs, ys)]
-
-        l, r = min(x), max(x)
-        d = max(abs(l), abs(r))
-
-        plt.subplot(4, 4, i + 1)
-        plt.hist(x, bins=8, range=(-d, d))
-        plt.title(f)
-    plt.show()
-
-
 def show_pearson_statistics(xs, ys, features):
     """
     Shows info about Pearson coefficient between features and
@@ -241,7 +217,7 @@ def cpp_output(features, ws):
         else:
             print_const(f, w)
     print_array('kNameScore', 'NameScore::NAME_SCORE_COUNT', ns)
-    print_array('kSearchType', 'SearchModel::SEARCH_TYPE_COUNT', st)
+    print_array('kType', 'Model::TYPE_COUNT', st)
 
 
 def main(args):
@@ -249,20 +225,17 @@ def main(args):
     normalize_data(data)
 
     ndcgs = compute_ndcgs_without_ws(data);
-    print('Current NDCG: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs)))
+    print('Current NDCG: {:.3f}, std: {:.3f}'.format(np.mean(ndcgs), np.std(ndcgs)))
     print()
 
     xs, ys = transform_data(data)
 
-    if args.plot:
-        plot_diagrams(xs, ys, FEATURES)
-
     clf = svm.LinearSVC(random_state=args.seed)
-    cv = cross_validation.KFold(len(ys), n_folds=5, shuffle=True, random_state=args.seed)
+    cv = KFold(n_splits=5, shuffle=True, random_state=args.seed)
 
     # "C" stands for the regularizer constant.
     grid = {'C': np.power(10.0, np.arange(-5, 6))}
-    gs = grid_search.GridSearchCV(clf, grid, scoring='accuracy', cv=cv)
+    gs = GridSearchCV(clf, grid, scoring='roc_auc', cv=cv)
     gs.fit(xs, ys)
 
     ws = gs.best_estimator_.coef_[0]
@@ -274,8 +247,8 @@ def main(args):
 
     ndcgs = compute_ndcgs_for_ws(data, ws)
 
-    print('NDCG mean: {}, std: {}'.format(np.mean(ndcgs), np.std(ndcgs)))
-    print('Accuracy: {}'.format(gs.best_score_))
+    print('NDCG mean: {:.3f}, std: {:.3f}'.format(np.mean(ndcgs), np.std(ndcgs)))
+    print('ROC AUC: {:.3f}'.format(gs.best_score_))
 
     if args.pearson:
         print()
@@ -292,7 +265,6 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument('--seed', help='random seed', type=int)
-    parser.add_argument('--plot', help='plot diagrams', action='store_true')
     parser.add_argument('--pearson', help='show pearson statistics', action='store_true')
     parser.add_argument('--cpp', help='generate output in the C++ format', action='store_true')
     args = parser.parse_args()
author	Yuri Gorshenin <y@maps.me>	2017-09-05 16:40:18 +0300
committer	mpimenov <mpimenov@users.noreply.github.com>	2017-09-05 16:45:55 +0300
commit	62e3177639416107240544835f3a10c4365e15cd (patch)
tree	43b6dfaf544a8a9df5b3014b02f2ed38e809423c
parent	fc351bfed9bd6e4494878f730c8971ccbb49cd06 (diff)