Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorygorshenin <mipt.vi002@gmail.com>2016-07-14 12:23:40 +0300
committerGitHub <noreply@github.com>2016-07-14 12:23:40 +0300
commit99345452166d0896e52bd831a86bb56282fe3855 (patch)
tree8e45cbc40963560373871a8bbd3eb3c4b695c348 /generator
parentbc98ef2a672d55fd7fb2ddbf97c573488d4ca574 (diff)
parent61babee342ffe9da76e920aba8c61294607ab2f1 (diff)
Merge pull request #3761 from mgsergio/booking-name-matching
[booking] Booking name matching
Diffstat (limited to 'generator')
-rw-r--r--generator/booking_dataset.cpp58
-rw-r--r--generator/booking_dataset.hpp5
-rw-r--r--generator/booking_quality_check/booking_quality_check.cpp19
-rw-r--r--generator/booking_scoring.cpp141
-rw-r--r--generator/booking_scoring.hpp22
-rw-r--r--generator/osm_element.cpp13
-rw-r--r--generator/osm_element.hpp3
7 files changed, 195 insertions, 66 deletions
diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp
index e8c103ee07..60745a6364 100644
--- a/generator/booking_dataset.cpp
+++ b/generator/booking_dataset.cpp
@@ -1,11 +1,11 @@
#include "generator/booking_dataset.hpp"
+#include "generator/booking_scoring.hpp"
+
#include "platform/local_country_file_utils.hpp"
#include "platform/platform.hpp"
#include "indexer/ftypes_matcher.hpp"
-#include "indexer/search_delimiters.hpp"
-#include "indexer/search_string_utils.hpp"
#include "geometry/distance_on_sphere.hpp"
@@ -171,45 +171,6 @@ vector<size_t> BookingDataset::GetNearestHotels(double lat, double lon, size_t l
return indexes;
}
-bool BookingDataset::MatchByName(string const & osmName,
- vector<size_t> const & bookingIndexes) const
-{
- return false;
-
- // Match name.
- // vector<strings::UniString> osmTokens;
- // NormalizeAndTokenizeString(name, osmTokens, search::Delimiters());
- //
- // cout << "\n------------- " << name << endl;
- //
- // bool matched = false;
- // for (auto const & index : indexes)
- // {
- // vector<strings::UniString> bookingTokens;
- // NormalizeAndTokenizeString(m_hotels[index].name, bookingTokens, search::Delimiters());
- //
- // map<size_t, vector<pair<size_t, size_t>>> weightPair;
- //
- // for (size_t j = 0; j < osmTokens.size(); ++j)
- // {
- // for (size_t i = 0; i < bookingTokens.size(); ++i)
- // {
- // size_t distance = strings::EditDistance(osmTokens[j].begin(), osmTokens[j].end(),
- // bookingTokens[i].begin(),
- // bookingTokens[i].end());
- // if (distance < 3)
- // weightPair[distance].emplace_back(i, j);
- // }
- // }
- //
- // if (!weightPair.empty())
- // {
- // cout << m_hotels[e.second] << endl;
- // matched = true;
- // }
- // }
-}
-
void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) const
{
for (auto const & hotel : m_hotels)
@@ -248,6 +209,8 @@ void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) cons
if (!hotel.houseNumber.empty())
e.AddTag("addr:housenumber", hotel.houseNumber);
+ // Matching booking.com hotel types to OpenStreetMap values.
+ // Booking types are listed in the closed API docs.
switch (hotel.type)
{
case 19:
@@ -301,13 +264,6 @@ void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) cons
}
}
-// static
-double BookingDataset::ScoreByLinearNormDistance(double distance)
-{
- distance = my::clamp(distance, 0, kDistanceLimitInMeters);
- return 1.0 - distance / kDistanceLimitInMeters;
-}
-
void BookingDataset::LoadHotels(istream & src, string const & addressReferencePath)
{
m_hotels.clear();
@@ -373,11 +329,7 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const
for (size_t const j : bookingIndexes)
{
- auto const & hotel = GetHotel(j);
- double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon);
- double score = ScoreByLinearNormDistance(distanceMeters);
- matched = score > kOptimalThreshold;
- if (matched)
+ if (booking_scoring::Match(GetHotel(j), e).IsMatched())
break;
}
diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp
index f56bba4d17..c37859cca6 100644
--- a/generator/booking_dataset.hpp
+++ b/generator/booking_dataset.hpp
@@ -22,9 +22,6 @@ public:
double static constexpr kDistanceLimitInMeters = 150;
size_t static constexpr kMaxSelectedElements = 3;
- // Calculated with tools/python/booking_hotels_quality.py
- double static constexpr kOptimalThreshold = 0.709283;
-
struct Hotel
{
enum class Fields
@@ -92,8 +89,6 @@ public:
void BuildFeatures(function<void(OsmElement *)> const & fn) const;
- static double ScoreByLinearNormDistance(double distance);
-
protected:
vector<Hotel> m_hotels;
diff --git a/generator/booking_quality_check/booking_quality_check.cpp b/generator/booking_quality_check/booking_quality_check.cpp
index 1696a69d6c..b4e817fc18 100644
--- a/generator/booking_quality_check/booking_quality_check.cpp
+++ b/generator/booking_quality_check/booking_quality_check.cpp
@@ -1,4 +1,5 @@
#include "generator/booking_dataset.hpp"
+#include "generator/booking_scoring.hpp"
#include "generator/osm_source.hpp"
#include "geometry/distance_on_sphere.hpp"
@@ -6,6 +7,7 @@
#include "std/fstream.hpp"
#include "std/iostream.hpp"
#include "std/numeric.hpp"
+#include "std/random.hpp"
#include "3party/gflags/src/gflags/gflags.h"
@@ -15,6 +17,7 @@ DEFINE_string(osm_file_name, "", "Input .o5m file");
DEFINE_string(booking_data, "", "Path to booking data in .tsv format");
DEFINE_string(sample_data, "", "Sample output path");
DEFINE_uint64(selection_size, 1000, "Selection size");
+DEFINE_uint64(seed, minstd_rand::default_seed, "Seed for random shuffle");
using namespace generator;
@@ -57,9 +60,7 @@ int main(int argc, char * argv[])
vector<size_t> elementIndexes(elements.size());
iota(elementIndexes.begin(), elementIndexes.end(), 0);
- // In first implementation, we used random_shufle for reference dataset.
- // Next time we are going to replace random_shuffle by shuffle with defined seed.
- random_shuffle(elementIndexes.begin(), elementIndexes.end());
+ shuffle(elementIndexes.begin(), elementIndexes.end(), minstd_rand(FLAGS_seed));
if (FLAGS_selection_size < elementIndexes.size())
elementIndexes.resize(FLAGS_selection_size);
@@ -73,15 +74,19 @@ int main(int argc, char * argv[])
for (size_t const j : bookingIndexes)
{
auto const & hotel = bookingDataset.GetHotel(j);
- double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon);
- double score = BookingDataset::ScoreByLinearNormDistance(distanceMeters);
+ auto const score = booking_scoring::Match(hotel, e);
- bool matched = score > BookingDataset::kOptimalThreshold;
+ double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon);
+ bool matched = score.IsMatched();
outStream << "# ------------------------------------------" << fixed << setprecision(6)
<< endl;
outStream << (matched ? 'y' : 'n') << " \t" << i << "\t " << j
- << " distance: " << distanceMeters << " score: " << score << endl;
+ << "\tdistance: " << distanceMeters
+ << "\tdistance score: " << score.m_linearNormDistanceScore
+ << "\tname score: " << score.m_nameSimilarityScore
+ << "\tresult score: " << score.GetMatchingScore()
+ << endl;
outStream << "# " << e << endl;
outStream << "# " << hotel << endl;
outStream << "# URL: https://www.openstreetmap.org/?mlat=" << hotel.lat
diff --git a/generator/booking_scoring.cpp b/generator/booking_scoring.cpp
new file mode 100644
index 0000000000..2ddd071f6f
--- /dev/null
+++ b/generator/booking_scoring.cpp
@@ -0,0 +1,141 @@
+#include "generator/booking_scoring.hpp"
+
+#include "generator/booking_dataset.hpp"
+
+#include "indexer/search_delimiters.hpp"
+#include "indexer/search_string_utils.hpp"
+
+#include "geometry/distance_on_sphere.hpp"
+
+#include "base/collection_cast.hpp"
+#include "base/stl_iterator.hpp"
+
+#include "std/algorithm.hpp"
+#include "std/vector.hpp"
+
+namespace generator
+{
+namespace booking_scoring
+{
+namespace
+{
+// Calculated with tools/python/booking_hotels_quality.py.
+double constexpr kOptimalThreshold = 0.317324;
+
+template <typename T, typename U>
+struct decay_equiv :
+ std::is_same<typename std::decay<T>::type, U>::type
+{};
+
+using WeightedBagOfWords = vector<pair<strings::UniString, double>>;
+
+vector<strings::UniString> StringToSetOfWords(string const & str)
+{
+ vector<strings::UniString> result;
+ search::NormalizeAndTokenizeString(str, result, search::Delimiters{});
+ sort(begin(result), end(result));
+ return result;
+}
+
+WeightedBagOfWords MakeWeightedBagOfWords(vector<strings::UniString> const & words)
+{
+ // TODO(mgsergio): Calculate tf-idsf score for every word.
+ auto constexpr kTfIdfScorePlaceholder = 1;
+
+ WeightedBagOfWords result;
+ for (auto i = 0; i < words.size(); ++i)
+ {
+ result.emplace_back(words[i], kTfIdfScorePlaceholder);
+ while (i + 1 < words.size() && words[i] == words[i + 1])
+ {
+ result.back().second += kTfIdfScorePlaceholder; // TODO(mgsergio): tf-idf score for result[i].frist;
+ ++i;
+ }
+ }
+ return result;
+}
+
+double WeightedBagsDotProduct(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs)
+{
+ double result{};
+
+ auto lhsIt = begin(lhs);
+ auto rhsIt = begin(rhs);
+
+ while (lhsIt != end(lhs) && rhsIt != end(rhs))
+ {
+ if (lhsIt->first == rhsIt->first)
+ {
+ result += lhsIt->second * rhsIt->second;
+ ++lhsIt;
+ ++rhsIt;
+ }
+ else if (lhsIt->first < rhsIt->first)
+ {
+ ++lhsIt;
+ }
+ else
+ {
+ ++rhsIt;
+ }
+ }
+
+ return result;
+}
+
+double WeightedBagOfWordsCos(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs)
+{
+ auto const product = WeightedBagsDotProduct(lhs, rhs);
+ auto const lhsLength = sqrt(WeightedBagsDotProduct(lhs, lhs));
+ auto const rhsLength = sqrt(WeightedBagsDotProduct(rhs, rhs));
+
+ if (product == 0.0)
+ return 0.0;
+
+ return product / (lhsLength * rhsLength);
+}
+
+double GetLinearNormDistanceScore(double distance)
+{
+ distance = my::clamp(distance, 0, BookingDataset::kDistanceLimitInMeters);
+ return 1.0 - distance / BookingDataset::kDistanceLimitInMeters;
+}
+
+double GetNameSimilarityScore(string const & booking_name, string const & osm_name)
+{
+ auto const aws = MakeWeightedBagOfWords(StringToSetOfWords(booking_name));
+ auto const bws = MakeWeightedBagOfWords(StringToSetOfWords(osm_name));
+
+ if (aws.empty() && bws.empty())
+ return 1.0;
+ if (aws.empty() || bws.empty())
+ return 0.0;
+
+ return WeightedBagOfWordsCos(aws, bws);
+}
+} // namespace
+
+double BookingMatchScore::GetMatchingScore() const
+{
+ return m_linearNormDistanceScore * m_nameSimilarityScore;
+}
+
+bool BookingMatchScore::IsMatched() const
+{
+ return GetMatchingScore() > kOptimalThreshold;
+}
+
+BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e)
+{
+ BookingMatchScore score;
+
+ auto const distance = ms::DistanceOnEarth(e.lat, e.lon, h.lat, h.lon);
+ score.m_linearNormDistanceScore = GetLinearNormDistanceScore(distance);
+
+ // TODO(mgsergio): Check all translations and use the best one.
+ score.m_nameSimilarityScore = GetNameSimilarityScore(h.name, e.GetTag("name"));
+
+ return score;
+}
+} // namespace booking_scoring
+} // namespace generator
diff --git a/generator/booking_scoring.hpp b/generator/booking_scoring.hpp
new file mode 100644
index 0000000000..e5516a4470
--- /dev/null
+++ b/generator/booking_scoring.hpp
@@ -0,0 +1,22 @@
+#pragma once
+
+#include "generator/booking_dataset.hpp"
+
+struct OsmElement;
+
+namespace generator
+{
+namespace booking_scoring
+{
+struct BookingMatchScore
+{
+ double GetMatchingScore() const;
+ bool IsMatched() const;
+
+ double m_linearNormDistanceScore{};
+ double m_nameSimilarityScore{};
+};
+
+BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e);
+} // namespace booking_scoring
+} // namespace generator
diff --git a/generator/osm_element.cpp b/generator/osm_element.cpp
index 1c65dd599d..5e54137283 100644
--- a/generator/osm_element.cpp
+++ b/generator/osm_element.cpp
@@ -121,6 +121,19 @@ string OsmElement::ToString(string const & shift) const
return ss.str();
}
+string OsmElement::GetTag(string const & key) const
+{
+ auto const it = find_if(begin(m_tags), end(m_tags), [&key](Tag const & tag)
+ {
+ return tag.key == key;
+ });
+
+ if (it == end(m_tags))
+ return {};
+
+ return it->value;
+}
+
string DebugPrint(OsmElement const & e)
{
return e.ToString();
diff --git a/generator/osm_element.hpp b/generator/osm_element.hpp
index fc1187c6a7..6a89905b9b 100644
--- a/generator/osm_element.hpp
+++ b/generator/osm_element.hpp
@@ -152,7 +152,8 @@ struct OsmElement
if (!v.empty())
AddTag(k, v);
}
+
+ string GetTag(string const & key) const;
};
string DebugPrint(OsmElement const & e);
-