diff options
author | ygorshenin <mipt.vi002@gmail.com> | 2016-07-14 12:23:40 +0300 |
---|---|---|
committer | GitHub <noreply@github.com> | 2016-07-14 12:23:40 +0300 |
commit | 99345452166d0896e52bd831a86bb56282fe3855 (patch) | |
tree | 8e45cbc40963560373871a8bbd3eb3c4b695c348 /generator | |
parent | bc98ef2a672d55fd7fb2ddbf97c573488d4ca574 (diff) | |
parent | 61babee342ffe9da76e920aba8c61294607ab2f1 (diff) |
Merge pull request #3761 from mgsergio/booking-name-matching
[booking] Booking name matching
Diffstat (limited to 'generator')
-rw-r--r-- | generator/booking_dataset.cpp | 58 | ||||
-rw-r--r-- | generator/booking_dataset.hpp | 5 | ||||
-rw-r--r-- | generator/booking_quality_check/booking_quality_check.cpp | 19 | ||||
-rw-r--r-- | generator/booking_scoring.cpp | 141 | ||||
-rw-r--r-- | generator/booking_scoring.hpp | 22 | ||||
-rw-r--r-- | generator/osm_element.cpp | 13 | ||||
-rw-r--r-- | generator/osm_element.hpp | 3 |
7 files changed, 195 insertions, 66 deletions
diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index e8c103ee07..60745a6364 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -1,11 +1,11 @@ #include "generator/booking_dataset.hpp" +#include "generator/booking_scoring.hpp" + #include "platform/local_country_file_utils.hpp" #include "platform/platform.hpp" #include "indexer/ftypes_matcher.hpp" -#include "indexer/search_delimiters.hpp" -#include "indexer/search_string_utils.hpp" #include "geometry/distance_on_sphere.hpp" @@ -171,45 +171,6 @@ vector<size_t> BookingDataset::GetNearestHotels(double lat, double lon, size_t l return indexes; } -bool BookingDataset::MatchByName(string const & osmName, - vector<size_t> const & bookingIndexes) const -{ - return false; - - // Match name. - // vector<strings::UniString> osmTokens; - // NormalizeAndTokenizeString(name, osmTokens, search::Delimiters()); - // - // cout << "\n------------- " << name << endl; - // - // bool matched = false; - // for (auto const & index : indexes) - // { - // vector<strings::UniString> bookingTokens; - // NormalizeAndTokenizeString(m_hotels[index].name, bookingTokens, search::Delimiters()); - // - // map<size_t, vector<pair<size_t, size_t>>> weightPair; - // - // for (size_t j = 0; j < osmTokens.size(); ++j) - // { - // for (size_t i = 0; i < bookingTokens.size(); ++i) - // { - // size_t distance = strings::EditDistance(osmTokens[j].begin(), osmTokens[j].end(), - // bookingTokens[i].begin(), - // bookingTokens[i].end()); - // if (distance < 3) - // weightPair[distance].emplace_back(i, j); - // } - // } - // - // if (!weightPair.empty()) - // { - // cout << m_hotels[e.second] << endl; - // matched = true; - // } - // } -} - void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) const { for (auto const & hotel : m_hotels) @@ -248,6 +209,8 @@ void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) cons if (!hotel.houseNumber.empty()) e.AddTag("addr:housenumber", hotel.houseNumber); + // Matching booking.com hotel types to OpenStreetMap values. + // Booking types are listed in the closed API docs. switch (hotel.type) { case 19: @@ -301,13 +264,6 @@ void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) cons } } -// static -double BookingDataset::ScoreByLinearNormDistance(double distance) -{ - distance = my::clamp(distance, 0, kDistanceLimitInMeters); - return 1.0 - distance / kDistanceLimitInMeters; -} - void BookingDataset::LoadHotels(istream & src, string const & addressReferencePath) { m_hotels.clear(); @@ -373,11 +329,7 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const for (size_t const j : bookingIndexes) { - auto const & hotel = GetHotel(j); - double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); - double score = ScoreByLinearNormDistance(distanceMeters); - matched = score > kOptimalThreshold; - if (matched) + if (booking_scoring::Match(GetHotel(j), e).IsMatched()) break; } diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp index f56bba4d17..c37859cca6 100644 --- a/generator/booking_dataset.hpp +++ b/generator/booking_dataset.hpp @@ -22,9 +22,6 @@ public: double static constexpr kDistanceLimitInMeters = 150; size_t static constexpr kMaxSelectedElements = 3; - // Calculated with tools/python/booking_hotels_quality.py - double static constexpr kOptimalThreshold = 0.709283; - struct Hotel { enum class Fields @@ -92,8 +89,6 @@ public: void BuildFeatures(function<void(OsmElement *)> const & fn) const; - static double ScoreByLinearNormDistance(double distance); - protected: vector<Hotel> m_hotels; diff --git a/generator/booking_quality_check/booking_quality_check.cpp b/generator/booking_quality_check/booking_quality_check.cpp index 1696a69d6c..b4e817fc18 100644 --- a/generator/booking_quality_check/booking_quality_check.cpp +++ b/generator/booking_quality_check/booking_quality_check.cpp @@ -1,4 +1,5 @@ #include "generator/booking_dataset.hpp" +#include "generator/booking_scoring.hpp" #include "generator/osm_source.hpp" #include "geometry/distance_on_sphere.hpp" @@ -6,6 +7,7 @@ #include "std/fstream.hpp" #include "std/iostream.hpp" #include "std/numeric.hpp" +#include "std/random.hpp" #include "3party/gflags/src/gflags/gflags.h" @@ -15,6 +17,7 @@ DEFINE_string(osm_file_name, "", "Input .o5m file"); DEFINE_string(booking_data, "", "Path to booking data in .tsv format"); DEFINE_string(sample_data, "", "Sample output path"); DEFINE_uint64(selection_size, 1000, "Selection size"); +DEFINE_uint64(seed, minstd_rand::default_seed, "Seed for random shuffle"); using namespace generator; @@ -57,9 +60,7 @@ int main(int argc, char * argv[]) vector<size_t> elementIndexes(elements.size()); iota(elementIndexes.begin(), elementIndexes.end(), 0); - // In first implementation, we used random_shufle for reference dataset. - // Next time we are going to replace random_shuffle by shuffle with defined seed. - random_shuffle(elementIndexes.begin(), elementIndexes.end()); + shuffle(elementIndexes.begin(), elementIndexes.end(), minstd_rand(FLAGS_seed)); if (FLAGS_selection_size < elementIndexes.size()) elementIndexes.resize(FLAGS_selection_size); @@ -73,15 +74,19 @@ int main(int argc, char * argv[]) for (size_t const j : bookingIndexes) { auto const & hotel = bookingDataset.GetHotel(j); - double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); - double score = BookingDataset::ScoreByLinearNormDistance(distanceMeters); + auto const score = booking_scoring::Match(hotel, e); - bool matched = score > BookingDataset::kOptimalThreshold; + double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); + bool matched = score.IsMatched(); outStream << "# ------------------------------------------" << fixed << setprecision(6) << endl; outStream << (matched ? 'y' : 'n') << " \t" << i << "\t " << j - << " distance: " << distanceMeters << " score: " << score << endl; + << "\tdistance: " << distanceMeters + << "\tdistance score: " << score.m_linearNormDistanceScore + << "\tname score: " << score.m_nameSimilarityScore + << "\tresult score: " << score.GetMatchingScore() + << endl; outStream << "# " << e << endl; outStream << "# " << hotel << endl; outStream << "# URL: https://www.openstreetmap.org/?mlat=" << hotel.lat diff --git a/generator/booking_scoring.cpp b/generator/booking_scoring.cpp new file mode 100644 index 0000000000..2ddd071f6f --- /dev/null +++ b/generator/booking_scoring.cpp @@ -0,0 +1,141 @@ +#include "generator/booking_scoring.hpp" + +#include "generator/booking_dataset.hpp" + +#include "indexer/search_delimiters.hpp" +#include "indexer/search_string_utils.hpp" + +#include "geometry/distance_on_sphere.hpp" + +#include "base/collection_cast.hpp" +#include "base/stl_iterator.hpp" + +#include "std/algorithm.hpp" +#include "std/vector.hpp" + +namespace generator +{ +namespace booking_scoring +{ +namespace +{ +// Calculated with tools/python/booking_hotels_quality.py. +double constexpr kOptimalThreshold = 0.317324; + +template <typename T, typename U> +struct decay_equiv : + std::is_same<typename std::decay<T>::type, U>::type +{}; + +using WeightedBagOfWords = vector<pair<strings::UniString, double>>; + +vector<strings::UniString> StringToSetOfWords(string const & str) +{ + vector<strings::UniString> result; + search::NormalizeAndTokenizeString(str, result, search::Delimiters{}); + sort(begin(result), end(result)); + return result; +} + +WeightedBagOfWords MakeWeightedBagOfWords(vector<strings::UniString> const & words) +{ + // TODO(mgsergio): Calculate tf-idsf score for every word. + auto constexpr kTfIdfScorePlaceholder = 1; + + WeightedBagOfWords result; + for (auto i = 0; i < words.size(); ++i) + { + result.emplace_back(words[i], kTfIdfScorePlaceholder); + while (i + 1 < words.size() && words[i] == words[i + 1]) + { + result.back().second += kTfIdfScorePlaceholder; // TODO(mgsergio): tf-idf score for result[i].frist; + ++i; + } + } + return result; +} + +double WeightedBagsDotProduct(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs) +{ + double result{}; + + auto lhsIt = begin(lhs); + auto rhsIt = begin(rhs); + + while (lhsIt != end(lhs) && rhsIt != end(rhs)) + { + if (lhsIt->first == rhsIt->first) + { + result += lhsIt->second * rhsIt->second; + ++lhsIt; + ++rhsIt; + } + else if (lhsIt->first < rhsIt->first) + { + ++lhsIt; + } + else + { + ++rhsIt; + } + } + + return result; +} + +double WeightedBagOfWordsCos(WeightedBagOfWords const & lhs, WeightedBagOfWords const & rhs) +{ + auto const product = WeightedBagsDotProduct(lhs, rhs); + auto const lhsLength = sqrt(WeightedBagsDotProduct(lhs, lhs)); + auto const rhsLength = sqrt(WeightedBagsDotProduct(rhs, rhs)); + + if (product == 0.0) + return 0.0; + + return product / (lhsLength * rhsLength); +} + +double GetLinearNormDistanceScore(double distance) +{ + distance = my::clamp(distance, 0, BookingDataset::kDistanceLimitInMeters); + return 1.0 - distance / BookingDataset::kDistanceLimitInMeters; +} + +double GetNameSimilarityScore(string const & booking_name, string const & osm_name) +{ + auto const aws = MakeWeightedBagOfWords(StringToSetOfWords(booking_name)); + auto const bws = MakeWeightedBagOfWords(StringToSetOfWords(osm_name)); + + if (aws.empty() && bws.empty()) + return 1.0; + if (aws.empty() || bws.empty()) + return 0.0; + + return WeightedBagOfWordsCos(aws, bws); +} +} // namespace + +double BookingMatchScore::GetMatchingScore() const +{ + return m_linearNormDistanceScore * m_nameSimilarityScore; +} + +bool BookingMatchScore::IsMatched() const +{ + return GetMatchingScore() > kOptimalThreshold; +} + +BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e) +{ + BookingMatchScore score; + + auto const distance = ms::DistanceOnEarth(e.lat, e.lon, h.lat, h.lon); + score.m_linearNormDistanceScore = GetLinearNormDistanceScore(distance); + + // TODO(mgsergio): Check all translations and use the best one. + score.m_nameSimilarityScore = GetNameSimilarityScore(h.name, e.GetTag("name")); + + return score; +} +} // namespace booking_scoring +} // namespace generator diff --git a/generator/booking_scoring.hpp b/generator/booking_scoring.hpp new file mode 100644 index 0000000000..e5516a4470 --- /dev/null +++ b/generator/booking_scoring.hpp @@ -0,0 +1,22 @@ +#pragma once + +#include "generator/booking_dataset.hpp" + +struct OsmElement; + +namespace generator +{ +namespace booking_scoring +{ +struct BookingMatchScore +{ + double GetMatchingScore() const; + bool IsMatched() const; + + double m_linearNormDistanceScore{}; + double m_nameSimilarityScore{}; +}; + +BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e); +} // namespace booking_scoring +} // namespace generator diff --git a/generator/osm_element.cpp b/generator/osm_element.cpp index 1c65dd599d..5e54137283 100644 --- a/generator/osm_element.cpp +++ b/generator/osm_element.cpp @@ -121,6 +121,19 @@ string OsmElement::ToString(string const & shift) const return ss.str(); } +string OsmElement::GetTag(string const & key) const +{ + auto const it = find_if(begin(m_tags), end(m_tags), [&key](Tag const & tag) + { + return tag.key == key; + }); + + if (it == end(m_tags)) + return {}; + + return it->value; +} + string DebugPrint(OsmElement const & e) { return e.ToString(); diff --git a/generator/osm_element.hpp b/generator/osm_element.hpp index fc1187c6a7..6a89905b9b 100644 --- a/generator/osm_element.hpp +++ b/generator/osm_element.hpp @@ -152,7 +152,8 @@ struct OsmElement if (!v.empty()) AddTag(k, v); } + + string GetTag(string const & key) const; }; string DebugPrint(OsmElement const & e); - |