diff options
author | Sergey Magidovich <mgsergio@mapswithme.com> | 2016-07-11 22:42:43 +0300 |
---|---|---|
committer | Sergey Magidovich <mgsergio@mapswithme.com> | 2016-07-12 10:31:59 +0300 |
commit | 88d5775c230e5ac2caa06ad2266b893b300a2211 (patch) | |
tree | 27515e79567c3f1883df47cdf068d246a5724584 /generator | |
parent | 6f423fbac4dc92d0bbca7f4d2a7c34c600a59633 (diff) |
Refactor. Add Matching by name.
Diffstat (limited to 'generator')
-rw-r--r-- | generator/booking_dataset.cpp | 56 | ||||
-rw-r--r-- | generator/booking_dataset.hpp | 5 | ||||
-rw-r--r-- | generator/booking_quality_check/booking_quality_check.cpp | 9 | ||||
-rw-r--r-- | generator/booking_scoring.cpp | 97 | ||||
-rw-r--r-- | generator/booking_scoring.hpp | 21 | ||||
-rw-r--r-- | generator/osm_element.cpp | 14 | ||||
-rw-r--r-- | generator/osm_element.hpp | 3 |
7 files changed, 142 insertions, 63 deletions
diff --git a/generator/booking_dataset.cpp b/generator/booking_dataset.cpp index 8239f68344..fec4af1689 100644 --- a/generator/booking_dataset.cpp +++ b/generator/booking_dataset.cpp @@ -1,11 +1,11 @@ #include "generator/booking_dataset.hpp" +#include "generator/booking_scoring.hpp" + #include "platform/local_country_file_utils.hpp" #include "platform/platform.hpp" #include "indexer/ftypes_matcher.hpp" -#include "indexer/search_delimiters.hpp" -#include "indexer/search_string_utils.hpp" #include "geometry/distance_on_sphere.hpp" @@ -171,45 +171,6 @@ vector<size_t> BookingDataset::GetNearestHotels(double lat, double lon, size_t l return indexes; } -bool BookingDataset::MatchByName(string const & osmName, - vector<size_t> const & bookingIndexes) const -{ - return false; - - // Match name. - // vector<strings::UniString> osmTokens; - // NormalizeAndTokenizeString(name, osmTokens, search::Delimiters()); - // - // cout << "\n------------- " << name << endl; - // - // bool matched = false; - // for (auto const & index : indexes) - // { - // vector<strings::UniString> bookingTokens; - // NormalizeAndTokenizeString(m_hotels[index].name, bookingTokens, search::Delimiters()); - // - // map<size_t, vector<pair<size_t, size_t>>> weightPair; - // - // for (size_t j = 0; j < osmTokens.size(); ++j) - // { - // for (size_t i = 0; i < bookingTokens.size(); ++i) - // { - // size_t distance = strings::EditDistance(osmTokens[j].begin(), osmTokens[j].end(), - // bookingTokens[i].begin(), - // bookingTokens[i].end()); - // if (distance < 3) - // weightPair[distance].emplace_back(i, j); - // } - // } - // - // if (!weightPair.empty()) - // { - // cout << m_hotels[e.second] << endl; - // matched = true; - // } - // } -} - void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) const { for (auto const & hotel : m_hotels) @@ -302,13 +263,6 @@ void BookingDataset::BuildFeatures(function<void(OsmElement *)> const & fn) cons } } -// static -double BookingDataset::ScoreByLinearNormDistance(double distance) -{ - distance = my::clamp(distance, 0, kDistanceLimitInMeters); - return 1.0 - distance / kDistanceLimitInMeters; -} - void BookingDataset::LoadHotels(istream & src, string const & addressReferencePath) { m_hotels.clear(); @@ -374,11 +328,7 @@ bool BookingDataset::MatchWithBooking(OsmElement const & e) const for (size_t const j : bookingIndexes) { - auto const & hotel = GetHotel(j); - double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); - double score = ScoreByLinearNormDistance(distanceMeters); - matched = score > kOptimalThreshold; - if (matched) + if (booking_scoring::Match(GetHotel(j), e).IsMatched()) break; } diff --git a/generator/booking_dataset.hpp b/generator/booking_dataset.hpp index f56bba4d17..c37859cca6 100644 --- a/generator/booking_dataset.hpp +++ b/generator/booking_dataset.hpp @@ -22,9 +22,6 @@ public: double static constexpr kDistanceLimitInMeters = 150; size_t static constexpr kMaxSelectedElements = 3; - // Calculated with tools/python/booking_hotels_quality.py - double static constexpr kOptimalThreshold = 0.709283; - struct Hotel { enum class Fields @@ -92,8 +89,6 @@ public: void BuildFeatures(function<void(OsmElement *)> const & fn) const; - static double ScoreByLinearNormDistance(double distance); - protected: vector<Hotel> m_hotels; diff --git a/generator/booking_quality_check/booking_quality_check.cpp b/generator/booking_quality_check/booking_quality_check.cpp index 6602687ec4..0331a7f9d3 100644 --- a/generator/booking_quality_check/booking_quality_check.cpp +++ b/generator/booking_quality_check/booking_quality_check.cpp @@ -1,4 +1,5 @@ #include "generator/booking_dataset.hpp" +#include "generator/booking_scoring.hpp" #include "generator/osm_source.hpp" #include "geometry/distance_on_sphere.hpp" @@ -73,15 +74,15 @@ int main(int argc, char * argv[]) for (size_t const j : bookingIndexes) { auto const & hotel = bookingDataset.GetHotel(j); - double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); - double const score = BookingDataset::ScoreByLinearNormDistance(distanceMeters); + auto const score = booking_scoring::Match(hotel, e); - bool matched = score > BookingDataset::kOptimalThreshold; + double const distanceMeters = ms::DistanceOnEarth(e.lat, e.lon, hotel.lat, hotel.lon); + bool matched = score.IsMatched(); outStream << "# ------------------------------------------" << fixed << setprecision(6) << endl; outStream << (matched ? 'y' : 'n') << " \t" << i << "\t " << j - << " distance: " << distanceMeters << " score: " << score << endl; + << " distance: " << distanceMeters << " score: " << score.GetMatchingScore() << endl; outStream << "# " << e << endl; outStream << "# " << hotel << endl; outStream << "# URL: https://www.openstreetmap.org/?mlat=" << hotel.lat diff --git a/generator/booking_scoring.cpp b/generator/booking_scoring.cpp new file mode 100644 index 0000000000..ff2aaec177 --- /dev/null +++ b/generator/booking_scoring.cpp @@ -0,0 +1,97 @@ +#include "generator/booking_scoring.hpp" + +#include "generator/booking_dataset.hpp" + +#include "indexer/search_string_utils.hpp" +#include "indexer/search_delimiters.hpp" + +#include "geometry/distance_on_sphere.hpp" + +#include "base/collection_cast.hpp" + +namespace generator +{ +namespace booking_scoring +{ +namespace +{ +// Calculated with tools/python/booking_hotels_quality.py. +double constexpr kOptimalThreshold = 0.151001; + +template <typename T, typename U> +struct decay_equiv : + std::is_same<typename std::decay<T>::type, U>::type +{}; + +set<strings::UniString> StringToSetOfWords(string const & str) +{ + vector<strings::UniString> result; + search::NormalizeAndTokenizeString(str, result, search::Delimiters{}); + return my::collection_cast<set>(result); +} + +// TODO(mgsergio): Update existing one in base or wherever... +// Or just use one from boost. +struct CounterIterator +{ + template<typename T, typename = typename enable_if<!decay_equiv<T, CounterIterator>::value>::type> + CounterIterator & operator=(T const &) { ++m_count; return *this; } + CounterIterator & operator++() { return *this; } + CounterIterator & operator++(int) { return *this; } + CounterIterator & operator*() { return *this; } + uint32_t Count() const { return m_count; } + + uint32_t m_count = 0; +}; + +double StringSimilarityScore(string const & a, string const & b) +{ + auto const aWords = StringToSetOfWords(a); + auto const bWords = StringToSetOfWords(b); + + auto const intersectionCard = set_intersection(begin(aWords), end(aWords), + begin(bWords), end(bWords), + CounterIterator()).Count(); + auto const aLikeBScore = static_cast<double>(intersectionCard) / aWords.size(); + auto const bLikeAScore = static_cast<double>(intersectionCard) / bWords.size(); + + return aLikeBScore * bLikeAScore; +} + +double GetLinearNormDistanceScrore(double distance) +{ + distance = my::clamp(distance, 0, BookingDataset::kDistanceLimitInMeters); + return 1.0 - distance / BookingDataset::kDistanceLimitInMeters; +} + +double GetNameSimilarityScore(string const & booking_name, string const & osm_name) +{ + return StringSimilarityScore(booking_name, osm_name); +} +} // namespace + +double BookingMatchScore::GetMatchingScore() const +{ + return m_linearNormDistanceScore * m_nameSimilarityScore; +} + +bool BookingMatchScore::IsMatched() const +{ + return GetMatchingScore() > kOptimalThreshold; +} + +BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e) +{ + BookingMatchScore score; + + auto const distance = ms::DistanceOnEarth(e.lat, e.lon, h.lat, h.lon); + score.m_linearNormDistanceScore = GetLinearNormDistanceScrore(distance); + + string osmHotelName; + score.m_nameSimilarityScore = e.GetTag("name", osmHotelName) + ? GetNameSimilarityScore(h.name, osmHotelName) : 0; + + return score; +} +} // namespace booking_scoring +} // namespace generator diff --git a/generator/booking_scoring.hpp b/generator/booking_scoring.hpp new file mode 100644 index 0000000000..d92482cf35 --- /dev/null +++ b/generator/booking_scoring.hpp @@ -0,0 +1,21 @@ +#pragma once + +#include "generator/booking_dataset.hpp" +#include "generator/osm_element.hpp" + +namespace generator +{ +namespace booking_scoring +{ +struct BookingMatchScore +{ + double GetMatchingScore() const; + bool IsMatched() const; + + double m_linearNormDistanceScore{}; + double m_nameSimilarityScore{}; +}; + +BookingMatchScore Match(BookingDataset::Hotel const & h, OsmElement const & e); +} // namespace booking_scoring +} // namespace generator diff --git a/generator/osm_element.cpp b/generator/osm_element.cpp index 1c65dd599d..8ae3340132 100644 --- a/generator/osm_element.cpp +++ b/generator/osm_element.cpp @@ -121,6 +121,20 @@ string OsmElement::ToString(string const & shift) const return ss.str(); } +bool OsmElement::GetTag(string const & key, string & value) const +{ + auto const it = find_if(begin(m_tags), end(m_tags), [&key](Tag const & tag) + { + return tag.key == key; + }); + + if (it == end(m_tags)) + return false; + + value = it->value; + return true; +} + string DebugPrint(OsmElement const & e) { return e.ToString(); diff --git a/generator/osm_element.hpp b/generator/osm_element.hpp index fc1187c6a7..c473f7d175 100644 --- a/generator/osm_element.hpp +++ b/generator/osm_element.hpp @@ -152,7 +152,8 @@ struct OsmElement if (!v.empty()) AddTag(k, v); } + + bool GetTag(string const & key, string & value) const; }; string DebugPrint(OsmElement const & e); - |