From f48c3e6a6a55501908279b73c39089ca0e04f01e Mon Sep 17 00:00:00 2001 From: Yuri Gorshenin Date: Wed, 20 Apr 2016 15:46:11 +0300 Subject: [search] Postcodes are added to the search index. --- base/string_utils.cpp | 4 + base/string_utils.hpp | 15 ++ generator/search_index_builder.cpp | 16 +- indexer/search_string_utils.cpp | 6 + indexer/search_string_utils.hpp | 2 + search/retrieval.cpp | 51 +++++-- search/retrieval.hpp | 11 +- search/search.pro | 4 + search/search_integration_tests/helpers.cpp | 9 +- search/search_integration_tests/helpers.hpp | 15 ++ .../search_query_v2_test.cpp | 167 ++++++++++++--------- search/search_query.cpp | 5 +- search/search_tests/postcodes_matcher_tests.cpp | 72 +++++++++ search/search_tests/ranking_tests.cpp | 1 + search/search_tests/search_tests.pro | 1 + search/search_trie.hpp | 1 + search/v2/geocoder.cpp | 2 + search/v2/locality_scorer.cpp | 2 + search/v2/postcodes_matcher.cpp | 161 ++++++++++++++++++++ search/v2/postcodes_matcher.hpp | 15 ++ search/v2/ranking_utils.hpp | 69 --------- search/v2/tokens_slice.cpp | 27 ++++ search/v2/tokens_slice.hpp | 67 +++++++++ 23 files changed, 560 insertions(+), 163 deletions(-) create mode 100644 search/search_tests/postcodes_matcher_tests.cpp create mode 100644 search/v2/postcodes_matcher.cpp create mode 100644 search/v2/postcodes_matcher.hpp create mode 100644 search/v2/tokens_slice.cpp create mode 100644 search/v2/tokens_slice.hpp diff --git a/base/string_utils.cpp b/base/string_utils.cpp index 5f26059688..da13ecb282 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -220,6 +220,10 @@ bool IsASCIIString(string const & str) return true; } +bool IsASCIIDigit(UniChar c) { return c >= '0' && c <= '9'; } + +bool IsASCIILatin(UniChar c) { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); } + bool StartsWith(UniString const & s, UniString const & p) { if (p.size() > s.size()) diff --git a/base/string_utils.hpp b/base/string_utils.hpp index 1c4634646a..f7c5b775ff 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -29,6 +29,19 @@ public: template UniString(IterT b, IterT e) : BaseT(b, e) {} bool IsEqualAscii(char const * s) const; + + UniString & operator+=(UniString const & rhs) + { + append(rhs); + return *this; + } + + UniString operator+(UniString const & rhs) const + { + UniString result(*this); + result += rhs; + return result; + } }; /// Performs full case folding for string to make it search-compatible according @@ -67,6 +80,8 @@ bool EqualNoCase(string const & s1, string const & s2); UniString MakeUniString(string const & utf8s); string ToUtf8(UniString const & s); bool IsASCIIString(string const & str); +bool IsASCIIDigit(UniChar c); +bool IsASCIILatin(UniChar c); inline string DebugPrint(UniString const & s) { diff --git a/generator/search_index_builder.cpp b/generator/search_index_builder.cpp index c008e0537c..9ba6112884 100644 --- a/generator/search_index_builder.cpp +++ b/generator/search_index_builder.cpp @@ -136,11 +136,11 @@ struct FeatureNameInserter { } - void AddToken(signed char lang, strings::UniString const & s) const + void AddToken(uint8_t lang, strings::UniString const & s) const { strings::UniString key; key.reserve(s.size() + 1); - key.push_back(static_cast(lang)); + key.push_back(lang); key.append(s.begin(), s.end()); m_keyValuePairs.emplace_back(key, m_val); @@ -278,6 +278,18 @@ public: skipIndex.IsCountryOrState(types) ? m_synonyms : nullptr, m_keyValuePairs, hasStreetType); m_valueBuilder.MakeValue(f, types, index, inserter.m_val); + string const postcode = f.GetMetadata().Get(feature::Metadata::FMD_POSTCODE); + if (!postcode.empty()) + { + // See OSM TagInfo or Wiki about modern postcodes format. The average number of tokens is less + // than two. + buffer_vector tokens; + SplitUniString(search::NormalizeAndSimplifyString(postcode), MakeBackInsertFunctor(tokens), + search::Delimiters()); + for (auto const & token : tokens) + inserter.AddToken(search::kCategoriesLang, search::PostcodeToString(token)); + } + // Skip types for features without names. if (!f.ForEachName(inserter)) skipIndex.SkipEmptyNameTypes(types); diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp index 24aaaa3764..0283191869 100644 --- a/indexer/search_string_utils.cpp +++ b/indexer/search_string_utils.cpp @@ -90,6 +90,12 @@ UniString FeatureTypeToString(uint32_t type) return UniString(s.begin(), s.end()); } +UniString PostcodeToString(strings::UniString const & postcode) +{ + static UniString const kPrefix = MakeUniString("!postcode:"); + return kPrefix + postcode; +} + namespace { char const * kStreetTokensSeparator = "\t -,."; diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index 5d1365c084..1a8c5d0ed6 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -19,6 +19,8 @@ void SplitUniString(strings::UniString const & uniS, F f, DelimsT const & delims strings::UniString FeatureTypeToString(uint32_t type); +strings::UniString PostcodeToString(strings::UniString const & postcode); + template bool TokenizeStringAndCheckIfLastTokenIsPrefix(strings::UniString const & s, ContainerT & tokens, diff --git a/search/retrieval.cpp b/search/retrieval.cpp index 4afdb46e2b..3d4ac39a62 100644 --- a/search/retrieval.cpp +++ b/search/retrieval.cpp @@ -162,6 +162,40 @@ unique_ptr RetrieveGeometryFeaturesImpl( return SortFeaturesAndBuildCBV(move(features)); } +template +struct RetrieveAddressFeaturesAdaptor +{ + template + unique_ptr operator()(TArgs &&... args) + { + return RetrieveAddressFeaturesImpl(forward(args)...); + } +}; + +template