diff options
author | Alex Zolotarev <alex@maps.me> | 2016-02-15 21:07:25 +0300 |
---|---|---|
committer | Sergey Yershov <yershov@corp.mail.ru> | 2016-03-23 16:21:19 +0300 |
commit | 64d7f7694384e5fe12385a074884190f956ccd80 (patch) | |
tree | 4873880e6e0d469ff8b53c3b6352a161f2243377 /indexer/search_string_utils.hpp | |
parent | 0ac3f7c5b0b55a5ec344b656913e2c3218a9dc6f (diff) |
Moved CategoriesHolder from search into indexer library. Need to access it from Editor.
Diffstat (limited to 'indexer/search_string_utils.hpp')
-rw-r--r-- | indexer/search_string_utils.hpp | 118 |
1 files changed, 118 insertions, 0 deletions
diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp new file mode 100644 index 0000000000..a79851a92f --- /dev/null +++ b/indexer/search_string_utils.hpp @@ -0,0 +1,118 @@ +#pragma once +#include "base/string_utils.hpp" + +#include "std/algorithm.hpp" + +namespace search +{ + +// This function should be used for all search strings normalization. +// It does some magic text transformation which greatly helps us to improve our search. +inline strings::UniString NormalizeAndSimplifyString(string const & s) +{ + using namespace strings; + UniString uniString = MakeUniString(s); + for (size_t i = 0; i < uniString.size(); ++i) + { + UniChar & c = uniString[i]; + switch (c) + { + // Replace "d with stroke" to simple d letter. Used in Vietnamese. + // (unicode-compliant implementation leaves it unchanged) + case 0x0110: + case 0x0111: c = 'd'; break; + // Replace small turkish dotless 'ı' with dotted 'i'. + // Our own invented hack to avoid well-known Turkish I-letter bug. + case 0x0131: c = 'i'; break; + // Replace capital turkish dotted 'İ' with dotted lowercased 'i'. + // Here we need to handle this case manually too, because default unicode-compliant implementation + // of MakeLowerCase converts 'İ' to 'i' + 0x0307. + case 0x0130: c = 'i'; break; + // Some Danish-specific hacks. + case 0x00d8: // Ø + case 0x00f8: c = 'o'; break; // ø + case 0x0152: // Œ + case 0x0153: // œ + c = 'o'; + uniString.insert(uniString.begin() + (i++) + 1, 'e'); + break; + case 0x00c6: // Æ + case 0x00e6: // æ + c = 'a'; + uniString.insert(uniString.begin() + (i++) + 1, 'e'); + break; + } + } + + MakeLowerCaseInplace(uniString); + NormalizeInplace(uniString); + + // Remove accents that can appear after NFKD normalization. + uniString.erase_if([](UniChar const & c) + { + // ̀ COMBINING GRAVE ACCENT + // ́ COMBINING ACUTE ACCENT + return (c == 0x0300 || c == 0x0301); + }); + + return uniString; + + /// @todo Restore this logic to distinguish и-й in future. + /* + // Just after lower casing is a correct place to avoid normalization for specific chars. + static auto const isSpecificChar = [](UniChar c) -> bool + { + return c == 0x0439; // й + }; + UniString result; + result.reserve(uniString.size()); + for (auto i = uniString.begin(), end = uniString.end(); i != end;) + { + auto j = find_if(i, end, isSpecificChar); + // We don't check if (j != i) because UniString and Normalize handle it correctly. + UniString normString(i, j); + NormalizeInplace(normString); + result.insert(result.end(), normString.begin(), normString.end()); + if (j == end) + break; + result.push_back(*j); + i = j + 1; + } + return result; + */ +} + +template <class DelimsT, typename F> +void SplitUniString(strings::UniString const & uniS, F f, DelimsT const & delims) +{ + for (strings::TokenizeIterator<DelimsT> iter(uniS, delims); iter; ++iter) + f(iter.GetUniString()); +} + +strings::UniString FeatureTypeToString(uint32_t type); + +template <class ContainerT, class DelimsT> +bool TokenizeStringAndCheckIfLastTokenIsPrefix(strings::UniString const & s, + ContainerT & tokens, + DelimsT const & delimiter) +{ + SplitUniString(s, MakeBackInsertFunctor(tokens), delimiter); + return !s.empty() && !delimiter(s.back()); +} + + +template <class ContainerT, class DelimsT> +bool TokenizeStringAndCheckIfLastTokenIsPrefix(string const & s, + ContainerT & tokens, + DelimsT const & delimiter) +{ + return TokenizeStringAndCheckIfLastTokenIsPrefix(NormalizeAndSimplifyString(s), + tokens, + delimiter); +} + +void GetStreetName(strings::SimpleTokenizer iter, string & streetName); +void GetStreetNameAsKey(string const & name, string & res); + +bool IsStreetSynonym(strings::UniString const & s); +} // namespace search |