diff options
author | vng <viktor.govako@gmail.com> | 2016-03-24 03:41:01 +0300 |
---|---|---|
committer | vng <viktor.govako@gmail.com> | 2016-03-24 05:06:37 +0300 |
commit | c4cd806f652cc52b9a6c15ff20e021b493ba5322 (patch) | |
tree | b092d34fea53874680ac11f4d389b9019ae5b0da /indexer/search_string_utils.hpp | |
parent | b37167c1a6cca7e4487868fa1cb773ea9103417d (diff) |
[search] Treat street synonyms as prefix in search algos.
Diffstat (limited to 'indexer/search_string_utils.hpp')
-rw-r--r-- | indexer/search_string_utils.hpp | 77 |
1 files changed, 3 insertions, 74 deletions
diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index afa4459f7a..5d1365c084 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -8,78 +8,7 @@ namespace search // This function should be used for all search strings normalization. // It does some magic text transformation which greatly helps us to improve our search. -inline strings::UniString NormalizeAndSimplifyString(string const & s) -{ - strings::UniString uniString = strings::MakeUniString(s); - for (size_t i = 0; i < uniString.size(); ++i) - { - strings::UniChar & c = uniString[i]; - switch (c) - { - // Replace "d with stroke" to simple d letter. Used in Vietnamese. - // (unicode-compliant implementation leaves it unchanged) - case 0x0110: - case 0x0111: c = 'd'; break; - // Replace small turkish dotless 'ı' with dotted 'i'. - // Our own invented hack to avoid well-known Turkish I-letter bug. - case 0x0131: c = 'i'; break; - // Replace capital turkish dotted 'İ' with dotted lowercased 'i'. - // Here we need to handle this case manually too, because default unicode-compliant implementation - // of MakeLowerCase converts 'İ' to 'i' + 0x0307. - case 0x0130: c = 'i'; break; - // Some Danish-specific hacks. - case 0x00d8: // Ø - case 0x00f8: c = 'o'; break; // ø - case 0x0152: // Œ - case 0x0153: // œ - c = 'o'; - uniString.insert(uniString.begin() + (i++) + 1, 'e'); - break; - case 0x00c6: // Æ - case 0x00e6: // æ - c = 'a'; - uniString.insert(uniString.begin() + (i++) + 1, 'e'); - break; - } - } - - MakeLowerCaseInplace(uniString); - NormalizeInplace(uniString); - - // Remove accents that can appear after NFKD normalization. - uniString.erase_if([](strings::UniChar const & c) - { - // ̀ COMBINING GRAVE ACCENT - // ́ COMBINING ACUTE ACCENT - return (c == 0x0300 || c == 0x0301); - }); - - return uniString; - - /// @todo Restore this logic to distinguish и-й in future. - /* - // Just after lower casing is a correct place to avoid normalization for specific chars. - static auto const isSpecificChar = [](UniChar c) -> bool - { - return c == 0x0439; // й - }; - UniString result; - result.reserve(uniString.size()); - for (auto i = uniString.begin(), end = uniString.end(); i != end;) - { - auto j = find_if(i, end, isSpecificChar); - // We don't check if (j != i) because UniString and Normalize handle it correctly. - UniString normString(i, j); - NormalizeInplace(normString); - result.insert(result.end(), normString.begin(), normString.end()); - if (j == end) - break; - result.push_back(*j); - i = j + 1; - } - return result; - */ -} +strings::UniString NormalizeAndSimplifyString(string const & s); template <class DelimsT, typename F> void SplitUniString(strings::UniString const & uniS, F f, DelimsT const & delims) @@ -110,10 +39,10 @@ bool TokenizeStringAndCheckIfLastTokenIsPrefix(string const & s, delimiter); } -void GetStreetName(strings::SimpleTokenizer iter, string & streetName); -void GetStreetNameAsKey(string const & name, string & res); +strings::UniString GetStreetNameAsKey(string const & name); bool IsStreetSynonym(strings::UniString const & s); +bool IsStreetSynonymPrefix(strings::UniString const & s); /// Normalizes both str and substr, and then returns true if substr is found in str. /// Used in native platform code for search in localized strings (cuisines, categories, strings etc.). |