diff options
author | Yuri Gorshenin <y@maps.me> | 2017-06-01 14:19:32 +0300 |
---|---|---|
committer | Yuri Gorshenin <y@maps.me> | 2017-06-01 15:29:06 +0300 |
commit | 08b0166a665ab98bfaf1273df9aa6911387abc09 (patch) | |
tree | 1040e186105cee29162c717c555ffe8d2da059ad /indexer/search_string_utils.cpp | |
parent | 7f836a82eb28a21430171a658e819598de143475 (diff) |
[search] Fixed numero signs in search queries and names.
Diffstat (limited to 'indexer/search_string_utils.cpp')
-rw-r--r-- | indexer/search_string_utils.cpp | 74 |
1 files changed, 58 insertions, 16 deletions
diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp index eb73e314a0..88027211d6 100644 --- a/indexer/search_string_utils.cpp +++ b/indexer/search_string_utils.cpp @@ -5,12 +5,41 @@ #include "base/macros.hpp" #include "base/mem_trie.hpp" -#include "std/algorithm.hpp" +#include <cctype> +using namespace std; using namespace strings; namespace search { +namespace +{ +// Replaces '#' followed by an end-of-string or a digit with space. +void RemoveNumeroSigns(UniString & s) +{ + size_t const n = s.size(); + + size_t i = 0; + while (i < n) + { + if (s[i] != '#') + { + ++i; + continue; + } + + size_t j = i + 1; + while (j < n && isspace(s[j])) + ++j; + + if (j == n || isdigit(s[j])) + s[i] = ' '; + + i = j; + } +} +} // namespace + UniString NormalizeAndSimplifyString(string const & s) { UniString uniString = MakeUniString(s); @@ -22,27 +51,39 @@ UniString NormalizeAndSimplifyString(string const & s) // Replace "d with stroke" to simple d letter. Used in Vietnamese. // (unicode-compliant implementation leaves it unchanged) case 0x0110: - case 0x0111: c = 'd'; break; - // Replace small turkish dotless 'ı' with dotted 'i'. - // Our own invented hack to avoid well-known Turkish I-letter bug. - case 0x0131: c = 'i'; break; + case 0x0111: + c = 'd'; + break; + // Replace small turkish dotless 'ı' with dotted 'i'. Our own + // invented hack to avoid well-known Turkish I-letter bug. + case 0x0131: + c = 'i'; + break; // Replace capital turkish dotted 'İ' with dotted lowercased 'i'. - // Here we need to handle this case manually too, because default unicode-compliant implementation - // of MakeLowerCase converts 'İ' to 'i' + 0x0307. - case 0x0130: c = 'i'; break; + // Here we need to handle this case manually too, because default + // unicode-compliant implementation of MakeLowerCase converts 'İ' + // to 'i' + 0x0307. + case 0x0130: + c = 'i'; + break; // Some Danish-specific hacks. - case 0x00d8: // Ø - case 0x00f8: c = 'o'; break; // ø - case 0x0152: // Œ - case 0x0153: // œ + case 0x00d8: // Ø + case 0x00f8: + c = 'o'; + break; // ø + case 0x0152: // Œ + case 0x0153: // œ c = 'o'; uniString.insert(uniString.begin() + (i++) + 1, 'e'); break; - case 0x00c6: // Æ - case 0x00e6: // æ + case 0x00c6: // Æ + case 0x00e6: // æ c = 'a'; uniString.insert(uniString.begin() + (i++) + 1, 'e'); break; + case 0x2116: // № + c = '#'; + break; } } @@ -50,13 +91,14 @@ UniString NormalizeAndSimplifyString(string const & s) NormalizeInplace(uniString); // Remove accents that can appear after NFKD normalization. - uniString.erase_if([](UniChar const & c) - { + uniString.erase_if([](UniChar const & c) { // ̀ COMBINING GRAVE ACCENT // ́ COMBINING ACUTE ACCENT return (c == 0x0300 || c == 0x0301); }); + RemoveNumeroSigns(uniString); + return uniString; /// @todo Restore this logic to distinguish и-й in future. |