diff options
author | tatiana-yan <tatiana.kondakova@gmail.com> | 2019-02-14 15:47:45 +0300 |
---|---|---|
committer | mpimenov <mpimenov@users.noreply.github.com> | 2019-02-18 13:42:14 +0300 |
commit | 6f1e6de8f75265f0bd4065c49212816ad864bfda (patch) | |
tree | dad3e36a1b9dd040551961baa24fc05da22bcde6 /indexer | |
parent | 0e658ee41115f405449b7e12b9440619173a91c1 (diff) |
[search] Add PreprocessBeforeTokenization to process abbreviations which can be splited during tokenization
Diffstat (limited to 'indexer')
-rw-r--r-- | indexer/search_string_utils.cpp | 26 | ||||
-rw-r--r-- | indexer/search_string_utils.hpp | 4 |
2 files changed, 30 insertions, 0 deletions
diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp index 9bc79abd07..9a59f00b7a 100644 --- a/indexer/search_string_utils.cpp +++ b/indexer/search_string_utils.cpp @@ -128,6 +128,32 @@ UniString NormalizeAndSimplifyString(string const & s) */ } +void PreprocessBeforeTokenization(strings::UniString & query) +{ + search::Delimiters const delims; + vector<pair<strings::UniString, strings::UniString>> const replacements = { + {MakeUniString("пр-т"), MakeUniString("проспект")}, + {MakeUniString("пр-д"), MakeUniString("проезд")}, + {MakeUniString("наб-я"), MakeUniString("набережная")}}; + + for (auto const & replacement : replacements) + { + auto start = query.begin(); + while ((start = std::search(start, query.end(), replacement.first.begin(), + replacement.first.end())) != query.end()) + { + auto end = start + replacement.first.size(); + if ((start == query.begin() || delims(*(start - 1))) && (end == query.end() || delims(*end))) + { + auto const dist = distance(query.begin(), start); + query.Replace(start, end, replacement.second.begin(), replacement.second.end()); + start = query.begin() + dist; + } + start += 1; + } + } +} + UniString FeatureTypeToString(uint32_t type) { string const s = "!type:" + to_string(type); diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp index 45e65fab18..74de6a716b 100644 --- a/indexer/search_string_utils.hpp +++ b/indexer/search_string_utils.hpp @@ -16,6 +16,10 @@ namespace search // It does some magic text transformation which greatly helps us to improve our search. strings::UniString NormalizeAndSimplifyString(std::string const & s); +// Replace abbreviations which can be split during tokenization with full form. +// Eg. "пр-т" -> "проспект". +void PreprocessBeforeTokenization(strings::UniString & query); + template <class Delims, typename Fn> void SplitUniString(strings::UniString const & uniS, Fn && f, Delims const & delims) { |