Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authortatiana-yan <tatiana.kondakova@gmail.com>2019-02-14 15:47:45 +0300
committermpimenov <mpimenov@users.noreply.github.com>2019-02-18 13:42:14 +0300
commit6f1e6de8f75265f0bd4065c49212816ad864bfda (patch)
treedad3e36a1b9dd040551961baa24fc05da22bcde6 /indexer
parent0e658ee41115f405449b7e12b9440619173a91c1 (diff)
[search] Add PreprocessBeforeTokenization to process abbreviations which can be splited during tokenization
Diffstat (limited to 'indexer')
-rw-r--r--indexer/search_string_utils.cpp26
-rw-r--r--indexer/search_string_utils.hpp4
2 files changed, 30 insertions, 0 deletions
diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp
index 9bc79abd07..9a59f00b7a 100644
--- a/indexer/search_string_utils.cpp
+++ b/indexer/search_string_utils.cpp
@@ -128,6 +128,32 @@ UniString NormalizeAndSimplifyString(string const & s)
*/
}
+void PreprocessBeforeTokenization(strings::UniString & query)
+{
+ search::Delimiters const delims;
+ vector<pair<strings::UniString, strings::UniString>> const replacements = {
+ {MakeUniString("пр-т"), MakeUniString("проспект")},
+ {MakeUniString("пр-д"), MakeUniString("проезд")},
+ {MakeUniString("наб-я"), MakeUniString("набережная")}};
+
+ for (auto const & replacement : replacements)
+ {
+ auto start = query.begin();
+ while ((start = std::search(start, query.end(), replacement.first.begin(),
+ replacement.first.end())) != query.end())
+ {
+ auto end = start + replacement.first.size();
+ if ((start == query.begin() || delims(*(start - 1))) && (end == query.end() || delims(*end)))
+ {
+ auto const dist = distance(query.begin(), start);
+ query.Replace(start, end, replacement.second.begin(), replacement.second.end());
+ start = query.begin() + dist;
+ }
+ start += 1;
+ }
+ }
+}
+
UniString FeatureTypeToString(uint32_t type)
{
string const s = "!type:" + to_string(type);
diff --git a/indexer/search_string_utils.hpp b/indexer/search_string_utils.hpp
index 45e65fab18..74de6a716b 100644
--- a/indexer/search_string_utils.hpp
+++ b/indexer/search_string_utils.hpp
@@ -16,6 +16,10 @@ namespace search
// It does some magic text transformation which greatly helps us to improve our search.
strings::UniString NormalizeAndSimplifyString(std::string const & s);
+// Replace abbreviations which can be split during tokenization with full form.
+// Eg. "пр-т" -> "проспект".
+void PreprocessBeforeTokenization(strings::UniString & query);
+
template <class Delims, typename Fn>
void SplitUniString(strings::UniString const & uniS, Fn && f, Delims const & delims)
{