diff options
author | vng <viktor.govako@gmail.com> | 2016-03-24 03:41:01 +0300 |
---|---|---|
committer | vng <viktor.govako@gmail.com> | 2016-03-24 05:06:37 +0300 |
commit | c4cd806f652cc52b9a6c15ff20e021b493ba5322 (patch) | |
tree | b092d34fea53874680ac11f4d389b9019ae5b0da /indexer/search_string_utils.cpp | |
parent | b37167c1a6cca7e4487868fa1cb773ea9103417d (diff) |
[search] Treat street synonyms as prefix in search algos.
Diffstat (limited to 'indexer/search_string_utils.cpp')
-rw-r--r-- | indexer/search_string_utils.cpp | 240 |
1 files changed, 173 insertions, 67 deletions
diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp index 11bf4e3e2c..25f2488a72 100644 --- a/indexer/search_string_utils.cpp +++ b/indexer/search_string_utils.cpp @@ -1,118 +1,224 @@ #include "search_string_utils.hpp" -#include "std/set.hpp" +#include "base/macros.hpp" +#include "base/stl_helpers.hpp" + +#include "std/algorithm.hpp" #include "std/transform_iterator.hpp" -#include "base/macros.hpp" +namespace search +{ +using namespace strings; + +UniString NormalizeAndSimplifyString(string const & s) +{ + UniString uniString = MakeUniString(s); + for (size_t i = 0; i < uniString.size(); ++i) + { + UniChar & c = uniString[i]; + switch (c) + { + // Replace "d with stroke" to simple d letter. Used in Vietnamese. + // (unicode-compliant implementation leaves it unchanged) + case 0x0110: + case 0x0111: c = 'd'; break; + // Replace small turkish dotless 'ı' with dotted 'i'. + // Our own invented hack to avoid well-known Turkish I-letter bug. + case 0x0131: c = 'i'; break; + // Replace capital turkish dotted 'İ' with dotted lowercased 'i'. + // Here we need to handle this case manually too, because default unicode-compliant implementation + // of MakeLowerCase converts 'İ' to 'i' + 0x0307. + case 0x0130: c = 'i'; break; + // Some Danish-specific hacks. + case 0x00d8: // Ø + case 0x00f8: c = 'o'; break; // ø + case 0x0152: // Œ + case 0x0153: // œ + c = 'o'; + uniString.insert(uniString.begin() + (i++) + 1, 'e'); + break; + case 0x00c6: // Æ + case 0x00e6: // æ + c = 'a'; + uniString.insert(uniString.begin() + (i++) + 1, 'e'); + break; + } + } + + MakeLowerCaseInplace(uniString); + NormalizeInplace(uniString); + + // Remove accents that can appear after NFKD normalization. + uniString.erase_if([](UniChar const & c) + { + // ̀ COMBINING GRAVE ACCENT + // ́ COMBINING ACUTE ACCENT + return (c == 0x0300 || c == 0x0301); + }); + + return uniString; + + /// @todo Restore this logic to distinguish и-й in future. + /* + // Just after lower casing is a correct place to avoid normalization for specific chars. + static auto const isSpecificChar = [](UniChar c) -> bool + { + return c == 0x0439; // й + }; + UniString result; + result.reserve(uniString.size()); + for (auto i = uniString.begin(), end = uniString.end(); i != end;) + { + auto j = find_if(i, end, isSpecificChar); + // We don't check if (j != i) because UniString and Normalize handle it correctly. + UniString normString(i, j); + NormalizeInplace(normString); + result.insert(result.end(), normString.begin(), normString.end()); + if (j == end) + break; + result.push_back(*j); + i = j + 1; + } + return result; + */ +} char const * STREET_TOKENS_SEPARATOR = "\t -,."; -strings::UniString search::FeatureTypeToString(uint32_t type) +UniString FeatureTypeToString(uint32_t type) { - string const s = "!type:" + strings::to_string(type); - return strings::UniString(s.begin(), s.end()); + string const s = "!type:" + to_string(type); + return UniString(s.begin(), s.end()); } -/// @todo Move prefixes, suffixes into separate file (autogenerated). -/// "Набережная" улица встречается в городах +namespace +{ -char const * affics[] = +class StreetsSynonymsHolder { - // Russian - "аллея", "бульвар", "набережная", "переулок", "площадь", "проезд", "проспект", "шоссе", "тупик", "улица", "тракт", "ал", "бул", "наб", "пер", "пл", "пр", "просп", "ш", "туп", "ул", "тр", + vector<UniString> m_synonyms; +public: + StreetsSynonymsHolder() + { + /// @todo Move prefixes, suffixes into separate file (autogenerated). + /// "Набережная" улица встречается в городах + + char const * affics[] = + { + // Russian + "аллея", "бульвар", "набережная", "переулок", "площадь", "проезд", "проспект", "шоссе", "тупик", "улица", "тракт", "ал", "бул", "наб", "пер", "пл", "пр", "просп", "ш", "туп", "ул", "тр", - // English - "street", "avenue", "square", "road", "boulevard", "drive", "highway", "lane", "way", "st", "av", "ave", "sq", "rd", "blvd", "dr", "hwy", "ln", + // English + "street", "avenue", "square", "road", "boulevard", "drive", "highway", "lane", "way", "circle", "st", "av", "ave", "sq", "rd", "blvd", "dr", "hwy", "ln", - // German - "strasse", "weg", "platz", + // German + "strasse", "weg", "platz", - // Lithuanian - "g", "pr", "pl", "kel", + // Lithuanian + "g", "pr", "pl", "kel", - // Български език - Bulgarian - "булевард", "бул", "площад", "пл", "улица", "ул", "квартал", "кв", + // Български език - Bulgarian + "булевард", "бул", "площад", "пл", "улица", "ул", "квартал", "кв", - // Canada - Canada - "allee", "alley", "autoroute", "aut", "bypass", "byway", "carrefour", "carref", "chemin", "côte", "crossing", "cross", "expressway", "freeway", "fwy", "line", "link", "loop", "parkway", "pky", "pkwy", "path", "pathway", "ptway", "route", "rte", "trail", "walk", + // Canada - Canada + "allee", "alley", "autoroute", "aut", "bypass", "byway", "carrefour", "carref", "chemin", "cercle", "circle", "côte", "crossing", "cross", "expressway", "freeway", "fwy", "line", "link", "loop", "parkway", "pky", "pkwy", "path", "pathway", "ptway", "route", "rue", "rte", "trail", "walk", - // Cesky - Czech - "ulice", "ul", "náměstí", "nám", + // Cesky - Czech + "ulice", "ul", "náměstí", "nám", - // Deutsch - German - "allee", "al", "brücke", "br", "chaussee", "gasse", "gr", "pfad", "straße", "str", + // Deutsch - German + "allee", "al", "brücke", "br", "chaussee", "gasse", "gr", "pfad", "straße", "str", - // Español - Spanish - "avenida", "avd", "avda", "bulevar", "bulev", "calle", "calleja", "cllja", "callejón", "callej", "cjon", "cllon", "callejuela", "cjla", "callizo", "cllzo", "calzada", "czada", "costera", "coste", "plza", "pza", "plazoleta", "pzta", "plazuela", "plzla", "tránsito", "trans", "transversal", "trval", "trasera", "tras", "travesía", "trva", + // Español - Spanish + "avenida", "avd", "avda", "bulevar", "bulev", "calle", "calleja", "cllja", "callejón", "callej", "cjon", "cllon", "callejuela", "cjla", "callizo", "cllzo", "calzada", "czada", "costera", "coste", "plza", "pza", "plazoleta", "pzta", "plazuela", "plzla", "tránsito", "trans", "transversal", "trval", "trasera", "tras", "travesía", "trva", - // Français - French - "rue", "avenue", "carré", "route", "boulevard", "drive", "autoroute", "lane", "chemin", + // Français - French + "rue", "avenue", "carré", "cercle", "route", "boulevard", "drive", "autoroute", "lane", "chemin", - // Nederlands - Dutch - "laan", "ln.", "straat", "steenweg", "stwg", "st", + // Nederlands - Dutch + "laan", "ln.", "straat", "steenweg", "stwg", "st", - // Norsk - Norwegian - "vei", "veien", "vn", "gaten", "gata", "gt", "plass", "plassen", "sving", "svingen", "sv", + // Norsk - Norwegian + "vei", "veien", "vn", "gaten", "gata", "gt", "plass", "plassen", "sving", "svingen", "sv", - // Polski - Polish - "aleja", "aleje", "aleji", "alejach", "aleją", "plac", "placu", "placem", "ulica", "ulicy", + // Polski - Polish + "aleja", "aleje", "aleji", "alejach", "aleją", "plac", "placu", "placem", "ulica", "ulicy", - // Português - Portuguese - "street", "avenida", "quadrado", "estrada", "boulevard", "carro", "auto-estrada", "lane", "caminho", + // Português - Portuguese + "street", "avenida", "quadrado", "estrada", "boulevard", "carro", "auto-estrada", "lane", "caminho", - // Română - Romanian - "bul", "bdul", "blv", "bulevard", "bulevardu", "calea", "cal", "piața", "pţa", "pța", "strada", "stra", "stradela", "sdla", "stradă", "unitate", "autostradă", "lane", + // Română - Romanian + "bul", "bdul", "blv", "bulevard", "bulevardu", "calea", "cal", "piața", "pţa", "pța", "strada", "stra", "stradela", "sdla", "stradă", "unitate", "autostradă", "lane", - // Slovenščina - Slovenian - "cesta", + // Slovenščina - Slovenian + "cesta", - // Suomi - Finnish - "kaari", "kri", "katu", "kuja", "kj", "kylä", "polku", "tie", "t", "tori", "väylä", "vlä", + // Suomi - Finnish + "kaari", "kri", "katu", "kuja", "kj", "kylä", "polku", "tie", "t", "tori", "väylä", "vlä", - // Svenska - Swedish - "väg", "vägen", "gatan", "gränd", "gränden", "stig", "stigen", "plats", "platsen", + // Svenska - Swedish + "väg", "vägen", "gatan", "gränd", "gränden", "stig", "stigen", "plats", "platsen", - // Türkçe - Turkish - "sokak", "sk", "sok", "sokağı", "cadde", "cd", "caddesi", "bulvar", "bulvarı", + // Türkçe - Turkish + "sokak", "sk", "sok", "sokağı", "cadde", "cd", "caddesi", "bulvar", "bulvarı", - // Tiếng Việt – Vietnamese - "quốc lộ", "ql", "tỉnh lộ", "tl", "Đại lộ", "Đl", "Đường", "Đ", "Đường sắt", "Đs", "Đường phố", "Đp", "vuông", "con Đường", "Đại lộ", "Đường cao tốc", + // Tiếng Việt – Vietnamese + "quốc lộ", "ql", "tỉnh lộ", "tl", "Đại lộ", "Đl", "Đường", "Đ", "Đường sắt", "Đs", "Đường phố", "Đp", "vuông", "con Đường", "Đại lộ", "Đường cao tốc", - // Українська - Ukrainian - "дорога", "провулок", "площа", "шосе", "вулиция", "дор", "пров", "вул" + // Українська - Ukrainian + "дорога", "провулок", "площа", "шосе", "вулиция", "дор", "пров", "вул" + }; + + m_synonyms.assign(make_transform_iterator(affics, &NormalizeAndSimplifyString), + make_transform_iterator(affics + ARRAY_SIZE(affics), &NormalizeAndSimplifyString)); + my::SortUnique(m_synonyms); + } + + bool MatchPrefix(UniString const & prefix) const + { + auto const it = lower_bound(m_synonyms.begin(), m_synonyms.end(), prefix); + return (it != m_synonyms.end() && StartsWith(*it, prefix)); + } + + bool MatchEqual(UniString const & prefix) const + { + return binary_search(m_synonyms.begin(), m_synonyms.end(), prefix); + } }; -void search::GetStreetName(strings::SimpleTokenizer iter, string & streetName) +StreetsSynonymsHolder g_streets; + +} // namespace + +UniString GetStreetNameAsKey(string const & name) { + UniString res; + SimpleTokenizer iter(name, STREET_TOKENS_SEPARATOR); while (iter) { - string const s = strings::MakeLowerCase(*iter); + UniString const s = NormalizeAndSimplifyString(*iter); ++iter; - char const ** end = affics + ARRAY_SIZE(affics); - - if (find(affics, end, s) == end) - streetName += s; + if (!g_streets.MatchEqual(s)) + res.append(s); } + return res; } -void search::GetStreetNameAsKey(string const & name, string & res) +bool IsStreetSynonym(UniString const & s) { - strings::SimpleTokenizer iter(name, STREET_TOKENS_SEPARATOR); - GetStreetName(iter, res); + return g_streets.MatchEqual(s); } -bool search::IsStreetSynonym(strings::UniString const & s) +bool IsStreetSynonymPrefix(UniString const & s) { - static set<strings::UniString> const kSynonyms( - make_transform_iterator(affics, &search::NormalizeAndSimplifyString), - make_transform_iterator(affics + ARRAY_SIZE(affics), &search::NormalizeAndSimplifyString)); - return kSynonyms.count(s) != 0; + return g_streets.MatchPrefix(s); } -bool search::ContainsNormalized(string const & str, string const & substr) +bool ContainsNormalized(string const & str, string const & substr) { - strings::UniString const ustr = search::NormalizeAndSimplifyString(str); - strings::UniString const usubstr = search::NormalizeAndSimplifyString(substr); + UniString const ustr = NormalizeAndSimplifyString(str); + UniString const usubstr = NormalizeAndSimplifyString(substr); return std::search(ustr.begin(), ustr.end(), usubstr.begin(), usubstr.end()) != ustr.end(); } +} // namespace search |