Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorvng <viktor.govako@gmail.com>2016-03-24 03:41:01 +0300
committervng <viktor.govako@gmail.com>2016-03-24 05:06:37 +0300
commitc4cd806f652cc52b9a6c15ff20e021b493ba5322 (patch)
treeb092d34fea53874680ac11f4d389b9019ae5b0da /indexer/search_string_utils.cpp
parentb37167c1a6cca7e4487868fa1cb773ea9103417d (diff)
[search] Treat street synonyms as prefix in search algos.
Diffstat (limited to 'indexer/search_string_utils.cpp')
-rw-r--r--indexer/search_string_utils.cpp240
1 files changed, 173 insertions, 67 deletions
diff --git a/indexer/search_string_utils.cpp b/indexer/search_string_utils.cpp
index 11bf4e3e2c..25f2488a72 100644
--- a/indexer/search_string_utils.cpp
+++ b/indexer/search_string_utils.cpp
@@ -1,118 +1,224 @@
#include "search_string_utils.hpp"
-#include "std/set.hpp"
+#include "base/macros.hpp"
+#include "base/stl_helpers.hpp"
+
+#include "std/algorithm.hpp"
#include "std/transform_iterator.hpp"
-#include "base/macros.hpp"
+namespace search
+{
+using namespace strings;
+
+UniString NormalizeAndSimplifyString(string const & s)
+{
+ UniString uniString = MakeUniString(s);
+ for (size_t i = 0; i < uniString.size(); ++i)
+ {
+ UniChar & c = uniString[i];
+ switch (c)
+ {
+ // Replace "d with stroke" to simple d letter. Used in Vietnamese.
+ // (unicode-compliant implementation leaves it unchanged)
+ case 0x0110:
+ case 0x0111: c = 'd'; break;
+ // Replace small turkish dotless 'ı' with dotted 'i'.
+ // Our own invented hack to avoid well-known Turkish I-letter bug.
+ case 0x0131: c = 'i'; break;
+ // Replace capital turkish dotted 'İ' with dotted lowercased 'i'.
+ // Here we need to handle this case manually too, because default unicode-compliant implementation
+ // of MakeLowerCase converts 'İ' to 'i' + 0x0307.
+ case 0x0130: c = 'i'; break;
+ // Some Danish-specific hacks.
+ case 0x00d8: // Ø
+ case 0x00f8: c = 'o'; break; // ø
+ case 0x0152: // Œ
+ case 0x0153: // œ
+ c = 'o';
+ uniString.insert(uniString.begin() + (i++) + 1, 'e');
+ break;
+ case 0x00c6: // Æ
+ case 0x00e6: // æ
+ c = 'a';
+ uniString.insert(uniString.begin() + (i++) + 1, 'e');
+ break;
+ }
+ }
+
+ MakeLowerCaseInplace(uniString);
+ NormalizeInplace(uniString);
+
+ // Remove accents that can appear after NFKD normalization.
+ uniString.erase_if([](UniChar const & c)
+ {
+ // ̀ COMBINING GRAVE ACCENT
+ // ́ COMBINING ACUTE ACCENT
+ return (c == 0x0300 || c == 0x0301);
+ });
+
+ return uniString;
+
+ /// @todo Restore this logic to distinguish и-й in future.
+ /*
+ // Just after lower casing is a correct place to avoid normalization for specific chars.
+ static auto const isSpecificChar = [](UniChar c) -> bool
+ {
+ return c == 0x0439; // й
+ };
+ UniString result;
+ result.reserve(uniString.size());
+ for (auto i = uniString.begin(), end = uniString.end(); i != end;)
+ {
+ auto j = find_if(i, end, isSpecificChar);
+ // We don't check if (j != i) because UniString and Normalize handle it correctly.
+ UniString normString(i, j);
+ NormalizeInplace(normString);
+ result.insert(result.end(), normString.begin(), normString.end());
+ if (j == end)
+ break;
+ result.push_back(*j);
+ i = j + 1;
+ }
+ return result;
+ */
+}
char const * STREET_TOKENS_SEPARATOR = "\t -,.";
-strings::UniString search::FeatureTypeToString(uint32_t type)
+UniString FeatureTypeToString(uint32_t type)
{
- string const s = "!type:" + strings::to_string(type);
- return strings::UniString(s.begin(), s.end());
+ string const s = "!type:" + to_string(type);
+ return UniString(s.begin(), s.end());
}
-/// @todo Move prefixes, suffixes into separate file (autogenerated).
-/// "Набережная" улица встречается в городах
+namespace
+{
-char const * affics[] =
+class StreetsSynonymsHolder
{
- // Russian
- "аллея", "бульвар", "набережная", "переулок", "площадь", "проезд", "проспект", "шоссе", "тупик", "улица", "тракт", "ал", "бул", "наб", "пер", "пл", "пр", "просп", "ш", "туп", "ул", "тр",
+ vector<UniString> m_synonyms;
+public:
+ StreetsSynonymsHolder()
+ {
+ /// @todo Move prefixes, suffixes into separate file (autogenerated).
+ /// "Набережная" улица встречается в городах
+
+ char const * affics[] =
+ {
+ // Russian
+ "аллея", "бульвар", "набережная", "переулок", "площадь", "проезд", "проспект", "шоссе", "тупик", "улица", "тракт", "ал", "бул", "наб", "пер", "пл", "пр", "просп", "ш", "туп", "ул", "тр",
- // English
- "street", "avenue", "square", "road", "boulevard", "drive", "highway", "lane", "way", "st", "av", "ave", "sq", "rd", "blvd", "dr", "hwy", "ln",
+ // English
+ "street", "avenue", "square", "road", "boulevard", "drive", "highway", "lane", "way", "circle", "st", "av", "ave", "sq", "rd", "blvd", "dr", "hwy", "ln",
- // German
- "strasse", "weg", "platz",
+ // German
+ "strasse", "weg", "platz",
- // Lithuanian
- "g", "pr", "pl", "kel",
+ // Lithuanian
+ "g", "pr", "pl", "kel",
- // Български език - Bulgarian
- "булевард", "бул", "площад", "пл", "улица", "ул", "квартал", "кв",
+ // Български език - Bulgarian
+ "булевард", "бул", "площад", "пл", "улица", "ул", "квартал", "кв",
- // Canada - Canada
- "allee", "alley", "autoroute", "aut", "bypass", "byway", "carrefour", "carref", "chemin", "côte", "crossing", "cross", "expressway", "freeway", "fwy", "line", "link", "loop", "parkway", "pky", "pkwy", "path", "pathway", "ptway", "route", "rte", "trail", "walk",
+ // Canada - Canada
+ "allee", "alley", "autoroute", "aut", "bypass", "byway", "carrefour", "carref", "chemin", "cercle", "circle", "côte", "crossing", "cross", "expressway", "freeway", "fwy", "line", "link", "loop", "parkway", "pky", "pkwy", "path", "pathway", "ptway", "route", "rue", "rte", "trail", "walk",
- // Cesky - Czech
- "ulice", "ul", "náměstí", "nám",
+ // Cesky - Czech
+ "ulice", "ul", "náměstí", "nám",
- // Deutsch - German
- "allee", "al", "brücke", "br", "chaussee", "gasse", "gr", "pfad", "straße", "str",
+ // Deutsch - German
+ "allee", "al", "brücke", "br", "chaussee", "gasse", "gr", "pfad", "straße", "str",
- // Español - Spanish
- "avenida", "avd", "avda", "bulevar", "bulev", "calle", "calleja", "cllja", "callejón", "callej", "cjon", "cllon", "callejuela", "cjla", "callizo", "cllzo", "calzada", "czada", "costera", "coste", "plza", "pza", "plazoleta", "pzta", "plazuela", "plzla", "tránsito", "trans", "transversal", "trval", "trasera", "tras", "travesía", "trva",
+ // Español - Spanish
+ "avenida", "avd", "avda", "bulevar", "bulev", "calle", "calleja", "cllja", "callejón", "callej", "cjon", "cllon", "callejuela", "cjla", "callizo", "cllzo", "calzada", "czada", "costera", "coste", "plza", "pza", "plazoleta", "pzta", "plazuela", "plzla", "tránsito", "trans", "transversal", "trval", "trasera", "tras", "travesía", "trva",
- // Français - French
- "rue", "avenue", "carré", "route", "boulevard", "drive", "autoroute", "lane", "chemin",
+ // Français - French
+ "rue", "avenue", "carré", "cercle", "route", "boulevard", "drive", "autoroute", "lane", "chemin",
- // Nederlands - Dutch
- "laan", "ln.", "straat", "steenweg", "stwg", "st",
+ // Nederlands - Dutch
+ "laan", "ln.", "straat", "steenweg", "stwg", "st",
- // Norsk - Norwegian
- "vei", "veien", "vn", "gaten", "gata", "gt", "plass", "plassen", "sving", "svingen", "sv",
+ // Norsk - Norwegian
+ "vei", "veien", "vn", "gaten", "gata", "gt", "plass", "plassen", "sving", "svingen", "sv",
- // Polski - Polish
- "aleja", "aleje", "aleji", "alejach", "aleją", "plac", "placu", "placem", "ulica", "ulicy",
+ // Polski - Polish
+ "aleja", "aleje", "aleji", "alejach", "aleją", "plac", "placu", "placem", "ulica", "ulicy",
- // Português - Portuguese
- "street", "avenida", "quadrado", "estrada", "boulevard", "carro", "auto-estrada", "lane", "caminho",
+ // Português - Portuguese
+ "street", "avenida", "quadrado", "estrada", "boulevard", "carro", "auto-estrada", "lane", "caminho",
- // Română - Romanian
- "bul", "bdul", "blv", "bulevard", "bulevardu", "calea", "cal", "piața", "pţa", "pța", "strada", "stra", "stradela", "sdla", "stradă", "unitate", "autostradă", "lane",
+ // Română - Romanian
+ "bul", "bdul", "blv", "bulevard", "bulevardu", "calea", "cal", "piața", "pţa", "pța", "strada", "stra", "stradela", "sdla", "stradă", "unitate", "autostradă", "lane",
- // Slovenščina - Slovenian
- "cesta",
+ // Slovenščina - Slovenian
+ "cesta",
- // Suomi - Finnish
- "kaari", "kri", "katu", "kuja", "kj", "kylä", "polku", "tie", "t", "tori", "väylä", "vlä",
+ // Suomi - Finnish
+ "kaari", "kri", "katu", "kuja", "kj", "kylä", "polku", "tie", "t", "tori", "väylä", "vlä",
- // Svenska - Swedish
- "väg", "vägen", "gatan", "gränd", "gränden", "stig", "stigen", "plats", "platsen",
+ // Svenska - Swedish
+ "väg", "vägen", "gatan", "gränd", "gränden", "stig", "stigen", "plats", "platsen",
- // Türkçe - Turkish
- "sokak", "sk", "sok", "sokağı", "cadde", "cd", "caddesi", "bulvar", "bulvarı",
+ // Türkçe - Turkish
+ "sokak", "sk", "sok", "sokağı", "cadde", "cd", "caddesi", "bulvar", "bulvarı",
- // Tiếng Việt – Vietnamese
- "quốc lộ", "ql", "tỉnh lộ", "tl", "Đại lộ", "Đl", "Đường", "Đ", "Đường sắt", "Đs", "Đường phố", "Đp", "vuông", "con Đường", "Đại lộ", "Đường cao tốc",
+ // Tiếng Việt – Vietnamese
+ "quốc lộ", "ql", "tỉnh lộ", "tl", "Đại lộ", "Đl", "Đường", "Đ", "Đường sắt", "Đs", "Đường phố", "Đp", "vuông", "con Đường", "Đại lộ", "Đường cao tốc",
- // Українська - Ukrainian
- "дорога", "провулок", "площа", "шосе", "вулиция", "дор", "пров", "вул"
+ // Українська - Ukrainian
+ "дорога", "провулок", "площа", "шосе", "вулиция", "дор", "пров", "вул"
+ };
+
+ m_synonyms.assign(make_transform_iterator(affics, &NormalizeAndSimplifyString),
+ make_transform_iterator(affics + ARRAY_SIZE(affics), &NormalizeAndSimplifyString));
+ my::SortUnique(m_synonyms);
+ }
+
+ bool MatchPrefix(UniString const & prefix) const
+ {
+ auto const it = lower_bound(m_synonyms.begin(), m_synonyms.end(), prefix);
+ return (it != m_synonyms.end() && StartsWith(*it, prefix));
+ }
+
+ bool MatchEqual(UniString const & prefix) const
+ {
+ return binary_search(m_synonyms.begin(), m_synonyms.end(), prefix);
+ }
};
-void search::GetStreetName(strings::SimpleTokenizer iter, string & streetName)
+StreetsSynonymsHolder g_streets;
+
+} // namespace
+
+UniString GetStreetNameAsKey(string const & name)
{
+ UniString res;
+ SimpleTokenizer iter(name, STREET_TOKENS_SEPARATOR);
while (iter)
{
- string const s = strings::MakeLowerCase(*iter);
+ UniString const s = NormalizeAndSimplifyString(*iter);
++iter;
- char const ** end = affics + ARRAY_SIZE(affics);
-
- if (find(affics, end, s) == end)
- streetName += s;
+ if (!g_streets.MatchEqual(s))
+ res.append(s);
}
+ return res;
}
-void search::GetStreetNameAsKey(string const & name, string & res)
+bool IsStreetSynonym(UniString const & s)
{
- strings::SimpleTokenizer iter(name, STREET_TOKENS_SEPARATOR);
- GetStreetName(iter, res);
+ return g_streets.MatchEqual(s);
}
-bool search::IsStreetSynonym(strings::UniString const & s)
+bool IsStreetSynonymPrefix(UniString const & s)
{
- static set<strings::UniString> const kSynonyms(
- make_transform_iterator(affics, &search::NormalizeAndSimplifyString),
- make_transform_iterator(affics + ARRAY_SIZE(affics), &search::NormalizeAndSimplifyString));
- return kSynonyms.count(s) != 0;
+ return g_streets.MatchPrefix(s);
}
-bool search::ContainsNormalized(string const & str, string const & substr)
+bool ContainsNormalized(string const & str, string const & substr)
{
- strings::UniString const ustr = search::NormalizeAndSimplifyString(str);
- strings::UniString const usubstr = search::NormalizeAndSimplifyString(substr);
+ UniString const ustr = NormalizeAndSimplifyString(str);
+ UniString const usubstr = NormalizeAndSimplifyString(substr);
return std::search(ustr.begin(), ustr.end(), usubstr.begin(), usubstr.end()) != ustr.end();
}
+} // namespace search