Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/search
diff options
context:
space:
mode:
authorYuri Gorshenin <y@maps.me>2016-06-07 18:04:07 +0300
committerYuri Gorshenin <y@maps.me>2016-06-15 17:43:46 +0300
commit83a097a846e1debe6fc80cc02aed415a743ebbdd (patch)
treec333a26c8a1fa09038e3e9c9f38cf1f96da3499f /search
parent9201925db05acf6b490c217752fb111810b351ea (diff)
[search] House number parser revisited.
Diffstat (limited to 'search')
-rw-r--r--search/features_layer_matcher.hpp15
-rw-r--r--search/features_layer_path_finder.cpp21
-rw-r--r--search/geocoder.cpp4
-rw-r--r--search/house_numbers_matcher.cpp657
-rw-r--r--search/house_numbers_matcher.hpp86
-rwxr-xr-xsearch/search_quality/clusterize-tag-values.lisp18
-rw-r--r--search/search_tests/house_numbers_matcher_test.cpp139
7 files changed, 620 insertions, 320 deletions
diff --git a/search/features_layer_matcher.hpp b/search/features_layer_matcher.hpp
index b6ff10cd23..edeced6064 100644
--- a/search/features_layer_matcher.hpp
+++ b/search/features_layer_matcher.hpp
@@ -154,9 +154,9 @@ private:
// |buildings| doesn't contain buildings matching by house number,
// so following code reads buildings in POIs vicinities and checks
// house numbers.
- vector<Parse> queryParses;
- ParseQuery(parent.m_subQuery, parent.m_lastTokenIsPrefix, queryParses);
- if (queryParses.empty())
+ vector<house_numbers::Token> queryParse;
+ ParseQuery(parent.m_subQuery, parent.m_lastTokenIsPrefix, queryParse);
+ if (queryParse.empty())
return;
for (size_t i = 0; i < pois.size(); ++i)
@@ -167,7 +167,8 @@ private:
{
if (m_postcodes && !m_postcodes->GetBit(ft.GetID().m_index))
return;
- if (HouseNumbersMatch(strings::MakeUniString(ft.GetHouseNumber()), queryParses))
+ if (house_numbers::HouseNumbersMatch(strings::MakeUniString(ft.GetHouseNumber()),
+ queryParse))
{
double const distanceM =
MercatorBounds::DistanceOnEarth(feature::GetCenter(ft), poiCenters[i]);
@@ -238,8 +239,8 @@ private:
return;
}
- vector<Parse> queryParses;
- ParseQuery(child.m_subQuery, child.m_lastTokenIsPrefix, queryParses);
+ vector<house_numbers::Token> queryParse;
+ ParseQuery(child.m_subQuery, child.m_lastTokenIsPrefix, queryParse);
uint32_t numFilterInvocations = 0;
auto houseNumberFilter = [&](uint32_t id, FeatureType & feature, bool & loaded) -> bool
@@ -272,7 +273,7 @@ private:
strings::UniString const houseNumber(strings::MakeUniString(feature.GetHouseNumber()));
if (!feature::IsHouseNumber(houseNumber))
return false;
- return HouseNumbersMatch(houseNumber, queryParses);
+ return house_numbers::HouseNumbersMatch(houseNumber, queryParse);
};
unordered_map<uint32_t, bool> cache;
diff --git a/search/features_layer_path_finder.cpp b/search/features_layer_path_finder.cpp
index 91db4ef27f..b767413daf 100644
--- a/search/features_layer_path_finder.cpp
+++ b/search/features_layer_path_finder.cpp
@@ -44,20 +44,6 @@ uint64_t CalcBottomUpPassCost(vector<FeaturesLayer const *> const & layers)
return CalcPassCost(layers.begin(), layers.end());
}
-bool LooksLikeHouseNumber(strings::UniString const & query, bool queryIsPrefix)
-{
- vector<Parse> parses;
- ParseQuery(query, queryIsPrefix, parses);
- for (auto const & parse : parses)
- {
- if (parse.IsEmpty())
- continue;
- if (feature::IsHouseNumber(parse.m_parts.front()))
- return true;
- }
- return false;
-}
-
bool GetPath(uint32_t id, vector<FeaturesLayer const *> const & layers, TParentGraph const & parent,
IntersectionResult & result)
{
@@ -129,8 +115,9 @@ void FeaturesLayerPathFinder::FindReachableVerticesTopDown(
parent.m_hasDelayedFeatures = false;
FeaturesLayer child(*layers[i - 1]);
- child.m_hasDelayedFeatures = child.m_type == SearchModel::SEARCH_TYPE_BUILDING &&
- LooksLikeHouseNumber(child.m_subQuery, child.m_lastTokenIsPrefix);
+ child.m_hasDelayedFeatures =
+ child.m_type == SearchModel::SEARCH_TYPE_BUILDING &&
+ house_numbers::LooksLikeHouseNumber(child.m_subQuery, child.m_lastTokenIsPrefix);
buffer.clear();
matcher.Match(child, parent, addEdge);
@@ -178,7 +165,7 @@ void FeaturesLayerPathFinder::FindReachableVerticesBottomUp(
FeaturesLayer parent(*layers[i + 1]);
parent.m_hasDelayedFeatures =
parent.m_type == SearchModel::SEARCH_TYPE_BUILDING &&
- LooksLikeHouseNumber(parent.m_subQuery, parent.m_lastTokenIsPrefix);
+ house_numbers::LooksLikeHouseNumber(parent.m_subQuery, parent.m_lastTokenIsPrefix);
buffer.clear();
matcher.Match(child, parent, addEdge);
diff --git a/search/geocoder.cpp b/search/geocoder.cpp
index 5157445e91..c635eb0779 100644
--- a/search/geocoder.cpp
+++ b/search/geocoder.cpp
@@ -4,6 +4,7 @@
#include "search/dummy_rank_table.hpp"
#include "search/features_filter.hpp"
#include "search/features_layer_matcher.hpp"
+#include "search/house_numbers_matcher.hpp"
#include "search/locality_scorer.hpp"
#include "search/processor.hpp"
#include "search/retrieval.hpp"
@@ -1244,7 +1245,8 @@ void Geocoder::MatchPOIsAndBuildings(size_t curToken)
filtered.Set(features.Get(), false /* isOwner */);
ASSERT(filtered.Get(), ());
- bool const looksLikeHouseNumber = feature::IsHouseNumber(m_layers.back().m_subQuery);
+ bool const looksLikeHouseNumber = house_numbers::LooksLikeHouseNumber(
+ m_layers.back().m_subQuery, m_layers.back().m_lastTokenIsPrefix);
if (filtered.IsEmpty() && !looksLikeHouseNumber)
break;
diff --git a/search/house_numbers_matcher.cpp b/search/house_numbers_matcher.cpp
index 504b75d859..42e1fa153b 100644
--- a/search/house_numbers_matcher.cpp
+++ b/search/house_numbers_matcher.cpp
@@ -1,249 +1,506 @@
#include "search/house_numbers_matcher.hpp"
+#include "indexer/string_set.hpp"
+
+#include "base/logging.hpp"
+
#include "std/algorithm.hpp"
#include "std/iterator.hpp"
#include "std/limits.hpp"
#include "std/sstream.hpp"
-
-#include "base/logging.hpp"
+#include "std/transform_iterator.hpp"
using namespace strings;
namespace search
{
-namespace
+namespace house_numbers
{
-size_t constexpr kInvalidNum = numeric_limits<size_t>::max();
-
-HouseNumberTokenizer::CharClass GetCharClass(UniChar c)
-{
- static UniString const kSeps = MakeUniString("\"\\/(),. \t№#-");
- if (c >= '0' && c <= '9')
- return HouseNumberTokenizer::CharClass::Digit;
- if (find(kSeps.begin(), kSeps.end(), c) != kSeps.end())
- return HouseNumberTokenizer::CharClass::Separator;
- return HouseNumberTokenizer::CharClass::Other;
-}
-
-bool IsShortWord(HouseNumberTokenizer::Token const & t)
-{
- return t.m_klass == HouseNumberTokenizer::CharClass::Other && t.m_token.size() <= 3;
-}
-
-bool IsNumber(HouseNumberTokenizer::Token const & t)
-{
- return t.m_klass == HouseNumberTokenizer::CharClass::Digit;
-}
-
-bool IsNumberOrShortWord(HouseNumberTokenizer::Token const & t)
+namespace
{
- return IsNumber(t) || IsShortWord(t);
-}
-
-bool IsBuildingSynonymPrefix(UniString const & p)
+// Common strings in house numbers.
+// To get this list, just run:
+//
+// ./clusterize-tag-values.lisp house-number-strings path-to-taginfo-db.db > strings.txt
+// cat strings.txt |
+// awk '{ if ($1 >= 100 && length($3) != 0) { printf("\"%s\",\n", $3) } }' |
+// sort | uniq
+//
+// *NOTE* there is a list of exceptions at the end.
+char const * const g_strings[] = {
+ "a", "aa", "ab", "abc", "ac", "ad", "ae", "af", "ag",
+ "ah", "ai", "aj", "ak", "al", "am", "an", "ao", "ap",
+ "aq", "ar", "are", "as", "at", "au", "av", "avenida", "aw",
+ "ax", "ay", "az", "azm", "b", "ba", "bab", "bah", "bak",
+ "bb", "bc", "bd", "be", "bedr", "ben", "bf", "bg", "bh",
+ "bij", "bis", "bk", "bl", "bldg", "blk", "bloc", "block", "bloco",
+ "blok", "bm", "bmn", "bn", "bo", "boe", "bol", "bor", "bov",
+ "box", "bp", "br", "bra", "brc", "bs", "bsa", "bu", "building",
+ "bus", "bv", "bwn", "bx", "by", "c", "ca", "cab", "cal",
+ "calle", "carrera", "cat", "cbi", "cbu", "cc", "ccz", "cd", "ce",
+ "centre", "cfn", "cgc", "cjg", "cl", "club", "cottage", "cottages", "court",
+ "cso", "cum", "d", "da", "db", "dd", "de", "df", "di",
+ "dia", "dvu", "e", "ec", "ee", "eh", "em", "en", "esm",
+ "ev", "f", "farm", "fdo", "fer", "ff", "fixme", "flat", "flats",
+ "floor", "g", "ga", "gar", "gara", "gas", "gb", "gg", "gr",
+ "grg", "h", "ha", "haus", "hh", "hl", "ho", "house", "hr",
+ "hs", "hv", "i", "ii", "iii", "int", "iv", "ix", "j",
+ "jab", "jf", "jj", "jms", "jtg", "k", "ka", "kab", "kk",
+ "km", "kmb", "kmk", "knn", "koy", "kp", "kra", "ksn", "kud",
+ "l", "ł", "la", "ldo", "ll", "local", "loja", "lot", "lote",
+ "lsb", "lt", "m", "mac", "mad", "mah", "mak", "mat", "mb",
+ "mbb", "mbn", "mch", "mei", "mks", "mm", "mny", "mo", "mok",
+ "monica", "mor", "morocco", "msb", "mtj", "mtk", "mvd", "n", "na",
+ "ncc", "ne", "nij", "nn", "no", "nr", "nst", "nu", "nut",
+ "o", "of", "ofof", "old", "one", "oo", "opl", "p", "pa",
+ "pap", "par", "park", "pav", "pb", "pch", "pg", "ph", "phd",
+ "pkf", "plaza", "plot", "po", "pos", "pp", "pr", "pra", "pya",
+ "q", "qq", "quater", "r", "ra", "rbo", "rd", "rear", "reisach",
+ "rk", "rm", "ro", "road", "rood", "rosso", "rs", "rw", "s",
+ "sab", "sal", "sav", "sb", "sba", "sbb", "sbl", "sbn", "sbx",
+ "sc", "sch", "sco", "seb", "sep", "sf", "sgr", "shop", "sir",
+ "sj", "sl", "sm", "sn", "snc", "so", "som", "south", "sp",
+ "spi", "spn", "ss", "st", "sta", "stc", "std", "stiege", "street",
+ "suite", "sur", "t", "tam", "ter", "terrace", "tf", "th", "the",
+ "tl", "to", "torre", "tr", "traf", "trd", "ts", "tt", "tu",
+ "u", "uhm", "unit", "utc", "v", "vi", "vii", "w", "wa",
+ "way", "we", "west", "wf", "wink", "wrh", "ws", "wsb", "x",
+ "xx", "y", "z", "za", "zh", "zona", "zu", "zw", "א",
+ "ב", "ג", "α", "а", "б", "бб", "бл", "в", "вл",
+ "вх", "г", "д", "е", "ж", "з", "и", "к", "л",
+ "лит", "м", "магазин", "н", "о", "п", "р", "разр", "с",
+ "стр", "т", "тп", "у", "уч", "участок", "ф", "ц", "ა",
+ "丁目", "之", "号", "號",
+
+ // List of exceptions
+ "владение"
+};
+
+// Common strings in house numbers.
+// To get this list, just run:
+//
+// ./clusterize-tag-values.lisp house-number path-to-taginfo-db.db > numbers.txt
+// tail -n +2 numbers.txt | head -78 | sed 's/^.*) \(.*\) \[.*$/"\1"/g;s/[ -/]//g;s/$/,/' |
+// sort | uniq
+const char * const g_patterns[] = {
+ "BL", "BLN", "BLNSL", "BN", "BNL", "BNSL", "L", "LL", "LN", "LNL", "LNLN", "LNN",
+ "N", "NBL", "NBLN", "NBN", "NBNBN", "NBNL", "NL", "NLBN", "NLL", "NLLN", "NLN", "NLNL",
+ "NLS", "NLSN", "NN", "NNBN", "NNL", "NNLN", "NNN", "NNS", "NS", "NSN", "NSS", "S",
+ "SL", "SLL", "SLN", "SN", "SNBNSS", "SNL", "SNN", "SS", "SSN", "SSS", "SSSS",
+};
+
+// List of common synonyms for building parts. Constructed by hand.
+const char * const g_buildingPartSynonyms[] = {
+ "building", "bldg", "bld", "bl", "unit", "block", "blk", "корпус",
+ "корп", "кор", "литер", "лит", "строение", "стр", "блок", "бл"};
+
+// List of common stop words for buildings. Constructed by hand.
+UniString const g_stopWords[] = {MakeUniString("дом"), MakeUniString("house")};
+
+bool IsStopWord(UniString const & s, bool isPrefix)
{
- static UniString kSynonyms[] = {
- MakeUniString("building"), MakeUniString("bld"), MakeUniString("unit"),
- MakeUniString("block"), MakeUniString("blk"), MakeUniString("корпус"),
- MakeUniString("литер"), MakeUniString("строение"), MakeUniString("блок")};
-
- for (UniString const & s : kSynonyms)
+ for (auto const & p : g_stopWords)
{
- if (StartsWith(s, p))
+ if ((isPrefix && StartsWith(p, s)) || (!isPrefix && p == s))
return true;
}
return false;
}
-size_t GetNumTokensForBuildingPart(vector<HouseNumberTokenizer::Token> const & ts, size_t i,
- vector<size_t> & memory);
-
-size_t GetNumTokensForBuildingPartImpl(vector<HouseNumberTokenizer::Token> const & ts, size_t i,
- vector<size_t> & memory)
+class BuildingPartSynonymsMatcher
{
- ASSERT_LESS(i, ts.size(), ());
+public:
+ using TSynonyms = StringSet<UniChar, 4>;
- auto const & token = ts[i];
- if (token.m_klass != HouseNumberTokenizer::CharClass::Other)
- return 0;
+ BuildingPartSynonymsMatcher()
+ {
+ for (auto const * s : g_buildingPartSynonyms)
+ {
+ UniString const us = MakeUniString(s);
+ m_synonyms.Add(us.begin(), us.end());
+ }
+ }
- if (!IsBuildingSynonymPrefix(token.m_token))
- return 0;
+ // Returns true if |s| looks like a building synonym.
+ inline bool Has(UniString const & s) const
+ {
+ return m_synonyms.Has(s.begin(), s.end()) == TSynonyms::Status::Full;
+ }
- // No sense in single "корпус" or "литер".
- if (i + 1 >= ts.size())
- return 0;
+private:
+ TSynonyms m_synonyms;
+};
- if (!IsNumberOrShortWord(ts[i + 1]))
- return 0;
+class StringsMatcher
+{
+public:
+ using TStrings = StringSet<UniChar, 8>;
- // No sense in "корпус корпус" or "литер литер".
- if (ts[i + 1].m_token == token.m_token)
- return 0;
+ StringsMatcher()
+ {
+ for (auto const * s : g_strings)
+ {
+ UniString const us = MakeUniString(s);
+ m_strings.Add(us.begin(), us.end());
+ }
- // Consume next token, either number or short word.
- size_t j = i + 2;
+ for (auto const * s : g_buildingPartSynonyms)
+ {
+ UniString const us = MakeUniString(s);
+ m_strings.Add(us.begin(), us.end());
+ }
+ }
- // Consume one more number of short word, if possible.
- if (j < ts.size() && IsNumberOrShortWord(ts[j]) && ts[j].m_klass != ts[j - 1].m_klass &&
- GetNumTokensForBuildingPart(ts, j, memory) == 0)
+ // Returns true when |s| may be a full substring of a house number,
+ // or a prefix of some valid substring of a house number, when
+ // |isPrefix| is true.
+ bool Has(UniString const & s, bool isPrefix) const
{
- ++j;
+ auto const status = m_strings.Has(s.begin(), s.end());
+ switch (status)
+ {
+ case TStrings::Status::Absent: return false;
+ case TStrings::Status::Prefix: return isPrefix;
+ case TStrings::Status::Full: return true;
+ }
}
- return j - i;
-}
-
-// Returns number of tokens starting at position |i|, where the first
-// token is some way of writing of "корпус", or "building", second
-// token is a number or a letter, and (possibly) third token which can
-// be a letter when second token is a number. |memory| is used here to
-// store results of previous calls and prevents degradation to
-// non-linear time.
-//
-// TODO (@y, @m): the parser is quite complex now. Consider to just
-// throw out all prefixes of "building" or "литер" and sort rest
-// tokens. Number of false positives will be higher but the parser
-// will be more robust, simple and faster.
-size_t GetNumTokensForBuildingPart(vector<HouseNumberTokenizer::Token> const & ts, size_t i,
- vector<size_t> & memory)
-{
- if (i >= ts.size())
- return 0;
- if (memory[i] == kInvalidNum)
- memory[i] = GetNumTokensForBuildingPartImpl(ts, i, memory);
- return memory[i];
-}
+private:
+ TStrings m_strings;
+};
-void MergeTokens(vector<HouseNumberTokenizer::Token> const & ts, vector<UniString> & rs)
+class HouseNumberClassifier
{
- vector<size_t> memory(ts.size(), kInvalidNum);
+public:
+ using TPatterns = StringSet<Token::Type, 4>;
- size_t i = 0;
- while (i < ts.size())
+ HouseNumberClassifier()
{
- switch (ts[i].m_klass)
+ for (auto const * p : g_patterns)
{
- case HouseNumberTokenizer::CharClass::Digit:
+ m_patterns.Add(make_transform_iterator(p, &CharToType),
+ make_transform_iterator(p + strlen(p), &CharToType));
+ }
+ }
+
+ // Returns true when the string |s| looks like a valid house number,
+ // (or a prefix of some valid house number, when |isPrefix| is
+ // true).
+ bool LooksGood(UniString const & s, bool isPrefix) const
+ {
+ vector<Token> parse;
+ Tokenize(s, isPrefix, parse);
+
+ size_t i = 0;
+ for (size_t j = 0; j != parse.size(); ++j)
{
- UniString token = ts[i].m_token;
- ++i;
- // Process cases like "123 б" or "9PQ".
- if (i < ts.size() && IsShortWord(ts[i]) && GetNumTokensForBuildingPart(ts, i, memory) == 0)
+ auto const & token = parse[j];
+ auto const type = token.m_type;
+ switch (type)
+ {
+ case Token::TYPE_SEPARATOR: break;
+ case Token::TYPE_GROUP_SEPARATOR: break;
+ case Token::TYPE_HYPHEN: break;
+ case Token::TYPE_SLASH: break;
+ case Token::TYPE_STRING:
{
- token.append(ts[i].m_token.begin(), ts[i].m_token.end());
+ if (IsStopWord(token.m_value, token.m_prefix))
+ break;
+ if (!m_matcher.Has(token.m_value, token.m_prefix))
+ return false;
+ // fallthrough
+ }
+ case Token::TYPE_NUMBER: // fallthrough
+ case Token::TYPE_BUILDING_PART: // fallthrough
+ case Token::TYPE_LETTER: // fallthrough
+ case Token::TYPE_BUILDING_PART_OR_LETTER:
+ parse[i] = move(parse[j]);
++i;
}
- rs.push_back(move(token));
- break;
}
- case HouseNumberTokenizer::CharClass::Separator:
+ parse.resize(i);
+
+ auto const status = m_patterns.Has(make_transform_iterator(parse.begin(), &TokenToType),
+ make_transform_iterator(parse.end(), &TokenToType));
+ switch (status)
{
- ASSERT(false, ("Seps can't be merged."));
- ++i;
- break;
+ case TPatterns::Status::Absent: return false;
+ case TPatterns::Status::Prefix: return true;
+ case TPatterns::Status::Full: return true;
}
- case HouseNumberTokenizer::CharClass::Other:
- {
- if (size_t numTokens = GetNumTokensForBuildingPart(ts, i, memory))
- {
- UniString token;
- ++i;
- for (size_t j = 1; j < numTokens; ++j, ++i)
- token.append(ts[i].m_token.begin(), ts[i].m_token.end());
- rs.push_back(move(token));
- break;
- }
+ }
- rs.push_back(ts[i].m_token);
- ++i;
- break;
- }
+private:
+ static Token::Type CharToType(char c)
+ {
+ switch (c)
+ {
+ case 'N': return Token::TYPE_NUMBER;
+ case 'S': return Token::TYPE_STRING;
+ case 'B': return Token::TYPE_BUILDING_PART;
+ case 'L': return Token::TYPE_LETTER;
+ case 'U': return Token::TYPE_BUILDING_PART_OR_LETTER;
+ default: CHECK(false, ("Unexpected character:", c)); return Token::TYPE_SEPARATOR;
}
}
- if (!rs.empty())
- sort(rs.begin() + 1, rs.end());
+ static Token::Type TokenToType(Token const & token) { return token.m_type; }
+
+ StringsMatcher m_matcher;
+ TPatterns m_patterns;
+};
+
+Token::Type GetCharType(UniChar c)
+{
+ static UniString const kSeps = MakeUniString(" \t\"\\().#~");
+ static UniString const kGroupSeps = MakeUniString(",|;+");
+
+ if (IsASCIIDigit(c))
+ return Token::TYPE_NUMBER;
+ if (find(kSeps.begin(), kSeps.end(), c) != kSeps.end())
+ return Token::TYPE_SEPARATOR;
+ if (find(kGroupSeps.begin(), kGroupSeps.end(), c) != kGroupSeps.end())
+ return Token::TYPE_GROUP_SEPARATOR;
+ if (c == '-')
+ return Token::TYPE_HYPHEN;
+ if (c == '/')
+ return Token::TYPE_SLASH;
+ return Token::TYPE_STRING;
}
-bool ParsesMatch(Parse const & houseNumberParse, Parse const & queryParse)
+bool IsLiteralType(Token::Type type)
{
- if (houseNumberParse.IsEmpty() || queryParse.IsEmpty())
- return false;
+ return type == Token::TYPE_STRING || type == Token::TYPE_LETTER ||
+ type == Token::TYPE_BUILDING_PART_OR_LETTER;
+}
- auto const & h = houseNumberParse.m_parts;
- auto const & q = queryParse.m_parts;
+// Leaves only numbers and letters, removes all trailing prefix
+// tokens. Then, does following:
+//
+// * when there is at least one number, drops all tokens until the
+// number and sorts the rest
+// * when there are no numbers at all, sorts tokens
+void SimplifyParse(vector<Token> & tokens)
+{
+ if (!tokens.empty() && tokens.back().m_prefix)
+ tokens.pop_back();
- // Check first tokens, hope, house numbers.
- if (h[0] != q[0])
- return false;
+ size_t i = 0;
+ size_t j = 0;
+ while (j != tokens.size() && tokens[j].m_type != Token::TYPE_NUMBER)
+ ++j;
+ for (; j != tokens.size(); ++j)
+ {
+ auto const type = tokens[j].m_type;
+ if (type == Token::TYPE_NUMBER || type == Token::TYPE_LETTER)
+ tokens[i++] = tokens[j];
+ }
- size_t i = 1, j = 1;
- while (i != h.size() && j != q.size())
+ if (i != 0)
{
- while (i != h.size() && h[i] < q[j])
- ++i;
- if (i == h.size() || h[i] != q[j])
+ tokens.resize(i);
+ sort(tokens.begin() + 1, tokens.end());
+ }
+ else
+ {
+ sort(tokens.begin(), tokens.end());
+ }
+}
+
+// Returns true when a sequence denoted by [b2, e2) is a subsequence
+// of [b1, e1).
+template <typename T1, typename T2>
+bool IsSubsequence(T1 b1, T1 e1, T2 b2, T2 e2)
+{
+ for (; b2 != e2; ++b1, ++b2)
+ {
+ while (b1 != e1 && *b1 < *b2)
+ ++b1;
+ if (b1 == e1 || *b1 != *b2)
return false;
- ++i;
- ++j;
}
+ return true;
+}
- if (queryParse.m_hasTrailingBuildingPrefixSynonym)
+bool IsBuildingPartSynonym(UniString const & s)
+{
+ static BuildingPartSynonymsMatcher const kMatcher;
+ return kMatcher.Has(s);
+}
+
+bool IsShortBuildingSynonym(UniString const & t)
+{
+ static UniString const kSynonyms[] = {MakeUniString("к"), MakeUniString("с")};
+ for (auto const & s : kSynonyms)
{
- // In this case, at least one more unmatched part must be in a
- // house number.
- return j == q.size() && h.size() > q.size();
+ if (t == s)
+ return true;
}
+ return false;
+}
- return j == q.size();
+template <typename TFn>
+void ForEachGroup(vector<Token> const & ts, TFn && fn)
+{
+ size_t i = 0;
+ while (i < ts.size())
+ {
+ while (i < ts.size() && ts[i].m_type == Token::TYPE_GROUP_SEPARATOR)
+ ++i;
+
+ size_t j = i;
+ while (j < ts.size() && ts[j].m_type != Token::TYPE_GROUP_SEPARATOR)
+ ++j;
+
+ if (i != j)
+ fn(i, j);
+
+ i = j;
+ }
+}
+
+template <typename TFn>
+void TransformString(UniString && token, TFn && fn)
+{
+ static UniString const kLiter = MakeUniString("лит");
+
+ size_t const size = token.size();
+
+ if (IsBuildingPartSynonym(token))
+ {
+ fn(move(token), Token::TYPE_BUILDING_PART);
+ }
+ else if (size == 4 && StartsWith(token, kLiter))
+ {
+ fn(UniString(token.begin(), token.begin() + 3), Token::TYPE_BUILDING_PART);
+ fn(UniString(token.begin() + 3, token.end()), Token::TYPE_LETTER);
+ }
+ else if (size == 2)
+ {
+ UniString firstLetter(token.begin(), token.begin() + 1);
+ if (IsShortBuildingSynonym(firstLetter))
+ {
+ fn(move(firstLetter), Token::TYPE_BUILDING_PART);
+ fn(UniString(token.begin() + 1, token.end()), Token::TYPE_LETTER);
+ }
+ else
+ {
+ fn(move(token), Token::TYPE_STRING);
+ }
+ }
+ else if (size == 1)
+ {
+ if (IsShortBuildingSynonym(token))
+ fn(move(token), Token::TYPE_BUILDING_PART_OR_LETTER);
+ else
+ fn(move(token), Token::TYPE_LETTER);
+ }
+ else
+ {
+ fn(move(token), Token::TYPE_STRING);
+ }
}
} // namespace
-// static
-void HouseNumberTokenizer::Tokenize(UniString const & s, vector<Token> & ts)
+void Tokenize(UniString s, bool isPrefix, vector<Token> & ts)
{
+ MakeLowerCaseInplace(s);
+ auto addToken = [&ts](UniString && value, Token::Type type)
+ {
+ ts.emplace_back(move(value), type);
+ };
+
size_t i = 0;
while (i < s.size())
{
- CharClass klass = GetCharClass(s[i]);
+ Token::Type const type = GetCharType(s[i]);
- size_t j = i;
- while (j < s.size() && GetCharClass(s[j]) == klass)
+ size_t j = i + 1;
+ while (j < s.size() && GetCharType(s[j]) == type)
++j;
- if (klass != CharClass::Separator)
+ if (type != Token::TYPE_SEPARATOR)
{
UniString token(s.begin() + i, s.begin() + j);
- ts.emplace_back(move(token), klass);
+ if (type == Token::TYPE_STRING)
+ {
+ if (j != s.size() || !isPrefix)
+ {
+ TransformString(move(token), addToken);
+ }
+ else
+ {
+ ts.emplace_back(move(token), Token::TYPE_STRING);
+ ts.back().m_prefix = true;
+ }
+ }
+ else
+ {
+ addToken(move(token), type);
+ }
}
i = j;
}
+
+ // Quite hacky loop from ts.size() - 1 towards 0.
+ for (size_t i = ts.size() - 1; i < ts.size(); --i)
+ {
+ if (ts[i].m_type != Token::TYPE_BUILDING_PART_OR_LETTER)
+ continue;
+ if (i + 1 == ts.size() || ts[i + 1].m_type == Token::TYPE_BUILDING_PART)
+ ts[i].m_type = Token::TYPE_LETTER;
+ else if (ts[i + 1].m_type == Token::TYPE_NUMBER)
+ ts[i].m_type = Token::TYPE_BUILDING_PART;
+ }
}
-void ParseQuery(strings::UniString const & query, bool queryIsPrefix, vector<Parse> & ps)
+void ParseHouseNumber(strings::UniString const & s, vector<vector<Token>> & parses)
{
- vector<HouseNumberTokenizer::Token> tokens;
- HouseNumberTokenizer::Tokenize(MakeLowerCase(query), tokens);
-
+ vector<Token> tokens;
+ Tokenize(s, false /* isPrefix */, tokens);
+
+ bool numbersSequence = true;
+ ForEachGroup(tokens, [&tokens, &numbersSequence](size_t i, size_t j)
+ {
+ switch (j - i)
+ {
+ case 0: break;
+ case 1:
+ numbersSequence = numbersSequence && tokens[i].m_type == Token::TYPE_NUMBER;
+ break;
+ case 2:
+ numbersSequence = numbersSequence && tokens[i].m_type == Token::TYPE_NUMBER &&
+ IsLiteralType(tokens[i + 1].m_type);
+ break;
+ default: numbersSequence = false; break;
+ }
+ });
+
+ size_t const oldSize = parses.size();
+ if (numbersSequence)
{
- ps.emplace_back();
- Parse & p = ps.back();
- MergeTokens(tokens, p.m_parts);
+ ForEachGroup(tokens, [&tokens, &parses](size_t i, size_t j)
+ {
+ parses.emplace_back();
+ auto & parse = parses.back();
+ for (size_t k = i; k < j; ++k)
+ parse.emplace_back(move(tokens[k]));
+ });
}
-
- // *NOTE* |tokens| is modified in the following block.
- if (queryIsPrefix && !tokens.empty() &&
- tokens.back().m_klass == HouseNumberTokenizer::CharClass::Other &&
- IsBuildingSynonymPrefix(tokens.back().m_token))
+ else
{
- tokens.pop_back();
- ps.emplace_back();
- Parse & p = ps.back();
- MergeTokens(tokens, p.m_parts);
- p.m_hasTrailingBuildingPrefixSynonym = true;
+ parses.emplace_back(move(tokens));
}
+
+ for (size_t i = oldSize; i < parses.size(); ++i)
+ SimplifyParse(parses[i]);
+}
+
+void ParseQuery(strings::UniString const & query, bool queryIsPrefix, vector<Token> & parse)
+{
+ Tokenize(query, queryIsPrefix, parse);
+ SimplifyParse(parse);
}
bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniString const & query,
@@ -252,69 +509,69 @@ bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniStrin
if (houseNumber == query)
return true;
- vector<Parse> queryParses;
- ParseQuery(query, queryIsPrefix, queryParses);
+ vector<Token> queryParse;
+ ParseQuery(query, queryIsPrefix, queryParse);
- return HouseNumbersMatch(houseNumber, queryParses);
+ return HouseNumbersMatch(houseNumber, queryParse);
}
-bool HouseNumbersMatch(strings::UniString const & houseNumber, vector<Parse> const & queryParses)
+bool HouseNumbersMatch(strings::UniString const & houseNumber, vector<Token> const & queryParse)
{
- if (houseNumber.empty() || queryParses.empty())
+ if (houseNumber.empty() || queryParse.empty())
return false;
// Fast pre-check, helps to early exit without complex house number
// parsing.
- bool good = false;
- for (auto const & queryParse : queryParses)
+ if (IsASCIIDigit(houseNumber[0]) && IsASCIIDigit(queryParse[0].m_value[0]) &&
+ houseNumber[0] != queryParse[0].m_value[0])
{
- if (!queryParse.IsEmpty() && houseNumber[0] == queryParse.m_parts.front()[0])
- {
- good = true;
- break;
- }
- }
- if (!good)
return false;
-
- Parse houseNumberParse;
- {
- vector<HouseNumberTokenizer::Token> tokens;
- HouseNumberTokenizer::Tokenize(MakeLowerCase(houseNumber), tokens);
- MergeTokens(tokens, houseNumberParse.m_parts);
}
- for (auto const & queryParse : queryParses)
+ vector<vector<Token>> houseNumberParses;
+ ParseHouseNumber(houseNumber, houseNumberParses);
+
+ for (auto & parse : houseNumberParses)
{
- if (ParsesMatch(houseNumberParse, queryParse))
+ if (parse.empty())
+ continue;
+ if (parse[0] == queryParse[0] &&
+ IsSubsequence(parse.begin() + 1, parse.end(), queryParse.begin() + 1, queryParse.end()))
+ {
return true;
+ }
}
return false;
}
-string DebugPrint(HouseNumberTokenizer::CharClass charClass)
+bool LooksLikeHouseNumber(strings::UniString const & s, bool isPrefix)
{
- switch (charClass)
- {
- case HouseNumberTokenizer::CharClass::Separator: return "Separator";
- case HouseNumberTokenizer::CharClass::Digit: return "Digit";
- case HouseNumberTokenizer::CharClass::Other: return "Other";
- }
- return "Unknown";
+ static HouseNumberClassifier const classifier;
+ return classifier.LooksGood(s, isPrefix);
}
-string DebugPrint(HouseNumberTokenizer::Token const & token)
+string DebugPrint(Token::Type type)
{
- ostringstream os;
- os << "Token [" << DebugPrint(token.m_token) << ", " << DebugPrint(token.m_klass) << "]";
- return os.str();
+ switch (type)
+ {
+ case Token::TYPE_NUMBER: return "Number";
+ case Token::TYPE_SEPARATOR: return "Separator";
+ case Token::TYPE_GROUP_SEPARATOR: return "GroupSeparator";
+ case Token::TYPE_HYPHEN: return "Hyphen";
+ case Token::TYPE_SLASH: return "Slash";
+ case Token::TYPE_STRING: return "String";
+ case Token::TYPE_BUILDING_PART: return "BuildingPart";
+ case Token::TYPE_LETTER: return "Letter";
+ case Token::TYPE_BUILDING_PART_OR_LETTER: return "BuildingPartOrLetter";
+ }
+ return "Unknown";
}
-string DebugPrint(Parse const & parse)
+string DebugPrint(Token const & token)
{
ostringstream os;
- os << "Parse [" << DebugPrint(parse.m_parts) << "]";
+ os << "Token [" << DebugPrint(token.m_value) << ", " << DebugPrint(token.m_type) << "]";
return os.str();
}
-
+} // namespace house_numbers
} // namespace search
diff --git a/search/house_numbers_matcher.hpp b/search/house_numbers_matcher.hpp
index 72299a1777..55ab8eaa38 100644
--- a/search/house_numbers_matcher.hpp
+++ b/search/house_numbers_matcher.hpp
@@ -7,58 +7,72 @@
namespace search
{
-// This class splits a string representing a house number to groups of
-// symbols from the same class (separators, digits or other symbols,
-// hope, letters).
-class HouseNumberTokenizer
+namespace house_numbers
{
-public:
- enum class CharClass
+struct Token
+{
+ enum Type
{
- Separator,
- Digit,
- Other,
+ TYPE_NUMBER,
+ TYPE_SEPARATOR,
+ TYPE_GROUP_SEPARATOR,
+ TYPE_HYPHEN,
+ TYPE_SLASH,
+ TYPE_STRING,
+ TYPE_BUILDING_PART,
+ TYPE_LETTER,
+ TYPE_BUILDING_PART_OR_LETTER
};
- struct Token
+ Token() = default;
+ Token(strings::UniString const & value, Type type) : m_value(value), m_type(type) {}
+ Token(strings::UniString && value, Type type) : m_value(move(value)), m_type(type) {}
+ Token(Token &&) = default;
+
+ Token & operator=(Token &&) = default;
+ Token & operator=(Token const &) = default;
+
+ bool operator==(Token const & rhs) const
{
- Token() : m_klass(CharClass::Separator) {}
- Token(strings::UniString const & token, CharClass klass) : m_token(token), m_klass(klass) {}
- Token(strings::UniString && token, CharClass klass) : m_token(move(token)), m_klass(klass) {}
+ return m_type == rhs.m_type && m_value == rhs.m_value;
+ }
- strings::UniString m_token;
- CharClass m_klass;
- };
+ bool operator!=(Token const & rhs) const { return !(*this == rhs); }
- // Performs greedy split of |s| by character classes. Note that this
- // function never emits Tokens corresponding to Separator classes.
- static void Tokenize(strings::UniString const & s, vector<Token> & ts);
+ bool operator<(Token const & rhs) const
+ {
+ if (m_type != rhs.m_type)
+ return m_type < rhs.m_type;
+ return m_value < rhs.m_value;
+ }
+
+ strings::UniString m_value;
+ Type m_type = TYPE_SEPARATOR;
+ bool m_prefix = false;
};
-struct Parse
-{
- inline bool IsEmpty() const { return m_parts.empty(); }
+// Tokenizes |s| that may be a house number.
+void Tokenize(strings::UniString s, bool isPrefix, vector<Token> & ts);
- vector<strings::UniString> m_parts;
- bool m_hasTrailingBuildingPrefixSynonym = false;
-};
+// Parses a string that can be one or more house numbers. This method
+// can be used to parse addr:housenumber fields.
+void ParseHouseNumber(strings::UniString const & s, vector<vector<Token>> & parses);
-// Parses query for later faster processing, when multiple buildings
-// are matched against the query.
-void ParseQuery(strings::UniString const & query, bool queryIsPrefix, vector<Parse> & ps);
+// Parses a part of search query that can be a house number.
+void ParseQuery(strings::UniString const & query, bool queryIsPrefix, vector<Token> & parse);
-// Returns true when |query| matches to |houseNumber|.
+// Returns true if house number matches to a given query.
bool HouseNumbersMatch(strings::UniString const & houseNumber, strings::UniString const & query,
bool queryIsPrefix);
-// Returns true when at least one parse of the query matches to
-// |houseNumber|.
-bool HouseNumbersMatch(strings::UniString const & houseNumber, vector<Parse> const & queryParses);
-
-string DebugPrint(HouseNumberTokenizer::CharClass charClass);
+// Returns true if house number matches to a given parsed query.
+bool HouseNumbersMatch(strings::UniString const & houseNumber, vector<Token> const & queryParse);
-string DebugPrint(HouseNumberTokenizer::Token const & token);
+// Returns true if |s| looks like a house number.
+bool LooksLikeHouseNumber(strings::UniString const & s, bool isPrefix);
-string DebugPrint(Parse const & parse);
+string DebugPrint(Token::Type type);
+string DebugPrint(Token const & token);
+} // namespace house_numbers
} // namespace search
diff --git a/search/search_quality/clusterize-tag-values.lisp b/search/search_quality/clusterize-tag-values.lisp
index aea2e04c6b..941597b6fb 100755
--- a/search/search_quality/clusterize-tag-values.lisp
+++ b/search/search_quality/clusterize-tag-values.lisp
@@ -103,7 +103,7 @@ exec /usr/bin/env sbcl --noinform --quit --eval "(defparameter *script-name* \"$
(defparameter *building-synonyms*
'("building" "bldg" "bld" "bl" "unit" "block" "blk"
- "корпус" "корп" "литер" "лит" "строение" "блок" "бл"))
+ "корпус" "корп" "кор" "литер" "лит" "строение" "стр" "блок" "бл"))
(defparameter *house-number-seps* '(#\Space #\Tab #\" #\\ #\( #\) #\. #\# #\~))
(defparameter *house-number-groups-seps* '(#\, #\| #\; #\+))
@@ -172,9 +172,12 @@ exec /usr/bin/env sbcl --noinform --quit --eval "(defparameter *script-name* \"$
(defun house-number-with-optional-suffix-p (tokens)
(case (length tokens)
(1 (eq (token-type (first tokens)) :number))
- (2 (and (eq (token-type (first tokens)) :number)
- (or (eq (token-type (second tokens)) :string)
- (eq (token-type (second tokens)) :letter))))
+ (2 (let ((first-type (token-type (first tokens)))
+ (second-type (token-type (second tokens))))
+ (and (eq first-type :number)
+ (or (eq second-type :string)
+ (eq second-type :letter)
+ (eq second-type :letter-or-building-part)))))
(otherwise nil)))
(defun get-house-number-sub-numbers (house-number)
@@ -207,7 +210,7 @@ exec /usr/bin/env sbcl --noinform --quit --eval "(defparameter *script-name* \"$
(:number "N")
(:building-part "B")
(:letter "L")
- (:letter-or-building-part "LB")
+ (:letter-or-building-part "U")
(:string "S")
((:hyphen :slash :group-separator) token-value)
(otherwise (assert NIL NIL (format nil "Unknown token type: ~a"
@@ -226,7 +229,10 @@ exec /usr/bin/env sbcl --noinform --quit --eval "(defparameter *script-name* \"$
(dolist (string (mapcar #'token-value
(remove-if-not #'(lambda (token)
(case (token-type token)
- ((:string :letter :letter-or-building-part) T)
+ ((:string
+ :letter
+ :letter-or-building-part
+ :building-part) T)
(otherwise nil)))
number)))
(funcall fn string string))))
diff --git a/search/search_tests/house_numbers_matcher_test.cpp b/search/search_tests/house_numbers_matcher_test.cpp
index 90485b0755..d5540e6fe5 100644
--- a/search/search_tests/house_numbers_matcher_test.cpp
+++ b/search/search_tests/house_numbers_matcher_test.cpp
@@ -6,40 +6,27 @@
#include "base/string_utils.hpp"
-using namespace strings;
+using namespace search::house_numbers;
using namespace search;
+using namespace strings;
namespace
{
-void ParseHouseNumber(string const & s, vector<vector<string>> & ts)
-{
- vector<Parse> parses;
- ParseQuery(MakeUniString(s), false /* queryIsPrefix */, parses);
- for (auto const & parse : parses)
- {
- ts.emplace_back();
- auto & tsb = ts.back();
- for (auto const & part : parse.m_parts)
- tsb.push_back(ToUtf8(part));
- }
-}
-
bool HouseNumbersMatch(string const & houseNumber, string const & query, bool queryIsPrefix = false)
{
- vector<Parse> queryParses;
- ParseQuery(MakeUniString(query), queryIsPrefix, queryParses);
- return search::HouseNumbersMatch(MakeUniString(houseNumber), queryParses);
+ return search::house_numbers::HouseNumbersMatch(MakeUniString(houseNumber), MakeUniString(query),
+ queryIsPrefix);
}
bool CheckTokenizer(string const & utf8s, vector<string> const & expected)
{
UniString utf32s = MakeUniString(utf8s);
- vector<HouseNumberTokenizer::Token> tokens;
- HouseNumberTokenizer::Tokenize(utf32s, tokens);
+ vector<Token> tokens;
+ Tokenize(utf32s, false /* isPrefix */, tokens);
vector<string> actual;
for (auto const & token : tokens)
- actual.push_back(ToUtf8(token.m_token));
+ actual.push_back(ToUtf8(token.m_value));
if (actual != expected)
{
LOG(LINFO, ("actual:", actual, "expected:", expected));
@@ -50,44 +37,59 @@ bool CheckTokenizer(string const & utf8s, vector<string> const & expected)
bool CheckParser(string const & utf8s, string const & expected)
{
- vector<vector<string>> parses;
- ParseHouseNumber(utf8s, parses);
+ vector<vector<Token>> parses;
+ ParseHouseNumber(MakeUniString(utf8s), parses);
+
+ if (parses.size() != 1)
+ {
+ LOG(LINFO, ("Actual:", parses, "expected:", expected));
+ return false;
+ }
- for (auto const & parse : parses)
+ auto const & parse = parses[0];
+ string actual;
+ for (size_t i = 0; i < parse.size(); ++i)
{
- string actual;
- for (size_t i = 0; i < parse.size(); ++i)
- {
- actual.append(parse[i]);
- if (i + 1 != parse.size())
- actual.push_back(' ');
- }
- if (actual == expected)
- return true;
+ actual.append(ToUtf8(parse[i].m_value));
+ if (i + 1 != parse.size())
+ actual.push_back(' ');
}
- LOG(LINFO, ("actual:", parses, "expected:", expected));
- return false;
+ if (actual != expected)
+ {
+ LOG(LINFO, ("Actual:", parses, "expected:", expected));
+ return false;
+ }
+
+ return true;
+}
+
+bool LooksLikeHouseNumber(string const & s, bool isPrefix)
+{
+ return house_numbers::LooksLikeHouseNumber(MakeUniString(s), isPrefix);
}
} // namespace
UNIT_TEST(HouseNumberTokenizer_Smoke)
{
- TEST(CheckTokenizer("123Б", {"123", "Б"}), ());
- TEST(CheckTokenizer("123/Б", {"123", "Б"}), ());
- TEST(CheckTokenizer("123/34 корп. 4 стр1", {"123", "34", "корп", "4", "стр", "1"}), ());
+ TEST(CheckTokenizer("123Б", {"123", "б"}), ());
+ TEST(CheckTokenizer("123/Б", {"123", "/", "б"}), ());
+ TEST(CheckTokenizer("123/34 корп. 4 стр1", {"123", "/", "34", "корп", "4", "стр", "1"}), ());
+ TEST(CheckTokenizer("1-100", {"1", "-", "100"}), ());
+ TEST(CheckTokenizer("19/1А литБ", {"19", "/", "1", "а", "лит", "б"}), ());
+ TEST(CheckTokenizer("9 литер аб1", {"9", "литер", "аб", "1"}), ());
}
-UNIT_TEST(HouseNumberNormalizer_Smoke)
+UNIT_TEST(HouseNumberParser_Smoke)
{
- TEST(CheckParser("123Б", "123б"), ());
+ TEST(CheckParser("123Б", "123 б"), ());
TEST(CheckParser("123/4 Литер А", "123 4 а"), ());
- TEST(CheckParser("123а корп. 2б", "123а 2б"), ());
+ TEST(CheckParser("123а корп. 2б", "123 2 а б"), ());
TEST(CheckParser("123к4", "123 4"), ());
- TEST(CheckParser("123к Корпус 2", "123к 2"), ());
+ TEST(CheckParser("123к Корпус 2", "123 2 к"), ());
TEST(CheckParser("9 литер А корпус 2", "9 2 а"), ());
TEST(CheckParser("39с79", "39 79"), ());
- TEST(CheckParser("9 литер аб1", "9 аб1"), ());
+ TEST(CheckParser("9 литер аб1", "9 1"), ());
}
UNIT_TEST(HouseNumbersMatcher_Smoke)
@@ -111,22 +113,53 @@ UNIT_TEST(HouseNumbersMatcher_Smoke)
TEST(HouseNumbersMatch("22к корпус 2а строение 7", "22к к 2а стр 7"), ());
TEST(HouseNumbersMatch("22к к 2а с 7", "22к корпус 2а"), ());
TEST(HouseNumbersMatch("124к корпус к", "124к к"), ());
-
- TEST(!HouseNumbersMatch("39", "39 с 79"), ());
- TEST(!HouseNumbersMatch("127а корпус 2", "127"), ());
- TEST(!HouseNumbersMatch("6 корпус 2", "7"), ());
- TEST(!HouseNumbersMatch("10/42 корпус 2", "42"), ());
- TEST(!HouseNumbersMatch("--...--.-", "--.....-"), ());
- TEST(!HouseNumbersMatch("22к", "22 корпус"), ());
- TEST(!HouseNumbersMatch("22к", "22я"), ());
- TEST(!HouseNumbersMatch("22к", "22л"), ());
+ TEST(HouseNumbersMatch("127а корпус 2", "127"), ());
+ TEST(HouseNumbersMatch("22к", "22 корпус"), ());
TEST(HouseNumbersMatch("39 корпус 79", "39", true /* queryIsPrefix */), ());
TEST(HouseNumbersMatch("39 корпус 79", "39 кор", true /* queryIsPrefix */), ());
- TEST(!HouseNumbersMatch("39", "39 корп", true /* queryIsPrefix */), ());
+ TEST(HouseNumbersMatch("39", "39 корп", true /* queryIsPrefix */), ());
TEST(HouseNumbersMatch("39 корпус 7", "39", true /* queryIsPrefix */), ());
TEST(HouseNumbersMatch("39К корпус 7", "39 к", true /* queryIsPrefix */), ());
TEST(HouseNumbersMatch("39К корпус 7", "39к", true /* queryIsPrefix */), ());
TEST(HouseNumbersMatch("39 К корпус 7", "39 к", false /* queryIsPrefix */), ());
- TEST(!HouseNumbersMatch("39 К корпус 7", "39", false /* queryIsPrefix */), ());
+ TEST(HouseNumbersMatch("39 К корпус 7", "39", false /* queryIsPrefix */), ());
+
+ TEST(!HouseNumbersMatch("39", "39 с 79"), ());
+ TEST(!HouseNumbersMatch("6 корпус 2", "7"), ());
+ TEST(!HouseNumbersMatch("10/42 корпус 2", "42"), ());
+ TEST(!HouseNumbersMatch("22к", "22я"), ());
+ TEST(!HouseNumbersMatch("22к", "22л"), ());
+}
+
+UNIT_TEST(LooksLikeHouseNumber_Smoke)
+{
+ TEST(LooksLikeHouseNumber("1", false /* isPrefix */), ());
+ TEST(LooksLikeHouseNumber("ev 10", false /* isPrefix */), ());
+
+ TEST(LooksLikeHouseNumber("14 к", true /* isPrefix */), ());
+ TEST(LooksLikeHouseNumber("14 кор", true /* isPrefix */), ());
+ TEST(LooksLikeHouseNumber("14 корпус", true /*isPrefix */), ());
+ TEST(LooksLikeHouseNumber("14 корпус 1", true /* isPrefix */), ());
+ TEST(LooksLikeHouseNumber("14 корпус 1", false /* isPrefix */), ());
+
+ TEST(LooksLikeHouseNumber("39 c 79", false /* isPrefix */), ());
+ TEST(LooksLikeHouseNumber("владение 14", false /* isPrefix */), ());
+
+ TEST(LooksLikeHouseNumber("4", false /* isPrefix */), ());
+ TEST(LooksLikeHouseNumber("4 2", false /* isPrefix */), ());
+ TEST(!LooksLikeHouseNumber("4 2 останкинская", false /* isPrefix */), ());
+
+ TEST(!LooksLikeHouseNumber("39 c 79 ленинградский", false /* isPrefix */), ());
+ TEST(!LooksLikeHouseNumber("каптерка 1", false /* isPrefix */), ());
+ TEST(!LooksLikeHouseNumber("1 канал", false /* isPrefix */), ());
+ TEST(!LooksLikeHouseNumber("2 останкинская", false /* isPrefix */), ());
+
+ TEST(LooksLikeHouseNumber("39 строе", true /* isPrefix */), ());
+ TEST(!LooksLikeHouseNumber("39 строе", false /* isPrefix */), ());
+
+ TEST(LooksLikeHouseNumber("дом ", true /* isPrefix */), ());
+ TEST(LooksLikeHouseNumber("дом ", false /* isPrefix */), ());
+
+ TEST(LooksLikeHouseNumber("дом 39 строение 79", false /* isPrefix */), ());
}