Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/base
diff options
context:
space:
mode:
authortatiana-kondakova <tatiana.kondakova@gmail.com>2018-03-23 20:09:23 +0300
committermpimenov <mpimenov@users.noreply.github.com>2018-04-03 16:43:41 +0300
commitf76abf201cda99075d6e05f95e8b35848b0d1e19 (patch)
tree140105904db31dbf0b5841e98e99e492a72ca193 /base
parent48635d9780f8803a2d8ad788802a588a78fd6b2d (diff)
Allow predefined set of misprints in prefix.
Diffstat (limited to 'base')
-rw-r--r--base/base_tests/levenshtein_dfa_test.cpp36
-rw-r--r--base/levenshtein_dfa.cpp67
-rw-r--r--base/levenshtein_dfa.hpp6
3 files changed, 83 insertions, 26 deletions
diff --git a/base/base_tests/levenshtein_dfa_test.cpp b/base/base_tests/levenshtein_dfa_test.cpp
index d1f88423b7..2d7cab01b1 100644
--- a/base/base_tests/levenshtein_dfa_test.cpp
+++ b/base/base_tests/levenshtein_dfa_test.cpp
@@ -5,6 +5,7 @@
#include <sstream>
#include <string>
+#include <vector>
using namespace std;
using namespace strings;
@@ -21,11 +22,12 @@ enum class Status
struct Result
{
Result() = default;
- Result(Status status, size_t errorsMade) : m_status(status), m_errorsMade(errorsMade) {}
+ Result(Status status, size_t errorsMade = 0) : m_status(status), m_errorsMade(errorsMade) {}
bool operator==(Result const & rhs) const
{
- return m_status == rhs.m_status && m_errorsMade == rhs.m_errorsMade;
+ return m_status == rhs.m_status &&
+ (m_errorsMade == rhs.m_errorsMade || m_status == Status::Rejects);
}
Status m_status = Status::Accepts;
@@ -132,14 +134,14 @@ UNIT_TEST(LevenshteinDFA_Smoke)
UNIT_TEST(LevenshteinDFA_Prefix)
{
{
- LevenshteinDFA dfa("москва", 1 /* prefixCharsToKeep */, 1 /* maxErrors */);
+ LevenshteinDFA dfa("москва", 1 /* prefixSize */, 1 /* maxErrors */);
TEST(Accepts(dfa, "москва"), ());
TEST(Accepts(dfa, "масква"), ());
TEST(Accepts(dfa, "моска"), ());
TEST(Rejects(dfa, "иосква"), ());
}
{
- LevenshteinDFA dfa("москва", 0 /* prefixCharsToKeep */, 1 /* maxErrors */);
+ LevenshteinDFA dfa("москва", 0 /* prefixSize */, 1 /* maxErrors */);
TEST(Accepts(dfa, "москва"), ());
TEST(Accepts(dfa, "иосква"), ());
TEST(Accepts(dfa, "моксва"), ());
@@ -149,7 +151,7 @@ UNIT_TEST(LevenshteinDFA_Prefix)
UNIT_TEST(LevenshteinDFA_ErrorsMade)
{
{
- LevenshteinDFA dfa("москва", 1 /* prefixCharsToKeep */, 2 /* maxErrors */);
+ LevenshteinDFA dfa("москва", 1 /* prefixSize */, 2 /* maxErrors */);
TEST_EQUAL(GetResult(dfa, "москва"), Result(Status::Accepts, 0 /* errorsMade */), ());
TEST_EQUAL(GetResult(dfa, "москв"), Result(Status::Accepts, 1 /* errorsMade */), ());
@@ -165,19 +167,37 @@ UNIT_TEST(LevenshteinDFA_ErrorsMade)
}
{
- LevenshteinDFA dfa("aa", 0 /* prefixCharsToKeep */, 2 /* maxErrors */);
+ LevenshteinDFA dfa("aa", 0 /* prefixSize */, 2 /* maxErrors */);
TEST_EQUAL(GetResult(dfa, "abab"), Result(Status::Accepts, 2 /* errorsMade */), ());
}
{
- LevenshteinDFA dfa("mississippi", 0 /* prefixCharsToKeep */, 0 /* maxErrors */);
+ LevenshteinDFA dfa("mississippi", 0 /* prefixSize */, 0 /* maxErrors */);
TEST_EQUAL(GetResult(dfa, "misisipi").m_status, Status::Rejects, ());
TEST_EQUAL(GetResult(dfa, "mississipp").m_status, Status::Intermediate, ());
TEST_EQUAL(GetResult(dfa, "mississippi"), Result(Status::Accepts, 0 /* errorsMade */), ());
}
{
- LevenshteinDFA dfa("кафе", 1 /* prefixCharsToKeep */, 1 /* maxErrors */);
+ vector<UniString> const allowedMisprints = {MakeUniString("yj")};
+ size_t const prefixSize = 1;
+ size_t const maxErrors = 1;
+ string const str = "yekaterinburg";
+ vector<pair<string, Result>> const queries = {
+ {"yekaterinburg", Result(Status::Accepts, 0 /* errorsMade */)},
+ {"ekaterinburg", Result(Status::Accepts, 1 /* errorsMade */)},
+ {"jekaterinburg", Result(Status::Accepts, 1 /* errorsMade */)},
+ {"iekaterinburg", Result(Status::Rejects)}};
+
+ for (auto const & q : queries)
+ {
+ LevenshteinDFA dfa(MakeUniString(q.first), prefixSize, allowedMisprints, maxErrors);
+ TEST_EQUAL(GetResult(dfa, str), q.second, ("Query:", q.first, "string:", str));
+ }
+ }
+
+ {
+ LevenshteinDFA dfa("кафе", 1 /* prefixSize */, 1 /* maxErrors */);
TEST_EQUAL(GetResult(dfa, "кафе"), Result(Status::Accepts, 0 /* errorsMade */), ());
TEST_EQUAL(GetResult(dfa, "кафер"), Result(Status::Accepts, 1 /* errorsMade */), ());
}
diff --git a/base/levenshtein_dfa.cpp b/base/levenshtein_dfa.cpp
index 40454192f8..9c0b09b00c 100644
--- a/base/levenshtein_dfa.cpp
+++ b/base/levenshtein_dfa.cpp
@@ -18,20 +18,22 @@ inline size_t AbsDiff(size_t a, size_t b) { return a > b ? a - b : b - a; }
class TransitionTable
{
public:
- TransitionTable(UniString const & s) : m_s(s), m_size(s.size()) {}
+ TransitionTable(UniString const & s, std::vector<UniString> const & prefixMisprints,
+ size_t prefixSize)
+ : m_s(s), m_size(s.size()), m_prefixMisprints(prefixMisprints), m_prefixSize(prefixSize)
+ {
+ }
- void Move(LevenshteinDFA::State const & s, size_t prefixCharsToKeep, UniChar c,
- LevenshteinDFA::State & t)
+ void Move(LevenshteinDFA::State const & s, UniChar c, LevenshteinDFA::State & t)
{
t.Clear();
for (auto const & p : s.m_positions)
- GetMoves(p, prefixCharsToKeep, c, t);
+ GetMoves(p, c, t);
t.Normalize();
}
private:
- void GetMoves(LevenshteinDFA::Position const & p, size_t prefixCharsToKeep, UniChar c,
- LevenshteinDFA::State & t)
+ void GetMoves(LevenshteinDFA::Position const & p, UniChar c, LevenshteinDFA::State & t)
{
auto & ps = t.m_positions;
@@ -53,11 +55,16 @@ private:
if (p.m_errorsLeft == 0)
return;
- if (p.m_offset < prefixCharsToKeep)
- return;
-
ps.emplace_back(p.m_offset, p.m_errorsLeft - 1, false /* transposed */);
+ if (p.m_offset < m_prefixSize)
+ {
+ // Allow only prefixMisprints for prefix.
+ if (IsAllowedPrefixMisprint(c, p.m_offset))
+ ps.emplace_back(p.m_offset + 1, p.m_errorsLeft - 1, false /* transposed */);
+ return;
+ }
+
if (p.m_offset == m_size)
return;
@@ -87,8 +94,25 @@ private:
return false;
}
+ bool IsAllowedPrefixMisprint(UniChar c, size_t position) const
+ {
+ CHECK_LESS(position, m_prefixSize, ());
+
+ for (auto const & misprints : m_prefixMisprints)
+ {
+ if (std::find(misprints.begin(), misprints.end(), c) != misprints.end() &&
+ std::find(misprints.begin(), misprints.end(), m_s[position]) != misprints.end())
+ {
+ return true;
+ }
+ }
+ return false;
+ }
+
UniString const & m_s;
size_t const m_size;
+ std::vector<UniString> const m_prefixMisprints;
+ size_t const m_prefixSize;
};
} // namespace
@@ -169,10 +193,21 @@ void LevenshteinDFA::State::Normalize()
// LevenshteinDFA ----------------------------------------------------------------------------------
// static
-LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, size_t maxErrors)
+LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixSize,
+ std::vector<UniString> const & prefixMisprints, size_t maxErrors)
: m_size(s.size()), m_maxErrors(maxErrors)
{
m_alphabet.assign(s.begin(), s.end());
+ CHECK_LESS_OR_EQUAL(prefixSize, s.size(), ());
+
+ for (auto it = s.begin(); std::distance(it, s.begin()) < prefixSize; ++it)
+ {
+ for (auto const & misprints : prefixMisprints)
+ {
+ if (std::find(misprints.begin(), misprints.end(), *it) != misprints.end())
+ m_alphabet.insert(m_alphabet.end(), misprints.begin(), misprints.end());
+ }
+ }
my::SortUnique(m_alphabet);
UniChar missed = 0;
@@ -204,7 +239,7 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si
pushState(MakeStart(), kStartingState);
pushState(MakeRejecting(), kRejectingState);
- TransitionTable table(s);
+ TransitionTable table(s, prefixMisprints, prefixSize);
while (!states.empty())
{
@@ -222,7 +257,7 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si
for (size_t i = 0; i < m_alphabet.size(); ++i)
{
State next;
- table.Move(curr, prefixCharsToKeep, m_alphabet[i], next);
+ table.Move(curr, m_alphabet[i], next);
size_t nid;
@@ -242,18 +277,18 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si
}
}
-LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t prefixCharsToKeep, size_t maxErrors)
- : LevenshteinDFA(MakeUniString(s), prefixCharsToKeep, maxErrors)
+LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t prefixSize, size_t maxErrors)
+ : LevenshteinDFA(MakeUniString(s), prefixSize, {} /* prefixMisprints */, maxErrors)
{
}
LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t maxErrors)
- : LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors)
+ : LevenshteinDFA(s, 0 /* prefixSize */, {} /* prefixMisprints */, maxErrors)
{
}
LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t maxErrors)
- : LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors)
+ : LevenshteinDFA(MakeUniString(s), 0 /* prefixSize */, {} /* prefixMisprints */, maxErrors)
{
}
diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp
index 88569d0b65..5505c94b23 100644
--- a/base/levenshtein_dfa.hpp
+++ b/base/levenshtein_dfa.hpp
@@ -3,6 +3,7 @@
#include "base/string_utils.hpp"
#include <cstddef>
+#include <string>
#include <vector>
namespace strings
@@ -94,8 +95,9 @@ public:
LevenshteinDFA(LevenshteinDFA const &) = default;
LevenshteinDFA(LevenshteinDFA &&) = default;
- LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, size_t maxErrors);
- LevenshteinDFA(std::string const & s, size_t prefixCharsToKeep, size_t maxErrors);
+ LevenshteinDFA(UniString const & s, size_t prefixSize,
+ std::vector<UniString> const & prefixMisprints, size_t maxErrors);
+ LevenshteinDFA(std::string const & s, size_t prefixSize, size_t maxErrors);
LevenshteinDFA(UniString const & s, size_t maxErrors);
LevenshteinDFA(std::string const & s, size_t maxErrors);