diff options
author | tatiana-kondakova <tatiana.kondakova@gmail.com> | 2018-03-23 20:09:23 +0300 |
---|---|---|
committer | mpimenov <mpimenov@users.noreply.github.com> | 2018-04-03 16:43:41 +0300 |
commit | f76abf201cda99075d6e05f95e8b35848b0d1e19 (patch) | |
tree | 140105904db31dbf0b5841e98e99e492a72ca193 /base | |
parent | 48635d9780f8803a2d8ad788802a588a78fd6b2d (diff) |
Allow predefined set of misprints in prefix.
Diffstat (limited to 'base')
-rw-r--r-- | base/base_tests/levenshtein_dfa_test.cpp | 36 | ||||
-rw-r--r-- | base/levenshtein_dfa.cpp | 67 | ||||
-rw-r--r-- | base/levenshtein_dfa.hpp | 6 |
3 files changed, 83 insertions, 26 deletions
diff --git a/base/base_tests/levenshtein_dfa_test.cpp b/base/base_tests/levenshtein_dfa_test.cpp index d1f88423b7..2d7cab01b1 100644 --- a/base/base_tests/levenshtein_dfa_test.cpp +++ b/base/base_tests/levenshtein_dfa_test.cpp @@ -5,6 +5,7 @@ #include <sstream> #include <string> +#include <vector> using namespace std; using namespace strings; @@ -21,11 +22,12 @@ enum class Status struct Result { Result() = default; - Result(Status status, size_t errorsMade) : m_status(status), m_errorsMade(errorsMade) {} + Result(Status status, size_t errorsMade = 0) : m_status(status), m_errorsMade(errorsMade) {} bool operator==(Result const & rhs) const { - return m_status == rhs.m_status && m_errorsMade == rhs.m_errorsMade; + return m_status == rhs.m_status && + (m_errorsMade == rhs.m_errorsMade || m_status == Status::Rejects); } Status m_status = Status::Accepts; @@ -132,14 +134,14 @@ UNIT_TEST(LevenshteinDFA_Smoke) UNIT_TEST(LevenshteinDFA_Prefix) { { - LevenshteinDFA dfa("москва", 1 /* prefixCharsToKeep */, 1 /* maxErrors */); + LevenshteinDFA dfa("москва", 1 /* prefixSize */, 1 /* maxErrors */); TEST(Accepts(dfa, "москва"), ()); TEST(Accepts(dfa, "масква"), ()); TEST(Accepts(dfa, "моска"), ()); TEST(Rejects(dfa, "иосква"), ()); } { - LevenshteinDFA dfa("москва", 0 /* prefixCharsToKeep */, 1 /* maxErrors */); + LevenshteinDFA dfa("москва", 0 /* prefixSize */, 1 /* maxErrors */); TEST(Accepts(dfa, "москва"), ()); TEST(Accepts(dfa, "иосква"), ()); TEST(Accepts(dfa, "моксва"), ()); @@ -149,7 +151,7 @@ UNIT_TEST(LevenshteinDFA_Prefix) UNIT_TEST(LevenshteinDFA_ErrorsMade) { { - LevenshteinDFA dfa("москва", 1 /* prefixCharsToKeep */, 2 /* maxErrors */); + LevenshteinDFA dfa("москва", 1 /* prefixSize */, 2 /* maxErrors */); TEST_EQUAL(GetResult(dfa, "москва"), Result(Status::Accepts, 0 /* errorsMade */), ()); TEST_EQUAL(GetResult(dfa, "москв"), Result(Status::Accepts, 1 /* errorsMade */), ()); @@ -165,19 +167,37 @@ UNIT_TEST(LevenshteinDFA_ErrorsMade) } { - LevenshteinDFA dfa("aa", 0 /* prefixCharsToKeep */, 2 /* maxErrors */); + LevenshteinDFA dfa("aa", 0 /* prefixSize */, 2 /* maxErrors */); TEST_EQUAL(GetResult(dfa, "abab"), Result(Status::Accepts, 2 /* errorsMade */), ()); } { - LevenshteinDFA dfa("mississippi", 0 /* prefixCharsToKeep */, 0 /* maxErrors */); + LevenshteinDFA dfa("mississippi", 0 /* prefixSize */, 0 /* maxErrors */); TEST_EQUAL(GetResult(dfa, "misisipi").m_status, Status::Rejects, ()); TEST_EQUAL(GetResult(dfa, "mississipp").m_status, Status::Intermediate, ()); TEST_EQUAL(GetResult(dfa, "mississippi"), Result(Status::Accepts, 0 /* errorsMade */), ()); } { - LevenshteinDFA dfa("кафе", 1 /* prefixCharsToKeep */, 1 /* maxErrors */); + vector<UniString> const allowedMisprints = {MakeUniString("yj")}; + size_t const prefixSize = 1; + size_t const maxErrors = 1; + string const str = "yekaterinburg"; + vector<pair<string, Result>> const queries = { + {"yekaterinburg", Result(Status::Accepts, 0 /* errorsMade */)}, + {"ekaterinburg", Result(Status::Accepts, 1 /* errorsMade */)}, + {"jekaterinburg", Result(Status::Accepts, 1 /* errorsMade */)}, + {"iekaterinburg", Result(Status::Rejects)}}; + + for (auto const & q : queries) + { + LevenshteinDFA dfa(MakeUniString(q.first), prefixSize, allowedMisprints, maxErrors); + TEST_EQUAL(GetResult(dfa, str), q.second, ("Query:", q.first, "string:", str)); + } + } + + { + LevenshteinDFA dfa("кафе", 1 /* prefixSize */, 1 /* maxErrors */); TEST_EQUAL(GetResult(dfa, "кафе"), Result(Status::Accepts, 0 /* errorsMade */), ()); TEST_EQUAL(GetResult(dfa, "кафер"), Result(Status::Accepts, 1 /* errorsMade */), ()); } diff --git a/base/levenshtein_dfa.cpp b/base/levenshtein_dfa.cpp index 40454192f8..9c0b09b00c 100644 --- a/base/levenshtein_dfa.cpp +++ b/base/levenshtein_dfa.cpp @@ -18,20 +18,22 @@ inline size_t AbsDiff(size_t a, size_t b) { return a > b ? a - b : b - a; } class TransitionTable { public: - TransitionTable(UniString const & s) : m_s(s), m_size(s.size()) {} + TransitionTable(UniString const & s, std::vector<UniString> const & prefixMisprints, + size_t prefixSize) + : m_s(s), m_size(s.size()), m_prefixMisprints(prefixMisprints), m_prefixSize(prefixSize) + { + } - void Move(LevenshteinDFA::State const & s, size_t prefixCharsToKeep, UniChar c, - LevenshteinDFA::State & t) + void Move(LevenshteinDFA::State const & s, UniChar c, LevenshteinDFA::State & t) { t.Clear(); for (auto const & p : s.m_positions) - GetMoves(p, prefixCharsToKeep, c, t); + GetMoves(p, c, t); t.Normalize(); } private: - void GetMoves(LevenshteinDFA::Position const & p, size_t prefixCharsToKeep, UniChar c, - LevenshteinDFA::State & t) + void GetMoves(LevenshteinDFA::Position const & p, UniChar c, LevenshteinDFA::State & t) { auto & ps = t.m_positions; @@ -53,11 +55,16 @@ private: if (p.m_errorsLeft == 0) return; - if (p.m_offset < prefixCharsToKeep) - return; - ps.emplace_back(p.m_offset, p.m_errorsLeft - 1, false /* transposed */); + if (p.m_offset < m_prefixSize) + { + // Allow only prefixMisprints for prefix. + if (IsAllowedPrefixMisprint(c, p.m_offset)) + ps.emplace_back(p.m_offset + 1, p.m_errorsLeft - 1, false /* transposed */); + return; + } + if (p.m_offset == m_size) return; @@ -87,8 +94,25 @@ private: return false; } + bool IsAllowedPrefixMisprint(UniChar c, size_t position) const + { + CHECK_LESS(position, m_prefixSize, ()); + + for (auto const & misprints : m_prefixMisprints) + { + if (std::find(misprints.begin(), misprints.end(), c) != misprints.end() && + std::find(misprints.begin(), misprints.end(), m_s[position]) != misprints.end()) + { + return true; + } + } + return false; + } + UniString const & m_s; size_t const m_size; + std::vector<UniString> const m_prefixMisprints; + size_t const m_prefixSize; }; } // namespace @@ -169,10 +193,21 @@ void LevenshteinDFA::State::Normalize() // LevenshteinDFA ---------------------------------------------------------------------------------- // static -LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, size_t maxErrors) +LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixSize, + std::vector<UniString> const & prefixMisprints, size_t maxErrors) : m_size(s.size()), m_maxErrors(maxErrors) { m_alphabet.assign(s.begin(), s.end()); + CHECK_LESS_OR_EQUAL(prefixSize, s.size(), ()); + + for (auto it = s.begin(); std::distance(it, s.begin()) < prefixSize; ++it) + { + for (auto const & misprints : prefixMisprints) + { + if (std::find(misprints.begin(), misprints.end(), *it) != misprints.end()) + m_alphabet.insert(m_alphabet.end(), misprints.begin(), misprints.end()); + } + } my::SortUnique(m_alphabet); UniChar missed = 0; @@ -204,7 +239,7 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si pushState(MakeStart(), kStartingState); pushState(MakeRejecting(), kRejectingState); - TransitionTable table(s); + TransitionTable table(s, prefixMisprints, prefixSize); while (!states.empty()) { @@ -222,7 +257,7 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si for (size_t i = 0; i < m_alphabet.size(); ++i) { State next; - table.Move(curr, prefixCharsToKeep, m_alphabet[i], next); + table.Move(curr, m_alphabet[i], next); size_t nid; @@ -242,18 +277,18 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, si } } -LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t prefixCharsToKeep, size_t maxErrors) - : LevenshteinDFA(MakeUniString(s), prefixCharsToKeep, maxErrors) +LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t prefixSize, size_t maxErrors) + : LevenshteinDFA(MakeUniString(s), prefixSize, {} /* prefixMisprints */, maxErrors) { } LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t maxErrors) - : LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors) + : LevenshteinDFA(s, 0 /* prefixSize */, {} /* prefixMisprints */, maxErrors) { } LevenshteinDFA::LevenshteinDFA(std::string const & s, size_t maxErrors) - : LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors) + : LevenshteinDFA(MakeUniString(s), 0 /* prefixSize */, {} /* prefixMisprints */, maxErrors) { } diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp index 88569d0b65..5505c94b23 100644 --- a/base/levenshtein_dfa.hpp +++ b/base/levenshtein_dfa.hpp @@ -3,6 +3,7 @@ #include "base/string_utils.hpp" #include <cstddef> +#include <string> #include <vector> namespace strings @@ -94,8 +95,9 @@ public: LevenshteinDFA(LevenshteinDFA const &) = default; LevenshteinDFA(LevenshteinDFA &&) = default; - LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, size_t maxErrors); - LevenshteinDFA(std::string const & s, size_t prefixCharsToKeep, size_t maxErrors); + LevenshteinDFA(UniString const & s, size_t prefixSize, + std::vector<UniString> const & prefixMisprints, size_t maxErrors); + LevenshteinDFA(std::string const & s, size_t prefixSize, size_t maxErrors); LevenshteinDFA(UniString const & s, size_t maxErrors); LevenshteinDFA(std::string const & s, size_t maxErrors); |