diff options
author | Yuri Gorshenin <y@maps.me> | 2016-11-15 17:35:41 +0300 |
---|---|---|
committer | Yuri Gorshenin <y@maps.me> | 2016-11-16 13:36:10 +0300 |
commit | e336eb4b6e571df71cc68166c7a09ad98f0609c3 (patch) | |
tree | dc1cc716f5db5037cea7e2729a55e3322cbdf7c7 /base | |
parent | cf1f3d1fccbb11c6a1bf14450f43ffd3e4d1852f (diff) |
[search] Added fuzzy search (up to 2 errors).
Diffstat (limited to 'base')
-rw-r--r-- | base/base_tests/levenshtein_dfa_test.cpp | 17 | ||||
-rw-r--r-- | base/levenshtein_dfa.cpp | 30 | ||||
-rw-r--r-- | base/levenshtein_dfa.hpp | 8 |
3 files changed, 45 insertions, 10 deletions
diff --git a/base/base_tests/levenshtein_dfa_test.cpp b/base/base_tests/levenshtein_dfa_test.cpp index f3cd896fa2..cf8e776be2 100644 --- a/base/base_tests/levenshtein_dfa_test.cpp +++ b/base/base_tests/levenshtein_dfa_test.cpp @@ -93,4 +93,21 @@ UNIT_TEST(LevenshteinDFA_Smoke) TEST(Rejects(dfa, "san"), ()); } } + +UNIT_TEST(LevenshteinDFA_Prefix) +{ + { + LevenshteinDFA dfa("москва", 1 /* prefixCharsToKeep */, 1 /* maxErrors */); + TEST(Accepts(dfa, "москва"), ()); + TEST(Accepts(dfa, "масква"), ()); + TEST(Accepts(dfa, "моска"), ()); + TEST(Rejects(dfa, "иосква"), ()); + } + { + LevenshteinDFA dfa("москва", 0 /* prefixCharsToKeep */, 1 /* maxErrors */); + TEST(Accepts(dfa, "москва"), ()); + TEST(Accepts(dfa, "иосква"), ()); + TEST(Rejects(dfa, "моксва"), ()); + } +} } // namespace diff --git a/base/levenshtein_dfa.cpp b/base/levenshtein_dfa.cpp index 6c78c4f2a4..54c4bb07e0 100644 --- a/base/levenshtein_dfa.cpp +++ b/base/levenshtein_dfa.cpp @@ -21,16 +21,18 @@ public: { } - void Move(LevenshteinDFA::State const & s, UniChar c, LevenshteinDFA::State & t) + void Move(LevenshteinDFA::State const & s, size_t prefixCharsToKeep, UniChar c, + LevenshteinDFA::State & t) { t.Clear(); for (auto const & p : s.m_positions) - GetMoves(p, c, t); + GetMoves(p, prefixCharsToKeep, c, t); t.Normalize(); } private: - void GetMoves(LevenshteinDFA::Position const & p, UniChar c, LevenshteinDFA::State & t) + void GetMoves(LevenshteinDFA::Position const & p, size_t prefixCharsToKeep, UniChar c, + LevenshteinDFA::State & t) { auto & ps = t.m_positions; @@ -43,6 +45,9 @@ private: if (p.m_numErrors == m_maxErrors) return; + if (p.m_offset < prefixCharsToKeep) + return; + ps.emplace_back(p.m_offset, p.m_numErrors + 1); if (p.m_offset == m_size) @@ -150,7 +155,7 @@ void LevenshteinDFA::State::Normalize() // LevenshteinDFA ---------------------------------------------------------------------------------- // static -LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors) +LevenshteinDFA::LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, uint8_t maxErrors) : m_size(s.size()), m_maxErrors(maxErrors) { m_alphabet.assign(s.begin(), s.end()); @@ -175,6 +180,7 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors) states.emplace(state); visited[state] = id; m_transitions.emplace_back(m_alphabet.size()); + m_accepting.push_back(false); }; pushState(State::MakeStart(), kStartingState); @@ -193,12 +199,12 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors) ASSERT_LESS(id, m_transitions.size(), ()); if (IsAccepting(curr)) - m_accepting.insert(id); + m_accepting[id] = true; for (size_t i = 0; i < m_alphabet.size(); ++i) { State next; - table.Move(curr, m_alphabet[i], next); + table.Move(curr, prefixCharsToKeep, m_alphabet[i], next); size_t nid; @@ -218,8 +224,18 @@ LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors) } } +LevenshteinDFA::LevenshteinDFA(string const & s, size_t prefixCharsToKeep, uint8_t maxErrors) + : LevenshteinDFA(MakeUniString(s), prefixCharsToKeep, maxErrors) +{ +} + +LevenshteinDFA::LevenshteinDFA(UniString const & s, uint8_t maxErrors) + : LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors) +{ +} + LevenshteinDFA::LevenshteinDFA(string const & s, uint8_t maxErrors) - : LevenshteinDFA(MakeUniString(s), maxErrors) + : LevenshteinDFA(s, 0 /* prefixCharsToKeep */, maxErrors) { } diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp index bdf5ca5b30..0e3236784b 100644 --- a/base/levenshtein_dfa.hpp +++ b/base/levenshtein_dfa.hpp @@ -3,7 +3,7 @@ #include "base/string_utils.hpp" #include "std/cstdint.hpp" -#include "std/unordered_set.hpp" +#include "std/vector.hpp" namespace strings { @@ -83,6 +83,8 @@ public: LevenshteinDFA const & m_dfa; }; + LevenshteinDFA(UniString const & s, size_t prefixCharsToKeep, uint8_t maxErrors); + LevenshteinDFA(string const & s, size_t prefixCharsToKeep, uint8_t maxErrors); LevenshteinDFA(UniString const & s, uint8_t maxErrors); LevenshteinDFA(string const & s, uint8_t maxErrors); @@ -101,7 +103,7 @@ private: bool IsAccepting(Position const & p) const; bool IsAccepting(State const & s) const; - inline bool IsAccepting(size_t s) const { return m_accepting.count(s) != 0; } + inline bool IsAccepting(size_t s) const { return m_accepting[s]; } inline bool IsRejecting(State const & s) const { return s.m_positions.empty(); } inline bool IsRejecting(size_t s) const { return s == kRejectingState; } @@ -114,7 +116,7 @@ private: vector<UniChar> m_alphabet; vector<vector<size_t>> m_transitions; - unordered_set<size_t> m_accepting; + vector<bool> m_accepting; }; string DebugPrint(LevenshteinDFA::Position const & p); |