diff options
Diffstat (limited to 'base')
-rw-r--r-- | base/CMakeLists.txt | 8 | ||||
-rw-r--r-- | base/base.pro | 5 | ||||
-rw-r--r-- | base/base_tests/CMakeLists.txt | 2 | ||||
-rw-r--r-- | base/base_tests/base_tests.pro | 1 | ||||
-rw-r--r-- | base/base_tests/levenshtein_dfa_test.cpp | 8 | ||||
-rw-r--r-- | base/base_tests/uni_string_dfa_test.cpp | 90 | ||||
-rw-r--r-- | base/dfa_helpers.hpp | 66 | ||||
-rw-r--r-- | base/levenshtein_dfa.hpp | 18 | ||||
-rw-r--r-- | base/string_utils.hpp | 33 | ||||
-rw-r--r-- | base/uni_string_dfa.cpp | 41 | ||||
-rw-r--r-- | base/uni_string_dfa.hpp | 39 |
11 files changed, 277 insertions, 34 deletions
diff --git a/base/CMakeLists.txt b/base/CMakeLists.txt index 89acc66ef7..e71d7f4a6b 100644 --- a/base/CMakeLists.txt +++ b/base/CMakeLists.txt @@ -20,10 +20,13 @@ set( const_helper.hpp deferred_task.cpp deferred_task.hpp + dfa_helpers.hpp exception.cpp exception.hpp gmtime.cpp gmtime.hpp + levenshtein_dfa.cpp + levenshtein_dfa.hpp limited_priority_queue.hpp logging.cpp logging.hpp @@ -71,12 +74,15 @@ set( threaded_container.hpp threaded_list.hpp threaded_priority_queue.hpp + internal/message.cpp + internal/message.hpp timegm.cpp timegm.hpp timer.cpp timer.hpp + uni_string_dfa.cpp + uni_string_dfa.hpp worker_thread.hpp - internal/message.cpp ) add_library(${PROJECT_NAME} ${SRC}) diff --git a/base/base.pro b/base/base.pro index 9f3fe820b7..31f3f5bd88 100644 --- a/base/base.pro +++ b/base/base.pro @@ -32,6 +32,7 @@ SOURCES += \ time_samples.cpp \ timegm.cpp \ timer.cpp \ + uni_string_dfa.cpp \ HEADERS += \ SRC_FIRST.hpp \ @@ -46,9 +47,10 @@ HEADERS += \ condition.hpp \ const_helper.hpp \ deferred_task.hpp \ + dfa_helpers.hpp \ exception.hpp \ gmtime.hpp \ - internal/messagex.hpp \ + internal/message.hpp \ levenshtein_dfa.hpp \ limited_priority_queue.hpp \ logging.hpp \ @@ -87,4 +89,5 @@ HEADERS += \ time_samples.hpp \ timegm.hpp \ timer.hpp \ + uni_string_dfa.hpp \ worker_thread.hpp \ diff --git a/base/base_tests/CMakeLists.txt b/base/base_tests/CMakeLists.txt index 3120b52052..0fa0ec6b3e 100644 --- a/base/base_tests/CMakeLists.txt +++ b/base/base_tests/CMakeLists.txt @@ -12,6 +12,7 @@ set( condition_test.cpp const_helper.cpp containers_test.cpp + levenshtein_dfa_test.cpp logging_test.cpp math_test.cpp matrix_test.cpp @@ -33,6 +34,7 @@ set( threads_test.cpp timegm_test.cpp timer_test.cpp + uni_string_dfa_test.cpp worker_thread_test.cpp ) diff --git a/base/base_tests/base_tests.pro b/base/base_tests/base_tests.pro index 1a4f2bb9ec..c8708b84d4 100644 --- a/base/base_tests/base_tests.pro +++ b/base/base_tests/base_tests.pro @@ -44,6 +44,7 @@ SOURCES += \ threads_test.cpp \ timegm_test.cpp \ timer_test.cpp \ + uni_string_dfa_test.cpp \ worker_thread_test.cpp \ HEADERS += diff --git a/base/base_tests/levenshtein_dfa_test.cpp b/base/base_tests/levenshtein_dfa_test.cpp index c671b1aa1e..f3cd896fa2 100644 --- a/base/base_tests/levenshtein_dfa_test.cpp +++ b/base/base_tests/levenshtein_dfa_test.cpp @@ -1,5 +1,6 @@ #include "testing/testing.hpp" +#include "base/dfa_helpers.hpp" #include "base/levenshtein_dfa.hpp" #include "std/string.hpp" @@ -18,7 +19,7 @@ enum class Status Status GetStatus(LevenshteinDFA const & dfa, string const & s) { auto it = dfa.Begin(); - it.Move(s); + DFAMove(it, s); if (it.Accepts()) return Status::Accepts; if (it.Rejects()) @@ -86,5 +87,10 @@ UNIT_TEST(LevenshteinDFA_Smoke) TEST(Accepts(dfa, "ленигнрадский"), ()); TEST(Rejects(dfa, "ленинский"), ()); } + + { + LevenshteinDFA dfa("atm", 1 /* maxErrors */); + TEST(Rejects(dfa, "san"), ()); + } } } // namespace diff --git a/base/base_tests/uni_string_dfa_test.cpp b/base/base_tests/uni_string_dfa_test.cpp new file mode 100644 index 0000000000..33191e18dd --- /dev/null +++ b/base/base_tests/uni_string_dfa_test.cpp @@ -0,0 +1,90 @@ +#include "testing/testing.hpp" + +#include "base/dfa_helpers.hpp" +#include "base/string_utils.hpp" +#include "base/uni_string_dfa.hpp" + +using namespace strings; + +namespace +{ +UNIT_TEST(UniStringDFA_Smoke) +{ + { + UniStringDFA dfa(""); + + auto it = dfa.Begin(); + TEST(it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "a"); + TEST(!it.Accepts(), ()); + TEST(it.Rejects(), ()); + } + + { + UniStringDFA dfa("абв"); + + auto it = dfa.Begin(); + TEST(!it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "а"); + TEST(!it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "б"); + TEST(!it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "в"); + TEST(it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "г"); + TEST(!it.Accepts(), ()); + TEST(it.Rejects(), ()); + } + + { + UniStringDFA dfa("абв"); + + auto it = dfa.Begin(); + TEST(!it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "а"); + TEST(!it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "г"); + TEST(!it.Accepts(), ()); + TEST(it.Rejects(), ()); + } +} + +UNIT_TEST(UniStringDFA_Prefix) +{ + { + PrefixDFAModifier<UniStringDFA> dfa(UniStringDFA("abc")); + + auto it = dfa.Begin(); + DFAMove(it, "ab"); + + TEST(!it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "c"); + TEST(it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "d"); + TEST(it.Accepts(), ()); + TEST(!it.Rejects(), ()); + + DFAMove(it, "efghijk"); + TEST(it.Accepts(), ()); + TEST(!it.Rejects(), ()); + } +} +} // namespace diff --git a/base/dfa_helpers.hpp b/base/dfa_helpers.hpp new file mode 100644 index 0000000000..5815f380f9 --- /dev/null +++ b/base/dfa_helpers.hpp @@ -0,0 +1,66 @@ +#pragma once + +#include "base/string_utils.hpp" + +#include "std/string.hpp" + +namespace strings +{ +template <typename DFA> +class PrefixDFAModifier +{ +public: + class Iterator + { + public: + Iterator & Move(strings::UniChar c) + { + if (Accepts() || Rejects()) + return *this; + + m_it.Move(c); + if (m_it.Accepts()) + m_accepts = true; + + return *this; + } + + bool Accepts() const { return m_accepts; } + bool Rejects() const { return !Accepts() && m_it.Rejects(); } + + private: + friend class PrefixDFAModifier; + + Iterator(typename DFA::Iterator it) : m_it(it), m_accepts(m_it.Accepts()) {} + + typename DFA::Iterator m_it; + bool m_accepts; + }; + + explicit PrefixDFAModifier(DFA const & dfa) : m_dfa(dfa) {} + + Iterator Begin() const { return Iterator(m_dfa.Begin()); } + +private: + DFA const m_dfa; +}; + +template <typename DFAIt, typename It> +void DFAMove(DFAIt & it, It begin, It end) +{ + for (; begin != end; ++begin) + it.Move(*begin); +} + +template <typename DFAIt> +void DFAMove(DFAIt & it, UniString const & s) +{ + DFAMove(it, s.begin(), s.end()); +} + +template <typename DFAIt> +void DFAMove(DFAIt & it, string const & s) +{ + DFAMove(it, MakeUniString(s)); +} +} // namespace strings diff --git a/base/levenshtein_dfa.hpp b/base/levenshtein_dfa.hpp index cef354c458..bdf5ca5b30 100644 --- a/base/levenshtein_dfa.hpp +++ b/base/levenshtein_dfa.hpp @@ -71,29 +71,13 @@ public: return *this; } - inline Iterator & Move(UniString const & s) { return Move(s.begin(), s.end()); } - - inline Iterator & Move(string const & s) - { - UniString us = MakeUniString(s); - return Move(us); - } - - template <typename It> - Iterator & Move(It begin, It end) - { - for (; begin != end; ++begin) - Move(*begin); - return *this; - } - bool Accepts() const { return m_dfa.IsAccepting(m_s); } bool Rejects() const { return m_dfa.IsRejecting(m_s); } private: friend class LevenshteinDFA; - Iterator(LevenshteinDFA const & dfa) : m_s(kStartingState), m_dfa(dfa) {} + explicit Iterator(LevenshteinDFA const & dfa) : m_s(kStartingState), m_dfa(dfa) {} size_t m_s; LevenshteinDFA const & m_dfa; diff --git a/base/string_utils.hpp b/base/string_utils.hpp index 114ce0f5d8..7875c55890 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -484,40 +484,45 @@ void ForEachMatched(string const & s, regex const & regex, TFn && fn) fn(*cur); } -// Computes the minimum number of insertions, deletions and alterations -// of characters needed to transform one string into another. -// The function works in O(length1 * length2) time and memory -// where length1 and length2 are the lengths of the argument strings. -// See https://en.wikipedia.org/wiki/Levenshtein_distance. -// One string is [b1, e1) and the other is [b2, e2). The iterator -// form is chosen to fit both std::string and strings::UniString. -// This function does not normalize either of the strings. +// Computes the minimum number of insertions, deletions and +// alterations of characters needed to transform one string into +// another. The function works in O(length1 * length2) time and +// O(min(length1, length2)) memory where length1 and length2 are the +// lengths of the argument strings. See +// https://en.wikipedia.org/wiki/Levenshtein_distance. One string is +// [b1, e1) and the other is [b2, e2). The iterator form is chosen to +// fit both std::string and strings::UniString. This function does +// not normalize either of the strings. template <typename TIter> size_t EditDistance(TIter const & b1, TIter const & e1, TIter const & b2, TIter const & e2) { size_t const n = distance(b1, e1); size_t const m = distance(b2, e2); - // |cur| and |prev| are current and previous rows of the + + if (m > n) + return EditDistance(b2, e2, b1, e1); + + // |curr| and |prev| are current and previous rows of the // dynamic programming table. vector<size_t> prev(m + 1); - vector<size_t> cur(m + 1); + vector<size_t> curr(m + 1); for (size_t j = 0; j <= m; j++) prev[j] = j; auto it1 = b1; // 1-based to avoid corner cases. for (size_t i = 1; i <= n; ++i, ++it1) { - cur[0] = i; + curr[0] = i; auto const & c1 = *it1; auto it2 = b2; for (size_t j = 1; j <= m; ++j, ++it2) { auto const & c2 = *it2; - cur[j] = min(cur[j - 1], prev[j]) + 1; - cur[j] = min(cur[j], prev[j - 1] + (c1 == c2 ? 0 : 1)); + curr[j] = min(curr[j - 1], prev[j]) + 1; + curr[j] = min(curr[j], prev[j - 1] + (c1 == c2 ? 0 : 1)); } - prev.swap(cur); + prev.swap(curr); } return prev[m]; } diff --git a/base/uni_string_dfa.cpp b/base/uni_string_dfa.cpp new file mode 100644 index 0000000000..4106bb8717 --- /dev/null +++ b/base/uni_string_dfa.cpp @@ -0,0 +1,41 @@ +#include "base/uni_string_dfa.hpp" + +#include "base/assert.hpp" + +namespace strings +{ +// UniStringDFA::Iterator -------------------------------------------------------------------------- +UniStringDFA::Iterator::Iterator(UniString const & s) : m_s(s), m_pos(0), m_rejected(false) {} + +UniStringDFA::Iterator & UniStringDFA::Iterator::Move(UniChar c) +{ + if (Accepts()) + { + m_rejected = true; + return *this; + } + + if (Rejects()) + return *this; + + ASSERT_LESS(m_pos, m_s.size(), ()); + if (m_s[m_pos] != c) + { + m_rejected = true; + return *this; + } + + ++m_pos; + return *this; +} + +// UniStringDFA::UniStringDFA ---------------------------------------------------------------------- +UniStringDFA::UniStringDFA(UniString const & s) : m_s(s) {} + +UniStringDFA::UniStringDFA(string const & s): UniStringDFA(MakeUniString(s)) {} + +UniStringDFA::Iterator UniStringDFA::Begin() const +{ + return Iterator(m_s); +} +} // namespace strings diff --git a/base/uni_string_dfa.hpp b/base/uni_string_dfa.hpp new file mode 100644 index 0000000000..f7e75085a1 --- /dev/null +++ b/base/uni_string_dfa.hpp @@ -0,0 +1,39 @@ +#pragma once + +#include "base/logging.hpp" +#include "base/string_utils.hpp" + +#include "std/string.hpp" + +namespace strings +{ +class UniStringDFA +{ +public: + class Iterator + { + public: + Iterator & Move(UniChar c); + + inline bool Accepts() const { return !Rejects() && m_pos == m_s.size(); } + inline bool Rejects() const { return m_rejected; } + + private: + friend class UniStringDFA; + + Iterator(UniString const & s); + + UniString const & m_s; + size_t m_pos; + bool m_rejected; + }; + + explicit UniStringDFA(UniString const & s); + explicit UniStringDFA(string const & s); + + Iterator Begin() const; + +private: + UniString const m_s; +}; +} // namespace strings |