diff options
author | Yuri Gorshenin <y@maps.me> | 2016-06-15 14:43:38 +0300 |
---|---|---|
committer | Yuri Gorshenin <y@maps.me> | 2016-06-15 14:43:38 +0300 |
commit | 90d01a652f231684cd48b1ea9bde382cf562c990 (patch) | |
tree | e4107970cd7f3316deeae63166dd7ec418045d2d /base | |
parent | 275f23853c854fef34bb19145d3f160ad85b6e58 (diff) |
[base] Implemented empty-tokens-support to tokenizer.
Diffstat (limited to 'base')
-rw-r--r-- | base/base_tests/string_utils_test.cpp | 69 | ||||
-rw-r--r-- | base/string_utils.cpp | 4 | ||||
-rw-r--r-- | base/string_utils.hpp | 192 |
3 files changed, 188 insertions, 77 deletions
diff --git a/base/base_tests/string_utils_test.cpp b/base/base_tests/string_utils_test.cpp index aa7e79074d..6e8c36a6b7 100644 --- a/base/base_tests/string_utils_test.cpp +++ b/base/base_tests/string_utils_test.cpp @@ -332,32 +332,47 @@ struct FunctorTester size_t & m_index; vector<string> const & m_tokens; - explicit FunctorTester(size_t & counter, vector<string> const & tokens) - : m_index(counter), m_tokens(tokens) {} + FunctorTester(size_t & counter, vector<string> const & tokens) + : m_index(counter), m_tokens(tokens) + { + } + void operator()(string const & s) { TEST_EQUAL(s, m_tokens[m_index++], ()); } }; -void TestIter(string const & str, char const * delims, vector<string> const & tokens) +void TestIter(string const & s, char const * delims, vector<string> const & tokens) { - strings::SimpleTokenizer it(str, delims); + strings::SimpleTokenizer it(s, delims); for (size_t i = 0; i < tokens.size(); ++i) { - TEST_EQUAL(true, it, (str, delims, i)); - TEST_EQUAL(i == tokens.size() - 1, it.IsLast(), ()); - TEST_EQUAL(*it, tokens[i], (str, delims, i)); + TEST_EQUAL(true, it, (s, delims, i)); + TEST_EQUAL(*it, tokens[i], (s, delims, i)); ++it; } - TEST_EQUAL(false, it, (str, delims)); + TEST_EQUAL(false, it, (s, delims)); size_t counter = 0; FunctorTester f = FunctorTester(counter, tokens); - strings::Tokenize(str, delims, f); + strings::Tokenize(s, delims, f); TEST_EQUAL(counter, tokens.size(), ()); } +void TestIterWithEmptyTokens(string const & s, char const * delims, vector<string> const & tokens) +{ + strings::SimpleTokenizerWithEmptyTokens it(s, delims); + + for (size_t i = 0; i < tokens.size(); ++i) + { + TEST_EQUAL(true, it, (s, delims, i)); + TEST_EQUAL(*it, tokens[i], (s, delims, i)); + ++it; + } + TEST_EQUAL(false, it, (s, delims)); +} + UNIT_TEST(SimpleTokenizer) { vector<string> tokens; @@ -402,6 +417,42 @@ UNIT_TEST(SimpleTokenizer) TEST_EQUAL(vector<string>(SimpleTokenizer(str, ","), SimpleTokenizer()), (vector<string>{"a", "b", "c"}), ()); } + + { + string const s = ""; + vector<string> tokens = {""}; + TestIterWithEmptyTokens(s, ",", tokens); + } + + { + string const s = ","; + vector<string> tokens = {"", ""}; + TestIterWithEmptyTokens(s, ",", tokens); + } + + { + string const s = ",,"; + vector<string> tokens = {"", "", ""}; + TestIterWithEmptyTokens(s, ",", tokens); + } + + { + string const s = "Hello, World!"; + vector<string> tokens = {s}; + TestIterWithEmptyTokens(s, "", tokens); + } + + { + string const s = "Hello, World!"; + vector<string> tokens = {"Hello", " World", ""}; + TestIterWithEmptyTokens(s, ",!", tokens); + } + + { + string const s = ",a,b,,c,d,"; + vector<string> tokens = {"", "a", "b", "", "c", "d", ""}; + TestIterWithEmptyTokens(s, ",", tokens); + } } UNIT_TEST(LastUniChar) diff --git a/base/string_utils.cpp b/base/string_utils.cpp index d5fe4d7c43..d36b33a853 100644 --- a/base/string_utils.cpp +++ b/base/string_utils.cpp @@ -16,9 +16,9 @@ bool UniString::IsEqualAscii(char const * s) const return (size() == strlen(s) && equal(begin(), end(), s)); } -SimpleDelimiter::SimpleDelimiter(char const * delimChars) +SimpleDelimiter::SimpleDelimiter(char const * delims) { - string const s(delimChars); + string const s(delims); string::const_iterator it = s.begin(); while (it != s.end()) m_delims.push_back(utf8::unchecked::next(it)); diff --git a/base/string_utils.hpp b/base/string_utils.hpp index 85396b7f19..33b7af7d2d 100644 --- a/base/string_utils.hpp +++ b/base/string_utils.hpp @@ -89,82 +89,150 @@ bool IsASCIILatin(UniChar c); inline string DebugPrint(UniString const & s) { return ToUtf8(s); } -template <typename DelimFuncT, typename UniCharIterT = UniString::const_iterator> +template <typename TDelimFn, typename TIt = UniString::const_iterator, bool KeepEmptyTokens = false> class TokenizeIterator { - UniCharIterT m_beg, m_end, m_finish; - DelimFuncT m_delimFunc; +public: + using difference_type = std::ptrdiff_t; + using value_type = string; + using pointer = void; + using reference = string; + using iterator_category = std::input_iterator_tag; - void move() + // *NOTE* |s| must be not temporary! + TokenizeIterator(string const & s, TDelimFn const & delimFn) + : m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn) { - m_beg = m_end; - while (m_beg != m_finish) - { - if (m_delimFunc(*m_beg)) - ++m_beg; - else - break; - } - m_end = m_beg; - while (m_end != m_finish) - { - if (m_delimFunc(*m_end)) - break; - else - ++m_end; - } + Move(); } -public: - /// @warning string S must be not temporary! - TokenizeIterator(string const & s, DelimFuncT const & delimFunc) - : m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc) + // *NOTE* |s| must be not temporary! + TokenizeIterator(UniString const & s, TDelimFn const & delimFn) + : m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn) { - move(); + Move(); } - /// @warning unistring S must be not temporary! - TokenizeIterator(UniString const & s, DelimFuncT const & delimFunc) - : m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc) + // Use default-constructed iterator for operator == to determine an + // end of a token stream. + TokenizeIterator() = default; + + string operator*() const { - move(); + ASSERT(m_start != m_finish, ("Dereferencing of empty iterator.")); + return string(m_start.base(), m_end.base()); } - /// Use default-constructed iterator for operator == to determine an end of a token stream. - TokenizeIterator() = default; + UniString GetUniString() const + { + ASSERT(m_start != m_finish, ("Dereferencing of empty iterator.")); + return UniString(m_start, m_end); + } - /// Explicitly disabled, because we're storing iterators for string - TokenizeIterator(char const *, DelimFuncT const &) = delete; + operator bool() const { return m_start != m_finish; } - string operator*() const + TokenizeIterator & operator++() { - ASSERT(m_beg != m_finish, ("dereferencing of empty iterator")); - return string(m_beg.base(), m_end.base()); + Move(); + return *this; } - operator bool() const { return m_beg != m_finish; } - - TokenizeIterator & operator++() + // Same as operator bool() in expression it == end(...). + bool operator==(TokenizeIterator const &) { return !(*this); } + + // Same as operator bool() in expression it != end(...). + bool operator!=(TokenizeIterator const &) { return (*this); } + +private: + void Move() { - move(); - return (*this); + m_start = m_end; + while (m_start != m_finish && m_delimFn(*m_start)) + ++m_start; + + m_end = m_start; + while (m_end != m_finish && !m_delimFn(*m_end)) + ++m_end; } - bool IsLast() const + TIt m_start; + TIt m_end; + TIt m_finish; + TDelimFn m_delimFn; +}; + +template <typename TDelimFn, typename TIt> +class TokenizeIterator<TDelimFn, TIt, true /* KeepEmptyTokens */> +{ +public: + using difference_type = std::ptrdiff_t; + using value_type = string; + using pointer = void; + using reference = string; + using iterator_category = std::input_iterator_tag; + + // *NOTE* |s| must be not temporary! + TokenizeIterator(string const & s, TDelimFn const & delimFn) + : m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn), m_finished(false) { - if (!*this) - return false; + while (m_end != m_finish && !m_delimFn(*m_end)) + ++m_end; + } - TokenizeIterator<DelimFuncT, UniCharIterT> copy(*this); - ++copy; - return !copy; + // Use default-constructed iterator for operator == to determine an + // end of a token stream. + TokenizeIterator() = default; + + string operator*() const + { + ASSERT(!m_finished, ("Dereferencing of empty iterator.")); + return string(m_start.base(), m_end.base()); + } + + UniString GetUniString() const + { + ASSERT(!m_finished, ("Dereferencing of empty iterator.")); + return UniString(m_start, m_end); } - UniString GetUniString() const { return UniString(m_beg, m_end); } - /// Same as operator bool() in expression it == end(...) + operator bool() const { return !m_finished; } + + TokenizeIterator & operator++() + { + Move(); + return *this; + } + + // Same as operator bool() in expression it == end(...). bool operator==(TokenizeIterator const &) { return !(*this); } - /// Same as operator bool() in expression it != end(...) + + // Same as operator bool() in expression it != end(...). bool operator!=(TokenizeIterator const &) { return (*this); } + +private: + void Move() + { + if (m_end == m_finish) + { + ASSERT(!m_finished, ()); + m_start = m_end = m_finish; + m_finished = true; + return; + } + + m_start = m_end; + ++m_start; + + m_end = m_start; + while (m_end != m_finish && !m_delimFn(*m_end)) + ++m_end; + } + + TIt m_start; + TIt m_end; + TIt m_finish; + TDelimFn m_delimFn; + bool m_finished; }; class SimpleDelimiter @@ -172,15 +240,20 @@ class SimpleDelimiter UniString m_delims; public: - SimpleDelimiter(char const * delimChars); + SimpleDelimiter(char const * delims); + // Used in TokenizeIterator to allow past the end iterator construction. SimpleDelimiter() = default; /// @return true if c is delimiter bool operator()(UniChar c) const; }; -typedef TokenizeIterator<SimpleDelimiter, ::utf8::unchecked::iterator<string::const_iterator>> - SimpleTokenizer; +using SimpleTokenizer = + TokenizeIterator<SimpleDelimiter, ::utf8::unchecked::iterator<string::const_iterator>, + false /* KeepEmptyTokens */>; +using SimpleTokenizerWithEmptyTokens = + TokenizeIterator<SimpleDelimiter, ::utf8::unchecked::iterator<string::const_iterator>, + true /* KeepEmptyTokens */>; template <typename TFunctor> void Tokenize(string const & str, char const * delims, TFunctor && f) @@ -389,16 +462,3 @@ size_t EditDistance(TIter const & b1, TIter const & e1, TIter const & b2, TIter return prev[m]; } } // namespace strings - -namespace std -{ -template <typename... Args> -struct iterator_traits<strings::TokenizeIterator<Args...>> -{ - using difference_type = std::ptrdiff_t; - using value_type = string; - using pointer = void; - using reference = string; - using iterator_category = std::input_iterator_tag; -}; -} // namespace std |