Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/mapsme/omim.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/base
diff options
context:
space:
mode:
authorYuri Gorshenin <y@maps.me>2016-06-15 14:43:38 +0300
committerYuri Gorshenin <y@maps.me>2016-06-15 14:43:38 +0300
commit90d01a652f231684cd48b1ea9bde382cf562c990 (patch)
treee4107970cd7f3316deeae63166dd7ec418045d2d /base
parent275f23853c854fef34bb19145d3f160ad85b6e58 (diff)
[base] Implemented empty-tokens-support to tokenizer.
Diffstat (limited to 'base')
-rw-r--r--base/base_tests/string_utils_test.cpp69
-rw-r--r--base/string_utils.cpp4
-rw-r--r--base/string_utils.hpp192
3 files changed, 188 insertions, 77 deletions
diff --git a/base/base_tests/string_utils_test.cpp b/base/base_tests/string_utils_test.cpp
index aa7e79074d..6e8c36a6b7 100644
--- a/base/base_tests/string_utils_test.cpp
+++ b/base/base_tests/string_utils_test.cpp
@@ -332,32 +332,47 @@ struct FunctorTester
size_t & m_index;
vector<string> const & m_tokens;
- explicit FunctorTester(size_t & counter, vector<string> const & tokens)
- : m_index(counter), m_tokens(tokens) {}
+ FunctorTester(size_t & counter, vector<string> const & tokens)
+ : m_index(counter), m_tokens(tokens)
+ {
+ }
+
void operator()(string const & s)
{
TEST_EQUAL(s, m_tokens[m_index++], ());
}
};
-void TestIter(string const & str, char const * delims, vector<string> const & tokens)
+void TestIter(string const & s, char const * delims, vector<string> const & tokens)
{
- strings::SimpleTokenizer it(str, delims);
+ strings::SimpleTokenizer it(s, delims);
for (size_t i = 0; i < tokens.size(); ++i)
{
- TEST_EQUAL(true, it, (str, delims, i));
- TEST_EQUAL(i == tokens.size() - 1, it.IsLast(), ());
- TEST_EQUAL(*it, tokens[i], (str, delims, i));
+ TEST_EQUAL(true, it, (s, delims, i));
+ TEST_EQUAL(*it, tokens[i], (s, delims, i));
++it;
}
- TEST_EQUAL(false, it, (str, delims));
+ TEST_EQUAL(false, it, (s, delims));
size_t counter = 0;
FunctorTester f = FunctorTester(counter, tokens);
- strings::Tokenize(str, delims, f);
+ strings::Tokenize(s, delims, f);
TEST_EQUAL(counter, tokens.size(), ());
}
+void TestIterWithEmptyTokens(string const & s, char const * delims, vector<string> const & tokens)
+{
+ strings::SimpleTokenizerWithEmptyTokens it(s, delims);
+
+ for (size_t i = 0; i < tokens.size(); ++i)
+ {
+ TEST_EQUAL(true, it, (s, delims, i));
+ TEST_EQUAL(*it, tokens[i], (s, delims, i));
+ ++it;
+ }
+ TEST_EQUAL(false, it, (s, delims));
+}
+
UNIT_TEST(SimpleTokenizer)
{
vector<string> tokens;
@@ -402,6 +417,42 @@ UNIT_TEST(SimpleTokenizer)
TEST_EQUAL(vector<string>(SimpleTokenizer(str, ","), SimpleTokenizer()),
(vector<string>{"a", "b", "c"}), ());
}
+
+ {
+ string const s = "";
+ vector<string> tokens = {""};
+ TestIterWithEmptyTokens(s, ",", tokens);
+ }
+
+ {
+ string const s = ",";
+ vector<string> tokens = {"", ""};
+ TestIterWithEmptyTokens(s, ",", tokens);
+ }
+
+ {
+ string const s = ",,";
+ vector<string> tokens = {"", "", ""};
+ TestIterWithEmptyTokens(s, ",", tokens);
+ }
+
+ {
+ string const s = "Hello, World!";
+ vector<string> tokens = {s};
+ TestIterWithEmptyTokens(s, "", tokens);
+ }
+
+ {
+ string const s = "Hello, World!";
+ vector<string> tokens = {"Hello", " World", ""};
+ TestIterWithEmptyTokens(s, ",!", tokens);
+ }
+
+ {
+ string const s = ",a,b,,c,d,";
+ vector<string> tokens = {"", "a", "b", "", "c", "d", ""};
+ TestIterWithEmptyTokens(s, ",", tokens);
+ }
}
UNIT_TEST(LastUniChar)
diff --git a/base/string_utils.cpp b/base/string_utils.cpp
index d5fe4d7c43..d36b33a853 100644
--- a/base/string_utils.cpp
+++ b/base/string_utils.cpp
@@ -16,9 +16,9 @@ bool UniString::IsEqualAscii(char const * s) const
return (size() == strlen(s) && equal(begin(), end(), s));
}
-SimpleDelimiter::SimpleDelimiter(char const * delimChars)
+SimpleDelimiter::SimpleDelimiter(char const * delims)
{
- string const s(delimChars);
+ string const s(delims);
string::const_iterator it = s.begin();
while (it != s.end())
m_delims.push_back(utf8::unchecked::next(it));
diff --git a/base/string_utils.hpp b/base/string_utils.hpp
index 85396b7f19..33b7af7d2d 100644
--- a/base/string_utils.hpp
+++ b/base/string_utils.hpp
@@ -89,82 +89,150 @@ bool IsASCIILatin(UniChar c);
inline string DebugPrint(UniString const & s) { return ToUtf8(s); }
-template <typename DelimFuncT, typename UniCharIterT = UniString::const_iterator>
+template <typename TDelimFn, typename TIt = UniString::const_iterator, bool KeepEmptyTokens = false>
class TokenizeIterator
{
- UniCharIterT m_beg, m_end, m_finish;
- DelimFuncT m_delimFunc;
+public:
+ using difference_type = std::ptrdiff_t;
+ using value_type = string;
+ using pointer = void;
+ using reference = string;
+ using iterator_category = std::input_iterator_tag;
- void move()
+ // *NOTE* |s| must be not temporary!
+ TokenizeIterator(string const & s, TDelimFn const & delimFn)
+ : m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn)
{
- m_beg = m_end;
- while (m_beg != m_finish)
- {
- if (m_delimFunc(*m_beg))
- ++m_beg;
- else
- break;
- }
- m_end = m_beg;
- while (m_end != m_finish)
- {
- if (m_delimFunc(*m_end))
- break;
- else
- ++m_end;
- }
+ Move();
}
-public:
- /// @warning string S must be not temporary!
- TokenizeIterator(string const & s, DelimFuncT const & delimFunc)
- : m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc)
+ // *NOTE* |s| must be not temporary!
+ TokenizeIterator(UniString const & s, TDelimFn const & delimFn)
+ : m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn)
{
- move();
+ Move();
}
- /// @warning unistring S must be not temporary!
- TokenizeIterator(UniString const & s, DelimFuncT const & delimFunc)
- : m_beg(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFunc(delimFunc)
+ // Use default-constructed iterator for operator == to determine an
+ // end of a token stream.
+ TokenizeIterator() = default;
+
+ string operator*() const
{
- move();
+ ASSERT(m_start != m_finish, ("Dereferencing of empty iterator."));
+ return string(m_start.base(), m_end.base());
}
- /// Use default-constructed iterator for operator == to determine an end of a token stream.
- TokenizeIterator() = default;
+ UniString GetUniString() const
+ {
+ ASSERT(m_start != m_finish, ("Dereferencing of empty iterator."));
+ return UniString(m_start, m_end);
+ }
- /// Explicitly disabled, because we're storing iterators for string
- TokenizeIterator(char const *, DelimFuncT const &) = delete;
+ operator bool() const { return m_start != m_finish; }
- string operator*() const
+ TokenizeIterator & operator++()
{
- ASSERT(m_beg != m_finish, ("dereferencing of empty iterator"));
- return string(m_beg.base(), m_end.base());
+ Move();
+ return *this;
}
- operator bool() const { return m_beg != m_finish; }
-
- TokenizeIterator & operator++()
+ // Same as operator bool() in expression it == end(...).
+ bool operator==(TokenizeIterator const &) { return !(*this); }
+
+ // Same as operator bool() in expression it != end(...).
+ bool operator!=(TokenizeIterator const &) { return (*this); }
+
+private:
+ void Move()
{
- move();
- return (*this);
+ m_start = m_end;
+ while (m_start != m_finish && m_delimFn(*m_start))
+ ++m_start;
+
+ m_end = m_start;
+ while (m_end != m_finish && !m_delimFn(*m_end))
+ ++m_end;
}
- bool IsLast() const
+ TIt m_start;
+ TIt m_end;
+ TIt m_finish;
+ TDelimFn m_delimFn;
+};
+
+template <typename TDelimFn, typename TIt>
+class TokenizeIterator<TDelimFn, TIt, true /* KeepEmptyTokens */>
+{
+public:
+ using difference_type = std::ptrdiff_t;
+ using value_type = string;
+ using pointer = void;
+ using reference = string;
+ using iterator_category = std::input_iterator_tag;
+
+ // *NOTE* |s| must be not temporary!
+ TokenizeIterator(string const & s, TDelimFn const & delimFn)
+ : m_start(s.begin()), m_end(s.begin()), m_finish(s.end()), m_delimFn(delimFn), m_finished(false)
{
- if (!*this)
- return false;
+ while (m_end != m_finish && !m_delimFn(*m_end))
+ ++m_end;
+ }
- TokenizeIterator<DelimFuncT, UniCharIterT> copy(*this);
- ++copy;
- return !copy;
+ // Use default-constructed iterator for operator == to determine an
+ // end of a token stream.
+ TokenizeIterator() = default;
+
+ string operator*() const
+ {
+ ASSERT(!m_finished, ("Dereferencing of empty iterator."));
+ return string(m_start.base(), m_end.base());
+ }
+
+ UniString GetUniString() const
+ {
+ ASSERT(!m_finished, ("Dereferencing of empty iterator."));
+ return UniString(m_start, m_end);
}
- UniString GetUniString() const { return UniString(m_beg, m_end); }
- /// Same as operator bool() in expression it == end(...)
+ operator bool() const { return !m_finished; }
+
+ TokenizeIterator & operator++()
+ {
+ Move();
+ return *this;
+ }
+
+ // Same as operator bool() in expression it == end(...).
bool operator==(TokenizeIterator const &) { return !(*this); }
- /// Same as operator bool() in expression it != end(...)
+
+ // Same as operator bool() in expression it != end(...).
bool operator!=(TokenizeIterator const &) { return (*this); }
+
+private:
+ void Move()
+ {
+ if (m_end == m_finish)
+ {
+ ASSERT(!m_finished, ());
+ m_start = m_end = m_finish;
+ m_finished = true;
+ return;
+ }
+
+ m_start = m_end;
+ ++m_start;
+
+ m_end = m_start;
+ while (m_end != m_finish && !m_delimFn(*m_end))
+ ++m_end;
+ }
+
+ TIt m_start;
+ TIt m_end;
+ TIt m_finish;
+ TDelimFn m_delimFn;
+ bool m_finished;
};
class SimpleDelimiter
@@ -172,15 +240,20 @@ class SimpleDelimiter
UniString m_delims;
public:
- SimpleDelimiter(char const * delimChars);
+ SimpleDelimiter(char const * delims);
+
// Used in TokenizeIterator to allow past the end iterator construction.
SimpleDelimiter() = default;
/// @return true if c is delimiter
bool operator()(UniChar c) const;
};
-typedef TokenizeIterator<SimpleDelimiter, ::utf8::unchecked::iterator<string::const_iterator>>
- SimpleTokenizer;
+using SimpleTokenizer =
+ TokenizeIterator<SimpleDelimiter, ::utf8::unchecked::iterator<string::const_iterator>,
+ false /* KeepEmptyTokens */>;
+using SimpleTokenizerWithEmptyTokens =
+ TokenizeIterator<SimpleDelimiter, ::utf8::unchecked::iterator<string::const_iterator>,
+ true /* KeepEmptyTokens */>;
template <typename TFunctor>
void Tokenize(string const & str, char const * delims, TFunctor && f)
@@ -389,16 +462,3 @@ size_t EditDistance(TIter const & b1, TIter const & e1, TIter const & b2, TIter
return prev[m];
}
} // namespace strings
-
-namespace std
-{
-template <typename... Args>
-struct iterator_traits<strings::TokenizeIterator<Args...>>
-{
- using difference_type = std::ptrdiff_t;
- using value_type = string;
- using pointer = void;
- using reference = string;
- using iterator_category = std::input_iterator_tag;
-};
-} // namespace std