diff options
author | Jeroen Vermeulen <jtv@precisiontranslationtools.com> | 2015-04-22 06:18:02 +0300 |
---|---|---|
committer | Jeroen Vermeulen <jtv@precisiontranslationtools.com> | 2015-04-22 06:18:02 +0300 |
commit | 10a0a7b05a190c10af9a926160fed06915691af5 (patch) | |
tree | 07c01727ca442dd213364c203253350d95195a21 /util | |
parent | b2d821a14176d37467375e671ebe9fc83260e9a8 (diff) |
Add new files.
Oops. Forgot these in my previous commit. Sorry!
Diffstat (limited to 'util')
-rw-r--r-- | util/tokenize.hh | 42 | ||||
-rw-r--r-- | util/tokenize_test.cc | 69 |
2 files changed, 111 insertions, 0 deletions
diff --git a/util/tokenize.hh b/util/tokenize.hh new file mode 100644 index 000000000..f4f3289bc --- /dev/null +++ b/util/tokenize.hh @@ -0,0 +1,42 @@ +#ifndef TOKENIZE_H +#define TOKENIZE_H + +#include <string> +#include <vector> + +namespace util +{ + +/** Split input text into a series of tokens. + * + * Splits on spaces and tabs, no other whitespace characters, and is not + * locale-sensitive. + * + * The spaces themselves are not included. A sequence of consecutive space/tab + * characters count as one. + */ +inline std::vector<std::string> tokenize(const char input[]) +{ + std::vector<std::string> token; + bool betweenWords = true; + int start = 0; + int i; + for(i = 0; input[i] != '\0'; i++) { + const bool isSpace = (input[i] == ' ' || input[i] == '\t'); + + if (!isSpace && betweenWords) { + start = i; + betweenWords = false; + } else if (isSpace && !betweenWords) { + token.push_back( std::string( input+start, i-start ) ); + betweenWords = true; + } + } + if (!betweenWords) + token.push_back( std::string( input+start, i-start ) ); + return token; +} + +} // namespace util + +#endif diff --git a/util/tokenize_test.cc b/util/tokenize_test.cc new file mode 100644 index 000000000..d879fa97f --- /dev/null +++ b/util/tokenize_test.cc @@ -0,0 +1,69 @@ +#include "util/tokenize.hh" + +#define BOOST_TEST_MODULE TokenizeTest +#include <boost/test/unit_test.hpp> + +namespace util +{ +namespace +{ + +BOOST_AUTO_TEST_CASE(empty_text_yields_empty_vector) +{ + const std::vector<std::string> tokens = util::tokenize(""); + BOOST_CHECK_EQUAL(tokens.size(), 0); +} + +BOOST_AUTO_TEST_CASE(whitespace_only_yields_empty_vector) +{ + const std::vector<std::string> tokens = util::tokenize(" "); + BOOST_CHECK_EQUAL(tokens.size(), 0); +} + +BOOST_AUTO_TEST_CASE(parses_single_token) +{ + const std::vector<std::string> tokens = util::tokenize("mytoken"); + BOOST_CHECK_EQUAL(tokens.size(), 1); + BOOST_CHECK_EQUAL(tokens[0], "mytoken"); +} + +BOOST_AUTO_TEST_CASE(ignores_leading_whitespace) +{ + const std::vector<std::string> tokens = util::tokenize(" \t mytoken"); + BOOST_CHECK_EQUAL(tokens.size(), 1); + BOOST_CHECK_EQUAL(tokens[0], "mytoken"); +} + +BOOST_AUTO_TEST_CASE(ignores_trailing_whitespace) +{ + const std::vector<std::string> tokens = util::tokenize("mytoken \t "); + BOOST_CHECK_EQUAL(tokens.size(), 1); + BOOST_CHECK_EQUAL(tokens[0], "mytoken"); +} + +BOOST_AUTO_TEST_CASE(splits_tokens_on_tabs) +{ + const std::vector<std::string> tokens = util::tokenize("one\ttwo"); + BOOST_CHECK_EQUAL(tokens.size(), 2); + BOOST_CHECK_EQUAL(tokens[0], "one"); + BOOST_CHECK_EQUAL(tokens[1], "two"); +} + +BOOST_AUTO_TEST_CASE(splits_tokens_on_spaces) +{ + const std::vector<std::string> tokens = util::tokenize("one two"); + BOOST_CHECK_EQUAL(tokens.size(), 2); + BOOST_CHECK_EQUAL(tokens[0], "one"); + BOOST_CHECK_EQUAL(tokens[1], "two"); +} + +BOOST_AUTO_TEST_CASE(treats_sequence_of_space_as_one_space) +{ + const std::vector<std::string> tokens = util::tokenize("one\t \ttwo"); + BOOST_CHECK_EQUAL(tokens.size(), 2); + BOOST_CHECK_EQUAL(tokens[0], "one"); + BOOST_CHECK_EQUAL(tokens[1], "two"); +} + +} // namespace +} // namespace util |