Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/moses-smt/mosesdecoder.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/util
diff options
context:
space:
mode:
authorJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-04-22 06:18:02 +0300
committerJeroen Vermeulen <jtv@precisiontranslationtools.com>2015-04-22 06:18:02 +0300
commit10a0a7b05a190c10af9a926160fed06915691af5 (patch)
tree07c01727ca442dd213364c203253350d95195a21 /util
parentb2d821a14176d37467375e671ebe9fc83260e9a8 (diff)
Add new files.
Oops. Forgot these in my previous commit. Sorry!
Diffstat (limited to 'util')
-rw-r--r--util/tokenize.hh42
-rw-r--r--util/tokenize_test.cc69
2 files changed, 111 insertions, 0 deletions
diff --git a/util/tokenize.hh b/util/tokenize.hh
new file mode 100644
index 000000000..f4f3289bc
--- /dev/null
+++ b/util/tokenize.hh
@@ -0,0 +1,42 @@
+#ifndef TOKENIZE_H
+#define TOKENIZE_H
+
+#include <string>
+#include <vector>
+
+namespace util
+{
+
+/** Split input text into a series of tokens.
+ *
+ * Splits on spaces and tabs, no other whitespace characters, and is not
+ * locale-sensitive.
+ *
+ * The spaces themselves are not included. A sequence of consecutive space/tab
+ * characters count as one.
+ */
+inline std::vector<std::string> tokenize(const char input[])
+{
+ std::vector<std::string> token;
+ bool betweenWords = true;
+ int start = 0;
+ int i;
+ for(i = 0; input[i] != '\0'; i++) {
+ const bool isSpace = (input[i] == ' ' || input[i] == '\t');
+
+ if (!isSpace && betweenWords) {
+ start = i;
+ betweenWords = false;
+ } else if (isSpace && !betweenWords) {
+ token.push_back( std::string( input+start, i-start ) );
+ betweenWords = true;
+ }
+ }
+ if (!betweenWords)
+ token.push_back( std::string( input+start, i-start ) );
+ return token;
+}
+
+} // namespace util
+
+#endif
diff --git a/util/tokenize_test.cc b/util/tokenize_test.cc
new file mode 100644
index 000000000..d879fa97f
--- /dev/null
+++ b/util/tokenize_test.cc
@@ -0,0 +1,69 @@
+#include "util/tokenize.hh"
+
+#define BOOST_TEST_MODULE TokenizeTest
+#include <boost/test/unit_test.hpp>
+
+namespace util
+{
+namespace
+{
+
+BOOST_AUTO_TEST_CASE(empty_text_yields_empty_vector)
+{
+ const std::vector<std::string> tokens = util::tokenize("");
+ BOOST_CHECK_EQUAL(tokens.size(), 0);
+}
+
+BOOST_AUTO_TEST_CASE(whitespace_only_yields_empty_vector)
+{
+ const std::vector<std::string> tokens = util::tokenize(" ");
+ BOOST_CHECK_EQUAL(tokens.size(), 0);
+}
+
+BOOST_AUTO_TEST_CASE(parses_single_token)
+{
+ const std::vector<std::string> tokens = util::tokenize("mytoken");
+ BOOST_CHECK_EQUAL(tokens.size(), 1);
+ BOOST_CHECK_EQUAL(tokens[0], "mytoken");
+}
+
+BOOST_AUTO_TEST_CASE(ignores_leading_whitespace)
+{
+ const std::vector<std::string> tokens = util::tokenize(" \t mytoken");
+ BOOST_CHECK_EQUAL(tokens.size(), 1);
+ BOOST_CHECK_EQUAL(tokens[0], "mytoken");
+}
+
+BOOST_AUTO_TEST_CASE(ignores_trailing_whitespace)
+{
+ const std::vector<std::string> tokens = util::tokenize("mytoken \t ");
+ BOOST_CHECK_EQUAL(tokens.size(), 1);
+ BOOST_CHECK_EQUAL(tokens[0], "mytoken");
+}
+
+BOOST_AUTO_TEST_CASE(splits_tokens_on_tabs)
+{
+ const std::vector<std::string> tokens = util::tokenize("one\ttwo");
+ BOOST_CHECK_EQUAL(tokens.size(), 2);
+ BOOST_CHECK_EQUAL(tokens[0], "one");
+ BOOST_CHECK_EQUAL(tokens[1], "two");
+}
+
+BOOST_AUTO_TEST_CASE(splits_tokens_on_spaces)
+{
+ const std::vector<std::string> tokens = util::tokenize("one two");
+ BOOST_CHECK_EQUAL(tokens.size(), 2);
+ BOOST_CHECK_EQUAL(tokens[0], "one");
+ BOOST_CHECK_EQUAL(tokens[1], "two");
+}
+
+BOOST_AUTO_TEST_CASE(treats_sequence_of_space_as_one_space)
+{
+ const std::vector<std::string> tokens = util::tokenize("one\t \ttwo");
+ BOOST_CHECK_EQUAL(tokens.size(), 2);
+ BOOST_CHECK_EQUAL(tokens[0], "one");
+ BOOST_CHECK_EQUAL(tokens[1], "two");
+}
+
+} // namespace
+} // namespace util