Tolerate odd characters in corpora.

author: Kenneth Heafield <github@kheafield.com> 2013-01-19 17:35:00 +0400
committer: Kenneth Heafield <github@kheafield.com> 2013-01-19 17:35:00 +0400
commit: a35299497fd9b0e289fd16bc08f22ac5af5a0d7c (patch)
tree: 5be768b39fc76482c8d38837e7da16a335c721db /lm
parent: 10d7f3654e9b15c7dd4f83de22c8b31b6b3d5aec (diff)
1 files changed, 2 insertions, 1 deletions
diff --git a/lm/builder/corpus_count.cc b/lm/builder/corpus_count.cc
index 8c3de57dd..abea4ed06 100644
--- a/lm/builder/corpus_count.cc
+++ b/lm/builder/corpus_count.cc
@@ -202,11 +202,12 @@ void CorpusCount::Run(const util::stream::ChainPosition &position) {
   const WordIndex end_sentence = vocab.Lookup("</s>");
   Writer writer(NGram::OrderFromSize(position.GetChain().EntrySize()), position, dedupe_mem_.get(), dedupe_mem_size_);
   uint64_t count = 0;
+  StringPiece delimiters("\0\t\r ", 4);
   try {
     while(true) {
       StringPiece line(from_.ReadLine());
       writer.StartSentence();
-      for (util::TokenIter<util::AnyCharacter, true> w(line, " \t"); w; ++w) {
+      for (util::TokenIter<util::AnyCharacter, true> w(line, delimiters); w; ++w) {
         WordIndex word = vocab.Lookup(*w);
         UTIL_THROW_IF(word <= 2, FormatLoadException, "Special word " << *w << " is not allowed in the corpus.  I plan to support models containing <unk> in the future.");
         writer.Append(word);
author	Kenneth Heafield <github@kheafield.com>	2013-01-19 17:35:00 +0400
committer	Kenneth Heafield <github@kheafield.com>	2013-01-19 17:35:00 +0400
commit	a35299497fd9b0e289fd16bc08f22ac5af5a0d7c (patch)
tree	5be768b39fc76482c8d38837e7da16a335c721db /lm
parent	10d7f3654e9b15c7dd4f83de22c8b31b6b3d5aec (diff)