Initialize repository

author: Taku Kudo <taku@google.com> 2017-03-07 13:43:50 +0300
committer: Taku Kudo <taku@google.com> 2017-03-07 13:43:50 +0300
commit: 2928ce5307224ea4c012fc6cbd7a098c486590b6 (patch)
tree: 38b679886855a7a6b80fdc61f2f62c952cf3bfb7 /src/unigram_model_trainer_test.cc
1 files changed, 75 insertions, 0 deletions
diff --git a/src/unigram_model_trainer_test.cc b/src/unigram_model_trainer_test.cc
new file mode 100644
index 0000000..ba86bfa
--- /dev/null
+++ b/src/unigram_model_trainer_test.cc
@@ -0,0 +1,75 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.!
+
+#include "unigram_model_trainer.h"
+#include "builder.h"
+#include "normalizer.h"
+#include "sentencepiece_model.pb.h"
+#include "sentencepiece_processor.h"
+#include "testharness.h"
+#include "util.h"
+
+namespace sentencepiece {
+namespace unigram {
+
+// Space symbol
+#define WS "\xe2\x96\x81"
+
+TEST(UnigramTrainerTest, EndToEndTest) {
+  TrainerSpec trainer_spec;
+  NormalizerSpec normalizer_spec;
+  normalizer_spec = normalizer::Builder::GetNormalizerSpec("nfkc");
+  trainer_spec.add_input("../data/wagahaiwa_nekodearu.txt");
+
+  constexpr int kVocabSize = 8000;
+  trainer_spec.set_vocab_size(kVocabSize);
+  trainer_spec.set_model_type(TrainerSpec::UNIGRAM);
+
+  trainer_spec.add_control_symbols("<ctrl>");
+  trainer_spec.add_user_defined_symbols("<user>");
+
+  test::ScopedTempFile sf("tmp_model");
+  trainer_spec.set_model_prefix(sf.filename());
+  unigram::Trainer trainer(trainer_spec, normalizer_spec);
+  trainer.Train();
+
+  SentencePieceProcessor sp;
+  EXPECT_TRUE(sp.Load(std::string(sf.filename()) + ".model"));
+  EXPECT_EQ(kVocabSize, sp.GetPieceSize());
+
+  const int cid = sp.PieceToId("<ctrl>");
+  const int uid = sp.PieceToId("<user>");
+  EXPECT_TRUE(sp.IsControl(cid));
+  EXPECT_FALSE(sp.IsUnknown(uid));
+
+  std::vector<std::string> tok;
+
+  sp.Encode("", &tok);
+  EXPECT_TRUE(tok.empty());
+
+  sp.Encode(
+      "吾輩《わがはい》は猫である。名前はまだ無い。"
+      "どこで生れたかとんと見当《けんとう》がつかぬ。"
+      "何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している"
+      "。",
+      &tok);
+  EXPECT_EQ(WS
+            " 吾輩 《 わが はい 》 は 猫 である 。 名前 はまだ 無い 。 "
+            "どこ で 生 れた か とん と 見当 《 けん とう 》 が つか ぬ 。 "
+            "何でも 薄 暗 い じめ じめ した 所で ニャーニャー "
+            "泣 い ていた 事 だけは 記憶 している 。",
+            string_util::Join(tok, " "));
+}
+}  // namespace unigram
+}  // namespace sentencepiece
author	Taku Kudo <taku@google.com>	2017-03-07 13:43:50 +0300
committer	Taku Kudo <taku@google.com>	2017-03-07 13:43:50 +0300
commit	2928ce5307224ea4c012fc6cbd7a098c486590b6 (patch)
tree	38b679886855a7a6b80fdc61f2f62c952cf3bfb7 /src/unigram_model_trainer_test.cc