Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/sentencepiece.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaku Kudo <taku@google.com>2017-03-07 13:43:50 +0300
committerTaku Kudo <taku@google.com>2017-03-07 13:43:50 +0300
commit2928ce5307224ea4c012fc6cbd7a098c486590b6 (patch)
tree38b679886855a7a6b80fdc61f2f62c952cf3bfb7 /src/bpe_model_trainer_test.cc
Initialize repository
Diffstat (limited to 'src/bpe_model_trainer_test.cc')
-rw-r--r--src/bpe_model_trainer_test.cc122
1 files changed, 122 insertions, 0 deletions
diff --git a/src/bpe_model_trainer_test.cc b/src/bpe_model_trainer_test.cc
new file mode 100644
index 0000000..3ba99e5
--- /dev/null
+++ b/src/bpe_model_trainer_test.cc
@@ -0,0 +1,122 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.!
+
+#include "bpe_model_trainer.h"
+
+#include "builder.h"
+#include "sentencepiece_processor.h"
+#include "testharness.h"
+#include "util.h"
+
+namespace sentencepiece {
+namespace bpe {
+
+// Space symbol
+#define WS "\xe2\x96\x81"
+
+namespace {
+std::string RunTrainer(const std::vector<std::string> &input, int size) {
+ test::ScopedTempFile input_scoped_file("input");
+ test::ScopedTempFile model_scoped_file("model");
+ const std::string input_file = input_scoped_file.filename();
+ const std::string model_prefix = model_scoped_file.filename();
+ {
+ io::OutputBuffer output(input_file);
+ for (const auto &line : input) {
+ output.WriteLine(line);
+ }
+ }
+
+ TrainerSpec trainer_spec;
+ trainer_spec.set_model_type(TrainerSpec::BPE);
+ trainer_spec.add_input(input_file);
+ trainer_spec.set_vocab_size(size - 3); // remove <unk>, <s>, </s>
+ trainer_spec.set_model_prefix(model_prefix);
+
+ auto normalizer_spec = normalizer::Builder::GetNormalizerSpec("identity");
+ normalizer_spec.set_add_dummy_prefix(false);
+
+ Trainer trainer(trainer_spec, normalizer_spec);
+ trainer.Train();
+
+ SentencePieceProcessor processor;
+ processor.Load(model_prefix + ".model");
+
+ const auto &model = processor.model_proto();
+ std::vector<std::string> pieces;
+
+ // remove <unk>, <s>, </s>
+ for (int i = 3; i < model.pieces_size(); ++i) {
+ pieces.emplace_back(model.pieces(i).piece());
+ }
+
+ return string_util::Join(pieces, " ");
+}
+} // namespace
+
+TEST(BPETrainerTest, BasicTest) {
+ EXPECT_EQ("ab ra abra ad cad abracad abracadabra ac br a b r c d",
+ RunTrainer({"abracadabra"}, 20));
+ EXPECT_EQ("ap le app apple en in ine pen p e a l n i",
+ RunTrainer({"pen", "pineapple", "apple"}, 20));
+ EXPECT_EQ("he ll llo hello hellohe el lo oh hel ohe e h l o",
+ RunTrainer({"hellohe"}, 20));
+}
+
+TEST(BPETrainerTest, EndToEndTest) {
+ TrainerSpec trainer_spec;
+ NormalizerSpec normalizer_spec;
+ normalizer_spec = normalizer::Builder::GetNormalizerSpec("nfkc");
+ trainer_spec.add_input("../data/wagahaiwa_nekodearu.txt");
+
+ constexpr int kVocabSize = 8000;
+ trainer_spec.set_vocab_size(kVocabSize);
+ trainer_spec.set_model_type(TrainerSpec::BPE);
+
+ trainer_spec.add_control_symbols("<ctrl>");
+ // trainer_spec.add_user_defined_symbols("<user>");
+
+ test::ScopedTempFile sf("tmp_model");
+ trainer_spec.set_model_prefix(sf.filename());
+ bpe::Trainer trainer(trainer_spec, normalizer_spec);
+ trainer.Train();
+
+ SentencePieceProcessor sp;
+ EXPECT_TRUE(sp.Load(std::string(sf.filename()) + ".model"));
+ EXPECT_EQ(kVocabSize, sp.GetPieceSize());
+
+ const int cid = sp.PieceToId("<ctrl>");
+ // const int uid = sp.PieceToId("<user>");
+ EXPECT_TRUE(sp.IsControl(cid));
+ // EXPECT_FALSE(sp.IsUnknown(uid));
+
+ std::vector<std::string> tok;
+ sp.Encode("", &tok);
+ EXPECT_TRUE(tok.empty());
+
+ sp.Encode(
+ "吾輩《わがはい》は猫である。名前はまだ無い。"
+ "どこで生れたかとんと見当《けんとう》がつかぬ。"
+ "何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している"
+ "。",
+ &tok);
+ EXPECT_EQ(WS
+ " 吾輩 《 わが はい 》 は猫 である 。 名前 はまだ 無い 。 "
+ "どこで 生 れた か とん と見 当 《 けんとう 》 が つかぬ 。 "
+ "何でも 薄 暗 いじ め じ め した 所で ニャー ニャー 泣 いていた "
+ "事 だけは 記憶 している 。",
+ string_util::Join(tok, " "));
+}
+} // namespace bpe
+} // namespace sentencepiece