diff options
author | Taku Kudo <taku@google.com> | 2018-06-04 19:44:15 +0300 |
---|---|---|
committer | Taku Kudo <taku@google.com> | 2018-06-04 19:44:15 +0300 |
commit | 2ff4267b91998297c02a9762051d75c0fb1c3011 (patch) | |
tree | b1821b84ce650828ff05d8d00090a8e97ba65d8c /src/bpe_model_trainer_test.cc | |
parent | 506aa4595ad3c819d800dc52187832033f104bc7 (diff) |
Minor style fixes
Diffstat (limited to 'src/bpe_model_trainer_test.cc')
-rw-r--r-- | src/bpe_model_trainer_test.cc | 44 |
1 files changed, 19 insertions, 25 deletions
diff --git a/src/bpe_model_trainer_test.cc b/src/bpe_model_trainer_test.cc index 56a44d6..222dd11 100644 --- a/src/bpe_model_trainer_test.cc +++ b/src/bpe_model_trainer_test.cc @@ -14,17 +14,20 @@ #include "bpe_model_trainer.h" +#include <string> +#include <vector> #include "sentencepiece_processor.h" +#include "sentencepiece_trainer.h" #include "testharness.h" #include "util.h" namespace sentencepiece { namespace bpe { +namespace { // Space symbol #define WS "\xe2\x96\x81" -namespace { std::string RunTrainer(const std::vector<std::string> &input, int size) { test::ScopedTempFile input_scoped_file("input"); test::ScopedTempFile model_scoped_file("model"); @@ -48,7 +51,7 @@ std::string RunTrainer(const std::vector<std::string> &input, int size) { normalizer_spec.set_add_dummy_prefix(false); Trainer trainer(trainer_spec, normalizer_spec); - trainer.Train(); + EXPECT_OK(trainer.Train()); SentencePieceProcessor processor; EXPECT_OK(processor.Load(model_prefix + ".model")); @@ -63,7 +66,6 @@ std::string RunTrainer(const std::vector<std::string> &input, int size) { return string_util::Join(pieces, " "); } -} // namespace TEST(BPETrainerTest, BasicTest) { EXPECT_EQ("ab ra abra ad cad abracad abracadabra ac br a b r c d", @@ -75,43 +77,33 @@ TEST(BPETrainerTest, BasicTest) { } TEST(BPETrainerTest, EndToEndTest) { - TrainerSpec trainer_spec; - trainer_spec.add_input("../data/wagahaiwa_nekodearu.txt"); + const test::ScopedTempFile sf("tmp_model"); - NormalizerSpec normalizer_spec; - normalizer_spec.set_name("nfkc"); - - constexpr int kVocabSize = 8000; - trainer_spec.set_vocab_size(kVocabSize); - trainer_spec.set_model_type(TrainerSpec::BPE); - - trainer_spec.add_control_symbols("<ctrl>"); - // trainer_spec.add_user_defined_symbols("<user>"); - - test::ScopedTempFile sf("tmp_model"); - trainer_spec.set_model_prefix(sf.filename()); - bpe::Trainer trainer(trainer_spec, normalizer_spec); - trainer.Train(); + EXPECT_OK(SentencePieceTrainer::Train( + std::string("--model_prefix=") + sf.filename() + + " --input=../data/wagahaiwa_nekodearu.txt" + " --vocab_size=8000" + " --normalization_rule_name=identity" + " --model_type=bpe" + " --control_symbols=<ctrl>")); SentencePieceProcessor sp; EXPECT_OK(sp.Load(std::string(sf.filename()) + ".model")); - EXPECT_EQ(kVocabSize, sp.GetPieceSize()); + EXPECT_EQ(8000, sp.GetPieceSize()); const int cid = sp.PieceToId("<ctrl>"); - // const int uid = sp.PieceToId("<user>"); EXPECT_TRUE(sp.IsControl(cid)); - // EXPECT_FALSE(sp.IsUnknown(uid)); std::vector<std::string> tok; - sp.Encode("", &tok); + EXPECT_OK(sp.Encode("", &tok)); EXPECT_TRUE(tok.empty()); - sp.Encode( + EXPECT_OK(sp.Encode( "吾輩《わがはい》は猫である。名前はまだ無い。" "どこで生れたかとんと見当《けんとう》がつかぬ。" "何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している" "。", - &tok); + &tok)); EXPECT_EQ(WS " 吾輩 《 わが はい 》 は猫 である 。 名前 はまだ 無い 。 " "どこで 生 れた か とん と見 当 《 けんとう 》 が つかぬ 。 " @@ -119,5 +111,7 @@ TEST(BPETrainerTest, EndToEndTest) { "事 だけは 記憶 している 。", string_util::Join(tok, " ")); } + +} // namespace } // namespace bpe } // namespace sentencepiece |