Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/sentencepiece.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaku Kudo <taku@google.com>2018-06-04 19:44:15 +0300
committerTaku Kudo <taku@google.com>2018-06-04 19:44:15 +0300
commit2ff4267b91998297c02a9762051d75c0fb1c3011 (patch)
treeb1821b84ce650828ff05d8d00090a8e97ba65d8c /src/bpe_model_trainer_test.cc
parent506aa4595ad3c819d800dc52187832033f104bc7 (diff)
Minor style fixes
Diffstat (limited to 'src/bpe_model_trainer_test.cc')
-rw-r--r--src/bpe_model_trainer_test.cc44
1 files changed, 19 insertions, 25 deletions
diff --git a/src/bpe_model_trainer_test.cc b/src/bpe_model_trainer_test.cc
index 56a44d6..222dd11 100644
--- a/src/bpe_model_trainer_test.cc
+++ b/src/bpe_model_trainer_test.cc
@@ -14,17 +14,20 @@
#include "bpe_model_trainer.h"
+#include <string>
+#include <vector>
#include "sentencepiece_processor.h"
+#include "sentencepiece_trainer.h"
#include "testharness.h"
#include "util.h"
namespace sentencepiece {
namespace bpe {
+namespace {
// Space symbol
#define WS "\xe2\x96\x81"
-namespace {
std::string RunTrainer(const std::vector<std::string> &input, int size) {
test::ScopedTempFile input_scoped_file("input");
test::ScopedTempFile model_scoped_file("model");
@@ -48,7 +51,7 @@ std::string RunTrainer(const std::vector<std::string> &input, int size) {
normalizer_spec.set_add_dummy_prefix(false);
Trainer trainer(trainer_spec, normalizer_spec);
- trainer.Train();
+ EXPECT_OK(trainer.Train());
SentencePieceProcessor processor;
EXPECT_OK(processor.Load(model_prefix + ".model"));
@@ -63,7 +66,6 @@ std::string RunTrainer(const std::vector<std::string> &input, int size) {
return string_util::Join(pieces, " ");
}
-} // namespace
TEST(BPETrainerTest, BasicTest) {
EXPECT_EQ("ab ra abra ad cad abracad abracadabra ac br a b r c d",
@@ -75,43 +77,33 @@ TEST(BPETrainerTest, BasicTest) {
}
TEST(BPETrainerTest, EndToEndTest) {
- TrainerSpec trainer_spec;
- trainer_spec.add_input("../data/wagahaiwa_nekodearu.txt");
+ const test::ScopedTempFile sf("tmp_model");
- NormalizerSpec normalizer_spec;
- normalizer_spec.set_name("nfkc");
-
- constexpr int kVocabSize = 8000;
- trainer_spec.set_vocab_size(kVocabSize);
- trainer_spec.set_model_type(TrainerSpec::BPE);
-
- trainer_spec.add_control_symbols("<ctrl>");
- // trainer_spec.add_user_defined_symbols("<user>");
-
- test::ScopedTempFile sf("tmp_model");
- trainer_spec.set_model_prefix(sf.filename());
- bpe::Trainer trainer(trainer_spec, normalizer_spec);
- trainer.Train();
+ EXPECT_OK(SentencePieceTrainer::Train(
+ std::string("--model_prefix=") + sf.filename() +
+ " --input=../data/wagahaiwa_nekodearu.txt"
+ " --vocab_size=8000"
+ " --normalization_rule_name=identity"
+ " --model_type=bpe"
+ " --control_symbols=<ctrl>"));
SentencePieceProcessor sp;
EXPECT_OK(sp.Load(std::string(sf.filename()) + ".model"));
- EXPECT_EQ(kVocabSize, sp.GetPieceSize());
+ EXPECT_EQ(8000, sp.GetPieceSize());
const int cid = sp.PieceToId("<ctrl>");
- // const int uid = sp.PieceToId("<user>");
EXPECT_TRUE(sp.IsControl(cid));
- // EXPECT_FALSE(sp.IsUnknown(uid));
std::vector<std::string> tok;
- sp.Encode("", &tok);
+ EXPECT_OK(sp.Encode("", &tok));
EXPECT_TRUE(tok.empty());
- sp.Encode(
+ EXPECT_OK(sp.Encode(
"吾輩《わがはい》は猫である。名前はまだ無い。"
"どこで生れたかとんと見当《けんとう》がつかぬ。"
"何でも薄暗いじめじめした所でニャーニャー泣いていた事だけは記憶している"
"。",
- &tok);
+ &tok));
EXPECT_EQ(WS
" 吾輩 《 わが はい 》 は猫 である 。 名前 はまだ 無い 。 "
"どこで 生 れた か とん と見 当 《 けんとう 》 が つかぬ 。 "
@@ -119,5 +111,7 @@ TEST(BPETrainerTest, EndToEndTest) {
"事 だけは 記憶 している 。",
string_util::Join(tok, " "));
}
+
+} // namespace
} // namespace bpe
} // namespace sentencepiece