diff options
author | Taku Kudo <taku@google.com> | 2018-07-26 10:28:32 +0300 |
---|---|---|
committer | Taku Kudo <taku@google.com> | 2018-07-26 10:28:32 +0300 |
commit | 5dac4835fa47b2510e07ea0e7bd205b1c99a3835 (patch) | |
tree | 11fccfaafbc159d9bc6ae68e6801c678c879cb96 /src/sentencepiece_processor_test.cc | |
parent | 9c3ea57a98642249259bcdffa57374954ff8e4ae (diff) |
Added --unk_surface option to allow user to change unknown surface string.
Diffstat (limited to 'src/sentencepiece_processor_test.cc')
-rw-r--r-- | src/sentencepiece_processor_test.cc | 137 |
1 files changed, 98 insertions, 39 deletions
diff --git a/src/sentencepiece_processor_test.cc b/src/sentencepiece_processor_test.cc index b0f1eb9..3a78c87 100644 --- a/src/sentencepiece_processor_test.cc +++ b/src/sentencepiece_processor_test.cc @@ -465,51 +465,110 @@ TEST(SentencepieceProcessorTest, DecodeTest) { float GetScore(int id) const override { return 0.0; } }; - SentencePieceProcessor sp; - auto mock = MakeUnique<DecodeMockModel>(); - // std::unique_ptr<ModelInterface> mock(new DecodeMockModel); - sp.SetModel(std::move(mock)); - - const auto normalizaiton_spec = MakeDefaultNormalizerSpec(); - sp.SetNormalizer(MakeUnique<normalizer::Normalizer>(normalizaiton_spec)); - const std::vector<std::string> input = {"<s>", WS "ABC", "<unk>", WS "DE", "F", "G" WS "H", "I", "</s>"}; - SentencePieceText spt; - sp.Decode(input, &spt); - EXPECT_EQ("ABC \xE2\x81\x87 DEFG HI", spt.text()); - EXPECT_EQ(8, spt.pieces_size()); + { + SentencePieceProcessor sp; + auto mock = MakeUnique<DecodeMockModel>(); + sp.SetModel(std::move(mock)); + + const auto normalizaiton_spec = MakeDefaultNormalizerSpec(); + sp.SetNormalizer(MakeUnique<normalizer::Normalizer>(normalizaiton_spec)); + + SentencePieceText spt; + + sp.Decode(input, &spt); + EXPECT_EQ("ABC \xE2\x81\x87 DEFG HI", spt.text()); + EXPECT_EQ(8, spt.pieces_size()); + + for (int i = 0; i < 6; ++i) { + EXPECT_EQ(input[i], spt.pieces(i).piece()); + } + + EXPECT_EQ("", spt.pieces(0).surface()); + EXPECT_EQ("ABC", spt.pieces(1).surface()); + EXPECT_EQ(" \xE2\x81\x87 ", spt.pieces(2).surface()); + EXPECT_EQ(" DE", spt.pieces(3).surface()); + EXPECT_EQ("F", spt.pieces(4).surface()); + EXPECT_EQ("G H", spt.pieces(5).surface()); + EXPECT_EQ("I", spt.pieces(6).surface()); + EXPECT_EQ("", spt.pieces(7).surface()); + + EXPECT_EQ(0, spt.pieces(0).begin()); + EXPECT_EQ(0, spt.pieces(0).end()); + EXPECT_EQ(0, spt.pieces(1).begin()); + EXPECT_EQ(3, spt.pieces(1).end()); + EXPECT_EQ(3, spt.pieces(2).begin()); + EXPECT_EQ(8, spt.pieces(2).end()); + EXPECT_EQ(8, spt.pieces(3).begin()); + EXPECT_EQ(11, spt.pieces(3).end()); + EXPECT_EQ(11, spt.pieces(4).begin()); + EXPECT_EQ(12, spt.pieces(4).end()); + EXPECT_EQ(12, spt.pieces(5).begin()); + EXPECT_EQ(15, spt.pieces(5).end()); + EXPECT_EQ(15, spt.pieces(6).begin()); + EXPECT_EQ(16, spt.pieces(6).end()); + EXPECT_EQ(16, spt.pieces(7).begin()); + EXPECT_EQ(16, spt.pieces(7).end()); + } + + // unk_surface is not defined. + { + SentencePieceProcessor sp; + auto proto = MakeUnique<ModelProto>(); + sp.Load(std::move(proto)); + + auto mock = MakeUnique<DecodeMockModel>(); + sp.SetModel(std::move(mock)); + + const auto normalizaiton_spec = MakeDefaultNormalizerSpec(); + sp.SetNormalizer(MakeUnique<normalizer::Normalizer>(normalizaiton_spec)); + + SentencePieceText spt; + + sp.Decode(input, &spt); + EXPECT_EQ("ABC \xE2\x81\x87 DEFG HI", spt.text()); + EXPECT_EQ(8, spt.pieces_size()); + } + + { + SentencePieceProcessor sp; + auto proto = MakeUnique<ModelProto>(); + proto->mutable_trainer_spec()->set_unk_surface(""); + sp.Load(std::move(proto)); + + auto mock = MakeUnique<DecodeMockModel>(); + sp.SetModel(std::move(mock)); + + const auto normalizaiton_spec = MakeDefaultNormalizerSpec(); + sp.SetNormalizer(MakeUnique<normalizer::Normalizer>(normalizaiton_spec)); + + SentencePieceText spt; - for (int i = 0; i < 6; ++i) { - EXPECT_EQ(input[i], spt.pieces(i).piece()); + sp.Decode(input, &spt); + EXPECT_EQ("ABC DEFG HI", spt.text()); + EXPECT_EQ(8, spt.pieces_size()); } - EXPECT_EQ("", spt.pieces(0).surface()); - EXPECT_EQ("ABC", spt.pieces(1).surface()); - EXPECT_EQ(" \xE2\x81\x87 ", spt.pieces(2).surface()); - EXPECT_EQ(" DE", spt.pieces(3).surface()); - EXPECT_EQ("F", spt.pieces(4).surface()); - EXPECT_EQ("G H", spt.pieces(5).surface()); - EXPECT_EQ("I", spt.pieces(6).surface()); - EXPECT_EQ("", spt.pieces(7).surface()); - - EXPECT_EQ(0, spt.pieces(0).begin()); - EXPECT_EQ(0, spt.pieces(0).end()); - EXPECT_EQ(0, spt.pieces(1).begin()); - EXPECT_EQ(3, spt.pieces(1).end()); - EXPECT_EQ(3, spt.pieces(2).begin()); - EXPECT_EQ(8, spt.pieces(2).end()); - EXPECT_EQ(8, spt.pieces(3).begin()); - EXPECT_EQ(11, spt.pieces(3).end()); - EXPECT_EQ(11, spt.pieces(4).begin()); - EXPECT_EQ(12, spt.pieces(4).end()); - EXPECT_EQ(12, spt.pieces(5).begin()); - EXPECT_EQ(15, spt.pieces(5).end()); - EXPECT_EQ(15, spt.pieces(6).begin()); - EXPECT_EQ(16, spt.pieces(6).end()); - EXPECT_EQ(16, spt.pieces(7).begin()); - EXPECT_EQ(16, spt.pieces(7).end()); + { + SentencePieceProcessor sp; + auto proto = MakeUnique<ModelProto>(); + proto->mutable_trainer_spec()->set_unk_surface("<UNK>"); + sp.Load(std::move(proto)); + + auto mock = MakeUnique<DecodeMockModel>(); + sp.SetModel(std::move(mock)); + + const auto normalizaiton_spec = MakeDefaultNormalizerSpec(); + sp.SetNormalizer(MakeUnique<normalizer::Normalizer>(normalizaiton_spec)); + + SentencePieceText spt; + + sp.Decode(input, &spt); + EXPECT_EQ("ABC<UNK> DEFG HI", spt.text()); + EXPECT_EQ(8, spt.pieces_size()); + } } void AddPiece(ModelProto *model_proto, absl::string_view piece, |