// Copyright 2016 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License.! #include "normalizer.h" #include "builder.h" #include "testharness.h" #include "util.h" namespace sentencepiece { namespace normalizer { namespace { // Space symbol #define WS "\xe2\x96\x81" NormalizerSpec MakeDefaultSpec() { NormalizerSpec normalizer_spec; normalizer_spec.set_name("nfkc"); EXPECT_OK(normalizer::Builder::PopulateNormalizerSpec(&normalizer_spec)); return normalizer_spec; } } // namespace TEST(NormalizerTest, NormalizeErrorTest) { NormalizerSpec spec; Normalizer normalizer(spec); EXPECT_NOT_OK(normalizer.Normalize("test", nullptr, nullptr)); } TEST(NormalizerTest, NormalizeTest) { auto spec = MakeDefaultSpec(); const Normalizer normalizer(spec); // Empty strings. EXPECT_EQ("", normalizer.Normalize("")); EXPECT_EQ("", normalizer.Normalize(" ")); EXPECT_EQ("", normalizer.Normalize(" ")); // Sentence with heading/tailing/redundant spaces. EXPECT_EQ(WS "ABC", normalizer.Normalize("ABC")); EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC ")); EXPECT_EQ(WS "A" WS "B" WS "C", normalizer.Normalize(" A B C ")); EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC ")); EXPECT_EQ(WS "ABC", normalizer.Normalize(" ABC ")); EXPECT_EQ(WS "ABC", normalizer.Normalize("  ABC")); EXPECT_EQ(WS "ABC", normalizer.Normalize("  ABC  ")); // NFKC char to char normalization. EXPECT_EQ(WS "123", normalizer.Normalize("①②③")); // NFKC char to multi-char normalization. EXPECT_EQ(WS "株式会社", normalizer.Normalize("㍿")); // Half width katakana, character composition happens. EXPECT_EQ(WS "グーグル", normalizer.Normalize(" グーグル ")); EXPECT_EQ(WS "I" WS "saw" WS "a" WS "girl", normalizer.Normalize(" I saw a   girl  ")); } TEST(NormalizerTest, NormalizeWithoutDummyPrefixTest) { auto spec = MakeDefaultSpec(); spec.set_add_dummy_prefix(false); const Normalizer normalizer(spec); // Empty strings. EXPECT_EQ("", normalizer.Normalize("")); EXPECT_EQ("", normalizer.Normalize(" ")); EXPECT_EQ("", normalizer.Normalize(" ")); // Sentence with heading/tailing/redundant spaces. EXPECT_EQ("ABC", normalizer.Normalize("ABC")); EXPECT_EQ("ABC", normalizer.Normalize(" ABC ")); EXPECT_EQ("A" WS "B" WS "C", normalizer.Normalize(" A B C ")); EXPECT_EQ("ABC", normalizer.Normalize(" ABC ")); EXPECT_EQ("ABC", normalizer.Normalize(" ABC ")); EXPECT_EQ("ABC", normalizer.Normalize("  ABC")); EXPECT_EQ("ABC", normalizer.Normalize("  ABC  ")); } TEST(NormalizerTest, NormalizeWithoutRemoveExtraWhitespacesTest) { auto spec = MakeDefaultSpec(); spec.set_remove_extra_whitespaces(false); const Normalizer normalizer(spec); // Empty strings. EXPECT_EQ("", normalizer.Normalize("")); EXPECT_EQ(WS WS WS WS WS WS WS, normalizer.Normalize(" ")); EXPECT_EQ(WS WS, normalizer.Normalize(" ")); // Sentence with heading/tailing/redundant spaces. EXPECT_EQ(WS "ABC", normalizer.Normalize("ABC")); EXPECT_EQ(WS WS "ABC" WS, normalizer.Normalize(" ABC ")); EXPECT_EQ(WS WS WS "A" WS WS "B" WS WS "C" WS WS, normalizer.Normalize(" A B C ")); } TEST(NormalizerTest, NormalizeWithoutEscapeWhitespacesTest) { auto spec = MakeDefaultSpec(); spec.set_add_dummy_prefix(false); spec.set_remove_extra_whitespaces(true); spec.set_escape_whitespaces(false); const Normalizer normalizer(spec); // Empty strings. EXPECT_EQ("", normalizer.Normalize("")); EXPECT_EQ("", normalizer.Normalize(" ")); EXPECT_EQ("", normalizer.Normalize(" ")); // Sentence with heading/tailing/redundant spaces. EXPECT_EQ("ABC", normalizer.Normalize("ABC")); EXPECT_EQ("ABC", normalizer.Normalize(" ABC ")); EXPECT_EQ("A B C", normalizer.Normalize(" A B C ")); EXPECT_EQ("A B C", normalizer.Normalize("A  B  C")); } TEST(NormalizeTest, NomalizeWithSpaceContainedRules) { Builder::CharsMap charsmap; auto AddRule = [&](const std::string &src, const std::string &trg) { Builder::Chars src_chars, trg_chars; for (const char32 c : string_util::UTF8ToUnicodeText(src)) { src_chars.push_back(c); } for (const char32 c : string_util::UTF8ToUnicodeText(trg)) { trg_chars.push_back(c); } charsmap[src_chars] = trg_chars; }; // Adds rules containing whitespaes. AddRule("a", " A"); AddRule("b", "B"); AddRule("c", "D E"); AddRule("d", " F G "); NormalizerSpec spec; EXPECT_OK( Builder::CompileCharsMap(charsmap, spec.mutable_precompiled_charsmap())); // Test default behavior { const Normalizer normalizer(spec); EXPECT_EQ(WS "A", normalizer.Normalize("a")); EXPECT_EQ(WS "B" WS "A", normalizer.Normalize("ba")); EXPECT_EQ(WS "D" WS "E", normalizer.Normalize("c")); EXPECT_EQ(WS "F" WS "G" WS "A", normalizer.Normalize("da")); EXPECT_EQ(WS "A" WS "F" WS "G", normalizer.Normalize("ad")); EXPECT_EQ(WS "A" WS "F" WS "G" WS "B", normalizer.Normalize("adb")); } spec.set_escape_whitespaces(false); { spec.set_add_dummy_prefix(false); spec.set_remove_extra_whitespaces(true); const Normalizer normalizer(spec); EXPECT_EQ("A", normalizer.Normalize("a")); EXPECT_EQ("B A", normalizer.Normalize("ba")); EXPECT_EQ("D E", normalizer.Normalize("c")); EXPECT_EQ("F G A", normalizer.Normalize("da")); EXPECT_EQ("A F G", normalizer.Normalize("ad")); EXPECT_EQ("A F G B", normalizer.Normalize("adb")); } { spec.set_add_dummy_prefix(false); spec.set_remove_extra_whitespaces(false); const Normalizer normalizer(spec); EXPECT_EQ(" A", normalizer.Normalize("a")); EXPECT_EQ("B A", normalizer.Normalize("ba")); EXPECT_EQ("D E", normalizer.Normalize("c")); EXPECT_EQ(" F G A", normalizer.Normalize("da")); EXPECT_EQ(" A F G ", normalizer.Normalize("ad")); EXPECT_EQ(" A F G B", normalizer.Normalize("adb")); } { spec.set_add_dummy_prefix(true); spec.set_remove_extra_whitespaces(true); const Normalizer normalizer(spec); EXPECT_EQ(" A", normalizer.Normalize("a")); EXPECT_EQ(" B A", normalizer.Normalize("ba")); EXPECT_EQ(" D E", normalizer.Normalize("c")); EXPECT_EQ(" F G A", normalizer.Normalize("da")); EXPECT_EQ(" A F G", normalizer.Normalize("ad")); EXPECT_EQ(" A F G B", normalizer.Normalize("adb")); } { spec.set_add_dummy_prefix(true); spec.set_remove_extra_whitespaces(false); const Normalizer normalizer(spec); EXPECT_EQ(" A", normalizer.Normalize("a")); EXPECT_EQ(" B A", normalizer.Normalize("ba")); EXPECT_EQ(" D E", normalizer.Normalize("c")); EXPECT_EQ(" F G A", normalizer.Normalize("da")); EXPECT_EQ(" A F G ", normalizer.Normalize("ad")); EXPECT_EQ(" A F G B", normalizer.Normalize("adb")); } } TEST(NormalizerTest, NormalizeFullTest) { std::vector n2i; std::string output; auto spec = MakeDefaultSpec(); const Normalizer normalizer(spec); { const std::string input = "I saw a girl"; normalizer.Normalize(input, &output, &n2i); EXPECT_EQ(WS "I" WS "saw" WS "a" WS "girl", output); const std::vector expected = {0, 0, 0, // WS (3byte) 0, // I 1, 1, 1, // WS 2, 3, 4, // saw 5, 5, 5, // WS 6, // a 7, 7, 7, // WS 8, 9, 10, 11, // girl 12}; EXPECT_EQ(expected, n2i); } { const std::string input = " I saw a   girl  "; EXPECT_OK(normalizer.Normalize(input, &output, &n2i)); LOG(INFO) << output; EXPECT_EQ(WS "I" WS "saw" WS "a" WS "girl", output); const std::vector expected = {1, 1, 1, // WS (3byte) 1, // I 2, 2, 2, // WS 5, 6, 7, // saw 8, 8, 8, // WS 9, // a 10, 10, 10, // WS 17, 18, 19, 20, // girl 21}; EXPECT_EQ(expected, n2i); } { const std::string input = " グーグル "; // halfwidth katakana normalizer.Normalize(input, &output, &n2i); EXPECT_EQ(WS "グーグル", output); const std::vector expected = {1, 1, 1, // WS (3byte) 1, 1, 1, // グ 7, 7, 7, // ー 10, 10, 10, // グ 16, 16, 16, // ル 19}; EXPECT_EQ(expected, n2i); } { const std::string input = "①②③"; normalizer.Normalize(input, &output, &n2i); EXPECT_EQ(WS "123", output); const std::vector expected = {0, 0, 0, // WS (3byte) 0, // 1 3, // 2 6, // 3 9}; EXPECT_EQ(expected, n2i); } { const std::string input = "㍿"; normalizer.Normalize(input, &output, &n2i); EXPECT_EQ(WS "株式会社", output); const std::vector expected = {0, 0, 0, // WS (3byte) 0, 0, 0, // 株 0, 0, 0, // 式 0, 0, 0, // 会 0, 0, 0, // 社 3}; // When "株式" is one piece, this has no alignment to the input. // Sentencepieces which includes the last character ("会社" or "社") // have the alignment to the input. EXPECT_EQ(expected, n2i); } } TEST(NormalizerTest, EncodeDecodePrecompiledCharsMapTest) { const std::string blob = Normalizer::EncodePrecompiledCharsMap("foo", "bar"); StringPiece trie_blob, normalized_blob; EXPECT_OK(Normalizer::DecodePrecompiledCharsMap(blob, &trie_blob, &normalized_blob)); EXPECT_EQ("foo", trie_blob); EXPECT_EQ("bar", normalized_blob); EXPECT_NOT_OK( Normalizer::DecodePrecompiledCharsMap("", &trie_blob, &normalized_blob)); } TEST(NormalizerTest, StatusTest) { NormalizerSpec spec; { const Normalizer normalizer(spec); EXPECT_FALSE(normalizer.status().ok()); } spec = MakeDefaultSpec(); { const Normalizer normalizer(spec); EXPECT_TRUE(normalizer.status().ok()); } } } // namespace normalizer } // namespace sentencepiece