// Copyright 2016 Google Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License.! #include #include #include #include #include #include "builder.h" #include "filesystem.h" #include "init.h" #include "sentencepiece_processor.h" #include "third_party/absl/flags/flag.h" #include "third_party/absl/strings/string_view.h" using sentencepiece::normalizer::Builder; ABSL_FLAG(bool, output_precompiled_header, false, "make normalization_rule.h file"); namespace sentencepiece { namespace { std::string ToHexUInt64Array( const std::vector> &data, std::vector *offset) { std::stringstream os; os.setf(std::ios_base::hex, std::ios_base::basefield); os.setf(std::ios_base::uppercase); os.setf(std::ios_base::right); os.fill('0'); os.unsetf(std::ios_base::showbase); size_t num = 0; for (const auto &p : data) { const char *begin = p.second.data(); const char *end = p.second.data() + p.second.size(); offset->push_back(num); while (begin < end) { unsigned long long int n = 0; unsigned char *buf = reinterpret_cast(&n); const size_t size = std::min(end - begin, sizeof(n)); for (size_t i = 0; i < size; ++i) { buf[i] = static_cast(begin[i]); } begin += sizeof(n); os << "0x" << std::setw(2 * sizeof(n)) << n << ", "; if (++num % 8 == 0) { os << "\n"; } } } return os.str(); } std::string ToHexData(absl::string_view data) { const char *begin = data.data(); const char *end = data.data() + data.size(); constexpr char kHex[] = "0123456789ABCDEF"; constexpr size_t kNumOfBytesOnOneLine = 20; size_t output_count = 0; std::stringstream os; while (begin < end) { const size_t bucket_size = std::min(end - begin, kNumOfBytesOnOneLine - output_count % kNumOfBytesOnOneLine); if (output_count % kNumOfBytesOnOneLine == 0 && bucket_size > 0) { os << "\""; } for (size_t i = 0; i < bucket_size; ++i) { os << "\\x" << kHex[(*begin & 0xF0) >> 4] << kHex[(*begin & 0x0F) >> 0]; ++begin; } output_count += bucket_size; if (output_count % kNumOfBytesOnOneLine == 0 && bucket_size > 0 && begin < end) { os << "\"\n"; } } os << "\"\n"; return os.str(); } std::string MakeHeader( const std::vector> &data) { constexpr char kHeader[] = R"(#ifndef NORMALIZATION_RULE_H_ #define NORMALIZATION_RULE_H_ #include namespace sentencepiece { namespace { struct BinaryBlob { const char *name; size_t size; const char *data; }; )"; constexpr char kFooter[] = R"( } // namespace } // namespace sentencepiece #endif // NORMALIZATION_RULE_H_ )"; std::stringstream os; os << kHeader; os << "#if defined(_WIN32) && !defined(__CYGWIN__)\n"; os << "constexpr unsigned long long int kNormalizationRules_blob_uint64[] = " "{\n"; std::vector offset; os << ToHexUInt64Array(data, &offset); CHECK_EQ(offset.size(), data.size()); os << "};\n\n"; os << "const BinaryBlob kNormalizationRules_blob[] = {\n"; for (size_t i = 0; i < data.size(); ++i) { os << "{ \"" << data[i].first << "\", " << data[i].second.size() << ", "; os << "reinterpret_cast(kNormalizationRules_blob_uint64 + " << offset[i] << ") },\n"; } os << "};\n"; os << "#else\n"; os << "constexpr BinaryBlob kNormalizationRules_blob[] = {\n"; for (size_t i = 0; i < data.size(); ++i) { os << "{ \"" << data[i].first << "\", " << data[i].second.size() << ", "; os << ToHexData(data[i].second) << "},\n"; } os << "};\n"; os << "#endif\n"; os << "constexpr size_t kNormalizationRules_size = " << data.size() << ";\n"; os << kFooter; return os.str(); } } // namespace } // namespace sentencepiece int main(int argc, char **argv) { sentencepiece::ParseCommandLineFlags(argv[0], &argc, &argv, true); const std::vector>> kRuleList = {{"nfkc", Builder::BuildNFKCMap}, {"nmt_nfkc", Builder::BuildNmtNFKCMap}, {"nfkc_cf", Builder::BuildNFKC_CFMap}, {"nmt_nfkc_cf", Builder::BuildNmtNFKC_CFMap}, {"case_uncaser", Builder::BuildUncaserMap}, {"case_recaser", Builder::BuildRecaserMap}}; std::vector> data; for (const auto &p : kRuleList) { Builder::CharsMap normalized_map; CHECK_OK(p.second(&normalized_map)); // Write Header. std::string index; CHECK_OK(Builder::CompileCharsMap(normalized_map, &index)); data.emplace_back(p.first, index); // Write TSV file. CHECK_OK(Builder::SaveCharsMap(p.first + ".tsv", normalized_map)); } if (absl::GetFlag(FLAGS_output_precompiled_header)) { constexpr char kPrecompiledHeaderFileName[] = "normalization_rule.h"; auto output = sentencepiece::filesystem::NewWritableFile(kPrecompiledHeaderFileName); CHECK_OK(output->status()); output->Write(sentencepiece::MakeHeader(data)); } return 0; }