Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/sentencepiece.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaku Kudo <taku@google.com>2018-06-04 14:32:37 +0300
committerTaku Kudo <taku@google.com>2018-06-04 14:32:37 +0300
commit4e3bcf1373fb7c8ddca151ddf2a6f10914057cfa (patch)
tree71ea36b440216b25cfa7290be2a09d3ac82b7acb /src/normalizer.cc
parent4f7af0dfadbf547264296d46924055842c901b60 (diff)
Updated normalizer
Diffstat (limited to 'src/normalizer.cc')
-rw-r--r--src/normalizer.cc80
1 files changed, 39 insertions, 41 deletions
diff --git a/src/normalizer.cc b/src/normalizer.cc
index ddb701a..77fb3f8 100644
--- a/src/normalizer.cc
+++ b/src/normalizer.cc
@@ -23,36 +23,32 @@ namespace normalizer {
constexpr int Normalizer::kMaxTrieResultsSize;
-Normalizer::Normalizer(const NormalizerSpec &spec) : spec_(&spec) {
+Normalizer::Normalizer(const NormalizerSpec &spec)
+ : spec_(&spec), status_(util::OkStatus()) {
StringPiece index = spec.precompiled_charsmap();
if (index.empty()) {
- status_ = util::InvalidArgumentError("precompiled_charsmap is empty.");
- return;
- }
-
- StringPiece trie_blob, normalized;
- status_ = DecodePrecompiledCharsMap(index, &trie_blob, &normalized);
- if (!status_.ok()) return;
+ LOG(INFO) << "precompiled_charsmap is empty. use identity normalization.";
+ } else {
+ StringPiece trie_blob, normalized;
+ status_ = DecodePrecompiledCharsMap(index, &trie_blob, &normalized);
+ if (!status_.ok()) return;
- // Reads the body of double array.
- trie_ = port::MakeUnique<Darts::DoubleArray>();
+ // Reads the body of double array.
+ trie_ = port::MakeUnique<Darts::DoubleArray>();
- // The second arg of set_array is not the size of blob,
- // but the number of double array units.
- trie_->set_array(const_cast<char *>(trie_blob.data()),
- trie_blob.size() / trie_->unit_size());
+ // The second arg of set_array is not the size of blob,
+ // but the number of double array units.
+ trie_->set_array(const_cast<char *>(trie_blob.data()),
+ trie_blob.size() / trie_->unit_size());
- normalized_ = normalized.data();
+ normalized_ = normalized.data();
+ }
}
Normalizer::~Normalizer() {}
util::Status Normalizer::Normalize(StringPiece input, std::string *normalized,
std::vector<size_t> *norm_to_orig) const {
- if (trie_ == nullptr || normalized_ == nullptr) {
- return util::InternalError("Normalizer model is not available.");
- }
-
norm_to_orig->clear();
normalized->clear();
@@ -60,6 +56,8 @@ util::Status Normalizer::Normalize(StringPiece input, std::string *normalized,
return util::OkStatus();
}
+ RETURN_IF_ERROR(status());
+
int consumed = 0;
// Ignores heading space.
@@ -144,7 +142,7 @@ util::Status Normalizer::Normalize(StringPiece input, std::string *normalized,
const StringPiece space = spec_->escape_whitespaces() ? kSpaceSymbol : " ";
while (string_util::EndsWith(*normalized, space)) {
const int length = normalized->size() - space.size();
- if (length < 0) return util::InternalError("length < 0");
+ CHECK_GE_OR_RETURN(length, 0);
consumed = (*norm_to_orig)[length];
normalized->resize(length);
norm_to_orig->resize(length);
@@ -153,9 +151,7 @@ util::Status Normalizer::Normalize(StringPiece input, std::string *normalized,
norm_to_orig->push_back(consumed);
- if (norm_to_orig->size() != normalized->size() + 1) {
- return util::InternalError("norm_to_org and normalized are inconsistent");
- }
+ CHECK_EQ_OR_RETURN(norm_to_orig->size(), normalized->size() + 1);
return util::OkStatus();
}
@@ -173,25 +169,27 @@ std::pair<StringPiece, int> Normalizer::NormalizePrefix(
if (input.empty()) return result;
- // Allocates trie_results in stack, which makes the encoding speed 36% faster.
- // (38k sentences/sec => 60k sentences/sec).
- // Builder checks that the result size never exceeds kMaxTrieResultsSize.
- // This array consumes 0.5kByte in stack, which is less than
- // default stack frames (16kByte).
- Darts::DoubleArray::result_pair_type
- trie_results[Normalizer::kMaxTrieResultsSize];
-
- const size_t num_nodes =
- trie_->commonPrefixSearch(input.data(), trie_results,
- Normalizer::kMaxTrieResultsSize, input.size());
-
- // Finds the longest rule.
size_t longest_length = 0;
int longest_value = 0;
- for (size_t k = 0; k < num_nodes; ++k) {
- if (longest_length == 0 || trie_results[k].length > longest_length) {
- longest_length = trie_results[k].length; // length of prefix
- longest_value = trie_results[k].value; // pointer to |normalized_|.
+
+ if (trie_ != nullptr) {
+ // Allocates trie_results in stack, which makes the encoding speed 36%
+ // faster. (38k sentences/sec => 60k sentences/sec). Builder checks that the
+ // result size never exceeds kMaxTrieResultsSize. This array consumes
+ // 0.5kByte in stack, which is less than default stack frames (16kByte).
+ Darts::DoubleArray::result_pair_type
+ trie_results[Normalizer::kMaxTrieResultsSize];
+
+ const size_t num_nodes = trie_->commonPrefixSearch(
+ input.data(), trie_results, Normalizer::kMaxTrieResultsSize,
+ input.size());
+
+ // Finds the longest rule.
+ for (size_t k = 0; k < num_nodes; ++k) {
+ if (longest_length == 0 || trie_results[k].length > longest_length) {
+ longest_length = trie_results[k].length; // length of prefix
+ longest_value = trie_results[k].value; // pointer to |normalized_|.
+ }
}
}
@@ -239,7 +237,7 @@ util::Status Normalizer::DecodePrecompiledCharsMap(StringPiece blob,
!string_util::DecodePOD<uint32>(
StringPiece(blob.data(), sizeof(trie_blob_size)), &trie_blob_size) ||
trie_blob_size >= blob.size()) {
- return util::InternalError("Trie blob is broken.");
+ return util::InternalError("Blob for normalization rule is broken.");
}
blob.remove_prefix(sizeof(trie_blob_size));