Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/sentencepiece.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorTaku Kudo <taku@google.com>2018-06-18 03:31:16 +0300
committerTaku Kudo <taku@google.com>2018-06-18 03:31:16 +0300
commit75c18c6e0467b32371309862259dcf6be55ace72 (patch)
tree347445d637136dd20b23d6c184c24ba1ba1df52c /src
parent6884e10ccd24d5192b0da010ecfdb5081b1a4dc6 (diff)
Uses abs::string_view instead of StringPiece
Diffstat (limited to 'src')
-rw-r--r--src/Makefile.am6
-rw-r--r--src/bpe_model.cc23
-rw-r--r--src/bpe_model.h2
-rw-r--r--src/builder.cc19
-rw-r--r--src/builder.h8
-rw-r--r--src/char_model.cc4
-rw-r--r--src/char_model.h2
-rw-r--r--src/compile_charsmap_main.cc4
-rw-r--r--src/model_interface.cc22
-rw-r--r--src/model_interface.h25
-rw-r--r--src/normalizer.cc51
-rw-r--r--src/normalizer.h20
-rw-r--r--src/normalizer_test.cc2
-rw-r--r--src/sentencepiece_processor.cc23
-rw-r--r--src/sentencepiece_processor.h7
-rw-r--r--src/sentencepiece_processor_test.cc43
-rw-r--r--src/sentencepiece_trainer.cc10
-rw-r--r--src/stringpiece.h46
-rw-r--r--src/trainer_interface.cc8
-rw-r--r--src/trainer_interface.h6
-rw-r--r--src/unicode_script_test.cc4
-rw-r--r--src/unigram_model.cc22
-rw-r--r--src/unigram_model.h16
-rw-r--r--src/unigram_model_test.cc2
-rw-r--r--src/unigram_model_trainer.cc4
-rw-r--r--src/unigram_model_trainer.h4
-rw-r--r--src/util.cc39
-rw-r--r--src/util.h100
-rw-r--r--src/util_test.cc8
-rw-r--r--src/word_model.cc2
-rw-r--r--src/word_model.h2
-rw-r--r--src/word_model_trainer.cc4
32 files changed, 294 insertions, 244 deletions
diff --git a/src/Makefile.am b/src/Makefile.am
index 37f53d0..d815a59 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -1,6 +1,7 @@
lib_LTLIBRARIES = libsentencepiece.la libsentencepiece_train.la
AM_CXXFLAS = -I($srcdir)
+AUTOMAKE_OPTIONS = subdir-objects
libsentencepiece_la_SOURCES = \
error.cc \
@@ -8,7 +9,7 @@ libsentencepiece_la_SOURCES = \
sentencepiece_processor.cc \
util.cc \
normalizer.cc \
- stringpiece.h unicode_script_map.h util.h \
+ unicode_script_map.h util.h \
common.h \
flags.h normalizer.h sentencepiece_processor.h \
model_factory.h model_factory.cc \
@@ -16,7 +17,8 @@ libsentencepiece_la_SOURCES = \
unigram_model.h unigram_model.cc \
word_model.h word_model.cc \
char_model.h char_model.cc \
- bpe_model.h bpe_model.cc
+ bpe_model.h bpe_model.cc \
+ ../third_party/absl/strings/string_view.cc
include_HEADERS = sentencepiece_processor.h sentencepiece_trainer.h
# noinst_LIBRARIES = libsentencepiecetrain.a
diff --git a/src/bpe_model.cc b/src/bpe_model.cc
index 8b66f4b..e97fb67 100644
--- a/src/bpe_model.cc
+++ b/src/bpe_model.cc
@@ -32,8 +32,8 @@ Model::Model(const ModelProto &model_proto) {
Model::~Model() {}
-std::vector<std::pair<StringPiece, int>> Model::Encode(
- StringPiece normalized) const {
+std::vector<std::pair<absl::string_view, int>> Model::Encode(
+ absl::string_view normalized) const {
if (!status().ok() || normalized.empty()) {
return {};
}
@@ -57,7 +57,7 @@ std::vector<std::pair<StringPiece, int>> Model::Encode(
int prev; // prev index of this symbol. -1 for BOS.
int next; // next index of tihs symbol. -1 for EOS.
bool freeze; // this symbol is never be merged.
- StringPiece piece;
+ absl::string_view piece;
};
using Agenda = std::priority_queue<SymbolPair *, std::vector<SymbolPair *>,
@@ -68,8 +68,9 @@ std::vector<std::pair<StringPiece, int>> Model::Encode(
// Reverse merge rules.
// key: merged symbol, value: pair of original symbols.
- std::unordered_map<StringPiece, std::pair<StringPiece, StringPiece>,
- StringPieceHash>
+ std::unordered_map<absl::string_view,
+ std::pair<absl::string_view, absl::string_view>,
+ string_util::string_view_hash>
rev_merge;
// Lookup new symbol pair at [left, right] and inserts it to agenda.
@@ -78,7 +79,7 @@ std::vector<std::pair<StringPiece, int>> Model::Encode(
if (left == -1 || right == -1 || symbols[left].freeze ||
symbols[right].freeze)
return;
- const StringPiece piece(
+ const absl::string_view piece(
symbols[left].piece.data(),
symbols[left].piece.size() + symbols[right].piece.size());
const auto it = pieces_.find(piece);
@@ -104,7 +105,7 @@ std::vector<std::pair<StringPiece, int>> Model::Encode(
while (!normalized.empty()) {
Symbol s;
const int mblen = matcher_->PrefixMatch(normalized, &s.freeze);
- s.piece = StringPiece(normalized.data(), mblen);
+ s.piece = absl::string_view(normalized.data(), mblen);
s.prev = index == 0 ? -1 : index - 1;
normalized.remove_prefix(mblen);
s.next = normalized.empty() ? -1 : index + 1;
@@ -134,7 +135,7 @@ std::vector<std::pair<StringPiece, int>> Model::Encode(
}
// Replaces symbols with `top` rule.
- symbols[top->left].piece = StringPiece(
+ symbols[top->left].piece = absl::string_view(
symbols[top->left].piece.data(),
symbols[top->left].piece.size() + symbols[top->right].piece.size());
@@ -143,15 +144,15 @@ std::vector<std::pair<StringPiece, int>> Model::Encode(
if (symbols[top->right].next >= 0) {
symbols[symbols[top->right].next].prev = top->left;
}
- symbols[top->right].piece = StringPiece("");
+ symbols[top->right].piece = absl::string_view("");
// Adds new symbol pairs which are newly added after symbol replacement.
MaybeAddNewSymbolPair(symbols[top->left].prev, top->left);
MaybeAddNewSymbolPair(top->left, symbols[top->left].next);
}
- std::function<void(StringPiece, EncodeResult *)> resegment;
- resegment = [this, &resegment, &rev_merge](StringPiece w,
+ std::function<void(absl::string_view, EncodeResult *)> resegment;
+ resegment = [this, &resegment, &rev_merge](absl::string_view w,
EncodeResult *output) -> void {
const int id = PieceToId(w);
if (id == -1 || !IsUnused(id)) {
diff --git a/src/bpe_model.h b/src/bpe_model.h
index 826bf6b..c73563b 100644
--- a/src/bpe_model.h
+++ b/src/bpe_model.h
@@ -32,7 +32,7 @@ class Model : public ModelInterface {
explicit Model(const ModelProto &model_proto);
~Model() override;
- EncodeResult Encode(StringPiece normalized) const override;
+ EncodeResult Encode(absl::string_view normalized) const override;
};
} // namespace bpe
} // namespace sentencepiece
diff --git a/src/builder.cc b/src/builder.cc
index e4f9d6a..3cbc535 100644
--- a/src/builder.cc
+++ b/src/builder.cc
@@ -204,8 +204,8 @@ util::Status Builder::CompileCharsMap(const CharsMap &chars_map,
<< "The number of shared prefix must be less than "
<< Normalizer::kMaxTrieResultsSize;
- StringPiece trie_blob(static_cast<const char *>(trie.array()),
- trie.size() * trie.unit_size());
+ absl::string_view trie_blob(static_cast<const char *>(trie.array()),
+ trie.size() * trie.unit_size());
*output = Normalizer::EncodePrecompiledCharsMap(trie_blob, normalized);
LOG(INFO) << "Generated normalizer blob. size=" << output->size();
@@ -214,12 +214,12 @@ util::Status Builder::CompileCharsMap(const CharsMap &chars_map,
}
// static
-util::Status Builder::DecompileCharsMap(StringPiece blob,
+util::Status Builder::DecompileCharsMap(absl::string_view blob,
Builder::CharsMap *chars_map) {
CHECK_OR_RETURN(chars_map);
chars_map->clear();
- StringPiece trie_blob, normalized;
+ absl::string_view trie_blob, normalized;
RETURN_IF_ERROR(
Normalizer::DecodePrecompiledCharsMap(blob, &trie_blob, &normalized));
@@ -245,7 +245,7 @@ util::Status Builder::DecompileCharsMap(StringPiece blob,
key.data(), copied_node_pos, copied_key_pos, key.size());
if (result >= -1) { // node exists.
if (result >= 0) { // has a value after transition.
- const StringPiece value = normalized.data() + result;
+ const absl::string_view value = normalized.data() + result;
Chars key_chars, value_chars;
for (const auto c : string_util::UTF8ToUnicodeText(key))
key_chars.push_back(c);
@@ -384,7 +384,8 @@ util::Status Builder::BuildNmtNFKCMap(CharsMap *chars_map) {
}
// static
-util::Status Builder::LoadCharsMap(StringPiece filename, CharsMap *chars_map) {
+util::Status Builder::LoadCharsMap(absl::string_view filename,
+ CharsMap *chars_map) {
LOG(INFO) << "Loading maping file: " << filename.data();
CHECK_OR_RETURN(chars_map);
@@ -400,12 +401,12 @@ util::Status Builder::LoadCharsMap(StringPiece filename, CharsMap *chars_map) {
std::vector<char32> src, trg;
for (auto &s : string_util::SplitPiece(fields[0], " ")) {
if (s.empty()) continue;
- s.Consume("U+");
+ string_util::ConsumePrefix(&s, "U+");
src.push_back(string_util::HexToInt<char32>(s));
}
for (auto &s : string_util::SplitPiece(fields[1], " ")) {
if (s.empty()) continue;
- s.Consume("U+");
+ string_util::ConsumePrefix(&s, "U+");
trg.push_back(string_util::HexToInt<char32>(s));
}
CHECK_OR_RETURN(!src.empty());
@@ -416,7 +417,7 @@ util::Status Builder::LoadCharsMap(StringPiece filename, CharsMap *chars_map) {
}
// static
-util::Status Builder::SaveCharsMap(StringPiece filename,
+util::Status Builder::SaveCharsMap(absl::string_view filename,
const Builder::CharsMap &chars_map) {
io::OutputBuffer output(filename);
RETURN_IF_ERROR(output.status());
diff --git a/src/builder.h b/src/builder.h
index 5685fa5..219a965 100644
--- a/src/builder.h
+++ b/src/builder.h
@@ -21,7 +21,7 @@
#include "common.h"
#include "sentencepiece_model.pb.h"
#include "sentencepiece_processor.h"
-#include "stringpiece.h"
+#include "third_party/absl/strings/string_view.h"
namespace sentencepiece {
namespace normalizer {
@@ -46,7 +46,7 @@ class Builder {
std::string *output);
// Decompiles `blob` into `chars_map`.
- static util::Status DecompileCharsMap(StringPiece blob, CharsMap *chars_map);
+ static util::Status DecompileCharsMap(absl::string_view blob, CharsMap *chars_map);
// Returns a pre-compiled binary index with `name`.
static util::Status GetPrecompiledCharsMap(const std::string &name,
@@ -97,10 +97,10 @@ class Builder {
// Format:
// src_uchar1 src_uchar2 ... <tab> trg_uchar1 trg_uchar2...
// (src|trg)_ucharX must be a hex of Unicode code point.
- static util::Status LoadCharsMap(StringPiece filename, CharsMap *chars_map);
+ static util::Status LoadCharsMap(absl::string_view filename, CharsMap *chars_map);
// Saves Chars map to `filename` as TSV.
- static util::Status SaveCharsMap(StringPiece filename,
+ static util::Status SaveCharsMap(absl::string_view filename,
const CharsMap &chars_map);
private:
diff --git a/src/char_model.cc b/src/char_model.cc
index ec3d244..0b410bb 100644
--- a/src/char_model.cc
+++ b/src/char_model.cc
@@ -25,7 +25,7 @@ Model::Model(const ModelProto &model_proto) {
Model::~Model() {}
-EncodeResult Model::Encode(StringPiece normalized) const {
+EncodeResult Model::Encode(absl::string_view normalized) const {
if (!status().ok() || normalized.empty()) {
return {};
}
@@ -34,7 +34,7 @@ EncodeResult Model::Encode(StringPiece normalized) const {
EncodeResult output;
while (!normalized.empty()) {
const int mblen = matcher_->PrefixMatch(normalized);
- StringPiece w(normalized.data(), mblen);
+ absl::string_view w(normalized.data(), mblen);
output.emplace_back(w, PieceToId(w));
normalized.remove_prefix(mblen);
}
diff --git a/src/char_model.h b/src/char_model.h
index a7c53d3..cd32875 100644
--- a/src/char_model.h
+++ b/src/char_model.h
@@ -27,7 +27,7 @@ class Model : public ModelInterface {
explicit Model(const ModelProto &model_proto);
~Model() override;
- EncodeResult Encode(StringPiece normalized) const override;
+ EncodeResult Encode(absl::string_view normalized) const override;
};
} // namespace character
} // namespace sentencepiece
diff --git a/src/compile_charsmap_main.cc b/src/compile_charsmap_main.cc
index 6a31fc9..6c20830 100644
--- a/src/compile_charsmap_main.cc
+++ b/src/compile_charsmap_main.cc
@@ -19,7 +19,7 @@
#include "builder.h"
#include "flags.h"
#include "sentencepiece_processor.h"
-#include "stringpiece.h"
+#include "third_party/absl/strings/string_view.h"
#include "util.h"
using sentencepiece::normalizer::Builder;
@@ -30,7 +30,7 @@ DEFINE_bool(output_precompiled_header, false, "make normalization_rule.h file");
namespace sentencepiece {
namespace {
-std::string ToHexData(StringPiece data) {
+std::string ToHexData(absl::string_view data) {
const char *begin = data.data();
const char *end = data.data() + data.size();
constexpr char kHex[] = "0123456789ABCDEF";
diff --git a/src/model_interface.cc b/src/model_interface.cc
index 255d1be..5cbb1a5 100644
--- a/src/model_interface.cc
+++ b/src/model_interface.cc
@@ -20,7 +20,7 @@
namespace sentencepiece {
-PrefixMatcher::PrefixMatcher(const std::set<StringPiece> &dic) {
+PrefixMatcher::PrefixMatcher(const std::set<absl::string_view> &dic) {
if (dic.empty()) return;
std::vector<const char *> key;
key.reserve(dic.size());
@@ -30,7 +30,7 @@ PrefixMatcher::PrefixMatcher(const std::set<StringPiece> &dic) {
nullptr));
}
-int PrefixMatcher::PrefixMatch(StringPiece w, bool *found) const {
+int PrefixMatcher::PrefixMatch(absl::string_view w, bool *found) const {
if (trie_ == nullptr) {
if (found) *found = false;
return std::min<int>(w.size(), string_util::OneCharLen(w.data()));
@@ -54,7 +54,8 @@ int PrefixMatcher::PrefixMatch(StringPiece w, bool *found) const {
return mblen;
}
-std::string PrefixMatcher::GlobalReplace(StringPiece w, StringPiece out) const {
+std::string PrefixMatcher::GlobalReplace(absl::string_view w,
+ absl::string_view out) const {
std::string result;
while (!w.empty()) {
bool found = false;
@@ -73,7 +74,7 @@ ModelInterface::ModelInterface(const ModelProto &model_proto)
: model_proto_(&model_proto), status_(util::OkStatus()) {}
ModelInterface::~ModelInterface() {}
-int ModelInterface::PieceToId(StringPiece piece) const {
+int ModelInterface::PieceToId(absl::string_view piece) const {
auto it = reserved_id_map_.find(piece);
if (it != reserved_id_map_.end()) {
return it->second;
@@ -119,7 +120,7 @@ void ModelInterface::InitializePieces(bool use_prefix_matcher) {
reserved_id_map_.clear();
unk_id_ = -1;
- std::set<StringPiece> user_defined_symbols;
+ std::set<absl::string_view> user_defined_symbols;
for (int i = 0; i < model_proto_->pieces_size(); ++i) {
const auto &sp = model_proto_->pieces(i);
@@ -162,22 +163,23 @@ void ModelInterface::InitializePieces(bool use_prefix_matcher) {
}
}
-std::vector<StringPiece> SplitIntoWords(StringPiece text) {
+std::vector<absl::string_view> SplitIntoWords(absl::string_view text) {
const char *begin = text.data();
const char *end = text.data() + text.size();
// Space symbol (U+2581)
- const StringPiece kSpaceSymbol = "\xe2\x96\x81";
+ const absl::string_view kSpaceSymbol = "\xe2\x96\x81";
- std::vector<StringPiece> result;
+ std::vector<absl::string_view> result;
while (begin < end) {
const int mblen =
std::min<int>(string_util::OneCharLen(begin), end - begin);
- if (begin == text.data() || StringPiece(begin, mblen) == kSpaceSymbol) {
+ if (begin == text.data() ||
+ absl::string_view(begin, mblen) == kSpaceSymbol) {
result.emplace_back(begin, 0); // add empty string piece.
}
result.back() =
- StringPiece(result.back().data(), result.back().size() + mblen);
+ absl::string_view(result.back().data(), result.back().size() + mblen);
begin += mblen;
}
diff --git a/src/model_interface.h b/src/model_interface.h
index f70c58a..04b733c 100644
--- a/src/model_interface.h
+++ b/src/model_interface.h
@@ -24,15 +24,16 @@
#include "common.h"
#include "sentencepiece_processor.h"
-#include "stringpiece.h"
+#include "third_party/absl/strings/string_view.h"
#include "third_party/darts_clone/darts.h"
+#include "util.h"
namespace sentencepiece {
// "_this_is_a_pen" => ["_this", "_is", "_a", "_pen"]
-std::vector<StringPiece> SplitIntoWords(StringPiece text);
+std::vector<absl::string_view> SplitIntoWords(absl::string_view text);
-using EncodeResult = std::vector<std::pair<StringPiece, int>>;
+using EncodeResult = std::vector<std::pair<absl::string_view, int>>;
using NBestEncodeResult = std::vector<std::pair<EncodeResult, float>>;
class ModelProto;
@@ -42,16 +43,16 @@ class ModelProto;
class PrefixMatcher {
public:
// Initializes the PrefixMatcher with `dic`.
- explicit PrefixMatcher(const std::set<StringPiece> &dic);
+ explicit PrefixMatcher(const std::set<absl::string_view> &dic);
// Finds the longest string in dic, which is a prefix of `w`.
// Returns the UTF8 byte length of matched string.
// `found` is set if a prefix match exists.
// If no entry is found, consumes one Unicode character.
- int PrefixMatch(StringPiece w, bool *found = nullptr) const;
+ int PrefixMatch(absl::string_view w, bool *found = nullptr) const;
// Replaces entries in `w` with `out`.
- std::string GlobalReplace(StringPiece w, StringPiece out) const;
+ std::string GlobalReplace(absl::string_view w, absl::string_view out) const;
private:
std::unique_ptr<Darts::DoubleArray> trie_;
@@ -61,7 +62,8 @@ class PrefixMatcher {
// Given a normalized string, returns a sequence of sentence pieces with ids.
class ModelInterface {
public:
- using PieceToIdMap = std::unordered_map<StringPiece, int, StringPieceHash>;
+ using PieceToIdMap =
+ std::unordered_map<absl::string_view, int, string_util::string_view_hash>;
// `model_proto` should not be deleted until ModelInterface is destroyed.
explicit ModelInterface(const ModelProto &model_proto);
@@ -77,16 +79,17 @@ class ModelInterface {
// Given a normalized string, returns a sequence of sentence pieces with ids.
// The concatenation of pieces must be the same as `normalized`.
- virtual EncodeResult Encode(StringPiece normalized) const = 0;
+ virtual EncodeResult Encode(absl::string_view normalized) const = 0;
// The same as above, but returns nbest result with score.
- virtual NBestEncodeResult NBestEncode(StringPiece normalized,
+ virtual NBestEncodeResult NBestEncode(absl::string_view normalized,
int nbest_size) const {
LOG(ERROR) << "Not implemented.";
return NBestEncodeResult();
}
- virtual EncodeResult SampleEncode(StringPiece normalized, float alpha) const {
+ virtual EncodeResult SampleEncode(absl::string_view normalized,
+ float alpha) const {
LOG(ERROR) << "Not implemented.";
return EncodeResult();
}
@@ -97,7 +100,7 @@ class ModelInterface {
// Returns the vocab id of `piece`.
// Returns UNK(0) if `piece` is unknown
- virtual int PieceToId(StringPiece piece) const;
+ virtual int PieceToId(absl::string_view piece) const;
// Returns the string representation of vocab with `id`.
// id must be 0 <= id < GetPieceSize().
diff --git a/src/normalizer.cc b/src/normalizer.cc
index 4ba724f..52999ae 100644
--- a/src/normalizer.cc
+++ b/src/normalizer.cc
@@ -17,7 +17,7 @@
#include <utility>
#include <vector>
#include "common.h"
-#include "stringpiece.h"
+#include "third_party/absl/strings/string_view.h"
#include "third_party/darts_clone/darts.h"
#include "util.h"
@@ -28,11 +28,11 @@ constexpr int Normalizer::kMaxTrieResultsSize;
Normalizer::Normalizer(const NormalizerSpec &spec)
: spec_(&spec), status_(util::OkStatus()) {
- StringPiece index = spec.precompiled_charsmap();
+ absl::string_view index = spec.precompiled_charsmap();
if (index.empty()) {
LOG(INFO) << "precompiled_charsmap is empty. use identity normalization.";
} else {
- StringPiece trie_blob, normalized;
+ absl::string_view trie_blob, normalized;
status_ = DecodePrecompiledCharsMap(index, &trie_blob, &normalized);
if (!status_.ok()) return;
@@ -50,7 +50,8 @@ Normalizer::Normalizer(const NormalizerSpec &spec)
Normalizer::~Normalizer() {}
-util::Status Normalizer::Normalize(StringPiece input, std::string *normalized,
+util::Status Normalizer::Normalize(absl::string_view input,
+ std::string *normalized,
std::vector<size_t> *norm_to_orig) const {
norm_to_orig->clear();
normalized->clear();
@@ -87,7 +88,7 @@ util::Status Normalizer::Normalize(StringPiece input, std::string *normalized,
// Replaces white space with U+2581 (LOWER ONE EIGHT BLOCK)
// if escape_whitespaces() is set (default = true).
- const StringPiece kSpaceSymbol = "\xe2\x96\x81";
+ const absl::string_view kSpaceSymbol = "\xe2\x96\x81";
// Adds a space symbol as a prefix (default is true)
// With this prefix, "world" and "hello world" are converted into
@@ -108,11 +109,11 @@ util::Status Normalizer::Normalize(StringPiece input, std::string *normalized,
bool is_prev_space = spec_->remove_extra_whitespaces();
while (!input.empty()) {
auto p = NormalizePrefix(input);
- StringPiece sp = p.first;
+ absl::string_view sp = p.first;
// Removes heading spaces in sentence piece,
// if the previous sentence piece ends with whitespace.
- while (is_prev_space && sp.Consume(" ")) {
+ while (is_prev_space && string_util::ConsumePrefix(&sp, " ")) {
}
if (!sp.empty()) {
@@ -130,7 +131,7 @@ util::Status Normalizer::Normalize(StringPiece input, std::string *normalized,
}
}
// Checks whether the last character of sp is whitespace.
- is_prev_space = sp.ends_with(" ");
+ is_prev_space = string_util::EndsWith(sp, " ");
}
consumed += p.second;
@@ -142,7 +143,8 @@ util::Status Normalizer::Normalize(StringPiece input, std::string *normalized,
// Ignores tailing space.
if (spec_->remove_extra_whitespaces()) {
- const StringPiece space = spec_->escape_whitespaces() ? kSpaceSymbol : " ";
+ const absl::string_view space =
+ spec_->escape_whitespaces() ? kSpaceSymbol : " ";
while (string_util::EndsWith(*normalized, space)) {
const int length = normalized->size() - space.size();
CHECK_GE_OR_RETURN(length, 0);
@@ -159,16 +161,16 @@ util::Status Normalizer::Normalize(StringPiece input, std::string *normalized,
return util::OkStatus();
}
-std::string Normalizer::Normalize(StringPiece input) const {
+std::string Normalizer::Normalize(absl::string_view input) const {
std::vector<size_t> norm_to_orig;
std::string normalized;
Normalize(input, &normalized, &norm_to_orig);
return normalized;
}
-std::pair<StringPiece, int> Normalizer::NormalizePrefix(
- StringPiece input) const {
- std::pair<StringPiece, int> result;
+std::pair<absl::string_view, int> Normalizer::NormalizePrefix(
+ absl::string_view input) const {
+ std::pair<absl::string_view, int> result;
if (input.empty()) return result;
@@ -205,24 +207,24 @@ std::pair<StringPiece, int> Normalizer::NormalizePrefix(
// but here we only consume one byte.
result.second = 1;
static const char kReplacementChar[] = "\xEF\xBF\xBD";
- result.first.set(kReplacementChar, 3);
+ result.first = absl::string_view(kReplacementChar);
} else {
result.second = length;
- result.first.set(input.data(), result.second);
+ result.first = absl::string_view(input.data(), result.second);
}
} else {
result.second = longest_length;
// No need to pass the size of normalized sentence,
// since |normalized| is delimitered by "\0".
- result.first.set(&normalized_[longest_value]);
+ result.first = absl::string_view(&normalized_[longest_value]);
}
return result;
}
// static
-std::string Normalizer::EncodePrecompiledCharsMap(StringPiece trie_blob,
- StringPiece normalized) {
+std::string Normalizer::EncodePrecompiledCharsMap(
+ absl::string_view trie_blob, absl::string_view normalized) {
// <trie size(4byte)><double array trie><normalized string>
std::string blob;
blob.append(string_util::EncodePOD<uint32>(trie_blob.size()));
@@ -232,22 +234,23 @@ std::string Normalizer::EncodePrecompiledCharsMap(StringPiece trie_blob,
}
// static
-util::Status Normalizer::DecodePrecompiledCharsMap(StringPiece blob,
- StringPiece *trie_blob,
- StringPiece *normalized) {
+util::Status Normalizer::DecodePrecompiledCharsMap(
+ absl::string_view blob, absl::string_view *trie_blob,
+ absl::string_view *normalized) {
uint32 trie_blob_size = 0;
if (blob.size() <= sizeof(trie_blob_size) ||
!string_util::DecodePOD<uint32>(
- StringPiece(blob.data(), sizeof(trie_blob_size)), &trie_blob_size) ||
+ absl::string_view(blob.data(), sizeof(trie_blob_size)),
+ &trie_blob_size) ||
trie_blob_size >= blob.size()) {
return util::InternalError("Blob for normalization rule is broken.");
}
blob.remove_prefix(sizeof(trie_blob_size));
- *trie_blob = StringPiece(blob.data(), trie_blob_size);
+ *trie_blob = absl::string_view(blob.data(), trie_blob_size);
blob.remove_prefix(trie_blob_size);
- *normalized = StringPiece(blob.data(), blob.size());
+ *normalized = absl::string_view(blob.data(), blob.size());
return util::OkStatus();
}
diff --git a/src/normalizer.h b/src/normalizer.h
index 8b4100e..4970d8c 100644
--- a/src/normalizer.h
+++ b/src/normalizer.h
@@ -23,7 +23,7 @@
#include "common.h"
#include "sentencepiece_model.pb.h"
#include "sentencepiece_processor.h"
-#include "stringpiece.h"
+#include "third_party/absl/strings/string_view.h"
#include "third_party/darts_clone/darts.h"
namespace sentencepiece {
@@ -59,12 +59,12 @@ class Normalizer {
// - Adds a prefix space.
// - Replaces a space with a meta symbol.
// - Removing heading, tailing and other redundant spaces.
- virtual util::Status Normalize(StringPiece input, std::string *normalized,
+ virtual util::Status Normalize(absl::string_view input, std::string *normalized,
std::vector<size_t> *norm_to_orig) const;
// Returns a normalized string without alignments.
// This function is used in sentencepiece training.
- virtual std::string Normalize(StringPiece input) const;
+ virtual std::string Normalize(absl::string_view input) const;
friend class Builder;
@@ -77,22 +77,22 @@ class Normalizer {
// Here's the sample code for the full text normalization.
//
// string output;
- // StringPiece input = "...";
+ // absl::string_view input = "...";
// while (!input.empty()) {
// const auto p = normalizer.NormalizePrefix(input);
// output.append(p.first.data(), p.first.size());
// input.remove_prefix(p.second);
// }
- std::pair<StringPiece, int> NormalizePrefix(StringPiece input) const;
+ std::pair<absl::string_view, int> NormalizePrefix(absl::string_view input) const;
// Encodes trie_blob and normalized string and return compiled blob.
- static std::string EncodePrecompiledCharsMap(StringPiece trie_blob,
- StringPiece normalized);
+ static std::string EncodePrecompiledCharsMap(absl::string_view trie_blob,
+ absl::string_view normalized);
// Decodes blob into trie_blob and normalized string.
- static util::Status DecodePrecompiledCharsMap(StringPiece blob,
- StringPiece *trie_blob,
- StringPiece *normalized);
+ static util::Status DecodePrecompiledCharsMap(absl::string_view blob,
+ absl::string_view *trie_blob,
+ absl::string_view *normalized);
// Maximum size of the return value of Trie, which corresponds
// to the maximum size of shared common prefix in the chars map.
diff --git a/src/normalizer_test.cc b/src/normalizer_test.cc
index 6d20cb5..d86ed24 100644
--- a/src/normalizer_test.cc
+++ b/src/normalizer_test.cc
@@ -332,7 +332,7 @@ TEST(NormalizerTest, NormalizeFullTest) {
TEST(NormalizerTest, EncodeDecodePrecompiledCharsMapTest) {
const std::string blob = Normalizer::EncodePrecompiledCharsMap("foo", "bar");
- StringPiece trie_blob, normalized_blob;
+ absl::string_view trie_blob, normalized_blob;
EXPECT_OK(Normalizer::DecodePrecompiledCharsMap(blob, &trie_blob,
&normalized_blob));
EXPECT_EQ("foo", trie_blob);
diff --git a/src/sentencepiece_processor.cc b/src/sentencepiece_processor.cc
index ca7d548..f9638c4 100644
--- a/src/sentencepiece_processor.cc
+++ b/src/sentencepiece_processor.cc
@@ -282,8 +282,8 @@ util::Status SentencePieceProcessor::PopulateSentencePieceText(
size_t consumed = 0;
bool is_prev_unk = false;
for (const auto &p : result) {
- const StringPiece w = p.first; // piece
- const int id = p.second; // id
+ const absl::string_view w = p.first; // piece
+ const int id = p.second; // id
CHECK_OR_RETURN(!w.empty()) << "Empty piece is not allowed.";
@@ -292,7 +292,7 @@ util::Status SentencePieceProcessor::PopulateSentencePieceText(
if (IsControl(id)) {
// Control symbol has no corresponding source surface, so begin == end.
auto *sp = spt->add_pieces();
- sp->set_piece(w.to_string());
+ sp->set_piece(w.data(), w.size());
sp->set_id(id);
sp->set_begin(norm_to_orig[consumed]);
sp->set_end(norm_to_orig[consumed]);
@@ -306,21 +306,22 @@ util::Status SentencePieceProcessor::PopulateSentencePieceText(
CHECK_LE_OR_RETURN(orig_begin, input.size());
CHECK_LE_OR_RETURN(orig_end, input.size());
CHECK_LE_OR_RETURN(orig_begin, orig_end);
- const auto surface = input.substr(orig_begin, orig_end - orig_begin);
+ const auto surface =
+ absl::ClippedSubstr(input, orig_begin, orig_end - orig_begin);
// Merges continuous run of unknown pieces so that decoder
// can copy or generate unknown tokens easily.
// Note that merged tokens are still unknown,
// since known pieces never consist of unknown characters.
if (is_prev_unk && is_unk) {
auto *sp = spt->mutable_pieces(spt->pieces_size() - 1);
- sp->set_piece(sp->piece() + w.to_string());
- sp->set_surface(sp->surface() + surface);
+ sp->set_piece(sp->piece() + std::string(w));
+ sp->set_surface(sp->surface() + std::string(surface));
sp->set_end(orig_end);
} else {
auto *sp = spt->add_pieces();
- sp->set_piece(w.to_string());
+ sp->set_piece(w.data(), w.size());
sp->set_id(id);
- sp->set_surface(surface);
+ sp->set_surface(surface.data(), surface.size());
sp->set_begin(orig_begin);
sp->set_end(orig_end);
}
@@ -418,7 +419,7 @@ util::Status SentencePieceProcessor::Decode(
const std::vector<std::string> &pieces, SentencePieceText *spt) const {
CHECK_OR_RETURN_STATUS_PROTO(spt);
- auto DecodeSentencePiece = [&](StringPiece piece, int id,
+ auto DecodeSentencePiece = [&](absl::string_view piece, int id,
bool is_bos_ws) -> std::string {
if (IsControl(id)) { // <s>, </s>
return ""; // invisible symbol.
@@ -426,14 +427,14 @@ util::Status SentencePieceProcessor::Decode(
if (IdToPiece(id) == piece) { // <unk>
return kUnknownSymbol;
} else { // return piece when piece is not <unk>.
- return piece.to_string();
+ return std::string(piece);
}
}
if (is_bos_ws) {
// Consume if the current position is bos and
// piece starts with kSpaceSymbol.
- piece.Consume(kSpaceSymbol);
+ string_util::ConsumePrefix(&piece, kSpaceSymbol);
}
return string_util::StringReplace(piece, kSpaceSymbol, " ", true);
diff --git a/src/sentencepiece_processor.h b/src/sentencepiece_processor.h
index 1fa260a..55efc46 100644
--- a/src/sentencepiece_processor.h
+++ b/src/sentencepiece_processor.h
@@ -20,6 +20,10 @@
#include <utility>
#include <vector>
+namespace absl {
+class string_view;
+} // namespace absl
+
namespace sentencepiece {
// SentencePieceProcessor:
@@ -80,10 +84,9 @@ class SentencePieceText;
class NBestSentencePieceText;
class ModelInterface;
class ModelProto;
-class StringPiece;
#ifndef SWIG
-using EncodeResult = std::vector<std::pair<StringPiece, int>>;
+using EncodeResult = std::vector<std::pair<absl::string_view, int>>;
#endif // SWIG
namespace normalizer {
diff --git a/src/sentencepiece_processor_test.cc b/src/sentencepiece_processor_test.cc
index 7d77a33..cca3de4 100644
--- a/src/sentencepiece_processor_test.cc
+++ b/src/sentencepiece_processor_test.cc
@@ -23,8 +23,8 @@
#include "sentencepiece.pb.h"
#include "sentencepiece_model.pb.h"
#include "sentencepiece_trainer.h"
-#include "stringpiece.h"
#include "testharness.h"
+#include "third_party/absl/strings/string_view.h"
#include "util.h"
namespace sentencepiece {
@@ -35,28 +35,29 @@ using port::MakeUnique;
class MockModel : public ModelInterface {
public:
- void SetEncodeResult(StringPiece input, const EncodeResult &output) {
+ void SetEncodeResult(absl::string_view input, const EncodeResult &output) {
input_ = input;
output_ = output;
}
- void SetNBestEncodeResult(StringPiece input,
+ void SetNBestEncodeResult(absl::string_view input,
const NBestEncodeResult &output) {
input_ = input;
nbest_output_ = output;
}
- EncodeResult Encode(StringPiece normalized) const {
+ EncodeResult Encode(absl::string_view normalized) const {
EXPECT_EQ(normalized, input_);
return output_;
}
- EncodeResult SampleEncode(StringPiece normalized, float alpha) const {
+ EncodeResult SampleEncode(absl::string_view normalized, float alpha) const {
EXPECT_EQ(normalized, input_);
return output_;
}
- NBestEncodeResult NBestEncode(StringPiece normalized, int nbest_size) const {
+ NBestEncodeResult NBestEncode(absl::string_view normalized,
+ int nbest_size) const {
EXPECT_EQ(normalized, input_);
return nbest_output_;
}
@@ -67,14 +68,14 @@ class MockModel : public ModelInterface {
int GetPieceSize() const { return 10; }
- int PieceToId(StringPiece piece) const { return 0; }
+ int PieceToId(absl::string_view piece) const { return 0; }
std::string IdToPiece(int id) const { return ""; }
float GetScore(int id) const { return 0.0; }
private:
- StringPiece input_;
+ absl::string_view input_;
EncodeResult output_;
NBestEncodeResult nbest_output_;
};
@@ -82,7 +83,7 @@ class MockModel : public ModelInterface {
std::vector<std::string> GetSpVec(const EncodeResult &pieces) {
std::vector<std::string> sps;
for (const auto &p : pieces) {
- sps.emplace_back(p.first.to_string());
+ sps.emplace_back(std::string(p.first));
}
return sps;
}
@@ -116,7 +117,7 @@ TEST(SentencepieceProcessorTest, StatusTest) {
}
TEST(SentencepieceProcessorTest, EncodeTest) {
- const StringPiece kInput = WS "ABC" WS "DEF";
+ const absl::string_view kInput = WS "ABC" WS "DEF";
SentencePieceProcessor sp;
const auto normalization_spec = MakeDefaultNormalizerSpec();
@@ -259,7 +260,7 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
{
auto mock = MakeUnique<MockModel>();
const EncodeResult result = {{WS "グー", 3}, {"グル", 4}, {"</s>", 2}};
- const StringPiece input = WS "グーグル";
+ const absl::string_view input = WS "グーグル";
mock->SetEncodeResult(input, result);
sp.SetModel(std::move(mock));
std::vector<std::string> output;
@@ -293,7 +294,7 @@ TEST(SentencepieceProcessorTest, EncodeTest) {
{
auto mock = MakeUnique<MockModel>();
const EncodeResult result = {{WS "株式", 3}, {"会社", 4}, {"</s>", 2}};
- const StringPiece input = WS "株式会社";
+ const absl::string_view input = WS "株式会社";
mock->SetEncodeResult(input, result);
sp.SetModel(std::move(mock));
std::vector<std::string> output;
@@ -437,14 +438,17 @@ TEST(SentencepieceProcessorTest, SampleEncodeTest) {
TEST(SentencepieceProcessorTest, DecodeTest) {
class DecodeMockModel : public ModelInterface {
public:
- EncodeResult Encode(StringPiece normalized) const override { return {}; }
+ EncodeResult Encode(absl::string_view normalized) const override {
+ return {};
+ }
int GetPieceSize() const override { return 7; }
- int PieceToId(StringPiece piece) const override {
- static std::unordered_map<StringPiece, int, StringPieceHash> kMap = {
- {"<unk>", 0}, {"<s>", 1}, {"</s>", 2}, {WS "ABC", 3},
- {WS "DE", 4}, {"F", 5}, {"G" WS "H", 6}};
+ int PieceToId(absl::string_view piece) const override {
+ static std::unordered_map<absl::string_view, int,
+ string_util::string_view_hash>
+ kMap = {{"<unk>", 0}, {"<s>", 1}, {"</s>", 2}, {WS "ABC", 3},
+ {WS "DE", 4}, {"F", 5}, {"G" WS "H", 6}};
return port::FindWithDefault(kMap, piece, 0);
}
@@ -508,9 +512,10 @@ TEST(SentencepieceProcessorTest, DecodeTest) {
EXPECT_EQ(16, spt.pieces(7).end());
}
-void AddPiece(ModelProto *model_proto, StringPiece piece, float score = 0.0) {
+void AddPiece(ModelProto *model_proto, absl::string_view piece,
+ float score = 0.0) {
auto *sp = model_proto->add_pieces();
- sp->set_piece(piece.to_string());
+ sp->set_piece(std::string(piece));
sp->set_score(score);
}
diff --git a/src/sentencepiece_trainer.cc b/src/sentencepiece_trainer.cc
index 56f43d5..d03c39c 100644
--- a/src/sentencepiece_trainer.cc
+++ b/src/sentencepiece_trainer.cc
@@ -136,14 +136,14 @@ util::Status SentencePieceTrainer::MergeSpecsFromArgs(
if (args.empty()) return util::OkStatus();
for (auto arg : string_util::SplitPiece(args, " ")) {
- arg.Consume("--");
+ string_util::ConsumePrefix(&arg, "--");
std::string key, value;
auto pos = arg.find("=");
- if (pos == StringPiece::npos) {
- key = arg.ToString();
+ if (pos == absl::string_view::npos) {
+ key = std::string(arg);
} else {
- key = arg.substr(0, pos).ToString();
- value = arg.substr(pos + 1).ToString();
+ key = std::string(arg.substr(0, pos));
+ value = std::string(arg.substr(pos + 1));
}
// Exception.
diff --git a/src/stringpiece.h b/src/stringpiece.h
index ca2613d..b237c18 100644
--- a/src/stringpiece.h
+++ b/src/stringpiece.h
@@ -21,21 +21,21 @@
namespace sentencepiece {
-class StringPiece {
+class absl::string_view {
public:
typedef size_t size_type;
// Create an empty slice.
- StringPiece() : data_(""), size_(0) {}
+ absl::string_view() : data_(""), size_(0) {}
// Create a slice that refers to d[0,n-1].
- StringPiece(const char *d, size_t n) : data_(d), size_(n) {}
+ absl::string_view(const char *d, size_t n) : data_(d), size_(n) {}
// Create a slice that refers to the contents of "s"
- StringPiece(const std::string &s) : data_(s.data()), size_(s.size()) {}
+ absl::string_view(const std::string &s) : data_(s.data()), size_(s.size()) {}
// Create a slice that refers to s[0,strlen(s)-1]
- StringPiece(const char *s) : data_(s), size_(strlen(s)) {}
+ absl::string_view(const char *s) : data_(s), size_(strlen(s)) {}
void set(const void *data, size_t len) {
data_ = reinterpret_cast<const char *>(data);
@@ -79,7 +79,7 @@ class StringPiece {
void remove_suffix(size_t n) { size_ -= n; }
- size_type find(StringPiece s, size_type pos = 0) const {
+ size_type find(absl::string_view s, size_type pos = 0) const {
if (size_ <= 0 || pos > static_cast<size_type>(size_)) {
if (size_ == 0 && pos == 0 && s.size_ == 0) {
return 0;
@@ -103,7 +103,7 @@ class StringPiece {
return find(c, pos);
}
- size_type find_first_of(StringPiece s, size_type pos = 0) const {
+ size_type find_first_of(absl::string_view s, size_type pos = 0) const {
if (size_ <= 0 || s.size_ <= 0) {
return npos;
}
@@ -125,7 +125,7 @@ class StringPiece {
return npos;
}
- bool Consume(StringPiece x) {
+ bool Consume(absl::string_view x) {
if (starts_with(x)) {
remove_prefix(x.size_);
return true;
@@ -133,11 +133,11 @@ class StringPiece {
return false;
}
- StringPiece substr(size_type pos, size_type n = npos) const {
+ absl::string_view substr(size_type pos, size_type n = npos) const {
size_type size = static_cast<size_type>(size_);
if (pos > size) pos = size;
if (n > size - pos) n = size - pos;
- return StringPiece(data_ + pos, n);
+ return absl::string_view(data_ + pos, n);
}
// Return a string that contains the copy of the referenced data.
@@ -148,14 +148,14 @@ class StringPiece {
// < 0 iff "*this" < "b",
// == 0 iff "*this" == "b",
// > 0 iff "*this" > "b"
- int compare(StringPiece b) const;
+ int compare(absl::string_view b) const;
// Return true iff "x" is a prefix of "*this"
- bool starts_with(StringPiece x) const {
+ bool starts_with(absl::string_view x) const {
return ((size_ >= x.size_) && (memcmp(data_, x.data_, x.size_) == 0));
}
// Return true iff "x" is a suffix of "*this"
- bool ends_with(StringPiece x) const {
+ bool ends_with(absl::string_view x) const {
return ((size_ >= x.size_) &&
(memcmp(data_ + (size_ - x.size_), x.data_, x.size_) == 0));
}
@@ -186,23 +186,23 @@ class StringPiece {
size_t size_;
};
-inline bool operator==(StringPiece x, StringPiece y) {
+inline bool operator==(absl::string_view x, absl::string_view y) {
return ((x.size() == y.size()) &&
(memcmp(x.data(), y.data(), x.size()) == 0));
}
-inline bool operator!=(StringPiece x, StringPiece y) { return !(x == y); }
+inline bool operator!=(absl::string_view x, absl::string_view y) { return !(x == y); }
-inline bool operator<(StringPiece x, StringPiece y) { return x.compare(y) < 0; }
-inline bool operator>(StringPiece x, StringPiece y) { return x.compare(y) > 0; }
-inline bool operator<=(StringPiece x, StringPiece y) {
+inline bool operator<(absl::string_view x, absl::string_view y) { return x.compare(y) < 0; }
+inline bool operator>(absl::string_view x, absl::string_view y) { return x.compare(y) > 0; }
+inline bool operator<=(absl::string_view x, absl::string_view y) {
return x.compare(y) <= 0;
}
-inline bool operator>=(StringPiece x, StringPiece y) {
+inline bool operator>=(absl::string_view x, absl::string_view y) {
return x.compare(y) >= 0;
}
-inline int StringPiece::compare(StringPiece b) const {
+inline int absl::string_view::compare(absl::string_view b) const {
const size_t min_len = (size_ < b.size_) ? size_ : b.size_;
int r = memcmp(data_, b.data_, min_len);
if (r == 0) {
@@ -215,13 +215,13 @@ inline int StringPiece::compare(StringPiece b) const {
return r;
}
-inline std::ostream &operator<<(std::ostream &o, StringPiece piece) {
+inline std::ostream &operator<<(std::ostream &o, absl::string_view piece) {
return o.write(piece.data(), static_cast<std::streamsize>(piece.size()));
}
-struct StringPieceHash {
+struct absl::string_viewHash {
// DJB hash function.
- inline size_t operator()(const StringPiece &sp) const {
+ inline size_t operator()(const absl::string_view &sp) const {
size_t hash = 5381;
for (size_t i = 0; i < sp.size(); ++i) {
hash = ((hash << 5) + hash) + sp[i];
diff --git a/src/trainer_interface.cc b/src/trainer_interface.cc
index 9cbcefc..19ca8ba 100644
--- a/src/trainer_interface.cc
+++ b/src/trainer_interface.cc
@@ -153,7 +153,7 @@ util::Status TrainerInterface::LoadSentences() {
const bool is_tsv = trainer_spec_.input_format() == "tsv";
- std::set<StringPiece> meta_pieces_set;
+ std::set<absl::string_view> meta_pieces_set;
for (const auto &it : meta_pieces_) meta_pieces_set.insert(it.second.first);
const PrefixMatcher meta_pieces_matcher(meta_pieces_set);
@@ -286,7 +286,7 @@ void TrainerInterface::SplitSentencesByWhitespace() {
std::unordered_map<std::string, int64> tokens;
for (const auto &s : sentences_) {
for (const auto &w : SplitIntoWords(s.first)) {
- tokens[w.to_string()] += s.second;
+ tokens[std::string(w)] += s.second;
}
}
sentences_ = Sorted(tokens);
@@ -345,7 +345,7 @@ util::Status TrainerInterface::Serialize(ModelProto *model_proto) const {
return util::OkStatus();
}
-util::Status TrainerInterface::SaveModel(StringPiece filename) const {
+util::Status TrainerInterface::SaveModel(absl::string_view filename) const {
LOG(INFO) << "Saving model: " << filename;
ModelProto model_proto;
RETURN_IF_ERROR(Serialize(&model_proto));
@@ -356,7 +356,7 @@ util::Status TrainerInterface::SaveModel(StringPiece filename) const {
return util::OkStatus();
}
-util::Status TrainerInterface::SaveVocab(StringPiece filename) const {
+util::Status TrainerInterface::SaveVocab(absl::string_view filename) const {
LOG(INFO) << "Saving vocabs: " << filename;
ModelProto model_proto;
Serialize(&model_proto);
diff --git a/src/trainer_interface.h b/src/trainer_interface.h
index 88dc1ff..b4abb12 100644
--- a/src/trainer_interface.h
+++ b/src/trainer_interface.h
@@ -126,13 +126,13 @@ class TrainerInterface {
util::Status Serialize(ModelProto *model_proto) const;
// Saves the best sentence split with the current model for debugging.
- util::Status SaveSplits(StringPiece filename) const;
+ util::Status SaveSplits(absl::string_view filename) const;
// Saves model file.
- util::Status SaveModel(StringPiece filename) const;
+ util::Status SaveModel(absl::string_view filename) const;
// Saves vocabulary file for NMT.
- util::Status SaveVocab(StringPiece filename) const;
+ util::Status SaveVocab(absl::string_view filename) const;
// Initializes `meta_pieces_` from TrainerSpec.
util::Status InitMetaPieces();
diff --git a/src/unicode_script_test.cc b/src/unicode_script_test.cc
index 56b80ad..6a79cf1 100644
--- a/src/unicode_script_test.cc
+++ b/src/unicode_script_test.cc
@@ -14,13 +14,13 @@
#include "unicode_script.h"
#include "common.h"
-#include "stringpiece.h"
+#include "third_party/absl/strings/string_view.h"
#include "testharness.h"
#include "util.h"
namespace sentencepiece {
namespace unicode_script {
-ScriptType GetScriptType(StringPiece s) {
+ScriptType GetScriptType(absl::string_view s) {
const auto ut = string_util::UTF8ToUnicodeText(s);
CHECK_EQ(1, ut.size());
return GetScript(ut[0]);
diff --git a/src/unigram_model.cc b/src/unigram_model.cc
index 17365e3..34f0595 100644
--- a/src/unigram_model.cc
+++ b/src/unigram_model.cc
@@ -24,7 +24,7 @@
#include <utility>
#include <vector>
-#include "stringpiece.h"
+#include "third_party/absl/strings/string_view.h"
#include "util.h"
namespace sentencepiece {
@@ -88,13 +88,13 @@ Lattice::Node *Lattice::NewNode() {
void Lattice::Clear() {
begin_nodes_.clear();
end_nodes_.clear();
- sentence_.clear();
+ sentence_ = absl::string_view("");
surface_.clear();
port::STLDeleteElements(&all_nodes_);
all_nodes_.clear();
}
-void Lattice::SetSentence(StringPiece sentence) {
+void Lattice::SetSentence(absl::string_view sentence) {
Clear();
sentence_ = sentence;
@@ -133,7 +133,7 @@ Lattice::Node *Lattice::Insert(int pos, int length) {
node->length = length;
const int utf8_length =
static_cast<int>(surface(pos + length) - surface(pos));
- node->piece.set(surface(pos), utf8_length);
+ node->piece = absl::string_view(surface(pos), utf8_length);
begin_nodes_[pos].push_back(node);
end_nodes_[pos + node->length].push_back(node);
@@ -424,7 +424,7 @@ void ModelBase::PopulateNodes(Lattice *lattice) const {
}
}
-int ModelBase::PieceToId(StringPiece piece) const {
+int ModelBase::PieceToId(absl::string_view piece) const {
auto it = reserved_id_map_.find(piece);
if (it != reserved_id_map_.end()) {
return it->second;
@@ -434,7 +434,8 @@ int ModelBase::PieceToId(StringPiece piece) const {
return id == -1 ? unk_id_ : id;
}
-void ModelBase::BuildTrie(std::vector<std::pair<StringPiece, int>> *pieces) {
+void ModelBase::BuildTrie(
+ std::vector<std::pair<absl::string_view, int>> *pieces) {
if (!status().ok()) return;
if (pieces->empty()) {
@@ -492,7 +493,7 @@ Model::Model(const ModelProto &model_proto) {
}
}
- std::vector<std::pair<StringPiece, int>> pieces;
+ std::vector<std::pair<absl::string_view, int>> pieces;
for (const auto &it : pieces_) pieces.emplace_back(it.first, it.second);
BuildTrie(&pieces);
@@ -500,7 +501,7 @@ Model::Model(const ModelProto &model_proto) {
Model::~Model() {}
-EncodeResult Model::Encode(StringPiece normalized) const {
+EncodeResult Model::Encode(absl::string_view normalized) const {
if (!status().ok() || normalized.empty()) {
return {};
}
@@ -517,7 +518,7 @@ EncodeResult Model::Encode(StringPiece normalized) const {
return results;
}
-NBestEncodeResult Model::NBestEncode(StringPiece normalized,
+NBestEncodeResult Model::NBestEncode(absl::string_view normalized,
int nbest_size) const {
if (!status().ok() || normalized.empty()) {
return {{{}, 0.0}};
@@ -543,7 +544,8 @@ NBestEncodeResult Model::NBestEncode(StringPiece normalized,
return nbest_results;
}
-EncodeResult Model::SampleEncode(StringPiece normalized, float theta) const {
+EncodeResult Model::SampleEncode(absl::string_view normalized,
+ float theta) const {
if (!status().ok() || normalized.empty()) {
return {};
}
diff --git a/src/unigram_model.h b/src/unigram_model.h
index 50b152a..aee61b6 100644
--- a/src/unigram_model.h
+++ b/src/unigram_model.h
@@ -35,7 +35,7 @@ class Lattice {
virtual ~Lattice();
struct Node {
- StringPiece piece; // Sentence piece representation.
+ absl::string_view piece; // Sentence piece representation.
uint32 pos; // Unicode position in the sentence.
uint32 length; // Unicode length, not UT8 byte.
uint32 node_id; // unique id in the current lattice.
@@ -75,7 +75,7 @@ class Lattice {
void Clear();
// Sets new sentence.
- void SetSentence(StringPiece sentence);
+ void SetSentence(absl::string_view sentence);
// Inserts a new node at [pos, pos + length - 1].
// After calling this method, The caller must set Node::score and Node::id.
@@ -105,7 +105,7 @@ class Lattice {
// Lattice class has the ownership of the returned value.
Node *NewNode();
- StringPiece sentence_;
+ absl::string_view sentence_;
std::vector<const char *> surface_;
std::vector<std::vector<Node *>> begin_nodes_;
std::vector<std::vector<Node *>> end_nodes_;
@@ -135,11 +135,11 @@ class ModelBase : public ModelInterface {
void PopulateNodes(Lattice *lattice) const;
// Returns a vocab id of |piece|.
- int PieceToId(StringPiece piece) const override;
+ int PieceToId(absl::string_view piece) const override;
protected:
// Builds a Trie index.
- void BuildTrie(std::vector<std::pair<StringPiece, int>> *pieces);
+ void BuildTrie(std::vector<std::pair<absl::string_view, int>> *pieces);
float min_score_ = 0.0;
float max_score_ = 0.0;
@@ -156,12 +156,12 @@ class Model : public ModelBase {
explicit Model(const ModelProto &model_proto);
~Model() override;
- EncodeResult Encode(StringPiece normalized) const override;
+ EncodeResult Encode(absl::string_view normalized) const override;
- NBestEncodeResult NBestEncode(StringPiece normalized,
+ NBestEncodeResult NBestEncode(absl::string_view normalized,
int nbest_size) const override;
- EncodeResult SampleEncode(StringPiece normalized, float theta) const override;
+ EncodeResult SampleEncode(absl::string_view normalized, float theta) const override;
};
} // namespace unigram
} // namespace sentencepiece
diff --git a/src/unigram_model_test.cc b/src/unigram_model_test.cc
index 7e604ab..7f99550 100644
--- a/src/unigram_model_test.cc
+++ b/src/unigram_model_test.cc
@@ -175,7 +175,7 @@ TEST(LatticeTest, ViterbiFromIncompleteLatticeTest) {
std::string GetTokenized(const std::vector<Lattice::Node *> &nodes) {
std::vector<std::string> tokens;
for (auto *node : nodes) {
- tokens.push_back(node->piece.to_string());
+ tokens.push_back(std::string(node->piece));
}
return string_util::Join(tokens, " ");
}
diff --git a/src/unigram_model_trainer.cc b/src/unigram_model_trainer.cc
index d3c1326..0974cb4 100644
--- a/src/unigram_model_trainer.cc
+++ b/src/unigram_model_trainer.cc
@@ -92,9 +92,9 @@ void TrainerModel::SetSentencePieces(SentencePieces &&sentencepieces) {
CHECK(!sentencepieces_.empty());
min_score_ = FLT_MAX;
- std::vector<std::pair<StringPiece, int>> pieces;
+ std::vector<std::pair<absl::string_view, int>> pieces;
for (size_t i = 0; i < sentencepieces_.size(); ++i) {
- const StringPiece w = sentencepieces_[i].first; // piece
+ const absl::string_view w = sentencepieces_[i].first; // piece
const float score = sentencepieces_[i].second; // score.
CHECK(!std::isnan(score));
pieces.emplace_back(w, i);
diff --git a/src/unigram_model_trainer.h b/src/unigram_model_trainer.h
index 41138b7..2bd31d0 100644
--- a/src/unigram_model_trainer.h
+++ b/src/unigram_model_trainer.h
@@ -22,7 +22,7 @@
#include <vector>
#include "sentencepiece_model.pb.h"
-#include "stringpiece.h"
+#include "third_party/absl/strings/string_view.h"
#include "trainer_interface.h"
#include "unigram_model.h"
#include "util.h"
@@ -67,7 +67,7 @@ class TrainerModel : public ModelBase {
bool IsUserDefined(int id) const override { return false; }
- EncodeResult Encode(StringPiece normalized) const override { return {}; }
+ EncodeResult Encode(absl::string_view normalized) const override { return {}; }
private:
SentencePieces sentencepieces_;
diff --git a/src/util.cc b/src/util.cc
index ac0c57e..cbf0d1a 100644
--- a/src/util.cc
+++ b/src/util.cc
@@ -41,12 +41,14 @@ std::vector<std::string> Split(const std::string &str, const std::string &delim,
return SplitInternal<std::string>(str, delim, allow_empty);
}
-std::vector<StringPiece> SplitPiece(StringPiece str, StringPiece delim,
- bool allow_empty) {
- return SplitInternal<StringPiece>(str, delim, allow_empty);
+std::vector<absl::string_view> SplitPiece(absl::string_view str,
+ absl::string_view delim,
+ bool allow_empty) {
+ return SplitInternal<absl::string_view>(str, delim, allow_empty);
}
-std::string Join(const std::vector<std::string> &tokens, StringPiece delim) {
+std::string Join(const std::vector<std::string> &tokens,
+ absl::string_view delim) {
std::string result;
if (!tokens.empty()) {
result.append(tokens[0]);
@@ -58,7 +60,7 @@ std::string Join(const std::vector<std::string> &tokens, StringPiece delim) {
return result;
}
-std::string Join(const std::vector<int> &tokens, StringPiece delim) {
+std::string Join(const std::vector<int> &tokens, absl::string_view delim) {
std::string result;
char buf[32];
if (!tokens.empty()) {
@@ -73,24 +75,25 @@ std::string Join(const std::vector<int> &tokens, StringPiece delim) {
return result;
}
-std::string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
- bool replace_all) {
+std::string StringReplace(absl::string_view s, absl::string_view oldsub,
+ absl::string_view newsub, bool replace_all) {
std::string ret;
StringReplace(s, oldsub, newsub, replace_all, &ret);
return ret;
}
-void StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
- bool replace_all, std::string *res) {
+void StringReplace(absl::string_view s, absl::string_view oldsub,
+ absl::string_view newsub, bool replace_all,
+ std::string *res) {
if (oldsub.empty()) {
res->append(s.data(), s.size());
return;
}
- StringPiece::size_type start_pos = 0;
+ absl::string_view::size_type start_pos = 0;
do {
- const StringPiece::size_type pos = s.find(oldsub, start_pos);
- if (pos == StringPiece::npos) {
+ const absl::string_view::size_type pos = s.find(oldsub, start_pos);
+ if (pos == absl::string_view::npos) {
break;
}
res->append(s.data() + start_pos, pos - start_pos);
@@ -136,7 +139,7 @@ char32 DecodeUTF8(const char *begin, const char *end, size_t *mblen) {
return kUnicodeError;
}
-bool IsStructurallyValid(StringPiece str) {
+bool IsStructurallyValid(absl::string_view str) {
const char *begin = str.data();
const char *end = str.data() + str.size();
size_t mblen = 0;
@@ -188,7 +191,7 @@ size_t EncodeUTF8(char32 c, char *output) {
std::string UnicodeCharToUTF8(const char32 c) { return UnicodeTextToUTF8({c}); }
-UnicodeText UTF8ToUnicodeText(StringPiece utf8) {
+UnicodeText UTF8ToUnicodeText(absl::string_view utf8) {
UnicodeText uc;
const char *begin = utf8.data();
const char *end = utf8.data() + utf8.size();
@@ -214,7 +217,7 @@ std::string UnicodeTextToUTF8(const UnicodeText &utext) {
namespace io {
-InputBuffer::InputBuffer(StringPiece filename)
+InputBuffer::InputBuffer(absl::string_view filename)
: is_(filename.empty() ? &std::cin
: new std::ifstream(WPATH(filename.data()))) {
if (!*is_)
@@ -234,7 +237,7 @@ bool InputBuffer::ReadLine(std::string *line) {
return static_cast<bool>(std::getline(*is_, *line));
}
-OutputBuffer::OutputBuffer(StringPiece filename)
+OutputBuffer::OutputBuffer(absl::string_view filename)
: os_(filename.empty()
? &std::cout
: new std::ofstream(WPATH(filename.data()), OUTPUT_MODE)) {
@@ -251,12 +254,12 @@ OutputBuffer::~OutputBuffer() {
util::Status OutputBuffer::status() const { return status_; }
-bool OutputBuffer::Write(StringPiece text) {
+bool OutputBuffer::Write(absl::string_view text) {
os_->write(text.data(), text.size());
return os_->good();
}
-bool OutputBuffer::WriteLine(StringPiece text) {
+bool OutputBuffer::WriteLine(absl::string_view text) {
return Write(text) && Write("\n");
}
} // namespace io
diff --git a/src/util.h b/src/util.h
index be4e70b..d649e32 100644
--- a/src/util.h
+++ b/src/util.h
@@ -27,7 +27,7 @@
#include <vector>
#include "common.h"
#include "sentencepiece_processor.h"
-#include "stringpiece.h"
+#include "third_party/absl/strings/string_view.h"
namespace sentencepiece {
@@ -42,31 +42,42 @@ std::ostream &operator<<(std::ostream &out, const std::vector<T> &v) {
// String utilities
namespace string_util {
-inline std::string ToLower(StringPiece arg) {
- std::string lower_value = arg.ToString();
+struct string_view_hash {
+ // DJB hash function.
+ inline size_t operator()(const absl::string_view &sp) const {
+ size_t hash = 5381;
+ for (size_t i = 0; i < sp.size(); ++i) {
+ hash = ((hash << 5) + hash) + sp[i];
+ }
+ return hash;
+ }
+};
+
+inline std::string ToLower(absl::string_view arg) {
+ std::string lower_value = std::string(arg);
std::transform(lower_value.begin(), lower_value.end(), lower_value.begin(),
::tolower);
return lower_value;
}
-inline std::string ToUpper(StringPiece arg) {
- std::string upper_value = arg.ToString();
+inline std::string ToUpper(absl::string_view arg) {
+ std::string upper_value = std::string(arg);
std::transform(upper_value.begin(), upper_value.end(), upper_value.begin(),
::toupper);
return upper_value;
}
template <typename Target>
-inline bool lexical_cast(StringPiece arg, Target *result) {
+inline bool lexical_cast(absl::string_view arg, Target *result) {
std::stringstream ss;
return (ss << arg.data() && ss >> *result);
}
template <>
-inline bool lexical_cast(StringPiece arg, bool *result) {
+inline bool lexical_cast(absl::string_view arg, bool *result) {
const char *kTrue[] = {"1", "t", "true", "y", "yes"};
const char *kFalse[] = {"0", "f", "false", "n", "no"};
- std::string lower_value = arg.ToString();
+ std::string lower_value = std::string(arg);
std::transform(lower_value.begin(), lower_value.end(), lower_value.begin(),
::tolower);
for (size_t i = 0; i < 5; ++i) {
@@ -83,29 +94,32 @@ inline bool lexical_cast(StringPiece arg, bool *result) {
}
template <>
-inline bool lexical_cast(StringPiece arg, std::string *result) {
- *result = arg.ToString();
+inline bool lexical_cast(absl::string_view arg, std::string *result) {
+ *result = std::string(arg);
return true;
}
std::vector<std::string> Split(const std::string &str, const std::string &delim,
bool allow_empty = false);
-std::vector<StringPiece> SplitPiece(StringPiece str, StringPiece delim,
- bool allow_empty = false);
+std::vector<absl::string_view> SplitPiece(absl::string_view str,
+ absl::string_view delim,
+ bool allow_empty = false);
-std::string Join(const std::vector<std::string> &tokens, StringPiece delim);
+std::string Join(const std::vector<std::string> &tokens,
+ absl::string_view delim);
-std::string Join(const std::vector<int> &tokens, StringPiece delim);
+std::string Join(const std::vector<int> &tokens, absl::string_view delim);
-std::string StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
- bool replace_all);
+std::string StringReplace(absl::string_view s, absl::string_view oldsub,
+ absl::string_view newsub, bool replace_all);
-void StringReplace(StringPiece s, StringPiece oldsub, StringPiece newsub,
- bool replace_all, std::string *res);
+void StringReplace(absl::string_view s, absl::string_view oldsub,
+ absl::string_view newsub, bool replace_all,
+ std::string *res);
template <typename T>
-inline bool DecodePOD(StringPiece str, T *result) {
+inline bool DecodePOD(absl::string_view str, T *result) {
CHECK_NOTNULL(result);
if (sizeof(*result) != str.size()) {
return false;
@@ -122,12 +136,22 @@ inline std::string EncodePOD(const T &value) {
return s;
}
-inline bool StartsWith(const StringPiece str, StringPiece prefix) {
- return str.starts_with(prefix);
+inline bool StartsWith(absl::string_view text, absl::string_view prefix) {
+ return prefix.empty() ||
+ (text.size() >= prefix.size() &&
+ memcmp(text.data(), prefix.data(), prefix.size()) == 0);
}
-inline bool EndsWith(const StringPiece str, StringPiece suffix) {
- return str.ends_with(suffix);
+inline bool EndsWith(absl::string_view text, absl::string_view suffix) {
+ return suffix.empty() || (text.size() >= suffix.size() &&
+ memcmp(text.data() + (text.size() - suffix.size()),
+ suffix.data(), suffix.size()) == 0);
+}
+
+inline bool ConsumePrefix(absl::string_view *str, absl::string_view expected) {
+ if (!StartsWith(*str, expected)) return false;
+ str->remove_prefix(expected.size());
+ return true;
}
template <typename T>
@@ -138,7 +162,7 @@ inline std::string IntToHex(T value) {
}
template <typename T>
-inline T HexToInt(StringPiece value) {
+inline T HexToInt(absl::string_view value) {
T n;
std::istringstream is(value.data());
is >> std::hex >> n;
@@ -191,17 +215,17 @@ inline bool IsValidCodepoint(char32 c) {
return (static_cast<uint32>(c) < 0xD800) || (c >= 0xE000 && c <= 0x10FFFF);
}
-bool IsStructurallyValid(StringPiece str);
+bool IsStructurallyValid(absl::string_view str);
using UnicodeText = std::vector<char32>;
char32 DecodeUTF8(const char *begin, const char *end, size_t *mblen);
-inline char32 DecodeUTF8(StringPiece input, size_t *mblen) {
+inline char32 DecodeUTF8(absl::string_view input, size_t *mblen) {
return DecodeUTF8(input.data(), input.data() + input.size(), mblen);
}
-inline bool IsValidDecodeUTF8(StringPiece input, size_t *mblen) {
+inline bool IsValidDecodeUTF8(absl::string_view input, size_t *mblen) {
const char32 c = DecodeUTF8(input, mblen);
return c != kUnicodeError || *mblen == 3;
}
@@ -210,7 +234,7 @@ size_t EncodeUTF8(char32 c, char *output);
std::string UnicodeCharToUTF8(const char32 c);
-UnicodeText UTF8ToUnicodeText(StringPiece utf8);
+UnicodeText UTF8ToUnicodeText(absl::string_view utf8);
std::string UnicodeTextToUTF8(const UnicodeText &utext);
@@ -220,7 +244,7 @@ std::string UnicodeTextToUTF8(const UnicodeText &utext);
namespace io {
class InputBuffer {
public:
- explicit InputBuffer(StringPiece filename);
+ explicit InputBuffer(absl::string_view filename);
util::Status status() const;
~InputBuffer();
bool ReadLine(std::string *line);
@@ -232,11 +256,11 @@ class InputBuffer {
class OutputBuffer {
public:
- explicit OutputBuffer(StringPiece filename);
+ explicit OutputBuffer(absl::string_view filename);
util::Status status() const;
~OutputBuffer();
- bool Write(StringPiece text);
- bool WriteLine(StringPiece text);
+ bool Write(absl::string_view text);
+ bool WriteLine(absl::string_view text);
private:
util::Status status_;
@@ -396,12 +420,12 @@ inline const std::string StrError(int n) {
inline Status OkStatus() { return Status(); }
-#define DECLARE_ERROR(FUNC, CODE) \
- inline util::Status FUNC##Error(StringPiece str) { \
- return util::Status(error::CODE, str.data()); \
- } \
- inline bool Is##FUNC(const util::Status &status) { \
- return status.code() == error::CODE; \
+#define DECLARE_ERROR(FUNC, CODE) \
+ inline util::Status FUNC##Error(absl::string_view str) { \
+ return util::Status(error::CODE, str.data()); \
+ } \
+ inline bool Is##FUNC(const util::Status &status) { \
+ return status.code() == error::CODE; \
}
DECLARE_ERROR(Cancelled, CANCELLED)
diff --git a/src/util_test.cc b/src/util_test.cc
index cc39cbe..ba5e224 100644
--- a/src/util_test.cc
+++ b/src/util_test.cc
@@ -129,7 +129,7 @@ TEST(UtilTest, SplitTest) {
}
TEST(UtilTest, SplitPieceTest) {
- std::vector<StringPiece> tokens;
+ std::vector<absl::string_view> tokens;
tokens = string_util::SplitPiece("this is a\ttest", " \t");
EXPECT_EQ(4, tokens.size());
@@ -195,8 +195,8 @@ TEST(UtilTest, JoinIntTest) {
EXPECT_EQ(string_util::Join(tokens, ""), "102-45");
}
-TEST(UtilTest, StringPieceTest) {
- StringPiece s;
+TEST(UtilTest, StringViewTest) {
+ absl::string_view s;
EXPECT_EQ(0, s.find("", 0));
}
@@ -410,7 +410,7 @@ TEST(UtilTest, UnicodeCharToUTF8Test) {
TEST(UtilTest, IsStructurallyValidTest) {
EXPECT_TRUE(string_util::IsStructurallyValid("abcd"));
EXPECT_TRUE(
- string_util::IsStructurallyValid(StringPiece("a\0cd", 4))); // NUL
+ string_util::IsStructurallyValid(absl::string_view("a\0cd", 4))); // NUL
EXPECT_TRUE(string_util::IsStructurallyValid("ab\xc3\x81")); // 2-byte
EXPECT_TRUE(string_util::IsStructurallyValid("a\xe3\x81\x81")); // 3-byte
EXPECT_TRUE(string_util::IsStructurallyValid("\xf2\x82\x81\x84")); // 4
diff --git a/src/word_model.cc b/src/word_model.cc
index c55934d..bdc79b8 100644
--- a/src/word_model.cc
+++ b/src/word_model.cc
@@ -25,7 +25,7 @@ Model::Model(const ModelProto &model_proto) {
Model::~Model() {}
-EncodeResult Model::Encode(StringPiece normalized) const {
+EncodeResult Model::Encode(absl::string_view normalized) const {
if (!status().ok() || normalized.empty()) {
return {};
}
diff --git a/src/word_model.h b/src/word_model.h
index c645f0f..34470f9 100644
--- a/src/word_model.h
+++ b/src/word_model.h
@@ -27,7 +27,7 @@ class Model : public ModelInterface {
explicit Model(const ModelProto &model_proto);
~Model() override;
- EncodeResult Encode(StringPiece normalized) const override;
+ EncodeResult Encode(absl::string_view normalized) const override;
};
} // namespace word
} // namespace sentencepiece
diff --git a/src/word_model_trainer.cc b/src/word_model_trainer.cc
index 2a21076..23a35ee 100644
--- a/src/word_model_trainer.cc
+++ b/src/word_model_trainer.cc
@@ -18,7 +18,7 @@
#include <string>
#include <unordered_map>
-#include "stringpiece.h"
+#include "third_party/absl/strings/string_view.h"
#include "util.h"
#include "word_model.h"
@@ -38,7 +38,7 @@ util::Status Trainer::Train() {
std::unordered_map<std::string, uint64> freq;
for (const auto &it : sentences_) {
for (const auto &s : SplitIntoWords(it.first)) {
- freq[s.to_string()] += it.second;
+ freq[std::string(s)] += it.second;
}
}