diff options
author | Taku Kudo <taku@google.com> | 2018-06-26 08:59:55 +0300 |
---|---|---|
committer | Taku Kudo <taku@google.com> | 2018-06-26 08:59:55 +0300 |
commit | 81d1a091fbe294886d6d73ccdaf4b645fa8b17b1 (patch) | |
tree | 4f47cb2ec157781c55d6666735fabecf94a51f68 /tensorflow | |
parent | 65da95da9abc95948cf1876c53c2491aded8dc91 (diff) |
Added Tensorflow module
Diffstat (limited to 'tensorflow')
-rw-r--r-- | tensorflow/.gitignore | 5 | ||||
-rw-r--r-- | tensorflow/__init__.py | 0 | ||||
-rwxr-xr-x | tensorflow/make_py_wheel.sh | 103 | ||||
-rw-r--r-- | tensorflow/sentencepiece_processor_ops.cc | 532 | ||||
-rwxr-xr-x | tensorflow/setup.py | 46 | ||||
-rw-r--r-- | tensorflow/test/__init__.py | 0 | ||||
-rwxr-xr-x | tensorflow/test/tf_sentencepiece_test.py | 283 | ||||
-rw-r--r-- | tensorflow/tf_sentencepiece/__init__.py | 5 | ||||
-rwxr-xr-x | tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so | bin | 0 -> 3508648 bytes | |||
-rw-r--r-- | tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py | 192 |
10 files changed, 1166 insertions, 0 deletions
diff --git a/tensorflow/.gitignore b/tensorflow/.gitignore new file mode 100644 index 0000000..0918968 --- /dev/null +++ b/tensorflow/.gitignore @@ -0,0 +1,5 @@ +build/ +sdist/ +dist/ +tmp/ +*py[cod] diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tensorflow/__init__.py diff --git a/tensorflow/make_py_wheel.sh b/tensorflow/make_py_wheel.sh new file mode 100755 index 0000000..96517df --- /dev/null +++ b/tensorflow/make_py_wheel.sh @@ -0,0 +1,103 @@ +#!/bin/bash + +# Copyright 2018 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License.! + +set -e # exit immediately on error +set -x # display all commands + +PROTOBUF_VERSION=3.6.0 + +run_docker() { + cd `dirname $0` + docker pull $1 + docker run --rm -ti --name tf_sentencepiece \ + -v `pwd`/../:/sentencepiece -w /sentencepiece/tensorflow \ + -td $1 /bin/bash + docker exec tf_sentencepiece bash -c "./build.sh native" + docker stop tf_sentencepiece +} + +build() { + rm -fr tmp + mkdir -p tmp + + export PATH="/opt/python/cp27-cp27mu/bin:${PATH}" + + # Installs necessary libraries under `tmp` sub directory. + cd tmp + + # Install libtool + curl -L -O http://ftpmirror.gnu.org/libtool/libtool-2.4.6.tar.gz + tar zxfv libtool-2.4.6.tar.gz + cd libtool-2.4.6 + ./configure + make -j4 + make install + cd .. + + # Install protobuf + curl -L -O https://github.com/google/protobuf/releases/download/v${PROTOBUF_VERSION}/protobuf-cpp-${PROTOBUF_VERSION}.tar.gz + tar zxfv protobuf-cpp-${PROTOBUF_VERSION}.tar.gz + cd protobuf-${PROTOBUF_VERSION} + ./configure --disable-shared --with-pic + make CXXFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" \ + CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4 + make install || true + cd ../.. + + # Install sentencepiece + cd .. + make distclean || true + ./autogen.sh + grep -v PKG_CHECK_MODULES configure > tmp + mv tmp -f configure + chmod +x configure + LIBS+="-pthread -L/usr/local/lib -lprotobuf" ./configure --disable-shared --with-pic + make CXXFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" \ + CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4 + make install || true + + # Builds _sentencepiece_processor_ops.so + cd tensorflow + pip install tensorflow + TF_CFLAGS="-I/opt/python/cp27-cp27mu/lib/python2.7/site-packages/tensorflow/include" + TF_LFLAGS="-L/opt/python/cp27-cp27mu/lib/python2.7/site-packages/tensorflow -ltensorflow_framework" + + g++ -std=c++11 -shared \ + -I../src \ + -fPIC ${TF_CFLAGS[@]} -O2 \ + -D_GLIBCXX_USE_CXX11_ABI=0 \ + -Wl,--whole-archive \ + /usr/local/lib/libprotobuf.a \ + /usr/local/lib/libsentencepiece.a \ + -Wl,--no-whole-archive \ + sentencepiece_processor_ops.cc \ + -o tf_sentencepiece/_sentencepiece_processor_ops.so \ + ${TF_LFLAGS[@]} + strip tf_sentencepiece/_sentencepiece_processor_ops.so + + # Builds Python manylinux wheel package. + python setup.py bdist_wheel --universal --plat-name=linux_x86_64 + python setup.py sdist + + rm -fr build tf_sentencepiece.egg-info tmp + cd .. && make distclean +} + +if [ "$1" = "native" ]; then + build +else + run_docker quay.io/pypa/manylinux1_x86_64 +fi diff --git a/tensorflow/sentencepiece_processor_ops.cc b/tensorflow/sentencepiece_processor_ops.cc new file mode 100644 index 0000000..5d2df57 --- /dev/null +++ b/tensorflow/sentencepiece_processor_ops.cc @@ -0,0 +1,532 @@ +// Copyright 2016 Google Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License.! + +#include <string> +#include <vector> + +#include "sentencepiece_processor.h" +#include "tensorflow/core/framework/op.h" +#include "tensorflow/core/framework/op_kernel.h" +#include "tensorflow/core/framework/shape_inference.h" +#include "tensorflow/core/framework/tensor_shape.h" + +typedef int int32; +typedef long long int int64; + +namespace sentencepiece { +using ::tensorflow::DEVICE_CPU; +using ::tensorflow::OpKernel; +using ::tensorflow::OpKernelConstruction; +using ::tensorflow::OpKernelContext; +using ::tensorflow::Tensor; +using ::tensorflow::TensorShapeUtils; +using ::tensorflow::shape_inference::DimensionHandle; +using ::tensorflow::shape_inference::InferenceContext; +using ::tensorflow::shape_inference::ShapeHandle; + +namespace { + +// A utility function to convert sentencepiece::util::Status to +// ::tensorflow::Status +::tensorflow::Status ToTFStatus(const sentencepiece::util::Status& s) { + if (s.ok()) return ::tensorflow::Status(); + return ::tensorflow::Status(static_cast<::tensorflow::error::Code>(s.code()), + ::tensorflow::string(s.error_message())); +} + +// A factory function to initialize SentencePieceProcessor with +// OpKernelConstruction `context`. +enum InitType { GENERAL, ENCODE, DECODE }; // purpose of processor. + +void InitializeModel(OpKernelConstruction* context, + SentencePieceProcessor* sentencepiece_processor, + InitType type) { + std::string model_file_attr, model_proto_attr; + OP_REQUIRES_OK(context, context->GetAttr("model_file", &model_file_attr)); + OP_REQUIRES_OK(context, context->GetAttr("model_proto", &model_proto_attr)); + + if (!model_file_attr.empty()) { + OP_REQUIRES( + context, model_proto_attr.empty(), + ::tensorflow::errors::InvalidArgument( + "`model_proto` must be empty when `model_file` is specified.")); + OP_REQUIRES_OK(context, + ToTFStatus(sentencepiece_processor->Load(model_file_attr))); + } else { + // Loads serialized sentencepiece model proto to enable embedding the + // relatively small sentencepiece model proto into the tensorflow graph + // such that the tensorflow graph is self-contained. + OP_REQUIRES_OK(context, + ToTFStatus(sentencepiece_processor->LoadFromSerializedProto( + model_proto_attr))); + } + + // Sets extra options to add <s>, </s>. + std::string options; + auto add_options = [&options, &context](const std::string& name, + const std::string& v) { + bool flag = false; + OP_REQUIRES_OK(context, context->GetAttr(name, &flag)); + if (flag) { + if (!options.empty()) options += ':'; + options += v; + } + }; + + if (type == ENCODE || type == DECODE) { + add_options("reverse", "reverse"); + } + + if (type == ENCODE) { + add_options("add_bos", "bos"); + add_options("add_eos", "eos"); + OP_REQUIRES_OK( + context, + ToTFStatus(sentencepiece_processor->SetEncodeExtraOptions(options))); + } else if (type == DECODE) { + OP_REQUIRES_OK( + context, + ToTFStatus(sentencepiece_processor->SetDecodeExtraOptions(options))); + } +} +} // namespace + +class SentencePieceGetPieceSizeOp : public OpKernel { + public: + explicit SentencePieceGetPieceSizeOp(OpKernelConstruction* context) + : OpKernel(context) { + SentencePieceProcessor sp; + InitializeModel(context, &sp, GENERAL); + vocab_size_ = sp.GetPieceSize(); + } + + void Compute(OpKernelContext* context) override { + Tensor* vocab_size_tensor = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, {}, &vocab_size_tensor)); + vocab_size_tensor->scalar<int32>()() = vocab_size_; + } + + private: + int32 vocab_size_ = 0; +}; + +template <typename S, typename T> +class SentencePieceConvertPieceOp : public OpKernel { + public: + explicit SentencePieceConvertPieceOp(OpKernelConstruction* context) + : OpKernel(context) { + InitializeModel(context, &sentencepiece_processor_, GENERAL); + } + + void Compute(OpKernelContext* context) override { + const Tensor* input_tensor = nullptr; + OP_REQUIRES_OK(context, context->input("input", &input_tensor)); + + Tensor* output_tensor = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(), + &output_tensor)); + for (int i = 0; i < input_tensor->NumElements(); ++i) + output_tensor->flat<T>()(i) = Convert(input_tensor->flat<S>()(i)); + } + + int32 Convert(const std::string& piece) const { + return sentencepiece_processor_.PieceToId(piece); + } + + std::string Convert(int32 id) const { + if (id >= 0 && id < sentencepiece_processor_.GetPieceSize()) { + return sentencepiece_processor_.IdToPiece(id); + } + return ""; + } + + private: + SentencePieceProcessor sentencepiece_processor_; +}; + +template <typename T> +class SentencePieceEncodeOpBase : public OpKernel { + public: + explicit SentencePieceEncodeOpBase(OpKernelConstruction* context) + : OpKernel(context) { + InitializeModel(context, &sentencepiece_processor_, ENCODE); + } + + void Compute(OpKernelContext* context) override { + const Tensor* input_tensor = nullptr; + + OP_REQUIRES_OK(context, context->input("input", &input_tensor)); + OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor->shape()), + ::tensorflow::errors::InvalidArgument( + "`input` must be a vector, got shape: ", + input_tensor->shape().DebugString())); + const auto& input_sentences = input_tensor->vec<std::string>(); + const int64 batch_size = input_sentences.size(); + + const Tensor* nbest_size_tensor = nullptr; + OP_REQUIRES_OK(context, context->input("nbest_size", &nbest_size_tensor)); + OP_REQUIRES(context, nbest_size_tensor->dims() <= 1, + ::tensorflow::errors::InvalidArgument( + "`nbest_size` must be a scalar or vector. got shape: ", + nbest_size_tensor->shape().DebugString())); + if (nbest_size_tensor->dims() == 1) { + OP_REQUIRES( + context, batch_size == nbest_size_tensor->dim_size(0), + ::tensorflow::errors::InvalidArgument( + "`nbest_size` must have the same batch size as `input`.")); + } + + const Tensor* alpha_tensor = nullptr; + OP_REQUIRES_OK(context, context->input("alpha", &alpha_tensor)); + OP_REQUIRES(context, alpha_tensor->dims() <= 1, + ::tensorflow::errors::InvalidArgument( + "`alpha` must be a scalar or vector, got shape: ", + alpha_tensor->shape().DebugString())); + if (alpha_tensor->dims() == 1) { + OP_REQUIRES(context, batch_size == alpha_tensor->dim_size(0), + ::tensorflow::errors::InvalidArgument( + "`alpha` must have the same batch size as `input`.")); + } + + std::vector<std::vector<T>> pieces(batch_size); + + for (int64 i = 0; i < batch_size; ++i) { + const int32 nbest_size = nbest_size_tensor->dims() == 1 + ? nbest_size_tensor->vec<int32>()(i) + : nbest_size_tensor->scalar<int32>()(); + if (nbest_size == 0 || nbest_size == 1) { + OP_REQUIRES_OK(context, ToTFStatus(sentencepiece_processor_.Encode( + input_sentences(i), &pieces[i]))); + } else { + const float alpha = alpha_tensor->dims() == 1 + ? alpha_tensor->vec<float>()(i) + : alpha_tensor->scalar<float>()(); + OP_REQUIRES_OK(context, + ToTFStatus(sentencepiece_processor_.SampleEncode( + input_sentences(i), nbest_size, alpha, &pieces[i]))); + } + } + + MakeOutputTensor(context, pieces); + } + + private: + virtual void MakeOutputTensor(OpKernelContext* context, + const std::vector<std::vector<T>>& pieces) = 0; + + SentencePieceProcessor sentencepiece_processor_; +}; + +template <typename T> +class SentencePieceEncodeSparseOp : public SentencePieceEncodeOpBase<T> { + public: + explicit SentencePieceEncodeSparseOp(OpKernelConstruction* context) + : SentencePieceEncodeOpBase<T>(context) {} + + private: + void MakeOutputTensor(OpKernelContext* context, + const std::vector<std::vector<T>>& pieces) override { + const int64 batch_size = pieces.size(); + + int64 max_sequence_length = 0; + int64 indices_size = 0; + for (int row = 0; row < batch_size; ++row) { + const int col_size = pieces[row].size(); + max_sequence_length = std::max<int64>(col_size, max_sequence_length); + indices_size += col_size; + } + + // Creates the indices output tensor. + Tensor* indices_tensor = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(0, {indices_size, 2}, + &indices_tensor)); + + auto indices_tensor_output = indices_tensor->matrix<int64>(); + int item_idx = 0; + for (int row = 0; row < batch_size; ++row) { + for (int col = 0; col < pieces[row].size(); ++col) { + indices_tensor_output(item_idx, 0) = row; + indices_tensor_output(item_idx, 1) = col; + ++item_idx; + } + } + + // Creates the values output tensor. + Tensor* values_tensor = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(1, {indices_size}, &values_tensor)); + + auto values_tensor_output = values_tensor->flat<T>(); + item_idx = 0; + for (int row = 0; row < batch_size; ++row) { + std::copy(pieces[row].begin(), pieces[row].end(), + &values_tensor_output(item_idx)); + item_idx += pieces[row].size(); + } + + // Creates the shape output tensor. + Tensor* shape_tensor = nullptr; + OP_REQUIRES_OK(context, context->allocate_output(2, {2}, &shape_tensor)); + + auto shape_tensor_output = shape_tensor->flat<int64>(); + shape_tensor_output(0) = batch_size; + shape_tensor_output(1) = max_sequence_length; + } +}; + +template <typename T> +class SentencePieceEncodeDenseOp : public SentencePieceEncodeOpBase<T> { + public: + explicit SentencePieceEncodeDenseOp(OpKernelConstruction* context) + : SentencePieceEncodeOpBase<T>(context) {} + + private: + void MakeOutputTensor(OpKernelContext* context, + const std::vector<std::vector<T>>& pieces) override { + const int64 batch_size = pieces.size(); + + int64 max_sequence_length = 0; + for (int row = 0; row < batch_size; ++row) { + max_sequence_length = + std::max<int64>(pieces[row].size(), max_sequence_length); + } + + Tensor* values_tensor = nullptr; + Tensor* length_tensor = nullptr; + + OP_REQUIRES_OK( + context, context->allocate_output(0, {batch_size, max_sequence_length}, + &values_tensor)); + OP_REQUIRES_OK(context, + context->allocate_output(1, {batch_size}, &length_tensor)); + + auto values_tensor_output = values_tensor->matrix<T>(); + auto length_tensor_output = length_tensor->vec<int32>(); + + for (int row = 0; row < batch_size; ++row) { + for (int col = 0; col < max_sequence_length; ++col) { + values_tensor_output(row, col) = + col < pieces[row].size() ? pieces[row][col] : T(); + } + length_tensor_output(row) = pieces[row].size(); + } + } +}; + +template <typename T> +class SentencePieceDecodeOp : public OpKernel { + public: + explicit SentencePieceDecodeOp(OpKernelConstruction* context) + : OpKernel(context) { + InitializeModel(context, &sentencepiece_processor_, DECODE); + } + + void Compute(OpKernelContext* context) override { + const Tensor* input_tensor = nullptr; + const Tensor* length_tensor = nullptr; + + OP_REQUIRES_OK(context, context->input("input", &input_tensor)); + OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_tensor->shape()), + ::tensorflow::errors::InvalidArgument( + "`input` must be a 2-D matrix. got shape: ", + input_tensor->shape().DebugString())); + OP_REQUIRES_OK(context, context->input("sequence_length", &length_tensor)); + OP_REQUIRES(context, TensorShapeUtils::IsVector(length_tensor->shape()), + ::tensorflow::errors::InvalidArgument( + "`sequence_length` must be a vector. got shape: ", + length_tensor->shape().DebugString())); + OP_REQUIRES( + context, input_tensor->dim_size(0) == length_tensor->dim_size(0), + ::tensorflow::errors::InvalidArgument( + "`sequence_length` must have the same batch size as `input`.")); + + const auto& input_sentences = input_tensor->matrix<T>(); + const auto& sequence_length = length_tensor->vec<int32>(); + const int64 batch_size = input_tensor->dim_size(0); + const int max_sequence_length = input_tensor->dim_size(1); + + Tensor* values_tensor = nullptr; + OP_REQUIRES_OK(context, + context->allocate_output(0, {batch_size}, &values_tensor)); + auto values_tensor_output = values_tensor->vec<std::string>(); + + for (int64 i = 0; i < batch_size; ++i) { + OP_REQUIRES(context, + (sequence_length(i) >= 0 && + sequence_length(i) <= max_sequence_length), + ::tensorflow::errors::InvalidArgument( + "`sequence_length` is out-of-range.")); + const std::vector<T> pieces(&input_sentences(i, 0), + &input_sentences(i, 0) + sequence_length(i)); + OP_REQUIRES_OK(context, ToTFStatus(sentencepiece_processor_.Decode( + pieces, &values_tensor_output(i)))); + } + } + + private: + SentencePieceProcessor sentencepiece_processor_; +}; + +namespace { +// The snake case of this variables are used as the function names. +constexpr char kGetPieceSizeOpName[] = "SentencepieceGetPieceSize"; +constexpr char kPieceToIdOpName[] = "SentencepiecePieceToId"; +constexpr char kIdToPieceOpName[] = "SentencepieceIdToPiece"; +constexpr char kEncodeDenseOpName[] = "SentencepieceEncodeDense"; +constexpr char kEncodeSparseOpName[] = "SentencepieceEncodeSparse"; +constexpr char kDecodeOpName[] = "SentencepieceDecode"; +} // namespace + +REGISTER_OP(kGetPieceSizeOpName) + .Output("vocab_size: int32") + .Attr("model_file: string = ''") + .Attr("model_proto: string = ''") + .SetShapeFn([](InferenceContext* c) { + c->set_output(0, c->MakeShape({})); + return ::tensorflow::Status::OK(); + }); + +REGISTER_KERNEL_BUILDER(Name(kGetPieceSizeOpName).Device(DEVICE_CPU), + SentencePieceGetPieceSizeOp); + +REGISTER_OP(kPieceToIdOpName) + .Input("input: string") + .Output("values: int32") + .Attr("model_file: string = ''") + .Attr("model_proto: string = ''") + .SetShapeFn([](InferenceContext* c) { + c->set_output(0, c->input(0)); + return ::tensorflow::Status::OK(); + }); + +REGISTER_KERNEL_BUILDER(Name(kPieceToIdOpName).Device(DEVICE_CPU), + SentencePieceConvertPieceOp<std::string, int32>); + +REGISTER_OP(kIdToPieceOpName) + .Input("input: int32") + .Output("values: string") + .Attr("model_file: string = ''") + .Attr("model_proto: string = ''") + .SetShapeFn([](InferenceContext* c) { + c->set_output(0, c->input(0)); + return ::tensorflow::Status::OK(); + }); + +REGISTER_KERNEL_BUILDER(Name(kIdToPieceOpName).Device(DEVICE_CPU), + SentencePieceConvertPieceOp<int32, std::string>); + +REGISTER_OP(kEncodeDenseOpName) + .Attr("out_type: {int32, string} = DT_INT32") + .Input("input: string") + .Input("nbest_size: int32") + .Input("alpha: float") + .Output("values: out_type") + .Output("sequence_length: int32") + .Attr("model_file: string = ''") + .Attr("model_proto: string = ''") + .Attr("reverse: bool = false") + .Attr("add_bos: bool = false") + .Attr("add_eos: bool = false") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle input, nbest, alpha; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input)); + TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest)); + TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha)); + DimensionHandle batch_size = c->Dim(input, 0); + if (c->Rank(nbest) == 1) + TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size)); + if (c->Rank(alpha) == 1) + TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size)); + c->set_output(0, c->MakeShape({batch_size, c->UnknownDim()})); + c->set_output(1, c->MakeShape({batch_size})); + return ::tensorflow::Status::OK(); + }); + +REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName) + .Device(DEVICE_CPU) + .TypeConstraint<int32>("out_type"), + SentencePieceEncodeDenseOp<int32>); + +REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName) + .Device(DEVICE_CPU) + .TypeConstraint<std::string>("out_type"), + SentencePieceEncodeDenseOp<std::string>); + +REGISTER_OP(kEncodeSparseOpName) + .Attr("out_type: {int32, string} = DT_INT32") + .Input("input: string") + .Input("nbest_size: int32") + .Input("alpha: float") + .Output("indices: int64") + .Output("values: out_type") + .Output("dense_shape: int64") + .Attr("model_file: string = ''") + .Attr("model_proto: string = ''") + .Attr("reverse: bool = false") + .Attr("add_bos: bool = false") + .Attr("add_eos: bool = false") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle input, nbest, alpha; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input)); + TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest)); + TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha)); + DimensionHandle batch_size = c->Dim(input, 0); + if (c->Rank(nbest) == 1) + TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size)); + if (c->Rank(alpha) == 1) + TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size)); + c->set_output(0, c->MakeShape({c->UnknownDim(), 2})); + c->set_output(1, c->MakeShape({c->UnknownDim()})); + c->set_output(2, c->MakeShape({2})); + return ::tensorflow::Status::OK(); + }); + +REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName) + .Device(DEVICE_CPU) + .TypeConstraint<int32>("out_type"), + SentencePieceEncodeSparseOp<int32>); + +REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName) + .Device(DEVICE_CPU) + .TypeConstraint<std::string>("out_type"), + SentencePieceEncodeSparseOp<std::string>); + +REGISTER_OP(kDecodeOpName) + .Attr("T: {int32, string}") + .Input("input: T") + .Input("sequence_length: int32") + .Output("values: string") + .Attr("model_file: string = ''") + .Attr("model_proto: string = ''") + .Attr("reverse: bool = false") + .SetShapeFn([](InferenceContext* c) { + ShapeHandle input, sequence_length; + TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input)); + TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sequence_length)); + DimensionHandle batch_size = c->Dim(input, 0); + TF_RETURN_IF_ERROR( + c->Merge(batch_size, c->Dim(sequence_length, 0), &batch_size)); + c->set_output(0, c->MakeShape({batch_size})); + return ::tensorflow::Status::OK(); + }); + +REGISTER_KERNEL_BUILDER( + Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<int32>("T"), + SentencePieceDecodeOp<int32>); + +REGISTER_KERNEL_BUILDER( + Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<std::string>("T"), + SentencePieceDecodeOp<std::string>); +} // namespace sentencepiece diff --git a/tensorflow/setup.py b/tensorflow/setup.py new file mode 100755 index 0000000..aca3285 --- /dev/null +++ b/tensorflow/setup.py @@ -0,0 +1,46 @@ +#!/usr/bin/env python + +# Copyright 2018 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License.! + +from setuptools import setup +from setuptools import find_packages +import string +import sys + +sys.path.append('./test') + +setup(name = 'tf_sentencepiece', + author = 'Taku Kudo', + author_email='taku@google.com', + description = 'SentencePiece Encode/Decode ops for TensorFlow', + version='0.1.1', + url = 'https://github.com/google/sentencepiece', + license = 'Apache', + platforms = 'Unix', + packages=find_packages(exclude=['test']), + package_data={'tf_sentencepiece': ['_sentencepiece_processor_ops.so']}, + classifiers = [ + 'Development Status :: 5 - Production/Stable', + 'Environment :: Console', + 'Intended Audience :: Developers', + 'Intended Audience :: Science/Research', + 'License :: OSI Approved :: Apache Software License', + 'Operating System :: Unix', + 'Programming Language :: Python', + 'Topic :: Text Processing :: Linguistic', + 'Topic :: Software Development :: Libraries :: Python Modules' + ], + keywords='tensorflow machine learning sentencepiece NLP segmentation', + test_suite = 'tf_sentencepiece_test.suite') diff --git a/tensorflow/test/__init__.py b/tensorflow/test/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/tensorflow/test/__init__.py diff --git a/tensorflow/test/tf_sentencepiece_test.py b/tensorflow/test/tf_sentencepiece_test.py new file mode 100755 index 0000000..e1a7b52 --- /dev/null +++ b/tensorflow/test/tf_sentencepiece_test.py @@ -0,0 +1,283 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +import itertools as it +import os +import unittest +import tensorflow as tf +import sentencepiece as spm +import tf_sentencepiece as tfspm + +class SentencePieceProcssorOpTest(unittest.TestCase): + + def _getSentencePieceModelFile(self): + return '../python/test/test_ja_model.model' + + def _getExpected(self, processor, reverse=False, add_bos=False, + add_eos=False, padding=''): + options = [] + if reverse: + options.append('reverse') + if add_bos: + options.append('bos') + if add_eos: + options.append('eos') + + processor.SetEncodeExtraOptions(':'.join(options)) + processor.SetDecodeExtraOptions(':'.join(options)) + + sentences = ['Hello world.', 'I have a pen.', + 'I saw a girl with a telescope.'] + pieces = [] + ids = [] + seq_len = [] + + for s in sentences: + x = processor.EncodeAsPieces(s) + y = processor.EncodeAsIds(s) + pieces.append(x) + ids.append(y) + seq_len.append(len(x)) + self.assertEqual(len(x), len(y)) + + # padding + max_len = max(seq_len) + pieces = [x + [padding] * (max_len - len(x)) for x in pieces] + ids = [x + [0] * (max_len - len(x)) for x in ids] + + return sentences, pieces, ids, seq_len + + def testGetPieceSize(self): + sentencepiece_model_file = self._getSentencePieceModelFile() + processor = spm.SentencePieceProcessor() + processor.Load(sentencepiece_model_file) + + with tf.Session(): + s = tfspm.piece_size( + model_file=sentencepiece_model_file) + self.assertEqual(s.eval(), processor.GetPieceSize()) + + def testConvertPiece(self): + sentencepiece_model_file = self._getSentencePieceModelFile() + processor = spm.SentencePieceProcessor() + processor.Load(sentencepiece_model_file) + (sentences, expected_pieces, + expected_ids, expected_seq_len) = self._getExpected(processor, + padding='<unk>') + + with tf.Session(): + ids_matrix = tfspm.piece_to_id( + tf.constant(expected_pieces), + model_file=sentencepiece_model_file) + ids_vec = tfspm.piece_to_id( + tf.constant(expected_pieces[0]), + model_file=sentencepiece_model_file) + ids_scalar = tfspm.piece_to_id( + tf.constant(expected_pieces[0][0]), + model_file=sentencepiece_model_file) + + self.assertEqual(ids_matrix.eval().tolist(), expected_ids) + self.assertEqual(ids_vec.eval().tolist(), expected_ids[0]) + self.assertEqual(ids_scalar.eval(), expected_ids[0][0]) + + pieces_matrix = tfspm.id_to_piece( + tf.constant(expected_ids), + model_file=sentencepiece_model_file) + pieces_vec = tfspm.id_to_piece( + tf.constant(expected_ids[0]), + model_file=sentencepiece_model_file) + pieces_scalar = tfspm.id_to_piece( + tf.constant(expected_ids[0][0]), + model_file=sentencepiece_model_file) + + self.assertEqual(pieces_matrix.eval().tolist(), expected_pieces) + self.assertEqual(pieces_vec.eval().tolist(), expected_pieces[0]) + self.assertEqual(pieces_scalar.eval(), expected_pieces[0][0]) + + + def testEncodeAndDecode(self): + sentencepiece_model_file = self._getSentencePieceModelFile() + processor = spm.SentencePieceProcessor() + processor.Load(sentencepiece_model_file) + + with tf.Session(): + for reverse, add_bos, add_eos in list(it.product( + (True, False), repeat=3)): + (sentences, expected_pieces, + expected_ids, expected_seq_len) = self._getExpected( + processor, reverse, add_bos, add_eos) + + # Encode sentences into pieces/ids. + s = tf.constant(sentences) + pieces, seq_len1 = tfspm.encode( + s, model_file=sentencepiece_model_file, + reverse=reverse, add_bos=add_bos, add_eos=add_eos, + out_type=tf.string) + ids, seq_len2 = tfspm.encode( + s, model_file=sentencepiece_model_file, + reverse=reverse, add_bos=add_bos, add_eos=add_eos) + + self.assertEqual(pieces.eval().tolist(), expected_pieces) + self.assertEqual(ids.eval().tolist(), expected_ids) + self.assertEqual(seq_len1.eval().tolist(), expected_seq_len) + self.assertEqual(seq_len2.eval().tolist(), expected_seq_len) + + # Decode pieces into sentences/ids. + pieces = tf.constant(expected_pieces) + ids = tf.constant(expected_ids) + seq_len = tf.constant(expected_seq_len, dtype=tf.int32) + decoded_sentences1 = tfspm.decode( + pieces, seq_len, model_file=sentencepiece_model_file, + reverse=reverse) + decoded_sentences2 = tfspm.decode( + ids, seq_len, model_file=sentencepiece_model_file, + reverse=reverse) + + self.assertEqual(decoded_sentences1.eval().tolist(), sentences) + self.assertEqual(decoded_sentences2.eval().tolist(), sentences) + + def testSampleEncodeAndDecode(self): + sentencepiece_model_file = self._getSentencePieceModelFile() + processor = spm.SentencePieceProcessor() + processor.Load(sentencepiece_model_file) + sentences, _, _, _ = self._getExpected(processor) + + with tf.Session(): + for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]: + # Round trip test. + nbest_size = tf.constant(n) + alpha = tf.constant(a) + s = tf.constant(sentences) + + pieces, seq_len1 = tfspm.encode( + s, nbest_size=nbest_size, alpha=alpha, + model_file=sentencepiece_model_file, out_type=tf.string) + ids, seq_len2 = tfspm.encode( + s, nbest_size=nbest_size, alpha=alpha, + model_file=sentencepiece_model_file) + decoded_sentences1 = tfspm.decode( + pieces, seq_len1, model_file=sentencepiece_model_file) + decoded_sentences2 = tfspm.decode( + ids, seq_len2, model_file=sentencepiece_model_file) + + self.assertEqual(decoded_sentences1.eval().tolist(), sentences) + self.assertEqual(decoded_sentences2.eval().tolist(), sentences) + + def testEncodeAndDecodeSparse(self): + sentencepiece_model_file = self._getSentencePieceModelFile() + processor = spm.SentencePieceProcessor() + processor.Load(sentencepiece_model_file) + + with tf.Session(): + for reverse, add_bos, add_eos in list(it.product( + (True, False), repeat=3)): + (sentences, expected_pieces, expected_ids, + _) = self._getExpected(processor, reverse, add_bos, add_eos) + + # Encode sentences into sparse pieces/ids. + s = tf.constant(sentences) + pieces = tfspm.encode_sparse( + s, model_file=sentencepiece_model_file, + reverse=reverse, add_bos=add_bos, add_eos=add_eos, + out_type=tf.string) + ids = tfspm.encode_sparse( + s, model_file=sentencepiece_model_file, + reverse=reverse, add_bos=add_bos, add_eos=add_eos) + pieces = tf.sparse_tensor_to_dense(pieces, default_value='') + ids = tf.sparse_tensor_to_dense(ids, default_value=0) + + self.assertEqual(ids.eval().tolist(), expected_ids) + self.assertEqual(pieces.eval().tolist(), expected_pieces) + + def testLoadModelProto(self): + # Makes a serialized model proto. + model_proto = open(self._getSentencePieceModelFile(), 'rb').read() + with tf.Session() as sess: + sentences = ['Hello world.'] + a = tf.constant(sentences) + sess.run(tfspm.encode( + a, model_proto=model_proto, + out_type=tf.string)) + + def testInvalidModelPath(self): + with tf.Session() as sess: + with self.assertRaises(tf.errors.NotFoundError): + sentences = ['Hello world.'] + a = tf.constant(sentences) + sess.run(tfspm.encode( + a, model_file='invalid path', out_type=tf.string)) + + def testInvalidModelProto(self): + with tf.Session() as sess: + with self.assertRaises(tf.errors.InternalError): + sentences = ['Hello world.'] + a = tf.constant(sentences) + sess.run(tfspm.encode( + a, model_proto='invalid proto', out_type=tf.string)) + + def testInvalidInput(self): + sentences = ['Hello world.', 'This is a test.'] + ids = [[0,1],[2,3]] + model_file = self._getSentencePieceModelFile() + with tf.Session() as sess: + a = tf.constant(sentences) + b = tf.constant(ids) + + alpha = tf.constant([1.0, 2.0]) + sess.run(tfspm.encode( + a, model_file=model_file, alpha=alpha, name='foo')) + + nbest_size = tf.constant([1, 2], dtype=tf.int32) + sess.run(tfspm.encode( + a, model_file=model_file, nbest_size=nbest_size, name='foo')) + + alpha = tf.constant(1.0) + sess.run(tfspm.encode( + a, model_file=model_file, alpha=alpha, name='foo')) + + nbest_size = tf.constant(10, dtype=tf.int32) + sess.run(tfspm.encode( + a, model_file=model_file, nbest_size=nbest_size, name='foo')) + + sess.run(tfspm.decode( + b, sequence_length=tf.constant([2, 2]), model_file=model_file)) + + with self.assertRaises(ValueError): + a = tf.constant(sentences) + alpha = tf.constant([1.0, 2.0, 3.0]) + sess.run(tfspm.encode( + a, model_file=model_file, alpha=alpha)) + with self.assertRaises(ValueError): + a = tf.constant(sentences) + nbest_size = tf.constant([1, 2, 3], dtype=tf.int32) + sess.run(tfspm.encode( + a, model_file=model_file, nbest_size=nbest_size)) + with self.assertRaises(ValueError): + a = tf.constant(sentences) + alpha = tf.constant([[1.0], [2.0]]) + sess.run(tfspm.encode( + a, model_file=model_file, alpha=alpha)) + with self.assertRaises(ValueError): + a = tf.constant(sentences) + nbest_size = tf.constant([[1], [2]], dtype=tf.int32) + sess.run(tfspm.encode( + a, model_file=model_file, nbest_size=nbest_size)) + with self.assertRaises(ValueError): + b = tf.constant(ids) + sess.run(tfspm.decode( + a, sequence_length=2, model_file=model_file)) + with self.assertRaises(ValueError): + b = tf.constant(ids) + sess.run(tfspm.decode( + a, sequence_length=tf.constant([2, 2, 2]), + model_file=model_file)) + + +def suite(): + suite = unittest.TestSuite() + suite.addTests(unittest.makeSuite(SentencePieceProcssorOpTest)) + return suite + + +if __name__ == '__main__': + unittest.main() diff --git a/tensorflow/tf_sentencepiece/__init__.py b/tensorflow/tf_sentencepiece/__init__.py new file mode 100644 index 0000000..8a3df31 --- /dev/null +++ b/tensorflow/tf_sentencepiece/__init__.py @@ -0,0 +1,5 @@ +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +from tf_sentencepiece.sentencepiece_processor_ops import * diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so Binary files differnew file mode 100755 index 0000000..db7fe23 --- /dev/null +++ b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so diff --git a/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py b/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py new file mode 100644 index 0000000..baafd6c --- /dev/null +++ b/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py @@ -0,0 +1,192 @@ +# Copyright 2018 Google Inc. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License.! + +r"""Ops for SentencePiece Encoding/Decoding.""" + +# TODO(taku): Implements n-best output + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import tensorflow as tf + +_gen_sentencepiece_processor_op = tf.load_op_library( + os.path.join(os.path.dirname(__file__), '_sentencepiece_processor_ops.so')) + + +def piece_size(model_file=None, model_proto=None, name=None): + """Returns the piece size (vocabulary size). + + Args: + model_file: The sentencepiece model file path. + model_proto: The sentencepiece model serialized proto. + Either `model_file` or `model_proto` must be set. + name: The name argument that is passed to the op function. + Returns: + A scalar representing the vocabulary size. + """ + + return _gen_sentencepiece_processor_op.sentencepiece_get_piece_size( + model_file=model_file, model_proto=model_proto, name=name) + + +def piece_to_id(input, model_file=None, model_proto=None, name=None): + """Converts piece into vocabulary id. + + Args: + input: An arbitrary tensor of string. + model_file: The sentencepiece model file path. + model_proto: The sentencepiece model serialized proto. + Either `model_file` or `model_proto` must be set. + name: The name argument that is passed to the op function. + Returns: + A tensor of int32 with the same shape as input. + """ + + return _gen_sentencepiece_processor_op.sentencepiece_piece_to_id( + input, model_file=model_file, model_proto=model_proto, name=name) + + +def id_to_piece(input, model_file=None, model_proto=None, name=None): + """Converts vocabulary id into piece. + + Args: + input: An arbitrary tensor of int32. + model_file: The sentencepiece model file path. + model_proto: The sentencepiece model serialized proto. + Either `model_file` or `model_proto` must be set. + name: The name argument that is passed to the op function. + Returns: + A tensor of string with the same shape as input. + """ + + return _gen_sentencepiece_processor_op.sentencepiece_id_to_piece( + input, model_file=model_file, model_proto=model_proto, name=name) + + +def encode_dense(input_sentences, nbest_size=0, alpha=1.0, + model_file=None, model_proto=None, + reverse=False, add_bos=False, add_eos=False, + out_type=tf.int32, name=None): + """Encodes sentences into pieces in dense tensor format. + + Args: + input_sentences: A 1D string tensor of arbitrary size holding the raw + text of input sentences. + nbest_size: A scalar or 1D tensor for sampling. + nbest_size = {0,1}: No sampling is performed. + nbest_size > 1: samples from the nbest_size results. + nbest_size < 0: assuming that nbest_size is infinite + and samples from the all hypothesis (lattice) using + forward-filtering-and-backward-sampling algorithm. + alpha: A scalar or 1D tensor for a moothing parameter. + Inverse temparature for probablity rescaling. + model_file: The sentencepiece model file path. + model_proto: The sentencepiece model serialized proto. + Either `model_file` or `model_proto` must be set. + reverse: Reverses the tokenized sequence (Default = false) + add_bos: Add <s> to the result (Default = false) + add_eos: Add </s> to the result (Default = false) + <s>/</s> is added after reversing (if enabled). + out_type: output type. tf.int32 or tf.string (Default = tf.int32) + Setting tf.int32 directly encodes the string into an id sequence. + name: The name argument that is passed to the op function. + Returns: + pieces: A dense 2D tensor representing the tokenized sentences. + sequence_length: A 1D tensor representing the length of pieces. + """ + + return _gen_sentencepiece_processor_op.sentencepiece_encode_dense( + input_sentences, nbest_size=nbest_size, alpha=alpha, + model_file=model_file, model_proto=model_proto, + reverse=reverse, add_bos=add_bos, add_eos=add_eos, + out_type=out_type, name=name) + +# Adds an alias for encode_dense. Accepts the `encode` function. +encode = encode_dense + + +def encode_sparse(input_sentences, nbest_size=0, alpha=1.0, + model_file=None, model_proto=None, + reverse=False, add_bos=False, add_eos=False, + out_type=tf.int32, name=None): + """Encodes sentences into pieces in sparse tensor format. + + Args: + input_sentences: A 1D string tensor of arbitrary size holding the raw + text of input sentences. + nbest_size: A scalar or 1D tensor for sampling. + nbest_size = {0,1}: No sampling is performed. + nbest_size > 1: samples from the nbest_size results. + nbest_size < 0: assuming that nbest_size is infinite + and samples from the all hypothesis (lattice) using + forward-filtering-and-backward-sampling algorithm. + alpha: A scalar or 1D tensor for a moothing parameter. + Inverse temparature for probablity rescaling. + model_file: The sentencepiece model file path. + model_proto: The sentencepiece model serialized proto. + Either `model_file` or `model_proto` must be set. + reverse: Reverses the tokenized sequence (Default = false) + add_bos: Add <s> to the result (Default = false) + add_eos: Add </s> to the result (Default = false) + <s>/</s> is added after reversing (if enabled). + out_type: output type. tf.int32 or tf.string (Default = tf.int32) + Setting tf.int32 directly encodes the string into an id sequence. + name: The name argument that is passed to the op function. + + Returns: + pieces: A sparse 2D tensor representing the tokenized sentences. + """ + + indices, values, dense_shape = ( + _gen_sentencepiece_processor_op.sentencepiece_encode_sparse( + input_sentences, nbest_size=nbest_size, alpha=alpha, + model_file=model_file, model_proto=model_proto, + reverse=reverse, add_bos=add_bos, add_eos=add_eos, + out_type=out_type, name=name)) + return tf.SparseTensor(indices, values, dense_shape) + + +def decode(pieces, sequence_length, model_file=None, model_proto=None, + reverse=False, name=None): + """Decode pieces into postproecssed text. + + Args: + pieces: A 2D int32 or string tensor [batch_size x max_length] of + encoded sequences. + sequence_length: A 1D int32 tensor [batch_size] representing the + length of pieces. + model_file: The sentencepiece model file path. + model_proto: The sentencepiece model serialized proto. + Either `model_file` or `model_proto` must be set. + reverse: Reverses the tokenized sequence (Default = false) + name: The name argument that is passed to the op function. + + Returns: + text: A 1D string tensor of decoded string. + """ + + return _gen_sentencepiece_processor_op.sentencepiece_decode( + pieces, sequence_length, model_file=model_file, + model_proto=model_proto, reverse=reverse, name=name) + + +tf.NotDifferentiable('SentencepieceGetPieceSize') +tf.NotDifferentiable('SentencepieceIdToPiece') +tf.NotDifferentiable('SentencepiecePieceToId') +tf.NotDifferentiable('SentencepieceEncodeDense') +tf.NotDifferentiable('SentencepieceEncodeSparse') +tf.NotDifferentiable('SentencepieceDecode') |