Added Tensorflow module

author: Taku Kudo <taku@google.com> 2018-06-26 08:59:55 +0300
committer: Taku Kudo <taku@google.com> 2018-06-26 08:59:55 +0300
commit: 81d1a091fbe294886d6d73ccdaf4b645fa8b17b1 (patch)
tree: 4f47cb2ec157781c55d6666735fabecf94a51f68 /tensorflow
parent: 65da95da9abc95948cf1876c53c2491aded8dc91 (diff)
10 files changed, 1166 insertions, 0 deletions
diff --git a/tensorflow/.gitignore b/tensorflow/.gitignore
new file mode 100644
index 0000000..0918968
--- /dev/null
+++ b/tensorflow/.gitignore
@@ -0,0 +1,5 @@
+build/
+sdist/
+dist/
+tmp/
+*py[cod]
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tensorflow/__init__.py
diff --git a/tensorflow/make_py_wheel.sh b/tensorflow/make_py_wheel.sh
new file mode 100755
index 0000000..96517df
--- /dev/null
+++ b/tensorflow/make_py_wheel.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+
+# Copyright 2018 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.!
+
+set -e  # exit immediately on error
+set -x  # display all commands
+
+PROTOBUF_VERSION=3.6.0
+
+run_docker() {
+  cd `dirname $0`
+  docker pull $1
+  docker run --rm -ti --name tf_sentencepiece \
+    -v `pwd`/../:/sentencepiece -w /sentencepiece/tensorflow \
+    -td $1 /bin/bash
+  docker exec tf_sentencepiece bash -c "./build.sh native"
+  docker stop tf_sentencepiece
+}
+
+build() {
+  rm -fr tmp
+  mkdir -p tmp
+
+  export PATH="/opt/python/cp27-cp27mu/bin:${PATH}"
+
+  # Installs necessary libraries under `tmp` sub directory.
+  cd tmp
+
+  # Install libtool
+  curl -L -O http://ftpmirror.gnu.org/libtool/libtool-2.4.6.tar.gz
+  tar zxfv libtool-2.4.6.tar.gz
+  cd libtool-2.4.6
+  ./configure
+  make -j4
+  make install
+  cd ..
+
+  # Install protobuf
+  curl -L -O https://github.com/google/protobuf/releases/download/v${PROTOBUF_VERSION}/protobuf-cpp-${PROTOBUF_VERSION}.tar.gz
+  tar zxfv protobuf-cpp-${PROTOBUF_VERSION}.tar.gz
+  cd protobuf-${PROTOBUF_VERSION}
+  ./configure --disable-shared --with-pic
+  make CXXFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" \
+       CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4
+  make install || true
+  cd ../..
+
+  # Install sentencepiece
+  cd ..
+  make distclean || true
+  ./autogen.sh
+  grep -v PKG_CHECK_MODULES configure > tmp
+  mv tmp -f configure
+  chmod +x configure
+  LIBS+="-pthread -L/usr/local/lib -lprotobuf" ./configure --disable-shared --with-pic
+  make CXXFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" \
+       CFLAGS+="-std=c++11 -O3 -D_GLIBCXX_USE_CXX11_ABI=0" -j4
+  make install || true
+
+  # Builds _sentencepiece_processor_ops.so
+  cd tensorflow
+  pip install tensorflow
+  TF_CFLAGS="-I/opt/python/cp27-cp27mu/lib/python2.7/site-packages/tensorflow/include"
+  TF_LFLAGS="-L/opt/python/cp27-cp27mu/lib/python2.7/site-packages/tensorflow -ltensorflow_framework"
+
+  g++ -std=c++11 -shared \
+    -I../src \
+    -fPIC ${TF_CFLAGS[@]} -O2 \
+    -D_GLIBCXX_USE_CXX11_ABI=0 \
+    -Wl,--whole-archive \
+    /usr/local/lib/libprotobuf.a \
+    /usr/local/lib/libsentencepiece.a \
+    -Wl,--no-whole-archive \
+    sentencepiece_processor_ops.cc \
+    -o tf_sentencepiece/_sentencepiece_processor_ops.so \
+    ${TF_LFLAGS[@]}
+  strip tf_sentencepiece/_sentencepiece_processor_ops.so
+
+  # Builds Python manylinux wheel package.
+  python setup.py bdist_wheel --universal --plat-name=linux_x86_64
+  python setup.py sdist
+
+  rm -fr build tf_sentencepiece.egg-info tmp
+  cd .. && make distclean
+}
+
+if [ "$1" = "native" ]; then
+  build
+else
+  run_docker quay.io/pypa/manylinux1_x86_64
+fi
diff --git a/tensorflow/sentencepiece_processor_ops.cc b/tensorflow/sentencepiece_processor_ops.cc
new file mode 100644
index 0000000..5d2df57
--- /dev/null
+++ b/tensorflow/sentencepiece_processor_ops.cc
@@ -0,0 +1,532 @@
+// Copyright 2016 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.!
+
+#include <string>
+#include <vector>
+
+#include "sentencepiece_processor.h"
+#include "tensorflow/core/framework/op.h"
+#include "tensorflow/core/framework/op_kernel.h"
+#include "tensorflow/core/framework/shape_inference.h"
+#include "tensorflow/core/framework/tensor_shape.h"
+
+typedef int int32;
+typedef long long int int64;
+
+namespace sentencepiece {
+using ::tensorflow::DEVICE_CPU;
+using ::tensorflow::OpKernel;
+using ::tensorflow::OpKernelConstruction;
+using ::tensorflow::OpKernelContext;
+using ::tensorflow::Tensor;
+using ::tensorflow::TensorShapeUtils;
+using ::tensorflow::shape_inference::DimensionHandle;
+using ::tensorflow::shape_inference::InferenceContext;
+using ::tensorflow::shape_inference::ShapeHandle;
+
+namespace {
+
+// A utility function to convert sentencepiece::util::Status to
+// ::tensorflow::Status
+::tensorflow::Status ToTFStatus(const sentencepiece::util::Status& s) {
+  if (s.ok()) return ::tensorflow::Status();
+  return ::tensorflow::Status(static_cast<::tensorflow::error::Code>(s.code()),
+                              ::tensorflow::string(s.error_message()));
+}
+
+// A factory function to initialize SentencePieceProcessor with
+// OpKernelConstruction `context`.
+enum InitType { GENERAL, ENCODE, DECODE };  // purpose of processor.
+
+void InitializeModel(OpKernelConstruction* context,
+                     SentencePieceProcessor* sentencepiece_processor,
+                     InitType type) {
+  std::string model_file_attr, model_proto_attr;
+  OP_REQUIRES_OK(context, context->GetAttr("model_file", &model_file_attr));
+  OP_REQUIRES_OK(context, context->GetAttr("model_proto", &model_proto_attr));
+
+  if (!model_file_attr.empty()) {
+    OP_REQUIRES(
+        context, model_proto_attr.empty(),
+        ::tensorflow::errors::InvalidArgument(
+            "`model_proto` must be empty when `model_file` is specified."));
+    OP_REQUIRES_OK(context,
+                   ToTFStatus(sentencepiece_processor->Load(model_file_attr)));
+  } else {
+    // Loads serialized sentencepiece model proto to enable embedding the
+    // relatively small sentencepiece model proto into the tensorflow graph
+    // such that the tensorflow graph is self-contained.
+    OP_REQUIRES_OK(context,
+                   ToTFStatus(sentencepiece_processor->LoadFromSerializedProto(
+                       model_proto_attr)));
+  }
+
+  // Sets extra options to add <s>, </s>.
+  std::string options;
+  auto add_options = [&options, &context](const std::string& name,
+                                          const std::string& v) {
+    bool flag = false;
+    OP_REQUIRES_OK(context, context->GetAttr(name, &flag));
+    if (flag) {
+      if (!options.empty()) options += ':';
+      options += v;
+    }
+  };
+
+  if (type == ENCODE || type == DECODE) {
+    add_options("reverse", "reverse");
+  }
+
+  if (type == ENCODE) {
+    add_options("add_bos", "bos");
+    add_options("add_eos", "eos");
+    OP_REQUIRES_OK(
+        context,
+        ToTFStatus(sentencepiece_processor->SetEncodeExtraOptions(options)));
+  } else if (type == DECODE) {
+    OP_REQUIRES_OK(
+        context,
+        ToTFStatus(sentencepiece_processor->SetDecodeExtraOptions(options)));
+  }
+}
+}  // namespace
+
+class SentencePieceGetPieceSizeOp : public OpKernel {
+ public:
+  explicit SentencePieceGetPieceSizeOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    SentencePieceProcessor sp;
+    InitializeModel(context, &sp, GENERAL);
+    vocab_size_ = sp.GetPieceSize();
+  }
+
+  void Compute(OpKernelContext* context) override {
+    Tensor* vocab_size_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, {}, &vocab_size_tensor));
+    vocab_size_tensor->scalar<int32>()() = vocab_size_;
+  }
+
+ private:
+  int32 vocab_size_ = 0;
+};
+
+template <typename S, typename T>
+class SentencePieceConvertPieceOp : public OpKernel {
+ public:
+  explicit SentencePieceConvertPieceOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    InitializeModel(context, &sentencepiece_processor_, GENERAL);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* input_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->input("input", &input_tensor));
+
+    Tensor* output_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(),
+                                                     &output_tensor));
+    for (int i = 0; i < input_tensor->NumElements(); ++i)
+      output_tensor->flat<T>()(i) = Convert(input_tensor->flat<S>()(i));
+  }
+
+  int32 Convert(const std::string& piece) const {
+    return sentencepiece_processor_.PieceToId(piece);
+  }
+
+  std::string Convert(int32 id) const {
+    if (id >= 0 && id < sentencepiece_processor_.GetPieceSize()) {
+      return sentencepiece_processor_.IdToPiece(id);
+    }
+    return "";
+  }
+
+ private:
+  SentencePieceProcessor sentencepiece_processor_;
+};
+
+template <typename T>
+class SentencePieceEncodeOpBase : public OpKernel {
+ public:
+  explicit SentencePieceEncodeOpBase(OpKernelConstruction* context)
+      : OpKernel(context) {
+    InitializeModel(context, &sentencepiece_processor_, ENCODE);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* input_tensor = nullptr;
+
+    OP_REQUIRES_OK(context, context->input("input", &input_tensor));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor->shape()),
+                ::tensorflow::errors::InvalidArgument(
+                    "`input` must be a vector, got shape: ",
+                    input_tensor->shape().DebugString()));
+    const auto& input_sentences = input_tensor->vec<std::string>();
+    const int64 batch_size = input_sentences.size();
+
+    const Tensor* nbest_size_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->input("nbest_size", &nbest_size_tensor));
+    OP_REQUIRES(context, nbest_size_tensor->dims() <= 1,
+                ::tensorflow::errors::InvalidArgument(
+                    "`nbest_size` must be a scalar or vector. got shape: ",
+                    nbest_size_tensor->shape().DebugString()));
+    if (nbest_size_tensor->dims() == 1) {
+      OP_REQUIRES(
+          context, batch_size == nbest_size_tensor->dim_size(0),
+          ::tensorflow::errors::InvalidArgument(
+              "`nbest_size` must have the same batch size as `input`."));
+    }
+
+    const Tensor* alpha_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->input("alpha", &alpha_tensor));
+    OP_REQUIRES(context, alpha_tensor->dims() <= 1,
+                ::tensorflow::errors::InvalidArgument(
+                    "`alpha` must be a scalar or vector, got shape: ",
+                    alpha_tensor->shape().DebugString()));
+    if (alpha_tensor->dims() == 1) {
+      OP_REQUIRES(context, batch_size == alpha_tensor->dim_size(0),
+                  ::tensorflow::errors::InvalidArgument(
+                      "`alpha` must have the same batch size as `input`."));
+    }
+
+    std::vector<std::vector<T>> pieces(batch_size);
+
+    for (int64 i = 0; i < batch_size; ++i) {
+      const int32 nbest_size = nbest_size_tensor->dims() == 1
+                                   ? nbest_size_tensor->vec<int32>()(i)
+                                   : nbest_size_tensor->scalar<int32>()();
+      if (nbest_size == 0 || nbest_size == 1) {
+        OP_REQUIRES_OK(context, ToTFStatus(sentencepiece_processor_.Encode(
+                                    input_sentences(i), &pieces[i])));
+      } else {
+        const float alpha = alpha_tensor->dims() == 1
+                                ? alpha_tensor->vec<float>()(i)
+                                : alpha_tensor->scalar<float>()();
+        OP_REQUIRES_OK(context,
+                       ToTFStatus(sentencepiece_processor_.SampleEncode(
+                           input_sentences(i), nbest_size, alpha, &pieces[i])));
+      }
+    }
+
+    MakeOutputTensor(context, pieces);
+  }
+
+ private:
+  virtual void MakeOutputTensor(OpKernelContext* context,
+                                const std::vector<std::vector<T>>& pieces) = 0;
+
+  SentencePieceProcessor sentencepiece_processor_;
+};
+
+template <typename T>
+class SentencePieceEncodeSparseOp : public SentencePieceEncodeOpBase<T> {
+ public:
+  explicit SentencePieceEncodeSparseOp(OpKernelConstruction* context)
+      : SentencePieceEncodeOpBase<T>(context) {}
+
+ private:
+  void MakeOutputTensor(OpKernelContext* context,
+                        const std::vector<std::vector<T>>& pieces) override {
+    const int64 batch_size = pieces.size();
+
+    int64 max_sequence_length = 0;
+    int64 indices_size = 0;
+    for (int row = 0; row < batch_size; ++row) {
+      const int col_size = pieces[row].size();
+      max_sequence_length = std::max<int64>(col_size, max_sequence_length);
+      indices_size += col_size;
+    }
+
+    // Creates the indices output tensor.
+    Tensor* indices_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(0, {indices_size, 2},
+                                                     &indices_tensor));
+
+    auto indices_tensor_output = indices_tensor->matrix<int64>();
+    int item_idx = 0;
+    for (int row = 0; row < batch_size; ++row) {
+      for (int col = 0; col < pieces[row].size(); ++col) {
+        indices_tensor_output(item_idx, 0) = row;
+        indices_tensor_output(item_idx, 1) = col;
+        ++item_idx;
+      }
+    }
+
+    // Creates the values output tensor.
+    Tensor* values_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, {indices_size}, &values_tensor));
+
+    auto values_tensor_output = values_tensor->flat<T>();
+    item_idx = 0;
+    for (int row = 0; row < batch_size; ++row) {
+      std::copy(pieces[row].begin(), pieces[row].end(),
+                &values_tensor_output(item_idx));
+      item_idx += pieces[row].size();
+    }
+
+    // Creates the shape output tensor.
+    Tensor* shape_tensor = nullptr;
+    OP_REQUIRES_OK(context, context->allocate_output(2, {2}, &shape_tensor));
+
+    auto shape_tensor_output = shape_tensor->flat<int64>();
+    shape_tensor_output(0) = batch_size;
+    shape_tensor_output(1) = max_sequence_length;
+  }
+};
+
+template <typename T>
+class SentencePieceEncodeDenseOp : public SentencePieceEncodeOpBase<T> {
+ public:
+  explicit SentencePieceEncodeDenseOp(OpKernelConstruction* context)
+      : SentencePieceEncodeOpBase<T>(context) {}
+
+ private:
+  void MakeOutputTensor(OpKernelContext* context,
+                        const std::vector<std::vector<T>>& pieces) override {
+    const int64 batch_size = pieces.size();
+
+    int64 max_sequence_length = 0;
+    for (int row = 0; row < batch_size; ++row) {
+      max_sequence_length =
+          std::max<int64>(pieces[row].size(), max_sequence_length);
+    }
+
+    Tensor* values_tensor = nullptr;
+    Tensor* length_tensor = nullptr;
+
+    OP_REQUIRES_OK(
+        context, context->allocate_output(0, {batch_size, max_sequence_length},
+                                          &values_tensor));
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(1, {batch_size}, &length_tensor));
+
+    auto values_tensor_output = values_tensor->matrix<T>();
+    auto length_tensor_output = length_tensor->vec<int32>();
+
+    for (int row = 0; row < batch_size; ++row) {
+      for (int col = 0; col < max_sequence_length; ++col) {
+        values_tensor_output(row, col) =
+            col < pieces[row].size() ? pieces[row][col] : T();
+      }
+      length_tensor_output(row) = pieces[row].size();
+    }
+  }
+};
+
+template <typename T>
+class SentencePieceDecodeOp : public OpKernel {
+ public:
+  explicit SentencePieceDecodeOp(OpKernelConstruction* context)
+      : OpKernel(context) {
+    InitializeModel(context, &sentencepiece_processor_, DECODE);
+  }
+
+  void Compute(OpKernelContext* context) override {
+    const Tensor* input_tensor = nullptr;
+    const Tensor* length_tensor = nullptr;
+
+    OP_REQUIRES_OK(context, context->input("input", &input_tensor));
+    OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_tensor->shape()),
+                ::tensorflow::errors::InvalidArgument(
+                    "`input` must be a 2-D matrix. got shape: ",
+                    input_tensor->shape().DebugString()));
+    OP_REQUIRES_OK(context, context->input("sequence_length", &length_tensor));
+    OP_REQUIRES(context, TensorShapeUtils::IsVector(length_tensor->shape()),
+                ::tensorflow::errors::InvalidArgument(
+                    "`sequence_length` must be a vector. got shape: ",
+                    length_tensor->shape().DebugString()));
+    OP_REQUIRES(
+        context, input_tensor->dim_size(0) == length_tensor->dim_size(0),
+        ::tensorflow::errors::InvalidArgument(
+            "`sequence_length` must have the same batch size as `input`."));
+
+    const auto& input_sentences = input_tensor->matrix<T>();
+    const auto& sequence_length = length_tensor->vec<int32>();
+    const int64 batch_size = input_tensor->dim_size(0);
+    const int max_sequence_length = input_tensor->dim_size(1);
+
+    Tensor* values_tensor = nullptr;
+    OP_REQUIRES_OK(context,
+                   context->allocate_output(0, {batch_size}, &values_tensor));
+    auto values_tensor_output = values_tensor->vec<std::string>();
+
+    for (int64 i = 0; i < batch_size; ++i) {
+      OP_REQUIRES(context,
+                  (sequence_length(i) >= 0 &&
+                   sequence_length(i) <= max_sequence_length),
+                  ::tensorflow::errors::InvalidArgument(
+                      "`sequence_length` is out-of-range."));
+      const std::vector<T> pieces(&input_sentences(i, 0),
+                                  &input_sentences(i, 0) + sequence_length(i));
+      OP_REQUIRES_OK(context, ToTFStatus(sentencepiece_processor_.Decode(
+                                  pieces, &values_tensor_output(i))));
+    }
+  }
+
+ private:
+  SentencePieceProcessor sentencepiece_processor_;
+};
+
+namespace {
+// The snake case of this variables are used as the function names.
+constexpr char kGetPieceSizeOpName[] = "SentencepieceGetPieceSize";
+constexpr char kPieceToIdOpName[] = "SentencepiecePieceToId";
+constexpr char kIdToPieceOpName[] = "SentencepieceIdToPiece";
+constexpr char kEncodeDenseOpName[] = "SentencepieceEncodeDense";
+constexpr char kEncodeSparseOpName[] = "SentencepieceEncodeSparse";
+constexpr char kDecodeOpName[] = "SentencepieceDecode";
+}  // namespace
+
+REGISTER_OP(kGetPieceSizeOpName)
+    .Output("vocab_size: int32")
+    .Attr("model_file: string = ''")
+    .Attr("model_proto: string = ''")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->MakeShape({}));
+      return ::tensorflow::Status::OK();
+    });
+
+REGISTER_KERNEL_BUILDER(Name(kGetPieceSizeOpName).Device(DEVICE_CPU),
+                        SentencePieceGetPieceSizeOp);
+
+REGISTER_OP(kPieceToIdOpName)
+    .Input("input: string")
+    .Output("values: int32")
+    .Attr("model_file: string = ''")
+    .Attr("model_proto: string = ''")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return ::tensorflow::Status::OK();
+    });
+
+REGISTER_KERNEL_BUILDER(Name(kPieceToIdOpName).Device(DEVICE_CPU),
+                        SentencePieceConvertPieceOp<std::string, int32>);
+
+REGISTER_OP(kIdToPieceOpName)
+    .Input("input: int32")
+    .Output("values: string")
+    .Attr("model_file: string = ''")
+    .Attr("model_proto: string = ''")
+    .SetShapeFn([](InferenceContext* c) {
+      c->set_output(0, c->input(0));
+      return ::tensorflow::Status::OK();
+    });
+
+REGISTER_KERNEL_BUILDER(Name(kIdToPieceOpName).Device(DEVICE_CPU),
+                        SentencePieceConvertPieceOp<int32, std::string>);
+
+REGISTER_OP(kEncodeDenseOpName)
+    .Attr("out_type: {int32, string} = DT_INT32")
+    .Input("input: string")
+    .Input("nbest_size: int32")
+    .Input("alpha: float")
+    .Output("values: out_type")
+    .Output("sequence_length: int32")
+    .Attr("model_file: string = ''")
+    .Attr("model_proto: string = ''")
+    .Attr("reverse: bool = false")
+    .Attr("add_bos: bool = false")
+    .Attr("add_eos: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input, nbest, alpha;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha));
+      DimensionHandle batch_size = c->Dim(input, 0);
+      if (c->Rank(nbest) == 1)
+        TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size));
+      if (c->Rank(alpha) == 1)
+        TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size));
+      c->set_output(0, c->MakeShape({batch_size, c->UnknownDim()}));
+      c->set_output(1, c->MakeShape({batch_size}));
+      return ::tensorflow::Status::OK();
+    });
+
+REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName)
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int32>("out_type"),
+                        SentencePieceEncodeDenseOp<int32>);
+
+REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName)
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<std::string>("out_type"),
+                        SentencePieceEncodeDenseOp<std::string>);
+
+REGISTER_OP(kEncodeSparseOpName)
+    .Attr("out_type: {int32, string} = DT_INT32")
+    .Input("input: string")
+    .Input("nbest_size: int32")
+    .Input("alpha: float")
+    .Output("indices: int64")
+    .Output("values: out_type")
+    .Output("dense_shape: int64")
+    .Attr("model_file: string = ''")
+    .Attr("model_proto: string = ''")
+    .Attr("reverse: bool = false")
+    .Attr("add_bos: bool = false")
+    .Attr("add_eos: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input, nbest, alpha;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest));
+      TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha));
+      DimensionHandle batch_size = c->Dim(input, 0);
+      if (c->Rank(nbest) == 1)
+        TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size));
+      if (c->Rank(alpha) == 1)
+        TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size));
+      c->set_output(0, c->MakeShape({c->UnknownDim(), 2}));
+      c->set_output(1, c->MakeShape({c->UnknownDim()}));
+      c->set_output(2, c->MakeShape({2}));
+      return ::tensorflow::Status::OK();
+    });
+
+REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName)
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<int32>("out_type"),
+                        SentencePieceEncodeSparseOp<int32>);
+
+REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName)
+                            .Device(DEVICE_CPU)
+                            .TypeConstraint<std::string>("out_type"),
+                        SentencePieceEncodeSparseOp<std::string>);
+
+REGISTER_OP(kDecodeOpName)
+    .Attr("T: {int32, string}")
+    .Input("input: T")
+    .Input("sequence_length: int32")
+    .Output("values: string")
+    .Attr("model_file: string = ''")
+    .Attr("model_proto: string = ''")
+    .Attr("reverse: bool = false")
+    .SetShapeFn([](InferenceContext* c) {
+      ShapeHandle input, sequence_length;
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
+      TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sequence_length));
+      DimensionHandle batch_size = c->Dim(input, 0);
+      TF_RETURN_IF_ERROR(
+          c->Merge(batch_size, c->Dim(sequence_length, 0), &batch_size));
+      c->set_output(0, c->MakeShape({batch_size}));
+      return ::tensorflow::Status::OK();
+    });
+
+REGISTER_KERNEL_BUILDER(
+    Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<int32>("T"),
+    SentencePieceDecodeOp<int32>);
+
+REGISTER_KERNEL_BUILDER(
+    Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<std::string>("T"),
+    SentencePieceDecodeOp<std::string>);
+}  // namespace sentencepiece
diff --git a/tensorflow/setup.py b/tensorflow/setup.py
new file mode 100755
index 0000000..aca3285
--- /dev/null
+++ b/tensorflow/setup.py
@@ -0,0 +1,46 @@
+#!/usr/bin/env python
+
+# Copyright 2018 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.!
+
+from setuptools import setup
+from setuptools import find_packages
+import string
+import sys
+
+sys.path.append('./test')
+
+setup(name = 'tf_sentencepiece',
+      author = 'Taku Kudo',
+      author_email='taku@google.com',
+      description = 'SentencePiece Encode/Decode ops for TensorFlow',
+      version='0.1.1',
+      url = 'https://github.com/google/sentencepiece',
+      license = 'Apache',
+      platforms = 'Unix',
+      packages=find_packages(exclude=['test']),
+      package_data={'tf_sentencepiece':  ['_sentencepiece_processor_ops.so']},
+      classifiers = [
+        'Development Status :: 5 - Production/Stable',
+        'Environment :: Console',
+        'Intended Audience :: Developers',
+        'Intended Audience :: Science/Research',
+        'License :: OSI Approved :: Apache Software License',
+        'Operating System :: Unix',
+        'Programming Language :: Python',
+        'Topic :: Text Processing :: Linguistic',
+        'Topic :: Software Development :: Libraries :: Python Modules'
+      ],
+      keywords='tensorflow machine learning sentencepiece NLP segmentation',
+      test_suite = 'tf_sentencepiece_test.suite')
diff --git a/tensorflow/test/__init__.py b/tensorflow/test/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/tensorflow/test/__init__.py
diff --git a/tensorflow/test/tf_sentencepiece_test.py b/tensorflow/test/tf_sentencepiece_test.py
new file mode 100755
index 0000000..e1a7b52
--- /dev/null
+++ b/tensorflow/test/tf_sentencepiece_test.py
@@ -0,0 +1,283 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import itertools as it
+import os
+import unittest
+import tensorflow as tf
+import sentencepiece as spm
+import tf_sentencepiece as tfspm
+
+class SentencePieceProcssorOpTest(unittest.TestCase):
+
+  def _getSentencePieceModelFile(self):
+    return '../python/test/test_ja_model.model'
+
+  def _getExpected(self, processor, reverse=False, add_bos=False,
+                   add_eos=False, padding=''):
+    options = []
+    if reverse:
+      options.append('reverse')
+    if add_bos:
+      options.append('bos')
+    if add_eos:
+      options.append('eos')
+
+    processor.SetEncodeExtraOptions(':'.join(options))
+    processor.SetDecodeExtraOptions(':'.join(options))
+
+    sentences = ['Hello world.', 'I have a pen.',
+                 'I saw a girl with a telescope.']
+    pieces = []
+    ids = []
+    seq_len = []
+
+    for s in sentences:
+      x = processor.EncodeAsPieces(s)
+      y = processor.EncodeAsIds(s)
+      pieces.append(x)
+      ids.append(y)
+      seq_len.append(len(x))
+      self.assertEqual(len(x), len(y))
+
+    # padding
+    max_len = max(seq_len)
+    pieces = [x + [padding] * (max_len - len(x)) for x in pieces]
+    ids = [x + [0] * (max_len - len(x)) for x in ids]
+
+    return sentences, pieces, ids, seq_len
+
+  def testGetPieceSize(self):
+    sentencepiece_model_file = self._getSentencePieceModelFile()
+    processor = spm.SentencePieceProcessor()
+    processor.Load(sentencepiece_model_file)
+
+    with tf.Session():
+      s = tfspm.piece_size(
+          model_file=sentencepiece_model_file)
+      self.assertEqual(s.eval(), processor.GetPieceSize())
+
+  def testConvertPiece(self):
+    sentencepiece_model_file = self._getSentencePieceModelFile()
+    processor = spm.SentencePieceProcessor()
+    processor.Load(sentencepiece_model_file)
+    (sentences, expected_pieces,
+     expected_ids, expected_seq_len) = self._getExpected(processor,
+                                                         padding='<unk>')
+
+    with tf.Session():
+      ids_matrix = tfspm.piece_to_id(
+          tf.constant(expected_pieces),
+          model_file=sentencepiece_model_file)
+      ids_vec = tfspm.piece_to_id(
+          tf.constant(expected_pieces[0]),
+          model_file=sentencepiece_model_file)
+      ids_scalar = tfspm.piece_to_id(
+          tf.constant(expected_pieces[0][0]),
+          model_file=sentencepiece_model_file)
+
+      self.assertEqual(ids_matrix.eval().tolist(), expected_ids)
+      self.assertEqual(ids_vec.eval().tolist(), expected_ids[0])
+      self.assertEqual(ids_scalar.eval(), expected_ids[0][0])
+
+      pieces_matrix = tfspm.id_to_piece(
+          tf.constant(expected_ids),
+          model_file=sentencepiece_model_file)
+      pieces_vec = tfspm.id_to_piece(
+          tf.constant(expected_ids[0]),
+          model_file=sentencepiece_model_file)
+      pieces_scalar = tfspm.id_to_piece(
+          tf.constant(expected_ids[0][0]),
+          model_file=sentencepiece_model_file)
+
+      self.assertEqual(pieces_matrix.eval().tolist(), expected_pieces)
+      self.assertEqual(pieces_vec.eval().tolist(), expected_pieces[0])
+      self.assertEqual(pieces_scalar.eval(), expected_pieces[0][0])
+
+
+  def testEncodeAndDecode(self):
+    sentencepiece_model_file = self._getSentencePieceModelFile()
+    processor = spm.SentencePieceProcessor()
+    processor.Load(sentencepiece_model_file)
+
+    with tf.Session():
+      for reverse, add_bos, add_eos in list(it.product(
+          (True, False), repeat=3)):
+        (sentences, expected_pieces,
+         expected_ids, expected_seq_len) = self._getExpected(
+             processor, reverse, add_bos, add_eos)
+
+        # Encode sentences into pieces/ids.
+        s = tf.constant(sentences)
+        pieces, seq_len1 = tfspm.encode(
+            s, model_file=sentencepiece_model_file,
+            reverse=reverse, add_bos=add_bos, add_eos=add_eos,
+            out_type=tf.string)
+        ids, seq_len2 = tfspm.encode(
+            s, model_file=sentencepiece_model_file,
+            reverse=reverse, add_bos=add_bos, add_eos=add_eos)
+
+        self.assertEqual(pieces.eval().tolist(), expected_pieces)
+        self.assertEqual(ids.eval().tolist(), expected_ids)
+        self.assertEqual(seq_len1.eval().tolist(), expected_seq_len)
+        self.assertEqual(seq_len2.eval().tolist(), expected_seq_len)
+
+        # Decode pieces into sentences/ids.
+        pieces = tf.constant(expected_pieces)
+        ids = tf.constant(expected_ids)
+        seq_len = tf.constant(expected_seq_len, dtype=tf.int32)
+        decoded_sentences1 = tfspm.decode(
+            pieces, seq_len, model_file=sentencepiece_model_file,
+            reverse=reverse)
+        decoded_sentences2 = tfspm.decode(
+            ids, seq_len, model_file=sentencepiece_model_file,
+            reverse=reverse)
+
+        self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
+        self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
+
+  def testSampleEncodeAndDecode(self):
+    sentencepiece_model_file = self._getSentencePieceModelFile()
+    processor = spm.SentencePieceProcessor()
+    processor.Load(sentencepiece_model_file)
+    sentences, _, _, _ = self._getExpected(processor)
+
+    with tf.Session():
+      for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]:
+        # Round trip test.
+        nbest_size = tf.constant(n)
+        alpha = tf.constant(a)
+        s = tf.constant(sentences)
+
+        pieces, seq_len1 = tfspm.encode(
+            s, nbest_size=nbest_size, alpha=alpha,
+            model_file=sentencepiece_model_file, out_type=tf.string)
+        ids, seq_len2 = tfspm.encode(
+            s, nbest_size=nbest_size, alpha=alpha,
+            model_file=sentencepiece_model_file)
+        decoded_sentences1 = tfspm.decode(
+            pieces, seq_len1, model_file=sentencepiece_model_file)
+        decoded_sentences2 = tfspm.decode(
+            ids, seq_len2, model_file=sentencepiece_model_file)
+
+        self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
+        self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
+
+  def testEncodeAndDecodeSparse(self):
+    sentencepiece_model_file = self._getSentencePieceModelFile()
+    processor = spm.SentencePieceProcessor()
+    processor.Load(sentencepiece_model_file)
+
+    with tf.Session():
+      for reverse, add_bos, add_eos in list(it.product(
+          (True, False), repeat=3)):
+        (sentences, expected_pieces, expected_ids,
+         _) = self._getExpected(processor, reverse, add_bos, add_eos)
+
+        # Encode sentences into sparse pieces/ids.
+        s = tf.constant(sentences)
+        pieces = tfspm.encode_sparse(
+            s, model_file=sentencepiece_model_file,
+            reverse=reverse, add_bos=add_bos, add_eos=add_eos,
+            out_type=tf.string)
+        ids = tfspm.encode_sparse(
+            s, model_file=sentencepiece_model_file,
+            reverse=reverse, add_bos=add_bos, add_eos=add_eos)
+        pieces = tf.sparse_tensor_to_dense(pieces, default_value='')
+        ids = tf.sparse_tensor_to_dense(ids, default_value=0)
+
+        self.assertEqual(ids.eval().tolist(), expected_ids)
+        self.assertEqual(pieces.eval().tolist(), expected_pieces)
+
+  def testLoadModelProto(self):
+    # Makes a serialized model proto.
+    model_proto = open(self._getSentencePieceModelFile(), 'rb').read()
+    with tf.Session() as sess:
+      sentences = ['Hello world.']
+      a = tf.constant(sentences)
+      sess.run(tfspm.encode(
+          a, model_proto=model_proto,
+          out_type=tf.string))
+
+  def testInvalidModelPath(self):
+    with tf.Session() as sess:
+      with self.assertRaises(tf.errors.NotFoundError):
+        sentences = ['Hello world.']
+        a = tf.constant(sentences)
+        sess.run(tfspm.encode(
+            a, model_file='invalid path', out_type=tf.string))
+
+  def testInvalidModelProto(self):
+    with tf.Session() as sess:
+      with self.assertRaises(tf.errors.InternalError):
+        sentences = ['Hello world.']
+        a = tf.constant(sentences)
+        sess.run(tfspm.encode(
+            a, model_proto='invalid proto', out_type=tf.string))
+
+  def testInvalidInput(self):
+    sentences = ['Hello world.', 'This is a test.']
+    ids = [[0,1],[2,3]]
+    model_file = self._getSentencePieceModelFile()
+    with tf.Session() as sess:
+      a = tf.constant(sentences)
+      b = tf.constant(ids)
+
+      alpha = tf.constant([1.0, 2.0])
+      sess.run(tfspm.encode(
+          a, model_file=model_file, alpha=alpha, name='foo'))
+
+      nbest_size = tf.constant([1, 2], dtype=tf.int32)
+      sess.run(tfspm.encode(
+          a, model_file=model_file, nbest_size=nbest_size, name='foo'))
+
+      alpha = tf.constant(1.0)
+      sess.run(tfspm.encode(
+          a, model_file=model_file, alpha=alpha, name='foo'))
+
+      nbest_size = tf.constant(10, dtype=tf.int32)
+      sess.run(tfspm.encode(
+          a, model_file=model_file, nbest_size=nbest_size, name='foo'))
+
+      sess.run(tfspm.decode(
+          b, sequence_length=tf.constant([2, 2]), model_file=model_file))
+
+      with self.assertRaises(ValueError):
+        a = tf.constant(sentences)
+        alpha = tf.constant([1.0, 2.0, 3.0])
+        sess.run(tfspm.encode(
+            a, model_file=model_file, alpha=alpha))
+      with self.assertRaises(ValueError):
+        a = tf.constant(sentences)
+        nbest_size = tf.constant([1, 2, 3], dtype=tf.int32)
+        sess.run(tfspm.encode(
+            a, model_file=model_file, nbest_size=nbest_size))
+      with self.assertRaises(ValueError):
+        a = tf.constant(sentences)
+        alpha = tf.constant([[1.0], [2.0]])
+        sess.run(tfspm.encode(
+            a, model_file=model_file, alpha=alpha))
+      with self.assertRaises(ValueError):
+        a = tf.constant(sentences)
+        nbest_size = tf.constant([[1], [2]], dtype=tf.int32)
+        sess.run(tfspm.encode(
+            a, model_file=model_file, nbest_size=nbest_size))
+      with self.assertRaises(ValueError):
+        b = tf.constant(ids)
+        sess.run(tfspm.decode(
+            a, sequence_length=2, model_file=model_file))
+      with self.assertRaises(ValueError):
+        b = tf.constant(ids)
+        sess.run(tfspm.decode(
+            a, sequence_length=tf.constant([2, 2, 2]),
+            model_file=model_file))
+
+
+def suite():
+  suite = unittest.TestSuite()
+  suite.addTests(unittest.makeSuite(SentencePieceProcssorOpTest))
+  return suite
+
+
+if __name__ == '__main__':
+  unittest.main()
diff --git a/tensorflow/tf_sentencepiece/__init__.py b/tensorflow/tf_sentencepiece/__init__.py
new file mode 100644
index 0000000..8a3df31
--- /dev/null
+++ b/tensorflow/tf_sentencepiece/__init__.py
@@ -0,0 +1,5 @@
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+from tf_sentencepiece.sentencepiece_processor_ops import *
diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so
new file mode 100755
index 0000000..db7fe23
--- /dev/null
+++ b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so
diff --git a/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py b/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py
new file mode 100644
index 0000000..baafd6c
--- /dev/null
+++ b/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py
@@ -0,0 +1,192 @@
+# Copyright 2018 Google Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.!
+
+r"""Ops for SentencePiece Encoding/Decoding."""
+
+# TODO(taku):  Implements n-best output
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import os
+import tensorflow as tf
+
+_gen_sentencepiece_processor_op = tf.load_op_library(
+    os.path.join(os.path.dirname(__file__), '_sentencepiece_processor_ops.so'))
+
+
+def piece_size(model_file=None, model_proto=None, name=None):
+  """Returns the piece size (vocabulary size).
+
+  Args:
+    model_file: The sentencepiece model file path.
+    model_proto: The sentencepiece model serialized proto.
+                 Either `model_file` or `model_proto` must be set.
+    name: The name argument that is passed to the op function.
+  Returns:
+    A scalar representing the vocabulary size.
+  """
+
+  return _gen_sentencepiece_processor_op.sentencepiece_get_piece_size(
+      model_file=model_file, model_proto=model_proto, name=name)
+
+
+def piece_to_id(input, model_file=None, model_proto=None, name=None):
+  """Converts piece into vocabulary id.
+
+  Args:
+    input: An arbitrary tensor of string.
+    model_file: The sentencepiece model file path.
+    model_proto: The sentencepiece model serialized proto.
+                 Either `model_file` or `model_proto` must be set.
+    name: The name argument that is passed to the op function.
+  Returns:
+    A tensor of int32 with the same shape as input.
+  """
+
+  return _gen_sentencepiece_processor_op.sentencepiece_piece_to_id(
+      input, model_file=model_file, model_proto=model_proto, name=name)
+
+
+def id_to_piece(input, model_file=None, model_proto=None, name=None):
+  """Converts vocabulary id into piece.
+
+  Args:
+    input: An arbitrary tensor of int32.
+    model_file: The sentencepiece model file path.
+    model_proto: The sentencepiece model serialized proto.
+                 Either `model_file` or `model_proto` must be set.
+    name: The name argument that is passed to the op function.
+  Returns:
+    A tensor of string with the same shape as input.
+  """
+
+  return _gen_sentencepiece_processor_op.sentencepiece_id_to_piece(
+      input, model_file=model_file, model_proto=model_proto, name=name)
+
+
+def encode_dense(input_sentences, nbest_size=0, alpha=1.0,
+                 model_file=None, model_proto=None,
+                 reverse=False, add_bos=False, add_eos=False,
+                 out_type=tf.int32, name=None):
+  """Encodes sentences into pieces in dense tensor format.
+
+  Args:
+    input_sentences: A 1D string tensor of arbitrary size holding the raw
+                     text of input sentences.
+    nbest_size: A scalar or 1D tensor for sampling.
+                nbest_size = {0,1}: No sampling is performed.
+                nbest_size > 1: samples from the nbest_size results.
+                nbest_size < 0: assuming that nbest_size is infinite
+                and samples from the all hypothesis (lattice) using
+                forward-filtering-and-backward-sampling algorithm.
+    alpha: A scalar or 1D tensor for a moothing parameter.
+           Inverse temparature for probablity rescaling.
+    model_file: The sentencepiece model file path.
+    model_proto: The sentencepiece model serialized proto.
+                 Either `model_file` or `model_proto` must be set.
+    reverse: Reverses the tokenized sequence (Default = false)
+    add_bos: Add <s> to the result (Default = false)
+    add_eos: Add </s> to the result (Default = false)
+             <s>/</s> is added after reversing (if enabled).
+    out_type: output type. tf.int32 or tf.string (Default = tf.int32)
+              Setting tf.int32 directly encodes the string into an id sequence.
+    name: The name argument that is passed to the op function.
+  Returns:
+    pieces: A dense 2D tensor representing the tokenized sentences.
+    sequence_length: A 1D tensor representing the length of pieces.
+  """
+
+  return _gen_sentencepiece_processor_op.sentencepiece_encode_dense(
+      input_sentences, nbest_size=nbest_size, alpha=alpha,
+      model_file=model_file, model_proto=model_proto,
+      reverse=reverse, add_bos=add_bos, add_eos=add_eos,
+      out_type=out_type, name=name)
+
+# Adds an alias for encode_dense. Accepts the `encode` function.
+encode = encode_dense
+
+
+def encode_sparse(input_sentences, nbest_size=0, alpha=1.0,
+                  model_file=None, model_proto=None,
+                  reverse=False, add_bos=False, add_eos=False,
+                  out_type=tf.int32, name=None):
+  """Encodes sentences into pieces in sparse tensor format.
+
+  Args:
+    input_sentences: A 1D string tensor of arbitrary size holding the raw
+                     text of input sentences.
+    nbest_size: A scalar or 1D tensor for sampling.
+                nbest_size = {0,1}: No sampling is performed.
+                nbest_size > 1: samples from the nbest_size results.
+                nbest_size < 0: assuming that nbest_size is infinite
+                and samples from the all hypothesis (lattice) using
+                forward-filtering-and-backward-sampling algorithm.
+    alpha: A scalar or 1D tensor for a moothing parameter.
+           Inverse temparature for probablity rescaling.
+    model_file: The sentencepiece model file path.
+    model_proto: The sentencepiece model serialized proto.
+                 Either `model_file` or `model_proto` must be set.
+    reverse: Reverses the tokenized sequence (Default = false)
+    add_bos: Add <s> to the result (Default = false)
+    add_eos: Add </s> to the result (Default = false)
+             <s>/</s> is added after reversing (if enabled).
+    out_type: output type. tf.int32 or tf.string (Default = tf.int32)
+              Setting tf.int32 directly encodes the string into an id sequence.
+    name: The name argument that is passed to the op function.
+
+  Returns:
+    pieces: A sparse 2D tensor representing the tokenized sentences.
+  """
+
+  indices, values, dense_shape = (
+      _gen_sentencepiece_processor_op.sentencepiece_encode_sparse(
+          input_sentences, nbest_size=nbest_size, alpha=alpha,
+          model_file=model_file, model_proto=model_proto,
+          reverse=reverse, add_bos=add_bos, add_eos=add_eos,
+          out_type=out_type, name=name))
+  return tf.SparseTensor(indices, values, dense_shape)
+
+
+def decode(pieces, sequence_length, model_file=None, model_proto=None,
+           reverse=False, name=None):
+  """Decode pieces into postproecssed text.
+
+  Args:
+    pieces: A 2D int32 or string tensor [batch_size x max_length] of
+            encoded sequences.
+    sequence_length: A 1D int32 tensor [batch_size] representing the
+                   length of pieces.
+    model_file: The sentencepiece model file path.
+    model_proto: The sentencepiece model serialized proto.
+                 Either `model_file` or `model_proto` must be set.
+    reverse: Reverses the tokenized sequence (Default = false)
+    name: The name argument that is passed to the op function.
+
+  Returns:
+    text: A 1D string tensor of decoded string.
+  """
+
+  return _gen_sentencepiece_processor_op.sentencepiece_decode(
+      pieces, sequence_length, model_file=model_file,
+      model_proto=model_proto, reverse=reverse, name=name)
+
+
+tf.NotDifferentiable('SentencepieceGetPieceSize')
+tf.NotDifferentiable('SentencepieceIdToPiece')
+tf.NotDifferentiable('SentencepiecePieceToId')
+tf.NotDifferentiable('SentencepieceEncodeDense')
+tf.NotDifferentiable('SentencepieceEncodeSparse')
+tf.NotDifferentiable('SentencepieceDecode')
author	Taku Kudo <taku@google.com>	2018-06-26 08:59:55 +0300
committer	Taku Kudo <taku@google.com>	2018-06-26 08:59:55 +0300
commit	81d1a091fbe294886d6d73ccdaf4b645fa8b17b1 (patch)
tree	4f47cb2ec157781c55d6666735fabecf94a51f68 /tensorflow
parent	65da95da9abc95948cf1876c53c2491aded8dc91 (diff)