Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/sentencepiece.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTaku Kudo <taku@google.com>2020-10-12 19:22:31 +0300
committerTaku Kudo <taku@google.com>2020-10-12 19:22:31 +0300
commit8f5610375816b7ae4b9999f6984c0f14810d1a25 (patch)
treedd8327bbe29d4dfe228b8d3fb5d3408b15877029
parent9cf136582d9cce492ba5a0cfb775f9e777fe07ea (diff)
remove tensorflow module
-rw-r--r--.travis.yml18
-rw-r--r--tensorflow/__init__.py0
-rwxr-xr-xtensorflow/make_py_wheel.sh88
-rwxr-xr-xtensorflow/make_py_wheel_mac.sh116
-rw-r--r--tensorflow/sentencepiece_processor_ops.cc652
-rwxr-xr-xtensorflow/setup.py52
-rw-r--r--tensorflow/test/__init__.py0
-rwxr-xr-xtensorflow/test/tf_sentencepiece_test.py322
-rw-r--r--tensorflow/tf_sentencepiece/__init__.py5
-rwxr-xr-xtensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.0bin943032 -> 0 bytes
-rwxr-xr-xtensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.2bin943032 -> 0 bytes
-rwxr-xr-xtensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.0bin943032 -> 0 bytes
-rwxr-xr-xtensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.1bin943032 -> 0 bytes
-rwxr-xr-xtensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.1.0bin943032 -> 0 bytes
-rwxr-xr-xtensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.2.0bin947160 -> 0 bytes
-rw-r--r--tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py317
-rwxr-xr-xtest.sh19
17 files changed, 3 insertions, 1586 deletions
diff --git a/.travis.yml b/.travis.yml
index 4467f48..1c5625d 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -8,13 +8,13 @@ matrix:
env: IMAGE=i386/ubuntu:rolling COMMAND=build_linux_gcc_ubuntu_i386
services: docker
- os: linux
- env: IMAGE=ubuntu:bionic COMMAND=build_linux_gcc_ubuntu_no_tf
+ env: IMAGE=ubuntu:bionic COMMAND=build_linux_gcc_ubuntu
services: docker
- os: linux
- env: IMAGE=ubuntu:xenial COMMAND=build_linux_gcc_ubuntu_no_tf
+ env: IMAGE=ubuntu:xenial COMMAND=build_linux_gcc_ubuntu
services: docker
- os: linux
- env: IMAGE=ubuntu:trusty COMMAND=build_linux_gcc_ubuntu_no_tf
+ env: IMAGE=ubuntu:trusty COMMAND=build_linux_gcc_ubuntu
services: docker
- os: linux
env: IMAGE=debian:stable COMMAND=build_linux_gcc_debian
@@ -37,12 +37,6 @@ matrix:
- $TRAVIS_BUILD_DIR/python/make_py_wheel.sh ${IMAGE}
- if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi
services: docker
- - os: linux
- env: IMAGE=x86_64 COMMAND=make_py_wheel_tf RELEASE_FILES="$TRAVIS_BUILD_DIR/tensorflow/dist/*.whl"
- script:
- - $TRAVIS_BUILD_DIR/tensorflow/make_py_wheel.sh
- - if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi
- services: docker
- os: osx
osx_image: xcode9.4
env: IMAGE=native COMMAND=build_osx
@@ -52,12 +46,6 @@ matrix:
script:
- $TRAVIS_BUILD_DIR/python/make_py_wheel_mac.sh
- if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi
- - os: osx
- osx_image: xcode9.4
- env: IMAGE=native COMMAND=make_py_wheel_mac_tf RELEASE_FILES="$TRAVIS_BUILD_DIR/tensorflow/dist/*.whl"
- script:
- - $TRAVIS_BUILD_DIR/tensorflow/make_py_wheel_mac.sh
- - if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi
script:
- $TRAVIS_BUILD_DIR/test.sh ${IMAGE} ${COMMAND}
- if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi
diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/tensorflow/__init__.py
+++ /dev/null
diff --git a/tensorflow/make_py_wheel.sh b/tensorflow/make_py_wheel.sh
deleted file mode 100755
index 8ccf4e1..0000000
--- a/tensorflow/make_py_wheel.sh
+++ /dev/null
@@ -1,88 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.!
-
-set -e # exit immediately on error
-set -x # display all commands
-
-CMAKE_VERSION=3.12.0
-
-run_docker() {
- cd `dirname $0`
- docker pull $1
- docker run --rm -ti --name tf_sentencepiece \
- -v `pwd`/../:/sentencepiece -w /sentencepiece/tensorflow \
- -td $1 /bin/bash
- docker exec tf_sentencepiece bash -c "./make_py_wheel.sh native $2"
- docker stop tf_sentencepiece
-}
-
-build_tf_wrapper() {
- pkg_name="==$1"
-
- pip3 install tensorflow${pkg_name} --upgrade
-
- TF_CFLAGS=( $(python3 -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
- TF_LFLAGS=( $(python3 -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
- TF_VERSION=( $(python3 -c 'import tensorflow as tf; print(tf.__version__)') )
-
- echo TF_CFLAGS=${TF_CFLAGS[@]}
- echo TF_LFLAGS=${TF_LFLAGS[@]}
- echo TF_VERSION=${TF_VERSION}
-
- g++ -std=c++11 -shared \
- -I../../src \
- -D_USE_TF_STRING_VIEW \
- -fPIC ${TF_CFLAGS[@]} -O2 \
- -Wl,--whole-archive \
- /usr/local/lib/libsentencepiece.a \
- -Wl,--no-whole-archive \
- sentencepiece_processor_ops.cc \
- -o tf_sentencepiece/_sentencepiece_processor_ops.so.${TF_VERSION} \
- ${TF_LFLAGS[@]}
-
- strip tf_sentencepiece/_sentencepiece_processor_ops.so.${TF_VERSION}
-
- python3 setup.py test
-}
-
-build() {
- rm -fr build
- mkdir -p build
- cd build
-
- cmake ../.. -DSPM_ENABLE_SHARED=OFF -DSPM_ENABLE_TENSORFLOW_SHARED=ON
- make -j4
- make install
- cd ..
-
- for v in $@; do
- build_tf_wrapper $v
- done
-
- python3 setup.py bdist_wheel --universal --plat-name=manylinux1_x86_64
- python3 setup.py sdist
- rm -fr build tf_sentencepiece.egg-info
-}
-
-if [ "$1" = "native" ]; then
- shift
- build $@
-else
-# Do not support TF<=1.14 because API compatiblity issue is not fixed.
-# run_docker tensorflow/tensorflow:custom-op-ubuntu14 "1.13.1 1.13.2 1.14.0"
- run_docker tensorflow/tensorflow:custom-op-ubuntu16 "1.15.0 1.15.2 2.0.0 2.0.1"
- run_docker tensorflow/tensorflow:2.1.0-custom-op-ubuntu16 "2.1.0 2.2.0"
-fi
diff --git a/tensorflow/make_py_wheel_mac.sh b/tensorflow/make_py_wheel_mac.sh
deleted file mode 100755
index 20c28af..0000000
--- a/tensorflow/make_py_wheel_mac.sh
+++ /dev/null
@@ -1,116 +0,0 @@
-#!/bin/bash
-
-# Copyright 2018 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.!
-
-set -e # exit immediately on error
-set -x # display all commands
-
-build_tf_wrapper() {
- if [ "$1" != "" ]; then
- pkg_name="==$1"
- fi
-
- # Builds _sentencepiece_processor_ops.so
- pip install tensorflow${pkg_name} --upgrade --no-cache-dir -I
-
- pip uninstall numpy -y || true
- pip uninstall numpy -y || true
- pip uninstall numpy -y || true
- pip install numpy --upgrade --no-cache-dir -I
-
- TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') )
- TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') )
- TF_VERSION=( $(python -c 'import tensorflow as tf; print(tf.__version__)') )
-
- TF_LFLAGS2=`echo -n ${TF_LFLAGS[@]} | sed -e 's/-l:lib/-l/' -e 's/.[12].dylib//'`
-
- g++ -std=c++11 -shared -undefined dynamic_lookup \
- -I../../src \
- -D_USE_TF_STRING_VIEW \
- -fPIC ${TF_CFLAGS[@]} -O2 \
- -D_GLIBCXX_USE_CXX11_ABI=0 \
- -Wl,-force_load \
- /usr/local/lib/libsentencepiece.a \
- sentencepiece_processor_ops.cc \
- -o tf_sentencepiece/_sentencepiece_processor_ops.so.${TF_VERSION} \
- ${TF_LFLAGS2}
-
- strip -x tf_sentencepiece/_sentencepiece_processor_ops.so.${TF_VERSION}
-}
-
-build() {
- VERSION="3.7"
- URL="https://www.python.org/ftp/python/3.7.0/python-3.7.0-macosx10.6.pkg"
- INSTALL_PATH="/Library/Frameworks/Python.framework/Versions/${VERSION}/bin"
- CURRENT_PATH=${PATH}
-
- curl -L -o python.pkg ${URL}
- sudo installer -pkg python.pkg -target /
-
- if [ -f "${INSTALL_PATH}/python3" ]; then
- ln -s ${INSTALL_PATH}/python3 ${INSTALL_PATH}/python
- ln -s ${INSTALL_PATH}/python3-config ${INSTALL_PATH}/python-config
- ln -s ${INSTALL_PATH}/pip3 ${INSTALL_PATH}/pip
- fi
-
- curl -L -O https://bootstrap.pypa.io/get-pip.py
-
- export PATH="${INSTALL_PATH}:${CURRENT_PATH}"
- ls -l ${INSTALL_PATH}
- which python
- which pip
- python --version
- sudo python get-pip.py --no-setuptools --no-wheel --ignore-installed
- pip install --upgrade setuptools
- pip install wheel
- pip install delocate
-
- cd tensorflow
- rm -fr build
- mkdir -p build
- cd build
-
- # Install sentencepiece
- cmake ../.. -DSPM_ENABLE_SHARED=OFF -DSPM_ENABLE_TENSORFLOW_SHARED=ON
- make -j4 VERBOSE=1
- make install
- cd ..
-
- # Remove pre-installed Linux so files.
- rm -f tf_sentencepiece/*.so.*
-
- build_tf_wrapper "2.2.0"
- build_tf_wrapper "2.1.0"
-# build_tf_wrapper "2.0.1"
- build_tf_wrapper "2.0.0"
-# build_tf_wrapper "1.15.2"
- build_tf_wrapper "1.15.0"
-# build_tf_wrapper "1.14.0"
-# build_tf_wrapper "1.13.2"
-# build_tf_wrapper "1.13.1"
-
- # Builds Python manylinux wheel package.
- # Platform name is determined by the tensorflow pip package.
- # TODO(taku): Automatically detect the platname of tensoflow-pip
- # PLAT_NAME=$(python -c 'import distutils.util; print(distutils.util.get_platform())')
- PLAT_NAME=macosx_10_10_x86_64
- python setup.py bdist_wheel --universal --plat-name=${PLAT_NAME}
- # python setup.py test
- python setup.py sdist
-
- rm -fr build tf_sentencepiece.egg-info tmp
-}
-
-build
diff --git a/tensorflow/sentencepiece_processor_ops.cc b/tensorflow/sentencepiece_processor_ops.cc
deleted file mode 100644
index 7cf915f..0000000
--- a/tensorflow/sentencepiece_processor_ops.cc
+++ /dev/null
@@ -1,652 +0,0 @@
-// Copyright 2018 Google Inc.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.!
-
-#include <mutex>
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "sentencepiece_processor.h"
-#include "tensorflow/core/framework/op.h"
-#include "tensorflow/core/framework/op_kernel.h"
-#include "tensorflow/core/framework/shape_inference.h"
-#include "tensorflow/core/framework/tensor_shape.h"
-#include "tensorflow/core/lib/hash/hash.h"
-
-typedef int int32;
-typedef long long int int64;
-typedef unsigned long long int uint64;
-
-namespace sentencepiece {
-using ::tensorflow::DEVICE_CPU;
-using ::tensorflow::Hash64;
-using ::tensorflow::OpKernel;
-using ::tensorflow::OpKernelConstruction;
-using ::tensorflow::OpKernelContext;
-using ::tensorflow::Tensor;
-using ::tensorflow::TensorShapeUtils;
-using ::tensorflow::tstring;
-using ::tensorflow::shape_inference::DimensionHandle;
-using ::tensorflow::shape_inference::InferenceContext;
-using ::tensorflow::shape_inference::ShapeHandle;
-
-namespace {
-
-// A utility function to convert sentencepiece::util::Status to
-// ::tensorflow::Status
-::tensorflow::Status ToTFStatus(const sentencepiece::util::Status& s) {
- if (s.ok()) return ::tensorflow::Status();
- return ::tensorflow::Status(static_cast<::tensorflow::error::Code>(s.code()),
- ::tensorflow::string(s.error_message()));
-}
-
-// Global cache to reuse SentencePieceProcessor with the same
-// model file or model proto. The instance is managed with shared_ptr so
-// the instance is deleted when no client is using it (refcount is zero).
-class SentencePieceProcessorCache {
- public:
- std::shared_ptr<SentencePieceProcessor> get(
- const std::string key, bool is_proto,
- sentencepiece::util::Status* status) {
- std::lock_guard<std::mutex> l(mutex_);
-
- const uint64 fp = Hash64(key.data(), key.size());
- auto sp = data_[fp].lock();
-
- if (sp) {
- *status = sp->status();
- return sp;
- }
-
- sp = std::make_shared<SentencePieceProcessor>();
- *status = is_proto ? sp->LoadFromSerializedProto(key) : sp->Load(key);
- if (!status->ok()) return nullptr;
-
- data_[fp] = sp;
- return sp;
- }
-
- private:
- std::mutex mutex_;
- std::unordered_map<uint64, std::weak_ptr<SentencePieceProcessor>> data_;
-};
-
-class SentencePieceBaseOp : public OpKernel {
- public:
- explicit SentencePieceBaseOp(OpKernelConstruction* context)
- : OpKernel(context) {
- std::string model_file_attr, model_proto_attr;
- OP_REQUIRES_OK(context, context->GetAttr("model_file", &model_file_attr));
- OP_REQUIRES_OK(context, context->GetAttr("model_proto", &model_proto_attr));
-
- // Initializes global cache.
- static SentencePieceProcessorCache* cache = new SentencePieceProcessorCache;
- sentencepiece::util::Status status;
-
- OP_REQUIRES(context,
- ((model_proto_attr.empty() && !model_file_attr.empty()) ||
- (!model_proto_attr.empty() && model_file_attr.empty())),
- ::tensorflow::errors::InvalidArgument(
- "Either `model_proto` or `model_file` must be set."));
-
- if (!model_file_attr.empty()) {
- sentencepiece_processor_ = cache->get(model_file_attr, false, &status);
- } else {
- // Loads serialized sentencepiece model proto to enable embedding the
- // relatively small sentencepiece model proto into the tensorflow graph
- // such that the tensorflow graph is self-contained.
- sentencepiece_processor_ = cache->get(model_proto_attr, true, &status);
- }
-
- OP_REQUIRES_OK(context, ToTFStatus(status));
- OP_REQUIRES(context, sentencepiece_processor_,
- ::tensorflow::errors::InvalidArgument(
- "Failed to initialize SentencePieceProcessor"));
-
- // Sets extra options to add <s>, </s>.
- auto has_attribute = [&context](const std::string& name) {
- bool flag = false;
- context->GetAttr(name, &flag).IgnoreError();
- return flag;
- };
-
- if (has_attribute("add_bos")) {
- bos_id_ = sentencepiece_processor_->bos_id();
- OP_REQUIRES(context, bos_id_ >= 0,
- ::tensorflow::errors::InvalidArgument(
- "`bos_id` is not defined in model"));
- }
-
- if (has_attribute("add_eos")) {
- eos_id_ = sentencepiece_processor_->eos_id();
- OP_REQUIRES(context, eos_id_ >= 0,
- ::tensorflow::errors::InvalidArgument(
- "`eos_id` is not defined in model"));
- }
-
- reverse_ = has_attribute("reverse");
-
- pad_id_ = sentencepiece_processor_->pad_id();
- if (pad_id_ == -1) pad_id_ = sentencepiece_processor_->unk_id();
- }
-
- protected:
- void GetPad(int32* pad) const { *pad = pad_id_; }
-
- void GetPad(tstring* pad) const {
- pad->clear();
- if (sentencepiece_processor_ && pad_id_ >= 0 &&
- pad_id_ != sentencepiece_processor_->unk_id())
- *pad = sentencepiece_processor_->IdToPiece(pad_id_);
- }
-
- std::shared_ptr<SentencePieceProcessor> sentencepiece_processor_;
- int bos_id_ = -1;
- int eos_id_ = -1;
- int pad_id_ = -1;
- bool reverse_ = false;
-};
-} // namespace
-
-class SentencePieceGetPieceSizeOp : public SentencePieceBaseOp {
- public:
- explicit SentencePieceGetPieceSizeOp(OpKernelConstruction* context)
- : SentencePieceBaseOp(context) {}
-
- void Compute(OpKernelContext* context) override {
- Tensor* vocab_size_tensor = nullptr;
- OP_REQUIRES_OK(context,
- context->allocate_output(0, {}, &vocab_size_tensor));
- vocab_size_tensor->scalar<int32>()() =
- sentencepiece_processor_->GetPieceSize();
- }
-};
-
-template <typename S, typename T>
-class SentencePieceConvertPieceOp : public SentencePieceBaseOp {
- public:
- explicit SentencePieceConvertPieceOp(OpKernelConstruction* context)
- : SentencePieceBaseOp(context) {}
-
- void Compute(OpKernelContext* context) override {
- const Tensor* input_tensor = nullptr;
- OP_REQUIRES_OK(context, context->input("input", &input_tensor));
-
- Tensor* output_tensor = nullptr;
- OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(),
- &output_tensor));
- for (int i = 0; i < input_tensor->NumElements(); ++i)
- output_tensor->flat<T>()(i) = Convert(input_tensor->flat<S>()(i));
- }
-
- int32 Convert(const std::string& piece) const {
- return sentencepiece_processor_->PieceToId(piece);
- }
-
- std::string Convert(int32 id) const {
- if (id >= 0 && id < sentencepiece_processor_->GetPieceSize()) {
- return sentencepiece_processor_->IdToPiece(id);
- }
- return "";
- }
-};
-
-class SentencePieceGetPieceTypeOp : public SentencePieceBaseOp {
- public:
- explicit SentencePieceGetPieceTypeOp(OpKernelConstruction* context)
- : SentencePieceBaseOp(context) {
- OP_REQUIRES_OK(context, context->GetAttr("piece_type", &piece_type_));
- }
-
- void Compute(OpKernelContext* context) override {
- const Tensor* input_tensor = nullptr;
- OP_REQUIRES_OK(context, context->input("input", &input_tensor));
-
- Tensor* output_tensor = nullptr;
- OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(),
- &output_tensor));
-
- for (int i = 0; i < input_tensor->NumElements(); ++i) {
- const int id = input_tensor->flat<int32>()(i);
- switch (piece_type_) {
- case 0:
- output_tensor->flat<bool>()(i) =
- sentencepiece_processor_->IsUnknown(id);
- break;
- case 1:
- output_tensor->flat<bool>()(i) =
- sentencepiece_processor_->IsControl(id);
- break;
- case 2:
- output_tensor->flat<bool>()(i) =
- sentencepiece_processor_->IsUnused(id);
- break;
- default:
- break;
- }
- }
- }
-
- private:
- int piece_type_;
-};
-
-template <typename T, typename U = T>
-class SentencePieceEncodeOpBase : public SentencePieceBaseOp {
- public:
- explicit SentencePieceEncodeOpBase(OpKernelConstruction* context)
- : SentencePieceBaseOp(context) {}
-
- void Compute(OpKernelContext* context) override {
- const Tensor* input_tensor = nullptr;
-
- OP_REQUIRES_OK(context, context->input("input", &input_tensor));
- OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor->shape()),
- ::tensorflow::errors::InvalidArgument(
- "`input` must be a vector, got shape: ",
- input_tensor->shape().DebugString()));
- const auto& input_sentences = input_tensor->vec<tstring>();
- const int64 batch_size = input_sentences.size();
-
- const Tensor* nbest_size_tensor = nullptr;
- OP_REQUIRES_OK(context, context->input("nbest_size", &nbest_size_tensor));
- OP_REQUIRES(context, nbest_size_tensor->dims() <= 1,
- ::tensorflow::errors::InvalidArgument(
- "`nbest_size` must be a scalar or vector. got shape: ",
- nbest_size_tensor->shape().DebugString()));
- if (nbest_size_tensor->dims() == 1) {
- OP_REQUIRES(
- context, batch_size == nbest_size_tensor->dim_size(0),
- ::tensorflow::errors::InvalidArgument(
- "`nbest_size` must have the same batch size as `input`."));
- }
-
- const Tensor* alpha_tensor = nullptr;
- OP_REQUIRES_OK(context, context->input("alpha", &alpha_tensor));
- OP_REQUIRES(context, alpha_tensor->dims() <= 1,
- ::tensorflow::errors::InvalidArgument(
- "`alpha` must be a scalar or vector, got shape: ",
- alpha_tensor->shape().DebugString()));
- if (alpha_tensor->dims() == 1) {
- OP_REQUIRES(context, batch_size == alpha_tensor->dim_size(0),
- ::tensorflow::errors::InvalidArgument(
- "`alpha` must have the same batch size as `input`."));
- }
-
- std::vector<std::vector<U>> pieces(batch_size);
-
- for (int64 i = 0; i < batch_size; ++i) {
- const int32 nbest_size = nbest_size_tensor->dims() == 1
- ? nbest_size_tensor->vec<int32>()(i)
- : nbest_size_tensor->scalar<int32>()();
- if (nbest_size == 0 || nbest_size == 1) {
- OP_REQUIRES_OK(context,
- ToTFStatus(sentencepiece_processor_->Encode(
- absl::string_view(input_sentences(i)), &pieces[i])));
- } else {
- const float alpha = alpha_tensor->dims() == 1
- ? alpha_tensor->vec<float>()(i)
- : alpha_tensor->scalar<float>()();
- OP_REQUIRES_OK(context,
- ToTFStatus(sentencepiece_processor_->SampleEncode(
- absl::string_view(input_sentences(i)), nbest_size,
- alpha, &pieces[i])));
- }
- RewritePieces(&pieces[i]);
- }
-
- MakeOutputTensor(context, pieces);
- }
-
- protected:
- void RewritePieces(std::vector<std::string>* pieces) const {
- if (reverse_) std::reverse(pieces->begin(), pieces->end());
- if (bos_id_ > 0)
- pieces->insert(pieces->begin(),
- sentencepiece_processor_->IdToPiece(bos_id_));
- if (eos_id_ > 0)
- pieces->push_back(sentencepiece_processor_->IdToPiece(eos_id_));
- }
-
- void RewritePieces(std::vector<int32>* pieces) const {
- if (reverse_) std::reverse(pieces->begin(), pieces->end());
- if (bos_id_ > 0) pieces->insert(pieces->begin(), bos_id_);
- if (eos_id_ > 0) pieces->push_back(eos_id_);
- }
-
- virtual void MakeOutputTensor(OpKernelContext* context,
- const std::vector<std::vector<U>>& pieces) = 0;
-};
-
-template <typename T, typename U = T>
-class SentencePieceEncodeSparseOp : public SentencePieceEncodeOpBase<T, U> {
- public:
- explicit SentencePieceEncodeSparseOp(OpKernelConstruction* context)
- : SentencePieceEncodeOpBase<T, U>(context) {}
-
- protected:
- void MakeOutputTensor(OpKernelContext* context,
- const std::vector<std::vector<U>>& pieces) override {
- const int64 batch_size = pieces.size();
-
- int64 max_sequence_length = 0;
- int64 indices_size = 0;
- for (int row = 0; row < batch_size; ++row) {
- const int col_size = pieces[row].size();
- max_sequence_length = std::max<int64>(col_size, max_sequence_length);
- indices_size += col_size;
- }
-
- // Creates the indices output tensor.
- Tensor* indices_tensor = nullptr;
- OP_REQUIRES_OK(context, context->allocate_output(0, {indices_size, 2},
- &indices_tensor));
-
- auto indices_tensor_output = indices_tensor->matrix<int64>();
- int item_idx = 0;
- for (int row = 0; row < batch_size; ++row) {
- for (int col = 0; col < pieces[row].size(); ++col) {
- indices_tensor_output(item_idx, 0) = row;
- indices_tensor_output(item_idx, 1) = col;
- ++item_idx;
- }
- }
-
- // Creates the values output tensor.
- Tensor* values_tensor = nullptr;
- OP_REQUIRES_OK(context,
- context->allocate_output(1, {indices_size}, &values_tensor));
-
- auto values_tensor_output = values_tensor->flat<T>();
- item_idx = 0;
- for (int row = 0; row < batch_size; ++row) {
- std::copy(pieces[row].begin(), pieces[row].end(),
- &values_tensor_output(item_idx));
- item_idx += pieces[row].size();
- }
-
- // Creates the shape output tensor.
- Tensor* shape_tensor = nullptr;
- OP_REQUIRES_OK(context, context->allocate_output(2, {2}, &shape_tensor));
-
- auto shape_tensor_output = shape_tensor->flat<int64>();
- shape_tensor_output(0) = batch_size;
- shape_tensor_output(1) = max_sequence_length;
- }
-};
-
-template <typename T, typename U = T>
-class SentencePieceEncodeDenseOp : public SentencePieceEncodeOpBase<T, U> {
- public:
- explicit SentencePieceEncodeDenseOp(OpKernelConstruction* context)
- : SentencePieceEncodeOpBase<T, U>(context) {
- this->GetPad(&pad_);
- }
-
- // protected:
- void MakeOutputTensor(OpKernelContext* context,
- const std::vector<std::vector<U>>& pieces) override {
- const int64 batch_size = pieces.size();
-
- int64 max_sequence_length = 0;
- for (int row = 0; row < batch_size; ++row) {
- max_sequence_length =
- std::max<int64>(pieces[row].size(), max_sequence_length);
- }
-
- Tensor* values_tensor = nullptr;
- Tensor* length_tensor = nullptr;
-
- OP_REQUIRES_OK(
- context, context->allocate_output(0, {batch_size, max_sequence_length},
- &values_tensor));
- OP_REQUIRES_OK(context,
- context->allocate_output(1, {batch_size}, &length_tensor));
-
- auto values_tensor_output = values_tensor->matrix<T>();
- auto length_tensor_output = length_tensor->vec<int32>();
-
- U pad = pad_;
-
- for (int row = 0; row < batch_size; ++row) {
- for (int col = 0; col < max_sequence_length; ++col) {
- values_tensor_output(row, col) =
- col < pieces[row].size() ? pieces[row][col] : pad;
- }
- length_tensor_output(row) = pieces[row].size();
- }
- }
-
- private:
- T pad_;
-};
-
-template <typename T, typename U = T>
-class SentencePieceDecodeOp : public SentencePieceBaseOp {
- public:
- explicit SentencePieceDecodeOp(OpKernelConstruction* context)
- : SentencePieceBaseOp(context) {}
-
- void Compute(OpKernelContext* context) override {
- const Tensor* input_tensor = nullptr;
- const Tensor* length_tensor = nullptr;
-
- OP_REQUIRES_OK(context, context->input("input", &input_tensor));
- OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_tensor->shape()),
- ::tensorflow::errors::InvalidArgument(
- "`input` must be a 2-D matrix. got shape: ",
- input_tensor->shape().DebugString()));
- OP_REQUIRES_OK(context, context->input("sequence_length", &length_tensor));
- OP_REQUIRES(context, TensorShapeUtils::IsVector(length_tensor->shape()),
- ::tensorflow::errors::InvalidArgument(
- "`sequence_length` must be a vector. got shape: ",
- length_tensor->shape().DebugString()));
- OP_REQUIRES(
- context, input_tensor->dim_size(0) == length_tensor->dim_size(0),
- ::tensorflow::errors::InvalidArgument(
- "`sequence_length` must have the same batch size as `input`."));
-
- const auto& input_sentences = input_tensor->matrix<T>();
- const auto& sequence_length = length_tensor->vec<int32>();
- const int64 batch_size = input_tensor->dim_size(0);
- const int max_sequence_length = input_tensor->dim_size(1);
-
- Tensor* values_tensor = nullptr;
- OP_REQUIRES_OK(context,
- context->allocate_output(0, {batch_size}, &values_tensor));
- auto values_tensor_output = values_tensor->vec<tstring>();
-
- for (int64 i = 0; i < batch_size; ++i) {
- OP_REQUIRES(context,
- (sequence_length(i) >= 0 &&
- sequence_length(i) <= max_sequence_length),
- ::tensorflow::errors::InvalidArgument(
- "`sequence_length` is out-of-range."));
- std::vector<U> pieces(&input_sentences(i, 0),
- &input_sentences(i, 0) + sequence_length(i));
- if (reverse_) std::reverse(pieces.begin(), pieces.end());
- std::string detokenized_str;
- OP_REQUIRES_OK(context, ToTFStatus(sentencepiece_processor_->Decode(
- pieces, &detokenized_str)));
- values_tensor_output(i) = detokenized_str;
- }
- }
-};
-
-namespace {
-// The snake case of this variables are used as the function names.
-constexpr char kGetPieceSizeOpName[] = "SentencepieceGetPieceSize";
-constexpr char kPieceToIdOpName[] = "SentencepiecePieceToId";
-constexpr char kIdToPieceOpName[] = "SentencepieceIdToPiece";
-constexpr char kGetPieceTypeOpName[] = "SentencepieceGetPieceType";
-constexpr char kEncodeDenseOpName[] = "SentencepieceEncodeDense";
-constexpr char kEncodeSparseOpName[] = "SentencepieceEncodeSparse";
-constexpr char kDecodeOpName[] = "SentencepieceDecode";
-} // namespace
-
-REGISTER_OP(kGetPieceSizeOpName)
- .Output("vocab_size: int32")
- .Attr("model_file: string = ''")
- .Attr("model_proto: string = ''")
- .SetShapeFn([](InferenceContext* c) {
- c->set_output(0, c->MakeShape({}));
- return ::tensorflow::Status::OK();
- });
-
-REGISTER_KERNEL_BUILDER(Name(kGetPieceSizeOpName).Device(DEVICE_CPU),
- SentencePieceGetPieceSizeOp);
-
-REGISTER_OP(kPieceToIdOpName)
- .Input("input: string")
- .Output("values: int32")
- .Attr("model_file: string = ''")
- .Attr("model_proto: string = ''")
- .SetShapeFn([](InferenceContext* c) {
- c->set_output(0, c->input(0));
- return ::tensorflow::Status::OK();
- });
-
-REGISTER_KERNEL_BUILDER(Name(kPieceToIdOpName).Device(DEVICE_CPU),
- SentencePieceConvertPieceOp<tstring, int32>);
-
-REGISTER_OP(kIdToPieceOpName)
- .Input("input: int32")
- .Output("values: string")
- .Attr("model_file: string = ''")
- .Attr("model_proto: string = ''")
- .SetShapeFn([](InferenceContext* c) {
- c->set_output(0, c->input(0));
- return ::tensorflow::Status::OK();
- });
-
-REGISTER_KERNEL_BUILDER(Name(kIdToPieceOpName).Device(DEVICE_CPU),
- SentencePieceConvertPieceOp<int32, tstring>);
-
-REGISTER_OP(kGetPieceTypeOpName)
- .Input("input: int32")
- .Output("values: bool")
- .Attr("model_file: string = ''")
- .Attr("model_proto: string = ''")
- .Attr("piece_type: int = 0")
- .SetShapeFn([](InferenceContext* c) {
- c->set_output(0, c->input(0));
- return ::tensorflow::Status::OK();
- });
-
-REGISTER_KERNEL_BUILDER(Name(kGetPieceTypeOpName).Device(DEVICE_CPU),
- SentencePieceGetPieceTypeOp);
-
-REGISTER_OP(kEncodeDenseOpName)
- .Attr("out_type: {int32, string} = DT_INT32")
- .Input("input: string")
- .Input("nbest_size: int32")
- .Input("alpha: float")
- .Output("values: out_type")
- .Output("sequence_length: int32")
- .Attr("model_file: string = ''")
- .Attr("model_proto: string = ''")
- .Attr("reverse: bool = false")
- .Attr("add_bos: bool = false")
- .Attr("add_eos: bool = false")
- .SetShapeFn([](InferenceContext* c) {
- ShapeHandle input, nbest, alpha;
- TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
- TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest));
- TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha));
- DimensionHandle batch_size = c->Dim(input, 0);
- if (c->Rank(nbest) == 1)
- TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size));
- if (c->Rank(alpha) == 1)
- TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size));
- c->set_output(0, c->MakeShape({batch_size, c->UnknownDim()}));
- c->set_output(1, c->MakeShape({batch_size}));
- return ::tensorflow::Status::OK();
- });
-
-REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName)
- .Device(DEVICE_CPU)
- .TypeConstraint<int32>("out_type"),
- SentencePieceEncodeDenseOp<int32>);
-
-REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName)
- .Device(DEVICE_CPU)
- .TypeConstraint<tstring>("out_type"),
- SentencePieceEncodeDenseOp<tstring, std::string>);
-
-REGISTER_OP(kEncodeSparseOpName)
- .Attr("out_type: {int32, string} = DT_INT32")
- .Input("input: string")
- .Input("nbest_size: int32")
- .Input("alpha: float")
- .Output("indices: int64")
- .Output("values: out_type")
- .Output("dense_shape: int64")
- .Attr("model_file: string = ''")
- .Attr("model_proto: string = ''")
- .Attr("reverse: bool = false")
- .Attr("add_bos: bool = false")
- .Attr("add_eos: bool = false")
- .SetShapeFn([](InferenceContext* c) {
- ShapeHandle input, nbest, alpha;
- TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input));
- TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest));
- TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha));
- DimensionHandle batch_size = c->Dim(input, 0);
- if (c->Rank(nbest) == 1)
- TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size));
- if (c->Rank(alpha) == 1)
- TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size));
- c->set_output(0, c->MakeShape({c->UnknownDim(), 2}));
- c->set_output(1, c->MakeShape({c->UnknownDim()}));
- c->set_output(2, c->MakeShape({2}));
- return ::tensorflow::Status::OK();
- });
-
-REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName)
- .Device(DEVICE_CPU)
- .TypeConstraint<int32>("out_type"),
- SentencePieceEncodeSparseOp<int32>);
-
-REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName)
- .Device(DEVICE_CPU)
- .TypeConstraint<tstring>("out_type"),
- SentencePieceEncodeSparseOp<tstring, std::string>);
-
-REGISTER_OP(kDecodeOpName)
- .Attr("T: {int32, string}")
- .Input("input: T")
- .Input("sequence_length: int32")
- .Output("values: string")
- .Attr("model_file: string = ''")
- .Attr("model_proto: string = ''")
- .Attr("reverse: bool = false")
- .SetShapeFn([](InferenceContext* c) {
- ShapeHandle input, sequence_length;
- TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input));
- TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sequence_length));
- DimensionHandle batch_size = c->Dim(input, 0);
- TF_RETURN_IF_ERROR(
- c->Merge(batch_size, c->Dim(sequence_length, 0), &batch_size));
- c->set_output(0, c->MakeShape({batch_size}));
- return ::tensorflow::Status::OK();
- });
-
-REGISTER_KERNEL_BUILDER(
- Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<int32>("T"),
- SentencePieceDecodeOp<int32>);
-
-REGISTER_KERNEL_BUILDER(
- Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<tstring>("T"),
- SentencePieceDecodeOp<tstring, std::string>);
-} // namespace sentencepiece
diff --git a/tensorflow/setup.py b/tensorflow/setup.py
deleted file mode 100755
index db3c6d9..0000000
--- a/tensorflow/setup.py
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright 2018 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.!
-
-from setuptools import setup
-from setuptools import find_packages
-import tensorflow as tf
-import codecs
-import string
-import sys
-import os
-
-sys.path.append(os.path.join('.', 'test'))
-
-with codecs.open(os.path.join('..', 'VERSION'), 'r', 'utf-8') as f:
- version = f.read()
-
-setup(name = 'tf_sentencepiece',
- author = 'Taku Kudo',
- author_email='taku@google.com',
- description = 'SentencePiece Encode/Decode ops for TensorFlow',
- version=version,
- url = 'https://github.com/google/sentencepiece',
- license = 'Apache',
- platforms = 'Unix',
- packages=find_packages(exclude=['test']),
- package_data={'tf_sentencepiece': ['_sentencepiece_processor_ops.so*']},
- classifiers = [
- 'Development Status :: 5 - Production/Stable',
- 'Environment :: Console',
- 'Intended Audience :: Developers',
- 'Intended Audience :: Science/Research',
- 'License :: OSI Approved :: Apache Software License',
- 'Operating System :: Unix',
- 'Programming Language :: Python',
- 'Topic :: Text Processing :: Linguistic',
- 'Topic :: Software Development :: Libraries :: Python Modules'
- ],
- keywords='tensorflow machine learning sentencepiece NLP segmentation',
- test_suite = 'tf_sentencepiece_test.suite')
diff --git a/tensorflow/test/__init__.py b/tensorflow/test/__init__.py
deleted file mode 100644
index e69de29..0000000
--- a/tensorflow/test/__init__.py
+++ /dev/null
diff --git a/tensorflow/test/tf_sentencepiece_test.py b/tensorflow/test/tf_sentencepiece_test.py
deleted file mode 100755
index 34f6fcd..0000000
--- a/tensorflow/test/tf_sentencepiece_test.py
+++ /dev/null
@@ -1,322 +0,0 @@
-#!/usr/bin/python
-# -*- coding: utf-8 -*-
-
-import itertools as it
-import os
-import sys
-import unittest
-import tensorflow as tf
-import tf_sentencepiece as tfspm
-
-try:
- tf.Session = tf.compat.v1.Session
- tf.sparse_tensor_to_dense = tf.compat.v1.sparse_tensor_to_dense
-except:
- pass
-
-
-class SentencePieceProcssorOpTest(unittest.TestCase):
-
- def _getSentencePieceModelFile(self):
- return os.path.join('..', 'python', 'test', 'test_model.model')
-
- def _getPieceSize(self):
- return 1000
-
- def _getExpected(self,
- reverse=False,
- add_bos=False,
- add_eos=False,
- padding=''):
- # TF uses str(bytes) as a string representation.
- padding = padding.encode('utf8')
- sentences = [
- b'Hello world.', b'I have a pen.', b'I saw a girl with a telescope.'
- ]
- pieces = [[b'\xe2\x96\x81He', b'll', b'o', b'\xe2\x96\x81world', b'.'],
- [
- b'\xe2\x96\x81I', b'\xe2\x96\x81have', b'\xe2\x96\x81a',
- b'\xe2\x96\x81p', b'en', b'.'
- ],
- [
- b'\xe2\x96\x81I', b'\xe2\x96\x81saw', b'\xe2\x96\x81a',
- b'\xe2\x96\x81girl', b'\xe2\x96\x81with', b'\xe2\x96\x81a',
- b'\xe2\x96\x81', b'te', b'le', b's', b'c', b'o', b'pe', b'.'
- ]]
- ids = [[151, 88, 21, 887, 6], [9, 76, 11, 68, 98, 6],
- [9, 459, 11, 939, 44, 11, 4, 142, 82, 8, 28, 21, 132, 6]]
- seq_len = [5, 6, 14]
-
- if reverse:
- ids = [x[::-1] for x in ids]
- pieces = [x[::-1] for x in pieces]
-
- if add_bos:
- ids = [[1] + x for x in ids]
- pieces = [[b'<s>'] + x for x in pieces]
- seq_len = [x + 1 for x in seq_len]
-
- if add_eos:
- ids = [x + [2] for x in ids]
- pieces = [x + [b'</s>'] for x in pieces]
- seq_len = [x + 1 for x in seq_len]
-
- max_len = max(seq_len)
- pieces = [x + [padding] * (max_len - len(x)) for x in pieces]
- ids = [x + [0] * (max_len - len(x)) for x in ids]
-
- return sentences, pieces, ids, seq_len
-
- def testGetPieceSize(self):
- sentencepiece_model_file = self._getSentencePieceModelFile()
-
- with tf.Session():
- s = tfspm.piece_size(model_file=sentencepiece_model_file)
- self.assertEqual(s.eval(), self._getPieceSize())
-
- def testConvertPiece(self):
- sentencepiece_model_file = self._getSentencePieceModelFile()
- (sentences, expected_pieces, expected_ids,
- expected_seq_len) = self._getExpected(padding='<unk>')
-
- with tf.Session():
- ids_matrix = tfspm.piece_to_id(
- tf.constant(expected_pieces), model_file=sentencepiece_model_file)
- ids_vec = tfspm.piece_to_id(
- tf.constant(expected_pieces[0]), model_file=sentencepiece_model_file)
- ids_scalar = tfspm.piece_to_id(
- tf.constant(expected_pieces[0][0]),
- model_file=sentencepiece_model_file)
-
- self.assertEqual(ids_matrix.eval().tolist(), expected_ids)
- self.assertEqual(ids_vec.eval().tolist(), expected_ids[0])
- self.assertEqual(ids_scalar.eval(), expected_ids[0][0])
-
- pieces_matrix = tfspm.id_to_piece(
- tf.constant(expected_ids), model_file=sentencepiece_model_file)
- pieces_vec = tfspm.id_to_piece(
- tf.constant(expected_ids[0]), model_file=sentencepiece_model_file)
- pieces_scalar = tfspm.id_to_piece(
- tf.constant(expected_ids[0][0]), model_file=sentencepiece_model_file)
-
- self.assertEqual(pieces_matrix.eval().tolist(), expected_pieces)
- self.assertEqual(pieces_vec.eval().tolist(), expected_pieces[0])
- self.assertEqual(pieces_scalar.eval(), expected_pieces[0][0])
-
- def testEncodeAndDecode(self):
- sentencepiece_model_file = self._getSentencePieceModelFile()
-
- with tf.Session():
- for reverse, add_bos, add_eos in list(
- it.product((True, False), repeat=3)):
- (sentences, expected_pieces, expected_ids,
- expected_seq_len) = self._getExpected(
- reverse=reverse, add_bos=add_bos, add_eos=add_eos)
-
- # Encode sentences into pieces/ids.
- s = tf.constant(sentences)
- pieces, seq_len1 = tfspm.encode(
- s,
- model_file=sentencepiece_model_file,
- reverse=reverse,
- add_bos=add_bos,
- add_eos=add_eos,
- out_type=tf.string)
- ids, seq_len2 = tfspm.encode(
- s,
- model_file=sentencepiece_model_file,
- reverse=reverse,
- add_bos=add_bos,
- add_eos=add_eos)
-
- self.assertEqual(pieces.eval().tolist(), expected_pieces)
- self.assertEqual(ids.eval().tolist(), expected_ids)
- self.assertEqual(seq_len1.eval().tolist(), expected_seq_len)
- self.assertEqual(seq_len2.eval().tolist(), expected_seq_len)
-
- # Decode pieces into sentences/ids.
- pieces = tf.constant(expected_pieces)
- ids = tf.constant(expected_ids)
- seq_len = tf.constant(expected_seq_len, dtype=tf.int32)
- decoded_sentences1 = tfspm.decode(
- pieces,
- seq_len,
- model_file=sentencepiece_model_file,
- reverse=reverse)
- decoded_sentences2 = tfspm.decode(
- ids, seq_len, model_file=sentencepiece_model_file, reverse=reverse)
-
- self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
- self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
-
- def testSampleEncodeAndDecode(self):
- sentencepiece_model_file = self._getSentencePieceModelFile()
- sentences, _, _, _ = self._getExpected()
-
- with tf.Session():
- for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]:
- # Round trip test.
- nbest_size = tf.constant(n)
- alpha = tf.constant(a)
- s = tf.constant(sentences)
-
- pieces, seq_len1 = tfspm.encode(
- s,
- nbest_size=nbest_size,
- alpha=alpha,
- model_file=sentencepiece_model_file,
- out_type=tf.string)
- ids, seq_len2 = tfspm.encode(
- s,
- nbest_size=nbest_size,
- alpha=alpha,
- model_file=sentencepiece_model_file)
- decoded_sentences1 = tfspm.decode(
- pieces, seq_len1, model_file=sentencepiece_model_file)
- decoded_sentences2 = tfspm.decode(
- ids, seq_len2, model_file=sentencepiece_model_file)
-
- self.assertEqual(decoded_sentences1.eval().tolist(), sentences)
- self.assertEqual(decoded_sentences2.eval().tolist(), sentences)
-
- def testEncodeAndDecodeSparse(self):
- sentencepiece_model_file = self._getSentencePieceModelFile()
-
- with tf.Session():
- for reverse, add_bos, add_eos in list(
- it.product((True, False), repeat=3)):
- (sentences, expected_pieces, expected_ids,
- _) = self._getExpected(reverse, add_bos, add_eos)
-
- # Encode sentences into sparse pieces/ids.
- s = tf.constant(sentences)
- pieces = tfspm.encode_sparse(
- s,
- model_file=sentencepiece_model_file,
- reverse=reverse,
- add_bos=add_bos,
- add_eos=add_eos,
- out_type=tf.string)
- ids = tfspm.encode_sparse(
- s,
- model_file=sentencepiece_model_file,
- reverse=reverse,
- add_bos=add_bos,
- add_eos=add_eos)
- pieces = tf.sparse_tensor_to_dense(pieces, default_value='')
- ids = tf.sparse_tensor_to_dense(ids, default_value=0)
-
- self.assertEqual(ids.eval().tolist(), expected_ids)
- self.assertEqual(pieces.eval().tolist(), expected_pieces)
-
- def testGetPieceType(self):
- sentencepiece_model_file = self._getSentencePieceModelFile()
- expected_is_unknown = []
- expected_is_control = []
- expected_is_unused = []
- ids = []
-
- for i in range(self._getPieceSize()):
- ids.append(i)
- expected_is_unknown.append(i == 0)
- expected_is_control.append(i == 1 or i == 2)
- expected_is_unused.append(False)
-
- with tf.Session():
- s = tf.constant(ids)
- is_unknown = tfspm.is_unknown(s, model_file=sentencepiece_model_file)
- is_control = tfspm.is_control(s, model_file=sentencepiece_model_file)
- is_unused = tfspm.is_unused(s, model_file=sentencepiece_model_file)
-
- self.assertEqual(is_unknown.eval().tolist(), expected_is_unknown)
- self.assertEqual(is_control.eval().tolist(), expected_is_control)
- self.assertEqual(is_unused.eval().tolist(), expected_is_unused)
-
- def testLoadModelProto(self):
- # Makes a serialized model proto.
- with open(self._getSentencePieceModelFile(), 'rb') as f:
- model_proto = f.read()
- with tf.Session() as sess:
- sentences = ['Hello world.']
- a = tf.constant(sentences)
- sess.run(tfspm.encode(a, model_proto=model_proto, out_type=tf.string))
-
- def testInvalidModelPath(self):
- with tf.Session() as sess:
- with self.assertRaises(tf.errors.NotFoundError):
- sentences = ['Hello world.']
- a = tf.constant(sentences)
- sess.run(tfspm.encode(a, model_file='invalid path', out_type=tf.string))
-
- def testInvalidModelProto(self):
- with tf.Session() as sess:
- with self.assertRaises(tf.errors.InternalError):
- sentences = ['Hello world.']
- a = tf.constant(sentences)
- sess.run(
- tfspm.encode(a, model_proto='invalid proto', out_type=tf.string))
-
- def testInvalidInput(self):
- sentences = ['Hello world.', 'This is a test.']
- ids = [[0, 1], [2, 3]]
- model_file = self._getSentencePieceModelFile()
- with tf.Session() as sess:
- a = tf.constant(sentences)
- b = tf.constant(ids)
-
- alpha = tf.constant([1.0, 2.0])
- sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha, name='foo'))
-
- nbest_size = tf.constant([1, 2], dtype=tf.int32)
- sess.run(
- tfspm.encode(
- a, model_file=model_file, nbest_size=nbest_size, name='foo'))
-
- alpha = tf.constant(1.0)
- sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha, name='foo'))
-
- nbest_size = tf.constant(10, dtype=tf.int32)
- sess.run(
- tfspm.encode(
- a, model_file=model_file, nbest_size=nbest_size, name='foo'))
-
- sess.run(
- tfspm.decode(
- b, sequence_length=tf.constant([2, 2]), model_file=model_file))
-
- with self.assertRaises(ValueError):
- a = tf.constant(sentences)
- alpha = tf.constant([1.0, 2.0, 3.0])
- sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha))
- with self.assertRaises(ValueError):
- a = tf.constant(sentences)
- nbest_size = tf.constant([1, 2, 3], dtype=tf.int32)
- sess.run(tfspm.encode(a, model_file=model_file, nbest_size=nbest_size))
- with self.assertRaises(ValueError):
- a = tf.constant(sentences)
- alpha = tf.constant([[1.0], [2.0]])
- sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha))
- with self.assertRaises(ValueError):
- a = tf.constant(sentences)
- nbest_size = tf.constant([[1], [2]], dtype=tf.int32)
- sess.run(tfspm.encode(a, model_file=model_file, nbest_size=nbest_size))
- with self.assertRaises(ValueError):
- b = tf.constant(ids)
- sess.run(tfspm.decode(a, sequence_length=2, model_file=model_file))
- with self.assertRaises(ValueError):
- b = tf.constant(ids)
- sess.run(
- tfspm.decode(
- a,
- sequence_length=tf.constant([2, 2, 2]),
- model_file=model_file))
-
-
-def suite():
- suite = unittest.TestSuite()
- suite.addTests(unittest.makeSuite(SentencePieceProcssorOpTest))
- return suite
-
-
-if __name__ == '__main__':
- unittest.main()
diff --git a/tensorflow/tf_sentencepiece/__init__.py b/tensorflow/tf_sentencepiece/__init__.py
deleted file mode 100644
index 8a3df31..0000000
--- a/tensorflow/tf_sentencepiece/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-from tf_sentencepiece.sentencepiece_processor_ops import *
diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.0 b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.0
deleted file mode 100755
index b7af398..0000000
--- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.0
+++ /dev/null
Binary files differ
diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.2 b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.2
deleted file mode 100755
index b7af398..0000000
--- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.2
+++ /dev/null
Binary files differ
diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.0 b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.0
deleted file mode 100755
index 06a5258..0000000
--- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.0
+++ /dev/null
Binary files differ
diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.1 b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.1
deleted file mode 100755
index 06a5258..0000000
--- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.1
+++ /dev/null
Binary files differ
diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.1.0 b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.1.0
deleted file mode 100755
index 31b6668..0000000
--- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.1.0
+++ /dev/null
Binary files differ
diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.2.0 b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.2.0
deleted file mode 100755
index fa8b9ea..0000000
--- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.2.0
+++ /dev/null
Binary files differ
diff --git a/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py b/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py
deleted file mode 100644
index 5c2aa39..0000000
--- a/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright 2018 Google Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.!
-r"""Ops for SentencePiece Encoding/Decoding."""
-
-# TODO(taku): Implements n-best output
-
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-from distutils.version import LooseVersion
-
-import warnings
-import glob
-import re
-import os
-import tensorflow as tf
-
-so_base = os.path.join(
- os.path.dirname(__file__), '_sentencepiece_processor_ops.so')
-so_file = so_base + '.' + tf.__version__
-
-if not hasattr(tf, 'no_gradient'):
- tf.no_gradient = tf.NotDifferentiable
-
-if not os.path.exists(so_file):
- versions = [
- re.search('so.([0-9]+\.[0-9\.]+.*)$', os.path.basename(n)).group(1)
- for n in glob.glob(so_base + '.*')
- ]
- latest = sorted(versions, key=LooseVersion)[-1]
- warnings.warn('No so file is found for [%s] from [%s]' %
- (tf.__version__, ', '.join(versions)))
- warnings.warn('use the latest version %s' % (latest))
- so_file = so_base + '.' + latest
-
-_gen_sentencepiece_processor_op = tf.load_op_library(so_file)
-
-
-def piece_size(model_file=None, model_proto=None, name=None):
- """Returns the piece size (vocabulary size).
-
- Args:
- model_file: The sentencepiece model file path.
- model_proto: The sentencepiece model serialized proto. Either `model_file`
- or `model_proto` must be set.
- name: The name argument that is passed to the op function.
-
- Returns:
- A scalar representing the vocabulary size.
- """
-
- return _gen_sentencepiece_processor_op.sentencepiece_get_piece_size(
- model_file=model_file, model_proto=model_proto, name=name)
-
-
-def piece_to_id(input, model_file=None, model_proto=None, name=None):
- """Converts piece into vocabulary id.
-
- Args:
- input: An arbitrary tensor of string.
- model_file: The sentencepiece model file path.
- model_proto: The sentencepiece model serialized proto. Either `model_file`
- or `model_proto` must be set.
- name: The name argument that is passed to the op function.
-
- Returns:
- A tensor of int32 with the same shape as input.
- """
-
- return _gen_sentencepiece_processor_op.sentencepiece_piece_to_id(
- input, model_file=model_file, model_proto=model_proto, name=name)
-
-
-def id_to_piece(input, model_file=None, model_proto=None, name=None):
- """Converts vocabulary id into piece.
-
- Args:
- input: An arbitrary tensor of int32.
- model_file: The sentencepiece model file path.
- model_proto: The sentencepiece model serialized proto. Either `model_file`
- or `model_proto` must be set.
- name: The name argument that is passed to the op function.
-
- Returns:
- A tensor of string with the same shape as input.
- """
-
- return _gen_sentencepiece_processor_op.sentencepiece_id_to_piece(
- input, model_file=model_file, model_proto=model_proto, name=name)
-
-
-def is_unknown(input, model_file=None, model_proto=None, name=None):
- """Returns true if input id is unknown piece.
-
- Args:
- input: An arbitrary tensor of int32.
- model_file: The sentencepiece model file path.
- model_proto: The sentencepiece model serialized proto. Either `model_file`
- or `model_proto` must be set.
- name: The name argument that is passed to the op function.
-
- Returns:
- A tensor of bool with the same shape as input.
- """
-
- return _gen_sentencepiece_processor_op.sentencepiece_get_piece_type(
- input,
- model_file=model_file,
- model_proto=model_proto,
- name=name,
- piece_type=0)
-
-
-def is_control(input, model_file=None, model_proto=None, name=None):
- """Returns true if input id is control piece.
-
- Args:
- input: An arbitrary tensor of int32.
- model_file: The sentencepiece model file path.
- model_proto: The sentencepiece model serialized proto. Either `model_file`
- or `model_proto` must be set.
- name: The name argument that is passed to the op function.
-
- Returns:
- A tensor of bool with the same shape as input.
- """
-
- return _gen_sentencepiece_processor_op.sentencepiece_get_piece_type(
- input,
- model_file=model_file,
- model_proto=model_proto,
- name=name,
- piece_type=1)
-
-
-def is_unused(input, model_file=None, model_proto=None, name=None):
- """Returns true if input id is unused piece.
-
- Args:
- input: An arbitrary tensor of int32.
- model_file: The sentencepiece model file path.
- model_proto: The sentencepiece model serialized proto. Either `model_file`
- or `model_proto` must be set.
- name: The name argument that is passed to the op function.
-
- Returns:
- A tensor of bool with the same shape as input.
- """
-
- return _gen_sentencepiece_processor_op.sentencepiece_get_piece_type(
- input,
- model_file=model_file,
- model_proto=model_proto,
- name=name,
- piece_type=2)
-
-
-def encode_dense(input_sentences,
- nbest_size=0,
- alpha=1.0,
- model_file=None,
- model_proto=None,
- reverse=False,
- add_bos=False,
- add_eos=False,
- out_type=tf.int32,
- name=None):
- """Encodes sentences into pieces in dense tensor format.
-
- Args:
- input_sentences: A 1D string tensor of arbitrary size holding the raw text
- of input sentences.
- nbest_size: A scalar or 1D tensor for sampling.
- nbest_size = {0,1}: No sampling is performed.
- nbest_size > 1: samples from the nbest_size results.
- nbest_size < 0: assuming that nbest_size is infinite and samples
- from the all hypothesis (lattice) using
- forward-filtering-and-backward-sampling algorithm.
- alpha: A scalar or 1D tensor for a smoothing parameter. Inverse temperature
- for probability rescaling.
- model_file: The sentencepiece model file path.
- model_proto: The sentencepiece model serialized proto. Either `model_file`
- or `model_proto` must be set.
- reverse: Reverses the tokenized sequence (Default = false)
- add_bos: Add <s> to the result (Default = false)
- add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
- reversing (if enabled).
- out_type: output type. tf.int32 or tf.string (Default = tf.int32) Setting
- tf.int32 directly encodes the string into an id sequence.
- name: The name argument that is passed to the op function.
-
- Returns:
- pieces: A dense 2D tensor representing the tokenized sentences.
- sequence_length: A 1D tensor representing the length of pieces.
- """
-
- return _gen_sentencepiece_processor_op.sentencepiece_encode_dense(
- input_sentences,
- nbest_size=nbest_size,
- alpha=alpha,
- model_file=model_file,
- model_proto=model_proto,
- reverse=reverse,
- add_bos=add_bos,
- add_eos=add_eos,
- out_type=out_type,
- name=name)
-
-
-def encode_sparse(input_sentences,
- nbest_size=0,
- alpha=1.0,
- model_file=None,
- model_proto=None,
- reverse=False,
- add_bos=False,
- add_eos=False,
- out_type=tf.int32,
- name=None):
- """Encodes sentences into pieces in sparse tensor format.
-
- Args:
- input_sentences: A 1D string tensor of arbitrary size holding the raw text
- of input sentences.
- nbest_size: A scalar or 1D tensor for sampling.
- nbest_size = {0,1}: No sampling is performed.
- nbest_size > 1: samples from the nbest_size results.
- nbest_size < 0: assuming that nbest_size is infinite and samples
- from the all hypothesis (lattice) using
- forward-filtering-and-backward-sampling algorithm.
- alpha: A scalar or 1D tensor for a smoothing parameter. Inverse temperature
- for probability rescaling.
- model_file: The sentencepiece model file path.
- model_proto: The sentencepiece model serialized proto. Either `model_file`
- or `model_proto` must be set.
- reverse: Reverses the tokenized sequence (Default = false)
- add_bos: Add <s> to the result (Default = false)
- add_eos: Add </s> to the result (Default = false) <s>/</s> is added after
- reversing (if enabled).
- out_type: output type. tf.int32 or tf.string (Default = tf.int32) Setting
- tf.int32 directly encodes the string into an id sequence.
- name: The name argument that is passed to the op function.
-
- Returns:
- pieces: A sparse 2D tensor representing the tokenized sentences.
- """
-
- indices, values, dense_shape = (
- _gen_sentencepiece_processor_op.sentencepiece_encode_sparse(
- input_sentences,
- nbest_size=nbest_size,
- alpha=alpha,
- model_file=model_file,
- model_proto=model_proto,
- reverse=reverse,
- add_bos=add_bos,
- add_eos=add_eos,
- out_type=out_type,
- name=name))
- return tf.SparseTensor(indices, values, dense_shape)
-
-
-def decode(pieces,
- sequence_length,
- model_file=None,
- model_proto=None,
- reverse=False,
- name=None):
- """Decode pieces into postprocessed text.
-
- Args:
- pieces: A 2D int32 or string tensor [batch_size x max_length] of encoded
- sequences.
- sequence_length: A 1D int32 tensor [batch_size] representing the length of
- pieces.
- model_file: The sentencepiece model file path.
- model_proto: The sentencepiece model serialized proto. Either `model_file`
- or `model_proto` must be set.
- reverse: Reverses the tokenized sequence (Default = false)
- name: The name argument that is passed to the op function.
-
- Returns:
- text: A 1D string tensor of decoded string.
- """
-
- return _gen_sentencepiece_processor_op.sentencepiece_decode(
- pieces,
- sequence_length,
- model_file=model_file,
- model_proto=model_proto,
- reverse=reverse,
- name=name)
-
-
-# Adds an alias for encode_dense. Accepts the `encode` function.
-encode = encode_dense
-sparse_encode = encode_sparse
-dense_encode = encode_dense
-
-tf.no_gradient('SentencepieceGetPieceSize')
-tf.no_gradient('SentencepieceIdToPiece')
-tf.no_gradient('SentencepiecePieceToId')
-tf.no_gradient('SentencepieceGetPieceType')
-tf.no_gradient('SentencepieceEncodeDense')
-tf.no_gradient('SentencepieceEncodeSparse')
-tf.no_gradient('SentencepieceDecode')
diff --git a/test.sh b/test.sh
index 306cd45..59b546c 100755
--- a/test.sh
+++ b/test.sh
@@ -62,15 +62,6 @@ build_python() {
cd ..
}
-build_tensorflow() {
- cd tensorflow
- pip3 install tensorflow
- python3 setup.py bdist_wheel
- python3 setup.py sdist
- python3 setup.py test
- cd ..
-}
-
build_linux_gcc_coverall_ubuntu() {
setup_debian
apt-get install -y lcov
@@ -78,7 +69,6 @@ build_linux_gcc_coverall_ubuntu() {
pip3 install 'requests[security]'
build_generic
build_python
- build_tensorflow
mkdir -p build
cd build
cmake .. -DSPM_COVERAGE=ON
@@ -92,13 +82,6 @@ build_linux_gcc_ubuntu() {
setup_ubuntu
build_generic
build_python
- build_tensorflow
-}
-
-build_linux_gcc_ubuntu_no_tf() {
- setup_ubuntu
- build_generic
- build_python
}
build_linux_gcc_ubuntu_i386() {
@@ -111,14 +94,12 @@ build_linux_gcc_debian() {
setup_debian
build_generic
build_python
- build_tensorflow
}
build_linux_gcc_fedora() {
setup_fedora
build_generic
build_python
-# build_tensorflow
}
build_linux_clang_ubuntu() {