diff options
author | Taku Kudo <taku@google.com> | 2020-10-12 19:22:31 +0300 |
---|---|---|
committer | Taku Kudo <taku@google.com> | 2020-10-12 19:22:31 +0300 |
commit | 8f5610375816b7ae4b9999f6984c0f14810d1a25 (patch) | |
tree | dd8327bbe29d4dfe228b8d3fb5d3408b15877029 | |
parent | 9cf136582d9cce492ba5a0cfb775f9e777fe07ea (diff) |
remove tensorflow module
17 files changed, 3 insertions, 1586 deletions
diff --git a/.travis.yml b/.travis.yml index 4467f48..1c5625d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,13 +8,13 @@ matrix: env: IMAGE=i386/ubuntu:rolling COMMAND=build_linux_gcc_ubuntu_i386 services: docker - os: linux - env: IMAGE=ubuntu:bionic COMMAND=build_linux_gcc_ubuntu_no_tf + env: IMAGE=ubuntu:bionic COMMAND=build_linux_gcc_ubuntu services: docker - os: linux - env: IMAGE=ubuntu:xenial COMMAND=build_linux_gcc_ubuntu_no_tf + env: IMAGE=ubuntu:xenial COMMAND=build_linux_gcc_ubuntu services: docker - os: linux - env: IMAGE=ubuntu:trusty COMMAND=build_linux_gcc_ubuntu_no_tf + env: IMAGE=ubuntu:trusty COMMAND=build_linux_gcc_ubuntu services: docker - os: linux env: IMAGE=debian:stable COMMAND=build_linux_gcc_debian @@ -37,12 +37,6 @@ matrix: - $TRAVIS_BUILD_DIR/python/make_py_wheel.sh ${IMAGE} - if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi services: docker - - os: linux - env: IMAGE=x86_64 COMMAND=make_py_wheel_tf RELEASE_FILES="$TRAVIS_BUILD_DIR/tensorflow/dist/*.whl" - script: - - $TRAVIS_BUILD_DIR/tensorflow/make_py_wheel.sh - - if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi - services: docker - os: osx osx_image: xcode9.4 env: IMAGE=native COMMAND=build_osx @@ -52,12 +46,6 @@ matrix: script: - $TRAVIS_BUILD_DIR/python/make_py_wheel_mac.sh - if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi - - os: osx - osx_image: xcode9.4 - env: IMAGE=native COMMAND=make_py_wheel_mac_tf RELEASE_FILES="$TRAVIS_BUILD_DIR/tensorflow/dist/*.whl" - script: - - $TRAVIS_BUILD_DIR/tensorflow/make_py_wheel_mac.sh - - if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi script: - $TRAVIS_BUILD_DIR/test.sh ${IMAGE} ${COMMAND} - if [[ "$RELEASE_FILES" != "" ]]; then ls -l $RELEASE_FILES ; fi diff --git a/tensorflow/__init__.py b/tensorflow/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/tensorflow/__init__.py +++ /dev/null diff --git a/tensorflow/make_py_wheel.sh b/tensorflow/make_py_wheel.sh deleted file mode 100755 index 8ccf4e1..0000000 --- a/tensorflow/make_py_wheel.sh +++ /dev/null @@ -1,88 +0,0 @@ -#!/bin/bash - -# Copyright 2018 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License.! - -set -e # exit immediately on error -set -x # display all commands - -CMAKE_VERSION=3.12.0 - -run_docker() { - cd `dirname $0` - docker pull $1 - docker run --rm -ti --name tf_sentencepiece \ - -v `pwd`/../:/sentencepiece -w /sentencepiece/tensorflow \ - -td $1 /bin/bash - docker exec tf_sentencepiece bash -c "./make_py_wheel.sh native $2" - docker stop tf_sentencepiece -} - -build_tf_wrapper() { - pkg_name="==$1" - - pip3 install tensorflow${pkg_name} --upgrade - - TF_CFLAGS=( $(python3 -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') ) - TF_LFLAGS=( $(python3 -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') ) - TF_VERSION=( $(python3 -c 'import tensorflow as tf; print(tf.__version__)') ) - - echo TF_CFLAGS=${TF_CFLAGS[@]} - echo TF_LFLAGS=${TF_LFLAGS[@]} - echo TF_VERSION=${TF_VERSION} - - g++ -std=c++11 -shared \ - -I../../src \ - -D_USE_TF_STRING_VIEW \ - -fPIC ${TF_CFLAGS[@]} -O2 \ - -Wl,--whole-archive \ - /usr/local/lib/libsentencepiece.a \ - -Wl,--no-whole-archive \ - sentencepiece_processor_ops.cc \ - -o tf_sentencepiece/_sentencepiece_processor_ops.so.${TF_VERSION} \ - ${TF_LFLAGS[@]} - - strip tf_sentencepiece/_sentencepiece_processor_ops.so.${TF_VERSION} - - python3 setup.py test -} - -build() { - rm -fr build - mkdir -p build - cd build - - cmake ../.. -DSPM_ENABLE_SHARED=OFF -DSPM_ENABLE_TENSORFLOW_SHARED=ON - make -j4 - make install - cd .. - - for v in $@; do - build_tf_wrapper $v - done - - python3 setup.py bdist_wheel --universal --plat-name=manylinux1_x86_64 - python3 setup.py sdist - rm -fr build tf_sentencepiece.egg-info -} - -if [ "$1" = "native" ]; then - shift - build $@ -else -# Do not support TF<=1.14 because API compatiblity issue is not fixed. -# run_docker tensorflow/tensorflow:custom-op-ubuntu14 "1.13.1 1.13.2 1.14.0" - run_docker tensorflow/tensorflow:custom-op-ubuntu16 "1.15.0 1.15.2 2.0.0 2.0.1" - run_docker tensorflow/tensorflow:2.1.0-custom-op-ubuntu16 "2.1.0 2.2.0" -fi diff --git a/tensorflow/make_py_wheel_mac.sh b/tensorflow/make_py_wheel_mac.sh deleted file mode 100755 index 20c28af..0000000 --- a/tensorflow/make_py_wheel_mac.sh +++ /dev/null @@ -1,116 +0,0 @@ -#!/bin/bash - -# Copyright 2018 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License.! - -set -e # exit immediately on error -set -x # display all commands - -build_tf_wrapper() { - if [ "$1" != "" ]; then - pkg_name="==$1" - fi - - # Builds _sentencepiece_processor_ops.so - pip install tensorflow${pkg_name} --upgrade --no-cache-dir -I - - pip uninstall numpy -y || true - pip uninstall numpy -y || true - pip uninstall numpy -y || true - pip install numpy --upgrade --no-cache-dir -I - - TF_CFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_compile_flags()))') ) - TF_LFLAGS=( $(python -c 'import tensorflow as tf; print(" ".join(tf.sysconfig.get_link_flags()))') ) - TF_VERSION=( $(python -c 'import tensorflow as tf; print(tf.__version__)') ) - - TF_LFLAGS2=`echo -n ${TF_LFLAGS[@]} | sed -e 's/-l:lib/-l/' -e 's/.[12].dylib//'` - - g++ -std=c++11 -shared -undefined dynamic_lookup \ - -I../../src \ - -D_USE_TF_STRING_VIEW \ - -fPIC ${TF_CFLAGS[@]} -O2 \ - -D_GLIBCXX_USE_CXX11_ABI=0 \ - -Wl,-force_load \ - /usr/local/lib/libsentencepiece.a \ - sentencepiece_processor_ops.cc \ - -o tf_sentencepiece/_sentencepiece_processor_ops.so.${TF_VERSION} \ - ${TF_LFLAGS2} - - strip -x tf_sentencepiece/_sentencepiece_processor_ops.so.${TF_VERSION} -} - -build() { - VERSION="3.7" - URL="https://www.python.org/ftp/python/3.7.0/python-3.7.0-macosx10.6.pkg" - INSTALL_PATH="/Library/Frameworks/Python.framework/Versions/${VERSION}/bin" - CURRENT_PATH=${PATH} - - curl -L -o python.pkg ${URL} - sudo installer -pkg python.pkg -target / - - if [ -f "${INSTALL_PATH}/python3" ]; then - ln -s ${INSTALL_PATH}/python3 ${INSTALL_PATH}/python - ln -s ${INSTALL_PATH}/python3-config ${INSTALL_PATH}/python-config - ln -s ${INSTALL_PATH}/pip3 ${INSTALL_PATH}/pip - fi - - curl -L -O https://bootstrap.pypa.io/get-pip.py - - export PATH="${INSTALL_PATH}:${CURRENT_PATH}" - ls -l ${INSTALL_PATH} - which python - which pip - python --version - sudo python get-pip.py --no-setuptools --no-wheel --ignore-installed - pip install --upgrade setuptools - pip install wheel - pip install delocate - - cd tensorflow - rm -fr build - mkdir -p build - cd build - - # Install sentencepiece - cmake ../.. -DSPM_ENABLE_SHARED=OFF -DSPM_ENABLE_TENSORFLOW_SHARED=ON - make -j4 VERBOSE=1 - make install - cd .. - - # Remove pre-installed Linux so files. - rm -f tf_sentencepiece/*.so.* - - build_tf_wrapper "2.2.0" - build_tf_wrapper "2.1.0" -# build_tf_wrapper "2.0.1" - build_tf_wrapper "2.0.0" -# build_tf_wrapper "1.15.2" - build_tf_wrapper "1.15.0" -# build_tf_wrapper "1.14.0" -# build_tf_wrapper "1.13.2" -# build_tf_wrapper "1.13.1" - - # Builds Python manylinux wheel package. - # Platform name is determined by the tensorflow pip package. - # TODO(taku): Automatically detect the platname of tensoflow-pip - # PLAT_NAME=$(python -c 'import distutils.util; print(distutils.util.get_platform())') - PLAT_NAME=macosx_10_10_x86_64 - python setup.py bdist_wheel --universal --plat-name=${PLAT_NAME} - # python setup.py test - python setup.py sdist - - rm -fr build tf_sentencepiece.egg-info tmp -} - -build diff --git a/tensorflow/sentencepiece_processor_ops.cc b/tensorflow/sentencepiece_processor_ops.cc deleted file mode 100644 index 7cf915f..0000000 --- a/tensorflow/sentencepiece_processor_ops.cc +++ /dev/null @@ -1,652 +0,0 @@ -// Copyright 2018 Google Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License.! - -#include <mutex> -#include <string> -#include <unordered_map> -#include <vector> - -#include "sentencepiece_processor.h" -#include "tensorflow/core/framework/op.h" -#include "tensorflow/core/framework/op_kernel.h" -#include "tensorflow/core/framework/shape_inference.h" -#include "tensorflow/core/framework/tensor_shape.h" -#include "tensorflow/core/lib/hash/hash.h" - -typedef int int32; -typedef long long int int64; -typedef unsigned long long int uint64; - -namespace sentencepiece { -using ::tensorflow::DEVICE_CPU; -using ::tensorflow::Hash64; -using ::tensorflow::OpKernel; -using ::tensorflow::OpKernelConstruction; -using ::tensorflow::OpKernelContext; -using ::tensorflow::Tensor; -using ::tensorflow::TensorShapeUtils; -using ::tensorflow::tstring; -using ::tensorflow::shape_inference::DimensionHandle; -using ::tensorflow::shape_inference::InferenceContext; -using ::tensorflow::shape_inference::ShapeHandle; - -namespace { - -// A utility function to convert sentencepiece::util::Status to -// ::tensorflow::Status -::tensorflow::Status ToTFStatus(const sentencepiece::util::Status& s) { - if (s.ok()) return ::tensorflow::Status(); - return ::tensorflow::Status(static_cast<::tensorflow::error::Code>(s.code()), - ::tensorflow::string(s.error_message())); -} - -// Global cache to reuse SentencePieceProcessor with the same -// model file or model proto. The instance is managed with shared_ptr so -// the instance is deleted when no client is using it (refcount is zero). -class SentencePieceProcessorCache { - public: - std::shared_ptr<SentencePieceProcessor> get( - const std::string key, bool is_proto, - sentencepiece::util::Status* status) { - std::lock_guard<std::mutex> l(mutex_); - - const uint64 fp = Hash64(key.data(), key.size()); - auto sp = data_[fp].lock(); - - if (sp) { - *status = sp->status(); - return sp; - } - - sp = std::make_shared<SentencePieceProcessor>(); - *status = is_proto ? sp->LoadFromSerializedProto(key) : sp->Load(key); - if (!status->ok()) return nullptr; - - data_[fp] = sp; - return sp; - } - - private: - std::mutex mutex_; - std::unordered_map<uint64, std::weak_ptr<SentencePieceProcessor>> data_; -}; - -class SentencePieceBaseOp : public OpKernel { - public: - explicit SentencePieceBaseOp(OpKernelConstruction* context) - : OpKernel(context) { - std::string model_file_attr, model_proto_attr; - OP_REQUIRES_OK(context, context->GetAttr("model_file", &model_file_attr)); - OP_REQUIRES_OK(context, context->GetAttr("model_proto", &model_proto_attr)); - - // Initializes global cache. - static SentencePieceProcessorCache* cache = new SentencePieceProcessorCache; - sentencepiece::util::Status status; - - OP_REQUIRES(context, - ((model_proto_attr.empty() && !model_file_attr.empty()) || - (!model_proto_attr.empty() && model_file_attr.empty())), - ::tensorflow::errors::InvalidArgument( - "Either `model_proto` or `model_file` must be set.")); - - if (!model_file_attr.empty()) { - sentencepiece_processor_ = cache->get(model_file_attr, false, &status); - } else { - // Loads serialized sentencepiece model proto to enable embedding the - // relatively small sentencepiece model proto into the tensorflow graph - // such that the tensorflow graph is self-contained. - sentencepiece_processor_ = cache->get(model_proto_attr, true, &status); - } - - OP_REQUIRES_OK(context, ToTFStatus(status)); - OP_REQUIRES(context, sentencepiece_processor_, - ::tensorflow::errors::InvalidArgument( - "Failed to initialize SentencePieceProcessor")); - - // Sets extra options to add <s>, </s>. - auto has_attribute = [&context](const std::string& name) { - bool flag = false; - context->GetAttr(name, &flag).IgnoreError(); - return flag; - }; - - if (has_attribute("add_bos")) { - bos_id_ = sentencepiece_processor_->bos_id(); - OP_REQUIRES(context, bos_id_ >= 0, - ::tensorflow::errors::InvalidArgument( - "`bos_id` is not defined in model")); - } - - if (has_attribute("add_eos")) { - eos_id_ = sentencepiece_processor_->eos_id(); - OP_REQUIRES(context, eos_id_ >= 0, - ::tensorflow::errors::InvalidArgument( - "`eos_id` is not defined in model")); - } - - reverse_ = has_attribute("reverse"); - - pad_id_ = sentencepiece_processor_->pad_id(); - if (pad_id_ == -1) pad_id_ = sentencepiece_processor_->unk_id(); - } - - protected: - void GetPad(int32* pad) const { *pad = pad_id_; } - - void GetPad(tstring* pad) const { - pad->clear(); - if (sentencepiece_processor_ && pad_id_ >= 0 && - pad_id_ != sentencepiece_processor_->unk_id()) - *pad = sentencepiece_processor_->IdToPiece(pad_id_); - } - - std::shared_ptr<SentencePieceProcessor> sentencepiece_processor_; - int bos_id_ = -1; - int eos_id_ = -1; - int pad_id_ = -1; - bool reverse_ = false; -}; -} // namespace - -class SentencePieceGetPieceSizeOp : public SentencePieceBaseOp { - public: - explicit SentencePieceGetPieceSizeOp(OpKernelConstruction* context) - : SentencePieceBaseOp(context) {} - - void Compute(OpKernelContext* context) override { - Tensor* vocab_size_tensor = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output(0, {}, &vocab_size_tensor)); - vocab_size_tensor->scalar<int32>()() = - sentencepiece_processor_->GetPieceSize(); - } -}; - -template <typename S, typename T> -class SentencePieceConvertPieceOp : public SentencePieceBaseOp { - public: - explicit SentencePieceConvertPieceOp(OpKernelConstruction* context) - : SentencePieceBaseOp(context) {} - - void Compute(OpKernelContext* context) override { - const Tensor* input_tensor = nullptr; - OP_REQUIRES_OK(context, context->input("input", &input_tensor)); - - Tensor* output_tensor = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(), - &output_tensor)); - for (int i = 0; i < input_tensor->NumElements(); ++i) - output_tensor->flat<T>()(i) = Convert(input_tensor->flat<S>()(i)); - } - - int32 Convert(const std::string& piece) const { - return sentencepiece_processor_->PieceToId(piece); - } - - std::string Convert(int32 id) const { - if (id >= 0 && id < sentencepiece_processor_->GetPieceSize()) { - return sentencepiece_processor_->IdToPiece(id); - } - return ""; - } -}; - -class SentencePieceGetPieceTypeOp : public SentencePieceBaseOp { - public: - explicit SentencePieceGetPieceTypeOp(OpKernelConstruction* context) - : SentencePieceBaseOp(context) { - OP_REQUIRES_OK(context, context->GetAttr("piece_type", &piece_type_)); - } - - void Compute(OpKernelContext* context) override { - const Tensor* input_tensor = nullptr; - OP_REQUIRES_OK(context, context->input("input", &input_tensor)); - - Tensor* output_tensor = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, input_tensor->shape(), - &output_tensor)); - - for (int i = 0; i < input_tensor->NumElements(); ++i) { - const int id = input_tensor->flat<int32>()(i); - switch (piece_type_) { - case 0: - output_tensor->flat<bool>()(i) = - sentencepiece_processor_->IsUnknown(id); - break; - case 1: - output_tensor->flat<bool>()(i) = - sentencepiece_processor_->IsControl(id); - break; - case 2: - output_tensor->flat<bool>()(i) = - sentencepiece_processor_->IsUnused(id); - break; - default: - break; - } - } - } - - private: - int piece_type_; -}; - -template <typename T, typename U = T> -class SentencePieceEncodeOpBase : public SentencePieceBaseOp { - public: - explicit SentencePieceEncodeOpBase(OpKernelConstruction* context) - : SentencePieceBaseOp(context) {} - - void Compute(OpKernelContext* context) override { - const Tensor* input_tensor = nullptr; - - OP_REQUIRES_OK(context, context->input("input", &input_tensor)); - OP_REQUIRES(context, TensorShapeUtils::IsVector(input_tensor->shape()), - ::tensorflow::errors::InvalidArgument( - "`input` must be a vector, got shape: ", - input_tensor->shape().DebugString())); - const auto& input_sentences = input_tensor->vec<tstring>(); - const int64 batch_size = input_sentences.size(); - - const Tensor* nbest_size_tensor = nullptr; - OP_REQUIRES_OK(context, context->input("nbest_size", &nbest_size_tensor)); - OP_REQUIRES(context, nbest_size_tensor->dims() <= 1, - ::tensorflow::errors::InvalidArgument( - "`nbest_size` must be a scalar or vector. got shape: ", - nbest_size_tensor->shape().DebugString())); - if (nbest_size_tensor->dims() == 1) { - OP_REQUIRES( - context, batch_size == nbest_size_tensor->dim_size(0), - ::tensorflow::errors::InvalidArgument( - "`nbest_size` must have the same batch size as `input`.")); - } - - const Tensor* alpha_tensor = nullptr; - OP_REQUIRES_OK(context, context->input("alpha", &alpha_tensor)); - OP_REQUIRES(context, alpha_tensor->dims() <= 1, - ::tensorflow::errors::InvalidArgument( - "`alpha` must be a scalar or vector, got shape: ", - alpha_tensor->shape().DebugString())); - if (alpha_tensor->dims() == 1) { - OP_REQUIRES(context, batch_size == alpha_tensor->dim_size(0), - ::tensorflow::errors::InvalidArgument( - "`alpha` must have the same batch size as `input`.")); - } - - std::vector<std::vector<U>> pieces(batch_size); - - for (int64 i = 0; i < batch_size; ++i) { - const int32 nbest_size = nbest_size_tensor->dims() == 1 - ? nbest_size_tensor->vec<int32>()(i) - : nbest_size_tensor->scalar<int32>()(); - if (nbest_size == 0 || nbest_size == 1) { - OP_REQUIRES_OK(context, - ToTFStatus(sentencepiece_processor_->Encode( - absl::string_view(input_sentences(i)), &pieces[i]))); - } else { - const float alpha = alpha_tensor->dims() == 1 - ? alpha_tensor->vec<float>()(i) - : alpha_tensor->scalar<float>()(); - OP_REQUIRES_OK(context, - ToTFStatus(sentencepiece_processor_->SampleEncode( - absl::string_view(input_sentences(i)), nbest_size, - alpha, &pieces[i]))); - } - RewritePieces(&pieces[i]); - } - - MakeOutputTensor(context, pieces); - } - - protected: - void RewritePieces(std::vector<std::string>* pieces) const { - if (reverse_) std::reverse(pieces->begin(), pieces->end()); - if (bos_id_ > 0) - pieces->insert(pieces->begin(), - sentencepiece_processor_->IdToPiece(bos_id_)); - if (eos_id_ > 0) - pieces->push_back(sentencepiece_processor_->IdToPiece(eos_id_)); - } - - void RewritePieces(std::vector<int32>* pieces) const { - if (reverse_) std::reverse(pieces->begin(), pieces->end()); - if (bos_id_ > 0) pieces->insert(pieces->begin(), bos_id_); - if (eos_id_ > 0) pieces->push_back(eos_id_); - } - - virtual void MakeOutputTensor(OpKernelContext* context, - const std::vector<std::vector<U>>& pieces) = 0; -}; - -template <typename T, typename U = T> -class SentencePieceEncodeSparseOp : public SentencePieceEncodeOpBase<T, U> { - public: - explicit SentencePieceEncodeSparseOp(OpKernelConstruction* context) - : SentencePieceEncodeOpBase<T, U>(context) {} - - protected: - void MakeOutputTensor(OpKernelContext* context, - const std::vector<std::vector<U>>& pieces) override { - const int64 batch_size = pieces.size(); - - int64 max_sequence_length = 0; - int64 indices_size = 0; - for (int row = 0; row < batch_size; ++row) { - const int col_size = pieces[row].size(); - max_sequence_length = std::max<int64>(col_size, max_sequence_length); - indices_size += col_size; - } - - // Creates the indices output tensor. - Tensor* indices_tensor = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(0, {indices_size, 2}, - &indices_tensor)); - - auto indices_tensor_output = indices_tensor->matrix<int64>(); - int item_idx = 0; - for (int row = 0; row < batch_size; ++row) { - for (int col = 0; col < pieces[row].size(); ++col) { - indices_tensor_output(item_idx, 0) = row; - indices_tensor_output(item_idx, 1) = col; - ++item_idx; - } - } - - // Creates the values output tensor. - Tensor* values_tensor = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output(1, {indices_size}, &values_tensor)); - - auto values_tensor_output = values_tensor->flat<T>(); - item_idx = 0; - for (int row = 0; row < batch_size; ++row) { - std::copy(pieces[row].begin(), pieces[row].end(), - &values_tensor_output(item_idx)); - item_idx += pieces[row].size(); - } - - // Creates the shape output tensor. - Tensor* shape_tensor = nullptr; - OP_REQUIRES_OK(context, context->allocate_output(2, {2}, &shape_tensor)); - - auto shape_tensor_output = shape_tensor->flat<int64>(); - shape_tensor_output(0) = batch_size; - shape_tensor_output(1) = max_sequence_length; - } -}; - -template <typename T, typename U = T> -class SentencePieceEncodeDenseOp : public SentencePieceEncodeOpBase<T, U> { - public: - explicit SentencePieceEncodeDenseOp(OpKernelConstruction* context) - : SentencePieceEncodeOpBase<T, U>(context) { - this->GetPad(&pad_); - } - - // protected: - void MakeOutputTensor(OpKernelContext* context, - const std::vector<std::vector<U>>& pieces) override { - const int64 batch_size = pieces.size(); - - int64 max_sequence_length = 0; - for (int row = 0; row < batch_size; ++row) { - max_sequence_length = - std::max<int64>(pieces[row].size(), max_sequence_length); - } - - Tensor* values_tensor = nullptr; - Tensor* length_tensor = nullptr; - - OP_REQUIRES_OK( - context, context->allocate_output(0, {batch_size, max_sequence_length}, - &values_tensor)); - OP_REQUIRES_OK(context, - context->allocate_output(1, {batch_size}, &length_tensor)); - - auto values_tensor_output = values_tensor->matrix<T>(); - auto length_tensor_output = length_tensor->vec<int32>(); - - U pad = pad_; - - for (int row = 0; row < batch_size; ++row) { - for (int col = 0; col < max_sequence_length; ++col) { - values_tensor_output(row, col) = - col < pieces[row].size() ? pieces[row][col] : pad; - } - length_tensor_output(row) = pieces[row].size(); - } - } - - private: - T pad_; -}; - -template <typename T, typename U = T> -class SentencePieceDecodeOp : public SentencePieceBaseOp { - public: - explicit SentencePieceDecodeOp(OpKernelConstruction* context) - : SentencePieceBaseOp(context) {} - - void Compute(OpKernelContext* context) override { - const Tensor* input_tensor = nullptr; - const Tensor* length_tensor = nullptr; - - OP_REQUIRES_OK(context, context->input("input", &input_tensor)); - OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_tensor->shape()), - ::tensorflow::errors::InvalidArgument( - "`input` must be a 2-D matrix. got shape: ", - input_tensor->shape().DebugString())); - OP_REQUIRES_OK(context, context->input("sequence_length", &length_tensor)); - OP_REQUIRES(context, TensorShapeUtils::IsVector(length_tensor->shape()), - ::tensorflow::errors::InvalidArgument( - "`sequence_length` must be a vector. got shape: ", - length_tensor->shape().DebugString())); - OP_REQUIRES( - context, input_tensor->dim_size(0) == length_tensor->dim_size(0), - ::tensorflow::errors::InvalidArgument( - "`sequence_length` must have the same batch size as `input`.")); - - const auto& input_sentences = input_tensor->matrix<T>(); - const auto& sequence_length = length_tensor->vec<int32>(); - const int64 batch_size = input_tensor->dim_size(0); - const int max_sequence_length = input_tensor->dim_size(1); - - Tensor* values_tensor = nullptr; - OP_REQUIRES_OK(context, - context->allocate_output(0, {batch_size}, &values_tensor)); - auto values_tensor_output = values_tensor->vec<tstring>(); - - for (int64 i = 0; i < batch_size; ++i) { - OP_REQUIRES(context, - (sequence_length(i) >= 0 && - sequence_length(i) <= max_sequence_length), - ::tensorflow::errors::InvalidArgument( - "`sequence_length` is out-of-range.")); - std::vector<U> pieces(&input_sentences(i, 0), - &input_sentences(i, 0) + sequence_length(i)); - if (reverse_) std::reverse(pieces.begin(), pieces.end()); - std::string detokenized_str; - OP_REQUIRES_OK(context, ToTFStatus(sentencepiece_processor_->Decode( - pieces, &detokenized_str))); - values_tensor_output(i) = detokenized_str; - } - } -}; - -namespace { -// The snake case of this variables are used as the function names. -constexpr char kGetPieceSizeOpName[] = "SentencepieceGetPieceSize"; -constexpr char kPieceToIdOpName[] = "SentencepiecePieceToId"; -constexpr char kIdToPieceOpName[] = "SentencepieceIdToPiece"; -constexpr char kGetPieceTypeOpName[] = "SentencepieceGetPieceType"; -constexpr char kEncodeDenseOpName[] = "SentencepieceEncodeDense"; -constexpr char kEncodeSparseOpName[] = "SentencepieceEncodeSparse"; -constexpr char kDecodeOpName[] = "SentencepieceDecode"; -} // namespace - -REGISTER_OP(kGetPieceSizeOpName) - .Output("vocab_size: int32") - .Attr("model_file: string = ''") - .Attr("model_proto: string = ''") - .SetShapeFn([](InferenceContext* c) { - c->set_output(0, c->MakeShape({})); - return ::tensorflow::Status::OK(); - }); - -REGISTER_KERNEL_BUILDER(Name(kGetPieceSizeOpName).Device(DEVICE_CPU), - SentencePieceGetPieceSizeOp); - -REGISTER_OP(kPieceToIdOpName) - .Input("input: string") - .Output("values: int32") - .Attr("model_file: string = ''") - .Attr("model_proto: string = ''") - .SetShapeFn([](InferenceContext* c) { - c->set_output(0, c->input(0)); - return ::tensorflow::Status::OK(); - }); - -REGISTER_KERNEL_BUILDER(Name(kPieceToIdOpName).Device(DEVICE_CPU), - SentencePieceConvertPieceOp<tstring, int32>); - -REGISTER_OP(kIdToPieceOpName) - .Input("input: int32") - .Output("values: string") - .Attr("model_file: string = ''") - .Attr("model_proto: string = ''") - .SetShapeFn([](InferenceContext* c) { - c->set_output(0, c->input(0)); - return ::tensorflow::Status::OK(); - }); - -REGISTER_KERNEL_BUILDER(Name(kIdToPieceOpName).Device(DEVICE_CPU), - SentencePieceConvertPieceOp<int32, tstring>); - -REGISTER_OP(kGetPieceTypeOpName) - .Input("input: int32") - .Output("values: bool") - .Attr("model_file: string = ''") - .Attr("model_proto: string = ''") - .Attr("piece_type: int = 0") - .SetShapeFn([](InferenceContext* c) { - c->set_output(0, c->input(0)); - return ::tensorflow::Status::OK(); - }); - -REGISTER_KERNEL_BUILDER(Name(kGetPieceTypeOpName).Device(DEVICE_CPU), - SentencePieceGetPieceTypeOp); - -REGISTER_OP(kEncodeDenseOpName) - .Attr("out_type: {int32, string} = DT_INT32") - .Input("input: string") - .Input("nbest_size: int32") - .Input("alpha: float") - .Output("values: out_type") - .Output("sequence_length: int32") - .Attr("model_file: string = ''") - .Attr("model_proto: string = ''") - .Attr("reverse: bool = false") - .Attr("add_bos: bool = false") - .Attr("add_eos: bool = false") - .SetShapeFn([](InferenceContext* c) { - ShapeHandle input, nbest, alpha; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input)); - TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest)); - TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha)); - DimensionHandle batch_size = c->Dim(input, 0); - if (c->Rank(nbest) == 1) - TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size)); - if (c->Rank(alpha) == 1) - TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size)); - c->set_output(0, c->MakeShape({batch_size, c->UnknownDim()})); - c->set_output(1, c->MakeShape({batch_size})); - return ::tensorflow::Status::OK(); - }); - -REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName) - .Device(DEVICE_CPU) - .TypeConstraint<int32>("out_type"), - SentencePieceEncodeDenseOp<int32>); - -REGISTER_KERNEL_BUILDER(Name(kEncodeDenseOpName) - .Device(DEVICE_CPU) - .TypeConstraint<tstring>("out_type"), - SentencePieceEncodeDenseOp<tstring, std::string>); - -REGISTER_OP(kEncodeSparseOpName) - .Attr("out_type: {int32, string} = DT_INT32") - .Input("input: string") - .Input("nbest_size: int32") - .Input("alpha: float") - .Output("indices: int64") - .Output("values: out_type") - .Output("dense_shape: int64") - .Attr("model_file: string = ''") - .Attr("model_proto: string = ''") - .Attr("reverse: bool = false") - .Attr("add_bos: bool = false") - .Attr("add_eos: bool = false") - .SetShapeFn([](InferenceContext* c) { - ShapeHandle input, nbest, alpha; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 1, &input)); - TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(1), 1, &nbest)); - TF_RETURN_IF_ERROR(c->WithRankAtMost(c->input(2), 1, &alpha)); - DimensionHandle batch_size = c->Dim(input, 0); - if (c->Rank(nbest) == 1) - TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(nbest, 0), &batch_size)); - if (c->Rank(alpha) == 1) - TF_RETURN_IF_ERROR(c->Merge(batch_size, c->Dim(alpha, 0), &batch_size)); - c->set_output(0, c->MakeShape({c->UnknownDim(), 2})); - c->set_output(1, c->MakeShape({c->UnknownDim()})); - c->set_output(2, c->MakeShape({2})); - return ::tensorflow::Status::OK(); - }); - -REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName) - .Device(DEVICE_CPU) - .TypeConstraint<int32>("out_type"), - SentencePieceEncodeSparseOp<int32>); - -REGISTER_KERNEL_BUILDER(Name(kEncodeSparseOpName) - .Device(DEVICE_CPU) - .TypeConstraint<tstring>("out_type"), - SentencePieceEncodeSparseOp<tstring, std::string>); - -REGISTER_OP(kDecodeOpName) - .Attr("T: {int32, string}") - .Input("input: T") - .Input("sequence_length: int32") - .Output("values: string") - .Attr("model_file: string = ''") - .Attr("model_proto: string = ''") - .Attr("reverse: bool = false") - .SetShapeFn([](InferenceContext* c) { - ShapeHandle input, sequence_length; - TF_RETURN_IF_ERROR(c->WithRank(c->input(0), 2, &input)); - TF_RETURN_IF_ERROR(c->WithRank(c->input(1), 1, &sequence_length)); - DimensionHandle batch_size = c->Dim(input, 0); - TF_RETURN_IF_ERROR( - c->Merge(batch_size, c->Dim(sequence_length, 0), &batch_size)); - c->set_output(0, c->MakeShape({batch_size})); - return ::tensorflow::Status::OK(); - }); - -REGISTER_KERNEL_BUILDER( - Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<int32>("T"), - SentencePieceDecodeOp<int32>); - -REGISTER_KERNEL_BUILDER( - Name(kDecodeOpName).Device(DEVICE_CPU).TypeConstraint<tstring>("T"), - SentencePieceDecodeOp<tstring, std::string>); -} // namespace sentencepiece diff --git a/tensorflow/setup.py b/tensorflow/setup.py deleted file mode 100755 index db3c6d9..0000000 --- a/tensorflow/setup.py +++ /dev/null @@ -1,52 +0,0 @@ -#!/usr/bin/env python - -# Copyright 2018 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License.! - -from setuptools import setup -from setuptools import find_packages -import tensorflow as tf -import codecs -import string -import sys -import os - -sys.path.append(os.path.join('.', 'test')) - -with codecs.open(os.path.join('..', 'VERSION'), 'r', 'utf-8') as f: - version = f.read() - -setup(name = 'tf_sentencepiece', - author = 'Taku Kudo', - author_email='taku@google.com', - description = 'SentencePiece Encode/Decode ops for TensorFlow', - version=version, - url = 'https://github.com/google/sentencepiece', - license = 'Apache', - platforms = 'Unix', - packages=find_packages(exclude=['test']), - package_data={'tf_sentencepiece': ['_sentencepiece_processor_ops.so*']}, - classifiers = [ - 'Development Status :: 5 - Production/Stable', - 'Environment :: Console', - 'Intended Audience :: Developers', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: Apache Software License', - 'Operating System :: Unix', - 'Programming Language :: Python', - 'Topic :: Text Processing :: Linguistic', - 'Topic :: Software Development :: Libraries :: Python Modules' - ], - keywords='tensorflow machine learning sentencepiece NLP segmentation', - test_suite = 'tf_sentencepiece_test.suite') diff --git a/tensorflow/test/__init__.py b/tensorflow/test/__init__.py deleted file mode 100644 index e69de29..0000000 --- a/tensorflow/test/__init__.py +++ /dev/null diff --git a/tensorflow/test/tf_sentencepiece_test.py b/tensorflow/test/tf_sentencepiece_test.py deleted file mode 100755 index 34f6fcd..0000000 --- a/tensorflow/test/tf_sentencepiece_test.py +++ /dev/null @@ -1,322 +0,0 @@ -#!/usr/bin/python -# -*- coding: utf-8 -*- - -import itertools as it -import os -import sys -import unittest -import tensorflow as tf -import tf_sentencepiece as tfspm - -try: - tf.Session = tf.compat.v1.Session - tf.sparse_tensor_to_dense = tf.compat.v1.sparse_tensor_to_dense -except: - pass - - -class SentencePieceProcssorOpTest(unittest.TestCase): - - def _getSentencePieceModelFile(self): - return os.path.join('..', 'python', 'test', 'test_model.model') - - def _getPieceSize(self): - return 1000 - - def _getExpected(self, - reverse=False, - add_bos=False, - add_eos=False, - padding=''): - # TF uses str(bytes) as a string representation. - padding = padding.encode('utf8') - sentences = [ - b'Hello world.', b'I have a pen.', b'I saw a girl with a telescope.' - ] - pieces = [[b'\xe2\x96\x81He', b'll', b'o', b'\xe2\x96\x81world', b'.'], - [ - b'\xe2\x96\x81I', b'\xe2\x96\x81have', b'\xe2\x96\x81a', - b'\xe2\x96\x81p', b'en', b'.' - ], - [ - b'\xe2\x96\x81I', b'\xe2\x96\x81saw', b'\xe2\x96\x81a', - b'\xe2\x96\x81girl', b'\xe2\x96\x81with', b'\xe2\x96\x81a', - b'\xe2\x96\x81', b'te', b'le', b's', b'c', b'o', b'pe', b'.' - ]] - ids = [[151, 88, 21, 887, 6], [9, 76, 11, 68, 98, 6], - [9, 459, 11, 939, 44, 11, 4, 142, 82, 8, 28, 21, 132, 6]] - seq_len = [5, 6, 14] - - if reverse: - ids = [x[::-1] for x in ids] - pieces = [x[::-1] for x in pieces] - - if add_bos: - ids = [[1] + x for x in ids] - pieces = [[b'<s>'] + x for x in pieces] - seq_len = [x + 1 for x in seq_len] - - if add_eos: - ids = [x + [2] for x in ids] - pieces = [x + [b'</s>'] for x in pieces] - seq_len = [x + 1 for x in seq_len] - - max_len = max(seq_len) - pieces = [x + [padding] * (max_len - len(x)) for x in pieces] - ids = [x + [0] * (max_len - len(x)) for x in ids] - - return sentences, pieces, ids, seq_len - - def testGetPieceSize(self): - sentencepiece_model_file = self._getSentencePieceModelFile() - - with tf.Session(): - s = tfspm.piece_size(model_file=sentencepiece_model_file) - self.assertEqual(s.eval(), self._getPieceSize()) - - def testConvertPiece(self): - sentencepiece_model_file = self._getSentencePieceModelFile() - (sentences, expected_pieces, expected_ids, - expected_seq_len) = self._getExpected(padding='<unk>') - - with tf.Session(): - ids_matrix = tfspm.piece_to_id( - tf.constant(expected_pieces), model_file=sentencepiece_model_file) - ids_vec = tfspm.piece_to_id( - tf.constant(expected_pieces[0]), model_file=sentencepiece_model_file) - ids_scalar = tfspm.piece_to_id( - tf.constant(expected_pieces[0][0]), - model_file=sentencepiece_model_file) - - self.assertEqual(ids_matrix.eval().tolist(), expected_ids) - self.assertEqual(ids_vec.eval().tolist(), expected_ids[0]) - self.assertEqual(ids_scalar.eval(), expected_ids[0][0]) - - pieces_matrix = tfspm.id_to_piece( - tf.constant(expected_ids), model_file=sentencepiece_model_file) - pieces_vec = tfspm.id_to_piece( - tf.constant(expected_ids[0]), model_file=sentencepiece_model_file) - pieces_scalar = tfspm.id_to_piece( - tf.constant(expected_ids[0][0]), model_file=sentencepiece_model_file) - - self.assertEqual(pieces_matrix.eval().tolist(), expected_pieces) - self.assertEqual(pieces_vec.eval().tolist(), expected_pieces[0]) - self.assertEqual(pieces_scalar.eval(), expected_pieces[0][0]) - - def testEncodeAndDecode(self): - sentencepiece_model_file = self._getSentencePieceModelFile() - - with tf.Session(): - for reverse, add_bos, add_eos in list( - it.product((True, False), repeat=3)): - (sentences, expected_pieces, expected_ids, - expected_seq_len) = self._getExpected( - reverse=reverse, add_bos=add_bos, add_eos=add_eos) - - # Encode sentences into pieces/ids. - s = tf.constant(sentences) - pieces, seq_len1 = tfspm.encode( - s, - model_file=sentencepiece_model_file, - reverse=reverse, - add_bos=add_bos, - add_eos=add_eos, - out_type=tf.string) - ids, seq_len2 = tfspm.encode( - s, - model_file=sentencepiece_model_file, - reverse=reverse, - add_bos=add_bos, - add_eos=add_eos) - - self.assertEqual(pieces.eval().tolist(), expected_pieces) - self.assertEqual(ids.eval().tolist(), expected_ids) - self.assertEqual(seq_len1.eval().tolist(), expected_seq_len) - self.assertEqual(seq_len2.eval().tolist(), expected_seq_len) - - # Decode pieces into sentences/ids. - pieces = tf.constant(expected_pieces) - ids = tf.constant(expected_ids) - seq_len = tf.constant(expected_seq_len, dtype=tf.int32) - decoded_sentences1 = tfspm.decode( - pieces, - seq_len, - model_file=sentencepiece_model_file, - reverse=reverse) - decoded_sentences2 = tfspm.decode( - ids, seq_len, model_file=sentencepiece_model_file, reverse=reverse) - - self.assertEqual(decoded_sentences1.eval().tolist(), sentences) - self.assertEqual(decoded_sentences2.eval().tolist(), sentences) - - def testSampleEncodeAndDecode(self): - sentencepiece_model_file = self._getSentencePieceModelFile() - sentences, _, _, _ = self._getExpected() - - with tf.Session(): - for n, a in [(-1, 0.1), (64, 0.1), (0, 0.0)]: - # Round trip test. - nbest_size = tf.constant(n) - alpha = tf.constant(a) - s = tf.constant(sentences) - - pieces, seq_len1 = tfspm.encode( - s, - nbest_size=nbest_size, - alpha=alpha, - model_file=sentencepiece_model_file, - out_type=tf.string) - ids, seq_len2 = tfspm.encode( - s, - nbest_size=nbest_size, - alpha=alpha, - model_file=sentencepiece_model_file) - decoded_sentences1 = tfspm.decode( - pieces, seq_len1, model_file=sentencepiece_model_file) - decoded_sentences2 = tfspm.decode( - ids, seq_len2, model_file=sentencepiece_model_file) - - self.assertEqual(decoded_sentences1.eval().tolist(), sentences) - self.assertEqual(decoded_sentences2.eval().tolist(), sentences) - - def testEncodeAndDecodeSparse(self): - sentencepiece_model_file = self._getSentencePieceModelFile() - - with tf.Session(): - for reverse, add_bos, add_eos in list( - it.product((True, False), repeat=3)): - (sentences, expected_pieces, expected_ids, - _) = self._getExpected(reverse, add_bos, add_eos) - - # Encode sentences into sparse pieces/ids. - s = tf.constant(sentences) - pieces = tfspm.encode_sparse( - s, - model_file=sentencepiece_model_file, - reverse=reverse, - add_bos=add_bos, - add_eos=add_eos, - out_type=tf.string) - ids = tfspm.encode_sparse( - s, - model_file=sentencepiece_model_file, - reverse=reverse, - add_bos=add_bos, - add_eos=add_eos) - pieces = tf.sparse_tensor_to_dense(pieces, default_value='') - ids = tf.sparse_tensor_to_dense(ids, default_value=0) - - self.assertEqual(ids.eval().tolist(), expected_ids) - self.assertEqual(pieces.eval().tolist(), expected_pieces) - - def testGetPieceType(self): - sentencepiece_model_file = self._getSentencePieceModelFile() - expected_is_unknown = [] - expected_is_control = [] - expected_is_unused = [] - ids = [] - - for i in range(self._getPieceSize()): - ids.append(i) - expected_is_unknown.append(i == 0) - expected_is_control.append(i == 1 or i == 2) - expected_is_unused.append(False) - - with tf.Session(): - s = tf.constant(ids) - is_unknown = tfspm.is_unknown(s, model_file=sentencepiece_model_file) - is_control = tfspm.is_control(s, model_file=sentencepiece_model_file) - is_unused = tfspm.is_unused(s, model_file=sentencepiece_model_file) - - self.assertEqual(is_unknown.eval().tolist(), expected_is_unknown) - self.assertEqual(is_control.eval().tolist(), expected_is_control) - self.assertEqual(is_unused.eval().tolist(), expected_is_unused) - - def testLoadModelProto(self): - # Makes a serialized model proto. - with open(self._getSentencePieceModelFile(), 'rb') as f: - model_proto = f.read() - with tf.Session() as sess: - sentences = ['Hello world.'] - a = tf.constant(sentences) - sess.run(tfspm.encode(a, model_proto=model_proto, out_type=tf.string)) - - def testInvalidModelPath(self): - with tf.Session() as sess: - with self.assertRaises(tf.errors.NotFoundError): - sentences = ['Hello world.'] - a = tf.constant(sentences) - sess.run(tfspm.encode(a, model_file='invalid path', out_type=tf.string)) - - def testInvalidModelProto(self): - with tf.Session() as sess: - with self.assertRaises(tf.errors.InternalError): - sentences = ['Hello world.'] - a = tf.constant(sentences) - sess.run( - tfspm.encode(a, model_proto='invalid proto', out_type=tf.string)) - - def testInvalidInput(self): - sentences = ['Hello world.', 'This is a test.'] - ids = [[0, 1], [2, 3]] - model_file = self._getSentencePieceModelFile() - with tf.Session() as sess: - a = tf.constant(sentences) - b = tf.constant(ids) - - alpha = tf.constant([1.0, 2.0]) - sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha, name='foo')) - - nbest_size = tf.constant([1, 2], dtype=tf.int32) - sess.run( - tfspm.encode( - a, model_file=model_file, nbest_size=nbest_size, name='foo')) - - alpha = tf.constant(1.0) - sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha, name='foo')) - - nbest_size = tf.constant(10, dtype=tf.int32) - sess.run( - tfspm.encode( - a, model_file=model_file, nbest_size=nbest_size, name='foo')) - - sess.run( - tfspm.decode( - b, sequence_length=tf.constant([2, 2]), model_file=model_file)) - - with self.assertRaises(ValueError): - a = tf.constant(sentences) - alpha = tf.constant([1.0, 2.0, 3.0]) - sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha)) - with self.assertRaises(ValueError): - a = tf.constant(sentences) - nbest_size = tf.constant([1, 2, 3], dtype=tf.int32) - sess.run(tfspm.encode(a, model_file=model_file, nbest_size=nbest_size)) - with self.assertRaises(ValueError): - a = tf.constant(sentences) - alpha = tf.constant([[1.0], [2.0]]) - sess.run(tfspm.encode(a, model_file=model_file, alpha=alpha)) - with self.assertRaises(ValueError): - a = tf.constant(sentences) - nbest_size = tf.constant([[1], [2]], dtype=tf.int32) - sess.run(tfspm.encode(a, model_file=model_file, nbest_size=nbest_size)) - with self.assertRaises(ValueError): - b = tf.constant(ids) - sess.run(tfspm.decode(a, sequence_length=2, model_file=model_file)) - with self.assertRaises(ValueError): - b = tf.constant(ids) - sess.run( - tfspm.decode( - a, - sequence_length=tf.constant([2, 2, 2]), - model_file=model_file)) - - -def suite(): - suite = unittest.TestSuite() - suite.addTests(unittest.makeSuite(SentencePieceProcssorOpTest)) - return suite - - -if __name__ == '__main__': - unittest.main() diff --git a/tensorflow/tf_sentencepiece/__init__.py b/tensorflow/tf_sentencepiece/__init__.py deleted file mode 100644 index 8a3df31..0000000 --- a/tensorflow/tf_sentencepiece/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - -from tf_sentencepiece.sentencepiece_processor_ops import * diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.0 b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.0 Binary files differdeleted file mode 100755 index b7af398..0000000 --- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.0 +++ /dev/null diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.2 b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.2 Binary files differdeleted file mode 100755 index b7af398..0000000 --- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.1.15.2 +++ /dev/null diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.0 b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.0 Binary files differdeleted file mode 100755 index 06a5258..0000000 --- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.0 +++ /dev/null diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.1 b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.1 Binary files differdeleted file mode 100755 index 06a5258..0000000 --- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.0.1 +++ /dev/null diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.1.0 b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.1.0 Binary files differdeleted file mode 100755 index 31b6668..0000000 --- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.1.0 +++ /dev/null diff --git a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.2.0 b/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.2.0 Binary files differdeleted file mode 100755 index fa8b9ea..0000000 --- a/tensorflow/tf_sentencepiece/_sentencepiece_processor_ops.so.2.2.0 +++ /dev/null diff --git a/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py b/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py deleted file mode 100644 index 5c2aa39..0000000 --- a/tensorflow/tf_sentencepiece/sentencepiece_processor_ops.py +++ /dev/null @@ -1,317 +0,0 @@ -# Copyright 2018 Google Inc. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License.! -r"""Ops for SentencePiece Encoding/Decoding.""" - -# TODO(taku): Implements n-best output - -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function -from distutils.version import LooseVersion - -import warnings -import glob -import re -import os -import tensorflow as tf - -so_base = os.path.join( - os.path.dirname(__file__), '_sentencepiece_processor_ops.so') -so_file = so_base + '.' + tf.__version__ - -if not hasattr(tf, 'no_gradient'): - tf.no_gradient = tf.NotDifferentiable - -if not os.path.exists(so_file): - versions = [ - re.search('so.([0-9]+\.[0-9\.]+.*)$', os.path.basename(n)).group(1) - for n in glob.glob(so_base + '.*') - ] - latest = sorted(versions, key=LooseVersion)[-1] - warnings.warn('No so file is found for [%s] from [%s]' % - (tf.__version__, ', '.join(versions))) - warnings.warn('use the latest version %s' % (latest)) - so_file = so_base + '.' + latest - -_gen_sentencepiece_processor_op = tf.load_op_library(so_file) - - -def piece_size(model_file=None, model_proto=None, name=None): - """Returns the piece size (vocabulary size). - - Args: - model_file: The sentencepiece model file path. - model_proto: The sentencepiece model serialized proto. Either `model_file` - or `model_proto` must be set. - name: The name argument that is passed to the op function. - - Returns: - A scalar representing the vocabulary size. - """ - - return _gen_sentencepiece_processor_op.sentencepiece_get_piece_size( - model_file=model_file, model_proto=model_proto, name=name) - - -def piece_to_id(input, model_file=None, model_proto=None, name=None): - """Converts piece into vocabulary id. - - Args: - input: An arbitrary tensor of string. - model_file: The sentencepiece model file path. - model_proto: The sentencepiece model serialized proto. Either `model_file` - or `model_proto` must be set. - name: The name argument that is passed to the op function. - - Returns: - A tensor of int32 with the same shape as input. - """ - - return _gen_sentencepiece_processor_op.sentencepiece_piece_to_id( - input, model_file=model_file, model_proto=model_proto, name=name) - - -def id_to_piece(input, model_file=None, model_proto=None, name=None): - """Converts vocabulary id into piece. - - Args: - input: An arbitrary tensor of int32. - model_file: The sentencepiece model file path. - model_proto: The sentencepiece model serialized proto. Either `model_file` - or `model_proto` must be set. - name: The name argument that is passed to the op function. - - Returns: - A tensor of string with the same shape as input. - """ - - return _gen_sentencepiece_processor_op.sentencepiece_id_to_piece( - input, model_file=model_file, model_proto=model_proto, name=name) - - -def is_unknown(input, model_file=None, model_proto=None, name=None): - """Returns true if input id is unknown piece. - - Args: - input: An arbitrary tensor of int32. - model_file: The sentencepiece model file path. - model_proto: The sentencepiece model serialized proto. Either `model_file` - or `model_proto` must be set. - name: The name argument that is passed to the op function. - - Returns: - A tensor of bool with the same shape as input. - """ - - return _gen_sentencepiece_processor_op.sentencepiece_get_piece_type( - input, - model_file=model_file, - model_proto=model_proto, - name=name, - piece_type=0) - - -def is_control(input, model_file=None, model_proto=None, name=None): - """Returns true if input id is control piece. - - Args: - input: An arbitrary tensor of int32. - model_file: The sentencepiece model file path. - model_proto: The sentencepiece model serialized proto. Either `model_file` - or `model_proto` must be set. - name: The name argument that is passed to the op function. - - Returns: - A tensor of bool with the same shape as input. - """ - - return _gen_sentencepiece_processor_op.sentencepiece_get_piece_type( - input, - model_file=model_file, - model_proto=model_proto, - name=name, - piece_type=1) - - -def is_unused(input, model_file=None, model_proto=None, name=None): - """Returns true if input id is unused piece. - - Args: - input: An arbitrary tensor of int32. - model_file: The sentencepiece model file path. - model_proto: The sentencepiece model serialized proto. Either `model_file` - or `model_proto` must be set. - name: The name argument that is passed to the op function. - - Returns: - A tensor of bool with the same shape as input. - """ - - return _gen_sentencepiece_processor_op.sentencepiece_get_piece_type( - input, - model_file=model_file, - model_proto=model_proto, - name=name, - piece_type=2) - - -def encode_dense(input_sentences, - nbest_size=0, - alpha=1.0, - model_file=None, - model_proto=None, - reverse=False, - add_bos=False, - add_eos=False, - out_type=tf.int32, - name=None): - """Encodes sentences into pieces in dense tensor format. - - Args: - input_sentences: A 1D string tensor of arbitrary size holding the raw text - of input sentences. - nbest_size: A scalar or 1D tensor for sampling. - nbest_size = {0,1}: No sampling is performed. - nbest_size > 1: samples from the nbest_size results. - nbest_size < 0: assuming that nbest_size is infinite and samples - from the all hypothesis (lattice) using - forward-filtering-and-backward-sampling algorithm. - alpha: A scalar or 1D tensor for a smoothing parameter. Inverse temperature - for probability rescaling. - model_file: The sentencepiece model file path. - model_proto: The sentencepiece model serialized proto. Either `model_file` - or `model_proto` must be set. - reverse: Reverses the tokenized sequence (Default = false) - add_bos: Add <s> to the result (Default = false) - add_eos: Add </s> to the result (Default = false) <s>/</s> is added after - reversing (if enabled). - out_type: output type. tf.int32 or tf.string (Default = tf.int32) Setting - tf.int32 directly encodes the string into an id sequence. - name: The name argument that is passed to the op function. - - Returns: - pieces: A dense 2D tensor representing the tokenized sentences. - sequence_length: A 1D tensor representing the length of pieces. - """ - - return _gen_sentencepiece_processor_op.sentencepiece_encode_dense( - input_sentences, - nbest_size=nbest_size, - alpha=alpha, - model_file=model_file, - model_proto=model_proto, - reverse=reverse, - add_bos=add_bos, - add_eos=add_eos, - out_type=out_type, - name=name) - - -def encode_sparse(input_sentences, - nbest_size=0, - alpha=1.0, - model_file=None, - model_proto=None, - reverse=False, - add_bos=False, - add_eos=False, - out_type=tf.int32, - name=None): - """Encodes sentences into pieces in sparse tensor format. - - Args: - input_sentences: A 1D string tensor of arbitrary size holding the raw text - of input sentences. - nbest_size: A scalar or 1D tensor for sampling. - nbest_size = {0,1}: No sampling is performed. - nbest_size > 1: samples from the nbest_size results. - nbest_size < 0: assuming that nbest_size is infinite and samples - from the all hypothesis (lattice) using - forward-filtering-and-backward-sampling algorithm. - alpha: A scalar or 1D tensor for a smoothing parameter. Inverse temperature - for probability rescaling. - model_file: The sentencepiece model file path. - model_proto: The sentencepiece model serialized proto. Either `model_file` - or `model_proto` must be set. - reverse: Reverses the tokenized sequence (Default = false) - add_bos: Add <s> to the result (Default = false) - add_eos: Add </s> to the result (Default = false) <s>/</s> is added after - reversing (if enabled). - out_type: output type. tf.int32 or tf.string (Default = tf.int32) Setting - tf.int32 directly encodes the string into an id sequence. - name: The name argument that is passed to the op function. - - Returns: - pieces: A sparse 2D tensor representing the tokenized sentences. - """ - - indices, values, dense_shape = ( - _gen_sentencepiece_processor_op.sentencepiece_encode_sparse( - input_sentences, - nbest_size=nbest_size, - alpha=alpha, - model_file=model_file, - model_proto=model_proto, - reverse=reverse, - add_bos=add_bos, - add_eos=add_eos, - out_type=out_type, - name=name)) - return tf.SparseTensor(indices, values, dense_shape) - - -def decode(pieces, - sequence_length, - model_file=None, - model_proto=None, - reverse=False, - name=None): - """Decode pieces into postprocessed text. - - Args: - pieces: A 2D int32 or string tensor [batch_size x max_length] of encoded - sequences. - sequence_length: A 1D int32 tensor [batch_size] representing the length of - pieces. - model_file: The sentencepiece model file path. - model_proto: The sentencepiece model serialized proto. Either `model_file` - or `model_proto` must be set. - reverse: Reverses the tokenized sequence (Default = false) - name: The name argument that is passed to the op function. - - Returns: - text: A 1D string tensor of decoded string. - """ - - return _gen_sentencepiece_processor_op.sentencepiece_decode( - pieces, - sequence_length, - model_file=model_file, - model_proto=model_proto, - reverse=reverse, - name=name) - - -# Adds an alias for encode_dense. Accepts the `encode` function. -encode = encode_dense -sparse_encode = encode_sparse -dense_encode = encode_dense - -tf.no_gradient('SentencepieceGetPieceSize') -tf.no_gradient('SentencepieceIdToPiece') -tf.no_gradient('SentencepiecePieceToId') -tf.no_gradient('SentencepieceGetPieceType') -tf.no_gradient('SentencepieceEncodeDense') -tf.no_gradient('SentencepieceEncodeSparse') -tf.no_gradient('SentencepieceDecode') @@ -62,15 +62,6 @@ build_python() { cd .. } -build_tensorflow() { - cd tensorflow - pip3 install tensorflow - python3 setup.py bdist_wheel - python3 setup.py sdist - python3 setup.py test - cd .. -} - build_linux_gcc_coverall_ubuntu() { setup_debian apt-get install -y lcov @@ -78,7 +69,6 @@ build_linux_gcc_coverall_ubuntu() { pip3 install 'requests[security]' build_generic build_python - build_tensorflow mkdir -p build cd build cmake .. -DSPM_COVERAGE=ON @@ -92,13 +82,6 @@ build_linux_gcc_ubuntu() { setup_ubuntu build_generic build_python - build_tensorflow -} - -build_linux_gcc_ubuntu_no_tf() { - setup_ubuntu - build_generic - build_python } build_linux_gcc_ubuntu_i386() { @@ -111,14 +94,12 @@ build_linux_gcc_debian() { setup_debian build_generic build_python - build_tensorflow } build_linux_gcc_fedora() { setup_fedora build_generic build_python -# build_tensorflow } build_linux_clang_ubuntu() { |