diff options
author | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2018-02-22 04:44:04 +0300 |
---|---|---|
committer | Marcin Junczys-Dowmunt <junczys@amu.edu.pl> | 2018-02-22 04:44:04 +0300 |
commit | d9d66f416b3284516f44e5b8e7b1fc7a0623e33f (patch) | |
tree | 59cd81a1b74c4ce395279b552bf7e6b724e82a4a | |
parent | 6e421f7a741dca8d7181f87acd398da60bb77f7d (diff) |
prototype cpu version
49 files changed, 1981 insertions, 1055 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 23598378..2b9067e5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,6 +65,14 @@ endif(Tcmalloc_FOUND) # add_definitions(-DMPI_FOUND=1) #endif(MPI_FOUND) +# currently for normal (openblas) blas +find_package(BLAS) +if(BLAS_FOUND) + include_directories(${BLAS_INCLUDE_DIRS}) + set(EXT_LIBS ${EXT_LIBS} ${BLAS_LIBRARIES}) + add_definitions(-DBLAS_FOUND=1) +endif(BLAS_FOUND) + find_package(ZLIB) if(ZLIB_FOUND) include_directories(${ZLIB_INCLUDE_DIRS}) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 92c8a776..cb1606f6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -8,20 +8,24 @@ cuda_add_library(marian 3rd_party/cnpy/cnpy.cpp 3rd_party/exception.cpp 3rd_party/svd/svd.cpp -# tensors/tensor.cu - tensors/device.cu - tensors/device.cpp + tensors/backend.cpp + tensors/gpu/device.cu + tensors/cpu/device.cpp tensors/gpu/algorithm.cu tensors/gpu/dropout.cu tensors/cpu/dropout.cpp - kernels/tensor_operators.cu - kernels/cudnn_wrappers.cu + tensors/gpu/prod.cu + tensors/cpu/prod.cpp + tensors/gpu/tensor_operators.cu + tensors/cpu/tensor_operators.cpp + tensors/gpu/cudnn_wrappers.cu + graph/expression_graph.cpp graph/expression_operators.cu graph/node.cpp graph/node_operators.cu - graph/node_initializers.cu + graph/node_initializers.cpp layers/convolution.cu rnn/cells.cu diff --git a/src/examples/mnist/model_lenet.h b/src/examples/mnist/model_lenet.h index 968ceaf3..8dd96634 100644 --- a/src/examples/mnist/model_lenet.h +++ b/src/examples/mnist/model_lenet.h @@ -33,6 +33,9 @@ protected: // Construct hidden layers + ABORT("TEMPORARY"); + + /* auto conv_1 = convolution(g) ("prefix", "conv_1") ("kernel-dims", std::make_pair(3,3)) @@ -99,6 +102,7 @@ protected: // Define a top-level node for inference return logsoftmax(last); } + */; } }; } diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp index 934e2b73..520476ae 100644 --- a/src/graph/expression_graph.cpp +++ b/src/graph/expression_graph.cpp @@ -1,7 +1,7 @@ #include <sstream> #include "graph/expression_graph.h" -#include "tensors/dispatch.h" +#include "tensors/tensor_operators.h" namespace marian { diff --git a/src/graph/expression_operators.cu b/src/graph/expression_operators.cu index 20f025ad..0cb65c1e 100644 --- a/src/graph/expression_operators.cu +++ b/src/graph/expression_operators.cu @@ -1,5 +1,5 @@ #include "graph/expression_operators.h" -#include "kernels/sparse.h" +//#include "kernels/sparse.h" #include "layers/constructors.h" #include "graph/node_operators.h" @@ -333,81 +333,81 @@ Expr shift(Expr a, Shape shift) { // return Expression<LexicalProbNodeOp>(logits, att, eps, lf); //} -Expr avg_pooling( - Expr x, - int height, - int width, - int padHeight, - int padWidth, - int strideHeight, - int strideWidth) { - return Expression<PoolingOp>(x, - height, - width, - padHeight, - padWidth, - strideHeight, - strideWidth, - "avg"); -} - -Expr max_pooling( - Expr x, - int height, - int width, - int padHeight, - int padWidth, - int strideHeight, - int strideWidth) -{ - return Expression<PoolingOp>(x, - height, - width, - padHeight, - padWidth, - strideHeight, - strideWidth, - "max"); -} - -Expr convert2cudnnFormat(Expr x) { - int numWords = x->shape()[0]; - int numExamples = x->shape()[1]; - int embSize = x->shape()[2]; - - std::vector<size_t> newIndeces; - for (int b = 0; b < numExamples; ++b) { - for (int t = 0; t < numWords; ++t) { - newIndeces.push_back((t * numExamples) + b); - } - } - - auto xRows = reshape(x, {x->shape()[0] * x ->shape()[1], x->shape()[2]}); - - Shape outShape({numExamples, 1, numWords, embSize}); - return reshape(rows(xRows, newIndeces), outShape); -} - -Expr convertFromcudnnFormat(Expr x) { - int batchDim = x->shape()[0]; - int sentenceDim = x->shape()[2]; - int embSize = x->shape()[3]; - - auto reshapedX = reshape(x, {batchDim * sentenceDim, embSize}); - - std::vector<size_t> newIndeces; - for (int t = 0; t < sentenceDim; ++t) { - for (int b = 0; b < batchDim; ++b) { - newIndeces.push_back(b * sentenceDim + t); - } - } - - Shape shape({batchDim, sentenceDim, embSize}); - return reshape(rows(reshapedX, newIndeces), shape); -} - -Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) { - return Expression<PoolingWithMaskingOp>(x, mask, width, isEven); -} +//Expr avg_pooling( +// Expr x, +// int height, +// int width, +// int padHeight, +// int padWidth, +// int strideHeight, +// int strideWidth) { +// return Expression<PoolingOp>(x, +// height, +// width, +// padHeight, +// padWidth, +// strideHeight, +// strideWidth, +// "avg"); +//} +// +//Expr max_pooling( +// Expr x, +// int height, +// int width, +// int padHeight, +// int padWidth, +// int strideHeight, +// int strideWidth) +//{ +// return Expression<PoolingOp>(x, +// height, +// width, +// padHeight, +// padWidth, +// strideHeight, +// strideWidth, +// "max"); +//} +// +//Expr convert2cudnnFormat(Expr x) { +// int numWords = x->shape()[0]; +// int numExamples = x->shape()[1]; +// int embSize = x->shape()[2]; +// +// std::vector<size_t> newIndeces; +// for (int b = 0; b < numExamples; ++b) { +// for (int t = 0; t < numWords; ++t) { +// newIndeces.push_back((t * numExamples) + b); +// } +// } +// +// auto xRows = reshape(x, {x->shape()[0] * x ->shape()[1], x->shape()[2]}); +// +// Shape outShape({numExamples, 1, numWords, embSize}); +// return reshape(rows(xRows, newIndeces), outShape); +//} +// +//Expr convertFromcudnnFormat(Expr x) { +// int batchDim = x->shape()[0]; +// int sentenceDim = x->shape()[2]; +// int embSize = x->shape()[3]; +// +// auto reshapedX = reshape(x, {batchDim * sentenceDim, embSize}); +// +// std::vector<size_t> newIndeces; +// for (int t = 0; t < sentenceDim; ++t) { +// for (int b = 0; b < batchDim; ++b) { +// newIndeces.push_back(b * sentenceDim + t); +// } +// } +// +// Shape shape({batchDim, sentenceDim, embSize}); +// return reshape(rows(reshapedX, newIndeces), shape); +//} +// +//Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) { +// return Expression<PoolingWithMaskingOp>(x, mask, width, isEven); +//} } diff --git a/src/graph/node_initializers.cu b/src/graph/node_initializers.cpp index 2010c17d..20aa9603 100644 --- a/src/graph/node_initializers.cu +++ b/src/graph/node_initializers.cpp @@ -1,7 +1,3 @@ -// TODO: move to backend, into graph/ - -#include "kernels/cuda_helpers.h" -#include "kernels/tensor_operators.h" #include "graph/node_initializers.h" #include "3rd_party/svd/svd.h" @@ -145,16 +141,19 @@ std::function<void(Tensor)> from_word2vec(const std::string& file, int dimVoc, int dimEmb, bool normalize /*= false*/) { - using namespace functional; - return [file, dimVoc, dimEmb, normalize](Tensor t) { auto embs = Word2VecReader().read(file, dimVoc, dimEmb); - t->set(embs); + if(normalize) { - float l2Norm = L2Norm(t); - if(l2Norm != 0) - Element(_1 = _1 / l2Norm, t); + float norm = 0; + for(auto e : embs) + norm += e * e; + norm = std::sqrt(norm); + if(norm != 0) + for(auto& e : embs) + e = e / norm; } + t->set(embs); }; } } diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 7d6836a3..9b0655ec 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -2,11 +2,9 @@ #include <thread> -#include "tensors/gpu/backend.h" #include "graph/node.h" #include "functional/functional.h" -#include "kernels/tensor_operators.h" -#include "kernels/cudnn_wrappers.h" +#include "tensors/tensor_operators.h" namespace marian { @@ -54,7 +52,6 @@ public: NodeOps forwardOps() { // C = alpha * dot(op(A), op(B)) return {NodeOp(Prod( - std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(), val_, child(0)->val(), child(1)->val(), @@ -72,18 +69,14 @@ public: // to sum gradients from different graph parts if(!transA_ && transB_) - return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(0)->grad(), + return {NodeOp(Prod(child(0)->grad(), adj_, child(1)->val(), false, false, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(1)->grad(), + NodeOp(Prod(child(1)->grad(), adj_, child(0)->val(), true, @@ -92,18 +85,14 @@ public: scalar_))}; if(transA_ && !transB_) - return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(0)->grad(), + return {NodeOp(Prod(child(0)->grad(), child(1)->val(), adj_, false, true, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(1)->grad(), + NodeOp(Prod(child(1)->grad(), child(0)->val(), adj_, false, @@ -112,18 +101,14 @@ public: scalar_))}; if(transA_ && transB_) - return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(0)->grad(), + return {NodeOp(Prod(child(0)->grad(), child(1)->val(), adj_, true, true, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(1)->grad(), + NodeOp(Prod(child(1)->grad(), adj_, child(0)->val(), true, @@ -131,18 +116,14 @@ public: 1.0, scalar_))}; - return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(0)->grad(), + return {NodeOp(Prod(child(0)->grad(), adj_, child(1)->val(), false, true, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(1)->grad(), + NodeOp(Prod(child(1)->grad(), child(0)->val(), adj_, true, @@ -198,7 +179,6 @@ public: using namespace functional; return { NodeOp(Prod( - std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(), val_, child(0)->val(), child(1)->val(), @@ -219,18 +199,14 @@ public: using namespace functional; if(!transA_ && transB_) - return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(0)->grad(), + return {NodeOp(Prod(child(0)->grad(), adj_, child(1)->val(), false, false, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(1)->grad(), + NodeOp(Prod(child(1)->grad(), adj_, child(0)->val(), true, @@ -240,18 +216,14 @@ public: NodeOp(Add(_1, child(2)->grad(), adj_))}; if(transA_ && !transB_) - return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(0)->grad(), + return {NodeOp(Prod(child(0)->grad(), child(1)->val(), adj_, false, true, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(1)->grad(), + NodeOp(Prod(child(1)->grad(), child(0)->val(), adj_, false, @@ -261,18 +233,14 @@ public: NodeOp(Add(_1, child(2)->grad(), adj_))}; if(transA_ && transB_) - return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(0)->grad(), + return {NodeOp(Prod(child(0)->grad(), child(1)->val(), adj_, true, true, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(1)->grad(), + NodeOp(Prod(child(1)->grad(), adj_, child(0)->val(), true, @@ -281,18 +249,14 @@ public: scalar_)), NodeOp(Add(_1, child(2)->grad(), adj_))}; - return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(0)->grad(), + return {NodeOp(Prod(child(0)->grad(), adj_, child(1)->val(), false, true, 1.0, scalar_)), - NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(1)->grad(), + NodeOp(Prod(child(1)->grad(), child(0)->val(), adj_, true, @@ -350,7 +314,6 @@ public: NodeOps forwardOps() { // C = alpha * dot(op(A), op(B)) return {NodeOp(ProdBatched( - std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(), val_, child(0)->val(), child(1)->val(), @@ -369,18 +332,14 @@ public: if(!transA_ && transB_) return { - NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(0)->grad(), + NodeOp(ProdBatched(child(0)->grad(), adj_, child(1)->val(), false, false, 1.0, scalar_)), - NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(1)->grad(), + NodeOp(ProdBatched(child(1)->grad(), adj_, child(0)->val(), true, @@ -390,18 +349,14 @@ public: if(transA_ && !transB_) return { - NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(0)->grad(), + NodeOp(ProdBatched(child(0)->grad(), child(1)->val(), adj_, false, true, 1.0, scalar_)), - NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(1)->grad(), + NodeOp(ProdBatched(child(1)->grad(), child(0)->val(), adj_, false, @@ -411,18 +366,14 @@ public: if(transA_ && transB_) return { - NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(0)->grad(), + NodeOp(ProdBatched(child(0)->grad(), child(1)->val(), adj_, true, true, 1.0, scalar_)), - NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(1)->grad(), + NodeOp(ProdBatched(child(1)->grad(), adj_, child(0)->val(), true, @@ -431,18 +382,14 @@ public: scalar_))}; return { - NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(0)->grad(), + NodeOp(ProdBatched(child(0)->grad(), adj_, child(1)->val(), false, true, 1.0, scalar_)), - NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend()) - ->getCublasHandle(), - child(1)->grad(), + NodeOp(ProdBatched(child(1)->grad(), child(0)->val(), adj_, true, @@ -796,46 +743,46 @@ struct HighwayNodeOp : public NaryNodeOp { const std::string type() { return "highway"; } }; -class ConvolutionOp : public NaryNodeOp { -public: - ConvolutionOp( - const std::vector<Expr>& nodes, - int hPad = 0, - int wPad = 0, - int hStride = 1, - int wStride = 1) - : NaryNodeOp(nodes), - conv_(nodes[1]->shape(), - nodes[2]->shape(), - hPad, - wPad, - hStride, - wStride) { - conv_.getOutputShape(nodes[0]->shape(), shape_); - } - - NodeOps forwardOps() { - return {NodeOp(conv_.forward( - child(0)->val(), - child(1)->val(), - child(2)->val(), - val_))}; - } - - NodeOps backwardOps() { - return {NodeOp(conv_.backward( - child(0)->val(), - child(0)->grad(), - child(1)->val(), - child(1)->grad(), - child(2)->grad(), - adj_))}; - } - - const std::string type() { return "layer_convolution"; } - -protected: - ConvolutionWrapper conv_; -}; +//class ConvolutionOp : public NaryNodeOp { +//public: +// ConvolutionOp( +// const std::vector<Expr>& nodes, +// int hPad = 0, +// int wPad = 0, +// int hStride = 1, +// int wStride = 1) +// : NaryNodeOp(nodes), +// conv_(nodes[1]->shape(), +// nodes[2]->shape(), +// hPad, +// wPad, +// hStride, +// wStride) { +// conv_.getOutputShape(nodes[0]->shape(), shape_); +// } +// +// NodeOps forwardOps() { +// return {NodeOp(conv_.forward( +// child(0)->val(), +// child(1)->val(), +// child(2)->val(), +// val_))}; +// } +// +// NodeOps backwardOps() { +// return {NodeOp(conv_.backward( +// child(0)->val(), +// child(0)->grad(), +// child(1)->val(), +// child(1)->grad(), +// child(2)->grad(), +// adj_))}; +// } +// +// const std::string type() { return "layer_convolution"; } +// +//protected: +// ConvolutionWrapper conv_; +//}; } diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index 0170fc73..07c06fda 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -4,10 +4,11 @@ #include "tensors/gpu/backend.h" #include "graph/node.h" -#include "kernels/sparse.h" -#include "kernels/tensor_operators.h" +//#include "kernels/sparse.h" +#include "tensors/tensor_operators.h" #include "functional/functional.h" -#include "kernels/cudnn_wrappers.h" + +#include "tensors/gpu/cudnn_wrappers.h" namespace marian { @@ -735,12 +736,12 @@ struct SelectNodeOp : public UnaryNodeOp { NodeOps forwardOps() { return {NodeOp( - Select(graph()->allocator(), val_, child(0)->val(), axis_, indices_))}; + Select(val_, child(0)->val(), axis_, indices_, graph()->allocator()))}; } NodeOps backwardOps() { return {NodeOp( - Insert(graph()->allocator(), child(0)->grad(), adj_, axis_, indices_))}; + Insert(child(0)->grad(), adj_, axis_, indices_, graph()->allocator()))}; } Shape newShape(Expr a, int axis, const std::vector<size_t>& indeces) { @@ -985,7 +986,7 @@ struct ShiftNodeOp : public UnaryNodeOp { : UnaryNodeOp(a, keywords::shape = a->shape(), args...), shift_(shift) {} NodeOps forwardOps() { - return {NodeOp(Shift(val_, child(0)->val(), shift_))}; + return {NodeOp(Shift(val_, child(0)->val(), shift_, false))}; } NodeOps backwardOps() { @@ -1054,84 +1055,84 @@ struct ShiftNodeOp : public UnaryNodeOp { // Ptr<sparse::CSR> lf_; //}; -class PoolingOp : public UnaryNodeOp { -public: - PoolingOp(Expr x, - int height, - int width, - int padHeight, - int padWidth, - int strideHeight, - int strideWidth, - std::string mode) - : UnaryNodeOp(x), - pooling_(height, - width, - padHeight, - padWidth, - strideHeight, - strideWidth, - mode) { - } - - NodeOps forwardOps() { - return {NodeOp(pooling_.forward(child(0)->val(), val_))}; - } - - NodeOps backwardOps() { - return {NodeOp(pooling_.backward( - child(0)->val(), - child(0)->grad(), - val_, - adj_))}; - } - - const std::string type() { return "layer_pooling"; } - - -protected: - PoolingWrapper pooling_; -}; - -class PoolingWithMaskingOp : public UnaryNodeOp { - public: - PoolingWithMaskingOp( Expr x, Expr mask, int width, bool isEven=false) - : UnaryNodeOp(x), - mask_(mask), - width_(width), - isEven_(isEven) - { - auto xShape = x->shape(); - int dimBatch = xShape[0]; - int dimWord = xShape[1]; - int cols = (isEven_) ? xShape[2] - 1 : xShape[2]; - int dimSentence = (cols / width_) + (cols % width_ != 0); - shape_ = {dimBatch, dimWord, dimSentence}; - } - - NodeOps forwardOps() { - return {NodeOp(PoolingWithMaskingForward(val_, - child(0)->val(), - mask_->val(), - width_, - isEven_))}; - } - - NodeOps backwardOps() { - return {NodeOp(PoolingWithMaskingBackward(adj_, - child(0)->grad(), - child(0)->val(), - mask_->val(), - width_, - isEven_))}; - } - - const std::string type() {return "layer_pooling";} - - protected: - Expr mask_; - int width_; - bool isEven_; -}; +//class PoolingOp : public UnaryNodeOp { +//public: +// PoolingOp(Expr x, +// int height, +// int width, +// int padHeight, +// int padWidth, +// int strideHeight, +// int strideWidth, +// std::string mode) +// : UnaryNodeOp(x), +// pooling_(height, +// width, +// padHeight, +// padWidth, +// strideHeight, +// strideWidth, +// mode) { +// } +// +// NodeOps forwardOps() { +// return {NodeOp(pooling_.forward(child(0)->val(), val_))}; +// } +// +// NodeOps backwardOps() { +// return {NodeOp(pooling_.backward( +// child(0)->val(), +// child(0)->grad(), +// val_, +// adj_))}; +// } +// +// const std::string type() { return "layer_pooling"; } +// +// +//protected: +// PoolingWrapper pooling_; +//}; +// +//class PoolingWithMaskingOp : public UnaryNodeOp { +// public: +// PoolingWithMaskingOp( Expr x, Expr mask, int width, bool isEven=false) +// : UnaryNodeOp(x), +// mask_(mask), +// width_(width), +// isEven_(isEven) +// { +// auto xShape = x->shape(); +// int dimBatch = xShape[0]; +// int dimWord = xShape[1]; +// int cols = (isEven_) ? xShape[2] - 1 : xShape[2]; +// int dimSentence = (cols / width_) + (cols % width_ != 0); +// shape_ = {dimBatch, dimWord, dimSentence}; +// } +// +// NodeOps forwardOps() { +// return {NodeOp(PoolingWithMaskingForward(val_, +// child(0)->val(), +// mask_->val(), +// width_, +// isEven_))}; +// } +// +// NodeOps backwardOps() { +// return {NodeOp(PoolingWithMaskingBackward(adj_, +// child(0)->grad(), +// child(0)->val(), +// mask_->val(), +// width_, +// isEven_))}; +// } +// +// const std::string type() {return "layer_pooling";} +// +// protected: +// Expr mask_; +// int width_; +// bool isEven_; +//}; } diff --git a/src/kernels/tensor_operators.h b/src/kernels/tensor_operators.h deleted file mode 100644 index 77e58670..00000000 --- a/src/kernels/tensor_operators.h +++ /dev/null @@ -1,396 +0,0 @@ -#pragma once - -#include <cublas_v2.h> - -#include <thrust/pair.h> - -#include "tensors/tensor.h" - -#include "tensors/allocator.h" - -#include "gpu/shape.h" -#include "gpu/tmp.h" -#include "gpu/tensor.h" -#include "functional/functional.h" - -namespace marian { - -bool IsNan(Tensor in); - -const int MAX_THREADS = 512; -const int MAX_BLOCKS = 65535; - -cublasHandle_t create_handle(size_t); - -template <size_t K, bool broadcast, class Functor> -__global__ void gElement(Functor functor, - gpu::Array<gpu::Tensor<float>, K> tensors) { - - int length = tensors[0].shape().elements(); - gpu::Array<int, gpu::Shape::size()> dims; - gpu::Array<int, K> indices; - - for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { - int index = bid + blockDim.x * blockIdx.x + threadIdx.x; - if(index < length) { - - indices.fill(index); - - if(broadcast) { - tensors[0].shape().dims(index, dims); - for(int i = 1; i < K; ++i) - indices[i] = tensors[i].shape().bindex(dims); - } - - tensors[0][index] = gpu::apply(functor, tensors, indices); - } - } -} - -template <class Functor, class ...Tensors> -void Element(Functor functor, Tensor out, Tensors ...tensors) { - cudaSetDevice(out->getDevice().no); - - constexpr size_t K = sizeof...(tensors) + 1; - gpu::Array<gpu::Tensor<float>, K> gTensors = {out, tensors...}; - - int length = gTensors[0].shape().elements(); - int threads = std::min(MAX_THREADS, length); - int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); - - bool broadcast = false; - for(int i = 1; i < K; ++i) - broadcast = broadcast || gTensors[0].shape() != gTensors[i].shape(); - - if(broadcast) - gElement<K, true><<<blocks, threads>>>(functor, gTensors); - else - gElement<K, false><<<blocks, threads>>>(functor, gTensors); -} - -void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis); - -void Select(Ptr<Allocator> allocator, - Tensor out, - Tensor in, - int axis, - const std::vector<size_t>&); - -void Insert(Ptr<Allocator> allocator, - Tensor out, - Tensor in, - int axis, - const std::vector<size_t>&); - -void Concatenate(Tensor out, const std::vector<Tensor>& inputs, int ax); - -void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax); - -template <size_t K, class Functor> -__global__ void gAddGeneric(Functor functor, - const gpu::Shape full, - gpu::Tensor<float> out, - gpu::Array<gpu::Tensor<float>, K> ins, - float scale = 1.0) { - - int outLength = out.shape().elements(); - bool same = outLength == full.elements(); - for(int i = 0; i < K; ++i) - same = same && outLength == ins[i].shape().elements(); - - constexpr size_t N = gpu::Shape::size(); - gpu::Array<int, N> len; - for(int i = 0; i < N; ++i) - len[i] = full[i] / out.shape()[i]; - - gpu::Array<int, N> dims; - for(int bid = 0; bid < outLength; bid += blockDim.x * gridDim.x) { - int index = bid + blockDim.x * blockIdx.x + threadIdx.x; - if(index < outLength) { - - if(same) { - out[index] += gpu::apply(functor, ins, index) * scale; - } else { - out.shape().dims(index, dims); - out[index] += gpu::loops(functor, ins, len, dims) * scale; - } - - } - } -} - -template <size_t K, class Functor> -__global__ void gAddEqual(Functor functor, - gpu::Tensor<float> out, - gpu::Array<gpu::Tensor<float>, K> ins, - float scale, - bool broadcast) { - int length = out.shape().elements(); - gpu::Array<int, gpu::Shape::size()> dims; - - for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { - int index = bid + blockDim.x * blockIdx.x + threadIdx.x; - if(index < length) { - gpu::Array<int, K> indices; - indices.fill(index); - - if(broadcast) { - out.shape().dims(index, dims); - for(size_t i = 0; i < K; ++i) - indices[i] = ins[i].shape().bindex(dims); - } - - out[index] += gpu::apply(functor, ins, indices) * scale; - } - } -} - -template <size_t K, class Functor> -__global__ void gAddReduce(Functor functor, - const gpu::Shape full, - gpu::Tensor<float> out, - gpu::Array<gpu::Tensor<float>, K> ins, - float scale = 1.0) { - - int rows = full.elements() / full.back(); - int cols = full.back(); - - bool same = true; - for(int i = 0; i < K; ++i) - same = same && ins[i].shape().elements() == full.elements(); - - for(int bid = 0; bid < rows; bid += gridDim.x) { - int j = bid + blockIdx.x; - if(j < rows) { - extern __shared__ float _share[]; - float* _sum = _share + blockDim.x; - - if(same) { - _sum[threadIdx.x] = 0; - for(int tid = 0; tid < cols; tid += blockDim.x) { - int id = tid + threadIdx.x; - if(id < cols) - _sum[threadIdx.x] += gpu::apply(functor, ins, j * cols + id); - } - } else { - gpu::Array<int, gpu::Shape::size()> dims; - _sum[threadIdx.x] = 0; - - for(int tid = 0; tid < cols; tid += blockDim.x) { - int id = tid + threadIdx.x; - if(id < cols) { - full.dims(j * cols + id, dims); - gpu::Array<int, K> indices; - for(int i = 0; i < K; ++i) - indices[i] = ins[i].shape().bindex(dims); - _sum[threadIdx.x] += gpu::apply(functor, ins, indices); - } - } - } - __syncthreads(); - int len = blockDim.x; - while(len != 1) { - __syncthreads(); - int skip = (len + 1) >> 1; - if(threadIdx.x < (len >> 1)) { - _sum[threadIdx.x] += _sum[threadIdx.x + skip]; - } - len = (len + 1) >> 1; - } - __syncthreads(); - out[j] += _sum[0] * scale; - } - } -} - -template <class Functor, class ...Tensors> -void Add(Functor functor, - float scale, - Tensor out, - Tensors... tensors) { - cudaSetDevice(out->getDevice().no); - - Shape full = Shape::broadcast({out, tensors...}); - - int length = out->shape().elements(); - - constexpr size_t K = sizeof...(Tensors); - - gpu::Tensor<float> gOut = out; - gpu::Array<gpu::Tensor<float>, K> gIns = {tensors ...}; - - if(full.back() != 1 && out->shape().back() == 1) { - size_t m = full.elements() / length; - size_t k = full.back(); - - int blocks = std::min(MAX_BLOCKS, (int)m); - int threads = std::min(MAX_THREADS, (int)k); - int shared = sizeof(float) * threads * 2; - - gAddReduce<<<blocks, threads, shared>>>(functor, full, gOut, gIns, scale); - - } else if(out->shape() == full) { - int threads = std::min(MAX_THREADS, length); - int blocks - = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); - - bool broadcast = false; - for(int i = 0; i < K; ++i) - broadcast = broadcast || gOut.shape() != gIns[i].shape(); - - gAddEqual<<<blocks, threads>>>(functor, gOut, gIns, scale, broadcast); - } else { - int threads = std::min(MAX_THREADS, length); - int blocks - = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); - - gAddGeneric<<<blocks, threads>>>(functor, full, gOut, gIns, scale); - } -} - -template <class Functor, class ...Tensors> -void Add(Functor functor, - Tensor out, - Tensors... tensors) { - Add(functor, 1, out, tensors...); -} - -template <class Functor, class ...Tensors> -void Reduce(Functor functor, - float scale, - Tensor out, - Tensors... tensors) { - out->set(0); - Add(functor, scale, out, tensors...); -} - -template <class Functor, class ...Tensors> -void Reduce(Functor functor, - Tensor out, - Tensors... tensors) { - out->set(0); - Add(functor, out, tensors...); -} - -float L2Norm(Tensor in); - -void Softmax(Tensor out, Tensor in, Tensor mask = nullptr); -void LogSoftmax(Tensor out, Tensor in); - -void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val); -void LogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val); - -void CudnnSoftmax(Tensor out, Tensor in); -void CudnnSoftmaxGrad(Tensor grad, Tensor adj, Tensor val); - -void CudnnLogSoftmax(Tensor out, Tensor in); -void CudnnLogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val); - -void CrossEntropyPick(Tensor out, Tensor in, Tensor pick); -void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick); - -void Argmax(Tensor Out, const Tensor In); - -void Prod(cublasHandle_t handle, - Tensor C, - const Tensor A, - const Tensor B, - bool transA, - bool transB, - float beta = 0, - float scalar = 1); - -void ProdBatched(cublasHandle_t handle, - Tensor C, - const Tensor A, - const Tensor B, - bool transA, - bool transB, - float beta = 0, - float scalar = 1); - -void CopyRowsByIndex(Tensor out, - const Tensor in, - thrust::pair<size_t, size_t>* ipair, - size_t length); - -void CopyRows(Tensor out, const Tensor in, const std::vector<size_t>& indeces); - -void PasteRows(Tensor out, const Tensor in, const std::vector<size_t>& indeces); - -void CopyCols(Tensor out, const Tensor in, const std::vector<size_t>& indeces); - -void PasteCols(Tensor out, const Tensor in, const std::vector<size_t>& indeces); - -void LSTMCellForward(Tensor out, std::vector<Tensor> inputs); -void LSTMOutputForward(Tensor out, std::vector<Tensor> inputs); -void LSTMCellBackward(std::vector<Tensor> outputs, - std::vector<Tensor> inputs, - Tensor adj); -void LSTMOutputBackward(std::vector<Tensor> outputs, - std::vector<Tensor> inputs, - Tensor adj); - -void GRUFastForward(Tensor out, std::vector<Tensor> inputs, bool final = false); - -void GRUFastBackward(std::vector<Tensor> outputs, - std::vector<Tensor> inputs, - Tensor adj, - bool final = false); - -void Att(Tensor out, Tensor va, Tensor context, Tensor state); -void AttBack(Tensor gva, - Tensor gContext, - Tensor gState, - Tensor va, - Tensor context, - Tensor state, - Tensor adj); - -void LayerNormalization(Tensor out, - Tensor in, - Tensor gamma, - Tensor beta, - float eps = 1e-9); -void LayerNormalizationGrad(Tensor gradX, - Tensor gradGamma, - Tensor gradBeta, - Tensor adj, - Tensor y, - Tensor x, - Tensor gamma, - Tensor beta, - float eps = 1e-9); - -void Shift(Tensor out, Tensor in, Shape shift, bool invert = false); - -void SetSparse(float*, - const std::vector<size_t>& indeces, - const std::vector<float>& values); - -void HighwayForward(Tensor out, - const Tensor in1, - const Tensor in2, - const Tensor t); - -void HighwayBackward(Tensor out1, - Tensor out2, - Tensor outt, - const Tensor in1, - const Tensor in2, - const Tensor t, - const Tensor adj); - -void PoolingWithMaskingForward(Tensor out, - Tensor in, - Tensor mask, - int width, - bool isEven=false); - -void PoolingWithMaskingBackward(Tensor adj, - Tensor adjIn, - Tensor in, - Tensor mask, - int width, - bool isEven=false); -} diff --git a/src/layers/convolution.cu b/src/layers/convolution.cu index 958ff4b4..83e881bf 100644 --- a/src/layers/convolution.cu +++ b/src/layers/convolution.cu @@ -25,11 +25,12 @@ Expr Convolution::apply(Expr x) { keywords::init=inits::zeros); std::vector<Expr> nodes = {x, kernel, bias}; - return Expression<ConvolutionOp>(nodes, - paddings.first, - paddings.second, - strides.first, - strides.second); + ABORT("Temporarily not implemented"); + //return Expression<ConvolutionOp>(nodes, + // paddings.first, + // paddings.second, + // strides.first, + // strides.second); } Expr Convolution::apply(const std::vector<Expr>&) { @@ -38,4 +39,3 @@ Expr Convolution::apply(const std::vector<Expr>&) { } } - diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp index 2cd47fac..5a771f05 100644 --- a/src/models/model_factory.cpp +++ b/src/models/model_factory.cpp @@ -2,15 +2,22 @@ #include "models/model_factory.h" #include "models/s2s.h" -#include "models/char_s2s.h" #include "models/transformer.h" #include "models/hardatt.h" #include "models/amun.h" #include "models/nematus.h" #include "models/encdec.h" +#ifdef USE_CUDNN +#include "models/char_s2s.h" +#endif + +#ifdef COMPILE_EXAMPLES #include "examples/mnist/model.h" +#ifdef USE_CUDNN #include "examples/mnist/model_lenet.h" +#endif +#endif namespace marian { namespace models { @@ -18,8 +25,12 @@ namespace models { Ptr<EncoderBase> EncoderFactory::construct() { if(options_->get<std::string>("type") == "s2s") return New<EncoderS2S>(options_); + +#ifdef USE_CUDNN if(options_->get<std::string>("type") == "char-s2s") return New<CharS2SEncoder>(options_); +#endif + if(options_->get<std::string>("type") == "transformer") return New<EncoderTransformer>(options_); @@ -172,15 +183,19 @@ Ptr<ModelBase> by_type(std::string type, Ptr<Options> options) { .construct(); } +#ifdef COMPILE_EXAMPLES // @TODO: examples should be compiled optionally if(type == "mnist-ffnn") { return New<MnistFeedForwardNet>(options); } +#endif +#ifdef USE_CUDNN +#ifdef COMPILE_EXAMPLES if(type == "mnist-lenet") { return New<MnistLeNet>(options); } - +#endif if(type == "char-s2s") { return models::encoder_decoder()(options) ("original-type", type) @@ -188,6 +203,7 @@ Ptr<ModelBase> by_type(std::string type, Ptr<Options> options) { .push_back(models::decoder()("type", "s2s")) .construct(); } +#endif // clang-format on ABORT("Unknown model type: {}", type); diff --git a/src/optimizers/clippers.cu b/src/optimizers/clippers.cu index 7289955c..e42586af 100644 --- a/src/optimizers/clippers.cu +++ b/src/optimizers/clippers.cu @@ -1,6 +1,6 @@ #include "clippers.h" -#include "kernels/tensor_operators.h" +#include "tensors/tensor_operators.h" #include "functional/functional.h" namespace marian { diff --git a/src/optimizers/optimizers.cu b/src/optimizers/optimizers.cu index e82800c9..d2bfa113 100644 --- a/src/optimizers/optimizers.cu +++ b/src/optimizers/optimizers.cu @@ -1,6 +1,6 @@ #include "optimizers.h" -#include "kernels/tensor_operators.h" +#include "tensors/tensor_operators.h" #include "functional/functional.h" namespace marian { @@ -8,7 +8,7 @@ void Sgd::updateImpl(Tensor params, Tensor grads) { using namespace functional; Element(_1 -= (multiplyFactor_ * eta_) * _2, params, grads); - cudaStreamSynchronize(0); + //cudaStreamSynchronize(0); } void Adagrad::updateImpl(Tensor params, Tensor grads) { @@ -31,13 +31,13 @@ void Adagrad::updateImpl(Tensor params, Tensor grads) { gt_, grads); - cudaStreamSynchronize(0); + //cudaStreamSynchronize(0); } void Adagrad::resetStats() { if(gt_) gt_->set(0); - cudaStreamSynchronize(0); + //cudaStreamSynchronize(0); } void Adam::updateImpl(Tensor params, Tensor grads) { @@ -69,7 +69,7 @@ void Adam::updateImpl(Tensor params, Tensor grads) { mt_, vt_); - cudaStreamSynchronize(0); + //cudaStreamSynchronize(0); } void Adam::resetStats() { @@ -79,7 +79,7 @@ void Adam::resetStats() { if(vt_) vt_->set(0); - cudaStreamSynchronize(0); + //cudaStreamSynchronize(0); } Ptr<OptimizerBase> Optimizer(Ptr<Config> options) { diff --git a/src/rnn/attention.cu b/src/rnn/attention.cu index a1c48a69..2faa9d9a 100644 --- a/src/rnn/attention.cu +++ b/src/rnn/attention.cu @@ -1,7 +1,7 @@ #include "attention.h" #include "graph/node_operators_binary.h" -#include "kernels/tensor_operators.h" +#include "tensors/tensor_operators.h" namespace marian { diff --git a/src/rnn/cells.cu b/src/rnn/cells.cu index 5134307f..42373eab 100644 --- a/src/rnn/cells.cu +++ b/src/rnn/cells.cu @@ -1,7 +1,7 @@ #include "rnn/cells.h" #include "graph/node_operators_binary.h" -#include "kernels/tensor_operators.h" +#include "tensors/tensor_operators.h" namespace marian { namespace rnn { diff --git a/src/tensors/cpu/add.h b/src/tensors/cpu/add.h new file mode 100644 index 00000000..d76cc3c5 --- /dev/null +++ b/src/tensors/cpu/add.h @@ -0,0 +1,135 @@ +/* All or part of this file was contributed by Intel under license: + * Copyright (C) 2017-2018 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "tensors/tensor.h" + +namespace marian { + +namespace cpu { + +#include "gpu/shape.h" +#include "gpu/tmp.h" +#include "gpu/tensor.h" +#include "functional/functional.h" + +template <size_t K, class Functor> +void gAddGeneric(Functor functor, + const gpu::Shape full, + gpu::Tensor<float> out, + gpu::Array<gpu::Tensor<float>, K> ins, + float scale = 1.0) { + + int outLength = out.shape().elements(); + bool same = outLength == full.elements(); + for(int i = 0; i < K; ++i) + same = same && outLength == ins[i].shape().elements(); + + constexpr size_t N = gpu::Shape::size(); + gpu::Array<int, N> len; + for(int i = 0; i < N; ++i) + len[i] = full[i] / out.shape()[i]; + + gpu::Array<int, N> dims; + for(int index = 0; index < outLength; ++index) { + if(same) { + out[index] += gpu::apply(functor, ins, index) * scale; + } else { + out.shape().dims(index, dims); + out[index] += gpu::loops(functor, ins, len, dims) * scale; + } + } +} + +template <size_t K, class Functor> +void gAddEqual(Functor functor, + gpu::Tensor<float> out, + gpu::Array<gpu::Tensor<float>, K> ins, + float scale, + bool broadcast) { + int length = out.shape().elements(); + gpu::Array<int, gpu::Shape::size()> dims; + + for(int index = 0; index < length; ++index) { + gpu::Array<int, K> indices; + indices.fill(index); + + if(broadcast) { + out.shape().dims(index, dims); + for(size_t i = 0; i < K; ++i) + indices[i] = ins[i].shape().bindex(dims); + } + + out[index] += gpu::apply(functor, ins, indices) * scale; + } +} + +template <size_t K, class Functor> +void gAddReduce(Functor functor, + const gpu::Shape full, + gpu::Tensor<float> out, + gpu::Array<gpu::Tensor<float>, K> ins, + float scale = 1.0) { + + int rows = full.elements() / full.back(); + int cols = full.back(); + + bool same = true; + for(int i = 0; i < K; ++i) + same = same && ins[i].shape().elements() == full.elements(); + + for(int j = 0; j < rows; ++j) { + float sum = 0; + if(same) { + for(int id = 0; id < cols; ++id) + sum += gpu::apply(functor, ins, j * cols + id); + } else { + gpu::Array<int, gpu::Shape::size()> dims; + for(int id = 0; id < cols; ++id) { + full.dims(j * cols + id, dims); + gpu::Array<int, K> indices; + for(int i = 0; i < K; ++i) + indices[i] = ins[i].shape().bindex(dims); + sum += gpu::apply(functor, ins, indices); + } + } + out[j] += sum * scale; + } +} + +template <class Functor, class ...Tensors> +void Add(Functor functor, + float scale, + marian::Tensor out, + Tensors... tensors) { + + auto full = marian::Shape::broadcast({out, tensors...}); + + int length = out->shape().elements(); + + constexpr size_t K = sizeof...(Tensors); + + gpu::Tensor<float> gOut = out; + gpu::Array<gpu::Tensor<float>, K> gIns = {tensors ...}; + + if(full.back() != 1 && out->shape().back() == 1) { + size_t m = full.elements() / length; + size_t k = full.back(); + cpu::gAddReduce(functor, full, gOut, gIns, scale); + } else if(out->shape() == full) { + bool broadcast = false; + for(int i = 0; i < K; ++i) + broadcast = broadcast || gOut.shape() != gIns[i].shape(); + cpu::gAddEqual(functor, gOut, gIns, scale, broadcast); + } else { + cpu::gAddGeneric(functor, full, gOut, gIns, scale); + } +} + + +} + +} diff --git a/src/tensors/device.cpp b/src/tensors/cpu/device.cpp index e4bf9d17..e4bf9d17 100644 --- a/src/tensors/device.cpp +++ b/src/tensors/cpu/device.cpp diff --git a/src/tensors/cpu/dropout.cpp b/src/tensors/cpu/dropout.cpp index bb6ee799..6187cf3d 100644 --- a/src/tensors/cpu/dropout.cpp +++ b/src/tensors/cpu/dropout.cpp @@ -1,7 +1,7 @@ #include <algorithm> #include <random> -#include "tensors/dispatch.h" +#include "tensors/tensor_operators.h" #include "tensors/cpu/backend.h" namespace marian { diff --git a/src/tensors/cpu/element.h b/src/tensors/cpu/element.h new file mode 100644 index 00000000..cdff1170 --- /dev/null +++ b/src/tensors/cpu/element.h @@ -0,0 +1,51 @@ +/* All or part of this file was contributed by Intel under license: + * Copyright (C) 2017-2018 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#pragma once + +#include "tensors/tensor.h" + +namespace marian { +namespace cpu { + +template <size_t K, bool broadcast, class Functor> +void gElement(Functor functor, + gpu::Array<gpu::Tensor<float>, K> tensors) { + + int length = tensors[0].shape().elements(); + gpu::Array<int, gpu::Shape::size()> dims; + gpu::Array<int, K> indices; + + #pragma omp parallel for simd + for(int index = 0; index < length; ++index) { + indices.fill(index); + if(broadcast) { + tensors[0].shape().dims(index, dims); + for(int i = 1; i < K; ++i) + indices[i] = tensors[i].shape().bindex(dims); + } + tensors[0][index] = gpu::apply(functor, tensors, indices); + } +} + +template <class Functor, class ...Tensors> +void Element(Functor functor, marian::Tensor out, Tensors ...tensors) { + constexpr size_t K = sizeof...(tensors) + 1; + gpu::Array<gpu::Tensor<float>, K> gTensors = {out, tensors...}; + + int length = gTensors[0].shape().elements(); + + bool broadcast = false; + for(int i = 1; i < K; ++i) + broadcast = broadcast || gTensors[0].shape() != gTensors[i].shape(); + + if(broadcast) + cpu::gElement<K, true>(functor, gTensors); + else + cpu::gElement<K, false>(functor, gTensors); +} + +} +} diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp new file mode 100644 index 00000000..f009ee3f --- /dev/null +++ b/src/tensors/cpu/prod.cpp @@ -0,0 +1,69 @@ +#include "tensors/gpu/prod.h" +#include "tensors/gpu/backend.h" + +#if BLAS_FOUND +#include <cblas.h> +#endif + +namespace marian { + +namespace cpu { + +void Prod(marian::Tensor C, + const marian::Tensor A, + const marian::Tensor B, + bool transA, + bool transB, + float beta, + float scalar) { + +#if BLAS_FOUND + float alpha = scalar; + + int m = A->shape().elements() / A->shape()[-1]; + int k = A->shape().back(); + if(transA) + std::swap(m, k); + + int l = B->shape().elements() / B->shape()[-1]; + int n = B->shape()[-1]; + if(transB) + std::swap(l, n); + + int lda = A->shape()[-1]; + int ldb = B->shape()[-1]; + int ldc = B->shape()[-1]; + + if(transB) + ldc = B->shape().elements() / B->shape()[-1]; + + cblas_sgemm( + CblasColMajor, + transB ? CblasTrans : CblasNoTrans, + transA ? CblasTrans : CblasNoTrans, + n, m, k, + alpha, + B->data(), + ldb, + A->data(), + lda, + beta, + C->data(), + ldc); +#else + ABORT("Not implemented!"); +#endif +} + +void ProdBatched(marian::Tensor C, + const marian::Tensor A, + const marian::Tensor B, + bool transA, + bool transB, + float beta, + float scalar) { + ABORT("Not implemented!"); +} + +} +} diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp new file mode 100644 index 00000000..46dcb919 --- /dev/null +++ b/src/tensors/cpu/tensor_operators.cpp @@ -0,0 +1,456 @@ +/* All or part of this file was contributed by Intel under license: + * Copyright (C) 2017-2018 Intel Corporation + * SPDX-License-Identifier: MIT + */ + +#include "tensors/tensor_operators.h" +#include "tensors/cpu/backend.h" + +#include "gpu/tensor.h" +#include "functional/functional.h" + +namespace marian { + +namespace cpu { + +void ConcatCont(marian::Tensor out, const std::vector<marian::Tensor>& inputs, int axis) { + int step = 1; + for(int i = 0; i < axis; ++i) + step *= out->shape()[i]; + + size_t offset1 = 0; + for(int i = 0; i < step; ++i) { + for(auto in : inputs) { + size_t size = in->shape().elements() / step; + size_t offset2 = i * size; + + std::copy(in->data() + offset2, + in->data() + offset2 + size, + out->data() + offset1); + + offset1 += size; + } + } +} + +inline void gInsertCols(float* out, + const float* in, + size_t rows, + size_t cols, + size_t cols_out, + size_t cols_in, + size_t offset_out, + size_t offset_in) { + for(int j = 0; j < rows; ++j) { + float* rowOut = out + j * cols_out + offset_out; + const float* rowIn = in + j * cols_in + offset_in; + for(int i = 0; i < cols; ++i) { + rowOut[i] = rowIn[i]; + } + } +} + +void Concatenate1(marian::Tensor out, const std::vector<marian::Tensor>& inputs) { + int rows = out->shape().elements() / out->shape().back(); + + size_t offset = 0; + int cols_out = out->shape().back(); + + for(auto in : inputs) { + ABORT_IF(rows != in->shape().elements() / in->shape().back(), + "First dimension must be equal"); + int cols_in = in->shape().back(); + cpu::gInsertCols(out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0); + offset += cols_in; + } +} + +void Concatenate(marian::Tensor out, const std::vector<marian::Tensor>& inputs, int ax) { + if(ax == out->shape().size() - 1) + Concatenate1(out, inputs); + else + ConcatCont(out, inputs, ax); +} + +void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax) { + ABORT("Not implemented!"); +} + +// @TODO: optimize this, currently it's quite horrible +void TransposeND(marian::Tensor out, marian::Tensor in, const std::vector<int>& vAxis) { + gpu::Array<int, gpu::Shape::size()> permute; + int diff = gpu::Shape::size() - vAxis.size(); + for(int i = 0; i < permute.size(); ++i) + if(i < diff) + permute[i] = i; + else + permute[i] = vAxis[i - diff] + diff; + + int length = out->shape().elements(); + + constexpr size_t N = gpu::Shape::size(); + gpu::Array<int, N> oDims; + gpu::Array<int, N> pDims; + gpu::Tensor<float> gOut = out; + gpu::Tensor<float> gIn = in; + + for(int index = 0; index < length; ++index) { + gOut.shape().dims(index, oDims); + for(int i = 0; i < N; ++i) + pDims[permute[i]] = oDims[i]; + gOut[index] = gIn[pDims]; + } +} + +void Softmax(Tensor out_, Tensor in_, Tensor mask_) { + float* out = out_->data(); + const float* in = in_->data(); + const float* mask = mask_ ? mask_->data() : nullptr; + + int rows = out_->shape().elements() / out_->shape().back(); + int cols = out_->shape().back(); + + for (int j = 0; j < rows; ++j) { + float* so = out + j*cols; + const float* sp = in + j*cols; + const float* mp = mask ? mask + j*cols : nullptr; + + float max = sp[0]; + for (int i = 1; i < cols; ++i) { + max = std::max(max, sp[i]); + } + + float sum = 0.f; + for (int i = 0; i < cols; ++i) { + float ex = !mask || mp[i] ? std::exp(sp[i] - max) : 0.f; + so[i] = ex; + sum += ex; + } + + for (int i = 0; i < cols; ++i) { + so[i] /= sum; + } + } +} + +void LogSoftmax(Tensor out_, Tensor in_) { + float* out = out_->data(); + const float* in = in_->data(); + + int rows = out_->shape().elements() / out_->shape().back(); + int cols = out_->shape().back(); + + for (int j = 0; j < rows; ++j) { + float* so = out + j * cols; + const float* sp = in + j*cols; + + float max = sp[0]; + for (int i = 1; i < cols; ++i) { + max = std::max(max, sp[i]); + } + + float sum = 0.f; + for (int i = 0; i < cols; ++i) { + float sm = sp[i] - max; + float ex = std::exp(sm); + so[i] = sm; + sum += ex; + } + + for (int i = 0; i < cols; ++i) { + so[i] -= std::log(sum); + } + } +} + +void SoftmaxGrad(marian::Tensor grad, marian::Tensor adj, marian::Tensor val) { + ABORT("Not implemented!"); +} + +void LogSoftmaxGrad(marian::Tensor grad, marian::Tensor adj, marian::Tensor val) { + ABORT("Not implemented!"); +} + +void CopyRows(marian::Tensor out_, const marian::Tensor in_, const std::vector<size_t>& indices) { + size_t cols = in_->shape()[1]; + size_t rows = indices.size(); + + float* out = out_->data(); + const float* in = in_->data(); + + #pragma omp parallel for + for (int j = 0; j < rows; ++j) { + size_t dst = j; + size_t src = indices[j]; + + float* rowOut = out + dst*cols; + const float* rowIn = in + src*cols; + + std::copy(rowIn, rowIn + cols, rowOut); + } +} + +void PasteRows(marian::Tensor out, + const marian::Tensor in, + const std::vector<size_t>& indices) { + ABORT("Not implemented!"); +} + +void CopyCols(marian::Tensor out, const marian::Tensor in, const std::vector<size_t>& indices) { + ABORT("Not implemented!"); +} + +void PasteCols(marian::Tensor out, + const marian::Tensor in, + const std::vector<size_t>& indices) { + ABORT("Not implemented!"); +} + +void Select(marian::Tensor out, + const marian::Tensor in, + int axis, + const std::vector<size_t>& indices, + Ptr<Allocator> allocator) { + ABORT("Not implemented!"); +} + +void Insert(marian::Tensor out, + const marian::Tensor in, + int axis, + const std::vector<size_t>& indices, + Ptr<Allocator> allocator) { + ABORT("Not implemented!"); +} + +void GRUFastForward(marian::Tensor out_, std::vector<marian::Tensor> inputs, bool final) { + int rows = out_->shape().elements() / out_->shape().back(); + int cols = out_->shape().back(); + + float* out = out_->data(); + + const float* state = inputs[0]->data(); + const float* xW = inputs[1]->data(); + const float* sU = inputs[2]->data(); + const float* b = inputs[3]->data(); + const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr; + + #pragma omp parallel for + for (int j = 0; j < rows; ++j) { + float m = !mask || mask[j]; + float* rowOut = out + j * cols; + const float* rowState = state + j * cols; + + const float* xWrow = xW + j * cols * 3; + const float* sUrow = sU + j * cols * 3; + + #pragma omp simd + for (int i = 0; i < cols; ++i) { + // @TODO: stable logit + float ev1 = std::exp(-(xWrow[i] + sUrow[i] + b[i])); + float r = 1.0f / (1.0f + ev1); + + int k = i + cols; + // @TODO: stable logit + float ev2 = std::exp(-(xWrow[k] + sUrow[k] + b[k])); + float z = 1.0f / (1.0f + ev2); + + int l = i + 2 * cols; + float h; + if(final) + h = std::tanh(xWrow[l] + (sUrow[l] + b[l]) * r); + else + h = std::tanh(xWrow[l] + sUrow[l] * r + b[l]); + + float out = (1.0f - z) * h + z * rowState[i]; + rowOut[i] = m * out + (1 - m) * rowState[i]; + } + } +} + +void GRUFastBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj, + bool final) { + ABORT("Not implemented!"); +} + +void CrossEntropyPick(marian::Tensor out, marian::Tensor in, marian::Tensor pick) { + ABORT("Not implemented!"); +} + +void CrossEntropyPickBackward(marian::Tensor out, marian::Tensor adj, marian::Tensor a, marian::Tensor pick) { + ABORT("Not implemented!"); +} + + +float L2Norm(marian::Tensor in) { + ABORT("Not implemented!"); +} + +void Att(marian::Tensor out_, marian::Tensor va_, marian::Tensor context_, marian::Tensor state_) { + float* out = out_->data(); + const float* va = va_->data(); + const float* ctx = context_->data(); + const float* state = state_->data(); + + int m = out_->shape().elements() / out_->shape().back(); + int k = context_->shape()[-1]; + int b = context_->shape()[-2]; + int t = context_->shape()[-3]; + + int rows = m; + int cols = k; + + #pragma omp parallel for + for (size_t j = 0; j < rows; ++j) { + const float* vaRow = va; + const float* ctxRow = ctx + (j % (b * t)) * cols; + const float* stateRow = state + ((j / (b * t)) * b + j % b) * cols; + + float sum = 0.f; + #pragma omp simd reduction(+:sum) + for (size_t i = 0; i < cols; ++i) { + float z = ctxRow[i] + stateRow[i]; + sum += std::tanh(z) * vaRow[i]; + } + + out[j] = sum; + } +} + +void AttBack(marian::Tensor gVa, + marian::Tensor gContext, + marian::Tensor gState, + marian::Tensor va, + marian::Tensor context, + marian::Tensor state, + marian::Tensor adj) { + ABORT("Not implemented!"); +} + +void LayerNormalization(marian::Tensor out_, + marian::Tensor in_, + marian::Tensor gamma_, + marian::Tensor beta_, + float eps) { + float* out = out_->data(); + const float* in = in_->data(); + const float* alpha = gamma_->data(); + const float* beta = beta_ ? beta_->data() : nullptr; + + int rows = in_->shape().elements() / in_->shape().back(); + int cols = in_->shape().back(); + + #pragma omp parallel for + for (int j = 0; j < rows; ++j) { + float* so = out + j*cols; + const float* sp = in + j*cols; + + float sum = 0.f; + #pragma omp simd reduction(+:sum) + for (int i = 0; i < cols; ++i) { + sum += sp[i]; + } + + float mean = sum / cols; + float sqSum = 0.f; + #pragma omp simd reduction(+:sqSum) + for (int i = 0; i < cols; ++i) { + float ex = sp[i] - mean; + sqSum += ex*ex; + } + + float sigma = std::sqrt(eps + sqSum / cols); + + #pragma omp simd + for (int i = 0; i < cols; ++i) { + float t = alpha[i] * ((sp[i] - mean) / sigma); + if (beta != nullptr) { + t += beta[i]; + } + + so[i] = t; + } + } +} + +void LayerNormalizationGrad(marian::Tensor gradX, + marian::Tensor gradGamma, + marian::Tensor gradBeta, + marian::Tensor adj, + marian::Tensor y, + marian::Tensor x, + marian::Tensor gamma, + marian::Tensor beta, + float eps) { + ABORT("Not implemented!"); +} + + +void Shift(marian::Tensor out, marian::Tensor in, marian::Shape shift, bool invert) { + ABORT("Not implemented!"); +} + +void SetSparse(float* out, + const std::vector<size_t>& indices, + const std::vector<float>& values) { + ABORT("Not implemented!"); +} + + +void LSTMCellForward(marian::Tensor out, std::vector<marian::Tensor> inputs) { + ABORT("Not implemented!"); +} + +void LSTMOutputForward(marian::Tensor out, std::vector<marian::Tensor> inputs) { + ABORT("Not implemented!"); +} + +void LSTMCellBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj) { + ABORT("Not implemented!"); +} + +void LSTMOutputBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj) { + ABORT("Not implemented!"); +} + +void HighwayForward(marian::Tensor out, + const marian::Tensor in1, + const marian::Tensor in2, + const marian::Tensor t) { + ABORT("Not implemented!"); +} + +void HighwayBackward(marian::Tensor out1, + marian::Tensor out2, + marian::Tensor outt, + const marian::Tensor in1, + const marian::Tensor in2, + const marian::Tensor t, + const marian::Tensor adj) { + ABORT("Not implemented!"); +} + +void PoolingWithMaskingForward(marian::Tensor out, + marian::Tensor in, + marian::Tensor mask, + int width, + bool isEven) { + ABORT("Not implemented!"); +} + +void PoolingWithMaskingBackward(marian::Tensor adj, + marian::Tensor adjIn, + marian::Tensor in, + marian::Tensor mask, + int width, + bool isEven) { + ABORT("Not implemented!"); +} + +} +} // namespace marian diff --git a/src/tensors/dispatch.h b/src/tensors/dispatch.h index e63a6af1..d8e218b3 100644 --- a/src/tensors/dispatch.h +++ b/src/tensors/dispatch.h @@ -1,38 +1,129 @@ -#pragma once
-
-#include "common/definitions.h"
-#include "tensors/tensor.h"
-
-#define DISPATCH1(Function, Arg1) \
- namespace gpu { \
- void Function(Arg1); \
- } \
- namespace cpu { \
- void Function(Arg1); \
- } \
- void Function(Arg1 arg1) { \
- if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
- gpu::Function(arg1); \
- else \
- cpu::Function(arg1); \
- }
-
-#define DISPATCH2(Function, Arg1, Arg2) \
- namespace gpu { \
- void Function(Arg1, Arg2); \
- } \
- namespace cpu { \
- void Function(Arg1, Arg2); \
- } \
- static inline void Function(Arg1 arg1, Arg2 arg2) { \
- if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
- gpu::Function(arg1, arg2); \
- else \
- cpu::Function(arg1, arg2); \
- }
-
-namespace marian {
-
- DISPATCH2(Dropout, Tensor, float)
-
-}
+#pragma once + + +#define DISPATCH1(Function, Arg1) \ + namespace gpu { \ + void Function(Arg1); \ + } \ + namespace cpu { \ + void Function(Arg1); \ + } \ + void Function(Arg1 arg1) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1); \ + else \ + cpu::Function(arg1); \ + } + +#define DISPATCH2(Function, Arg1, Arg2) \ + namespace gpu { \ + void Function(Arg1, Arg2); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2); \ + } \ + static inline void Function(Arg1 arg1, Arg2 arg2) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2); \ + else \ + cpu::Function(arg1, arg2); \ + } + +#define DISPATCH3(Function, Arg1, Arg2, Arg3) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3); \ + } \ + static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3); \ + else \ + cpu::Function(arg1, arg2, arg3); \ + } + +#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3, Arg4); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4); \ + } \ + static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3, arg4); \ + else \ + cpu::Function(arg1, arg2, arg3, arg4); \ + } + +#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \ + } \ + static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3, arg4, arg5); \ + else \ + cpu::Function(arg1, arg2, arg3, arg4, arg5); \ + } + +#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \ + } \ + static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \ + else \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \ + } + +#define DISPATCH7(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \ + } \ + static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \ + else \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \ + } + +#define DISPATCH8(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \ + } \ + static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \ + else \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \ + } + +#define DISPATCH9(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \ + namespace gpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \ + } \ + namespace cpu { \ + void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \ + } \ + static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) { \ + if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \ + gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \ + else \ + cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \ + } + diff --git a/src/tensors/gpu/add.h b/src/tensors/gpu/add.h new file mode 100644 index 00000000..13ffc500 --- /dev/null +++ b/src/tensors/gpu/add.h @@ -0,0 +1,180 @@ +#include "gpu/shape.h" +#include "gpu/tmp.h" +#include "gpu/tensor.h" +#include "functional/functional.h" + +namespace marian { + +namespace gpu { + +#ifdef __CUDACC__ +template <size_t K, class Functor> +__global__ void gAddGeneric(Functor functor, + const gpu::Shape full, + gpu::Tensor<float> out, + gpu::Array<gpu::Tensor<float>, K> ins, + float scale = 1.0) { + + int outLength = out.shape().elements(); + bool same = outLength == full.elements(); + for(int i = 0; i < K; ++i) + same = same && outLength == ins[i].shape().elements(); + + constexpr size_t N = gpu::Shape::size(); + gpu::Array<int, N> len; + for(int i = 0; i < N; ++i) + len[i] = full[i] / out.shape()[i]; + + gpu::Array<int, N> dims; + for(int bid = 0; bid < outLength; bid += blockDim.x * gridDim.x) { + int index = bid + blockDim.x * blockIdx.x + threadIdx.x; + if(index < outLength) { + + if(same) { + out[index] += gpu::apply(functor, ins, index) * scale; + } else { + out.shape().dims(index, dims); + out[index] += gpu::loops(functor, ins, len, dims) * scale; + } + + } + } +} + +template <size_t K, class Functor> +__global__ void gAddEqual(Functor functor, + gpu::Tensor<float> out, + gpu::Array<gpu::Tensor<float>, K> ins, + float scale, + bool broadcast) { + int length = out.shape().elements(); + gpu::Array<int, gpu::Shape::size()> dims; + + for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { + int index = bid + blockDim.x * blockIdx.x + threadIdx.x; + if(index < length) { + gpu::Array<int, K> indices; + indices.fill(index); + + if(broadcast) { + out.shape().dims(index, dims); + for(size_t i = 0; i < K; ++i) + indices[i] = ins[i].shape().bindex(dims); + } + + out[index] += gpu::apply(functor, ins, indices) * scale; + } + } +} + +template <size_t K, class Functor> +__global__ void gAddReduce(Functor functor, + const gpu::Shape full, + gpu::Tensor<float> out, + gpu::Array<gpu::Tensor<float>, K> ins, + float scale = 1.0) { + + int rows = full.elements() / full.back(); + int cols = full.back(); + + bool same = true; + for(int i = 0; i < K; ++i) + same = same && ins[i].shape().elements() == full.elements(); + + for(int bid = 0; bid < rows; bid += gridDim.x) { + int j = bid + blockIdx.x; + if(j < rows) { + extern __shared__ float _share[]; + float* _sum = _share + blockDim.x; + + if(same) { + _sum[threadIdx.x] = 0; + for(int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if(id < cols) + _sum[threadIdx.x] += gpu::apply(functor, ins, j * cols + id); + } + } else { + gpu::Array<int, gpu::Shape::size()> dims; + _sum[threadIdx.x] = 0; + + for(int tid = 0; tid < cols; tid += blockDim.x) { + int id = tid + threadIdx.x; + if(id < cols) { + full.dims(j * cols + id, dims); + gpu::Array<int, K> indices; + for(int i = 0; i < K; ++i) + indices[i] = ins[i].shape().bindex(dims); + _sum[threadIdx.x] += gpu::apply(functor, ins, indices); + } + } + } + __syncthreads(); + int len = blockDim.x; + while(len != 1) { + __syncthreads(); + int skip = (len + 1) >> 1; + if(threadIdx.x < (len >> 1)) { + _sum[threadIdx.x] += _sum[threadIdx.x + skip]; + } + len = (len + 1) >> 1; + } + __syncthreads(); + out[j] += _sum[0] * scale; + } + } +} +#endif + +template <class Functor, class ...Tensors> +void Add(Functor functor, + float scale, + marian::Tensor out, + Tensors... tensors) { + +#ifdef __CUDACC__ + cudaSetDevice(out->getDevice().no); + + auto full = marian::Shape::broadcast({out, tensors...}); + + int length = out->shape().elements(); + + constexpr size_t K = sizeof...(Tensors); + + gpu::Tensor<float> gOut = out; + gpu::Array<gpu::Tensor<float>, K> gIns = {tensors ...}; + + if(full.back() != 1 && out->shape().back() == 1) { + size_t m = full.elements() / length; + size_t k = full.back(); + + int blocks = std::min(MAX_BLOCKS, (int)m); + int threads = std::min(MAX_THREADS, (int)k); + int shared = sizeof(float) * threads * 2; + + gAddReduce<<<blocks, threads, shared>>>(functor, full, gOut, gIns, scale); + + } else if(out->shape() == full) { + int threads = std::min(MAX_THREADS, length); + int blocks + = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + + bool broadcast = false; + for(int i = 0; i < K; ++i) + broadcast = broadcast || gOut.shape() != gIns[i].shape(); + + gAddEqual<<<blocks, threads>>>(functor, gOut, gIns, scale, broadcast); + } else { + int threads = std::min(MAX_THREADS, length); + int blocks + = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + + gAddGeneric<<<blocks, threads>>>(functor, full, gOut, gIns, scale); + } +#else + ABORT("Not implemented"); +#endif +} + +} +} diff --git a/src/tensors/gpu/algorithm.cu b/src/tensors/gpu/algorithm.cu index b26e00e9..2742cc20 100644 --- a/src/tensors/gpu/algorithm.cu +++ b/src/tensors/gpu/algorithm.cu @@ -1,7 +1,7 @@ #include "tensors/gpu/algorithm.h" -#include "kernels/cuda_helpers.h" -#include "kernels/tensor_operators.h" +#include "tensors/tensor_operators.h" +#include "tensors/gpu/cuda_helpers.h" namespace marian { namespace gpu { @@ -34,7 +34,8 @@ namespace marian { const std::vector<float>& values, float* data) { CUDA_CHECK(cudaSetDevice(backend->getDevice().no)); - SetSparse(data, keys, values); + ABORT("no SetSparse"); + //gpu::SetSparse(data, keys, values); CUDA_CHECK(cudaStreamSynchronize(0)); } diff --git a/src/kernels/cuda_helpers.h b/src/tensors/gpu/cuda_helpers.h index 46d84afe..6726e127 100644 --- a/src/kernels/cuda_helpers.h +++ b/src/tensors/gpu/cuda_helpers.h @@ -4,9 +4,14 @@ #include "3rd_party/exception.h" #include "common/logging.h" +const float CUDA_FLT_MAX = 1.70141e+38; +const int MAX_THREADS = 512; +const int MAX_BLOCKS = 65535; + #define CUDA_CHECK(ans) \ { gpuAssert((ans), __FILE__, __LINE__); } + inline void gpuAssert(cudaError_t code, const char *file, int line, diff --git a/src/kernels/cudnn_wrappers.cu b/src/tensors/gpu/cudnn_wrappers.cu index ca5bec8c..f773d900 100644 --- a/src/kernels/cudnn_wrappers.cu +++ b/src/tensors/gpu/cudnn_wrappers.cu @@ -1,4 +1,4 @@ -#include "kernels/cudnn_wrappers.h" +#include "tensors/gpu/cudnn_wrappers.h" namespace marian { diff --git a/src/kernels/cudnn_wrappers.h b/src/tensors/gpu/cudnn_wrappers.h index fca4b6e0..fca4b6e0 100644 --- a/src/kernels/cudnn_wrappers.h +++ b/src/tensors/gpu/cudnn_wrappers.h diff --git a/src/tensors/device.cu b/src/tensors/gpu/device.cu index bef5491b..e15b80d7 100644 --- a/src/tensors/device.cu +++ b/src/tensors/gpu/device.cu @@ -2,7 +2,7 @@ #include <iostream> #include "tensors/device.h" -#include "kernels/cuda_helpers.h" +#include "tensors/gpu/cuda_helpers.h" namespace marian { namespace gpu { diff --git a/src/tensors/gpu/dropout.cu b/src/tensors/gpu/dropout.cu index 4a4223a8..5bd68352 100644 --- a/src/tensors/gpu/dropout.cu +++ b/src/tensors/gpu/dropout.cu @@ -3,7 +3,7 @@ #include <stdio.h> #include <stdlib.h> -#include "tensors/dispatch.h" +#include "tensors/tensor_operators.h" #include "tensors/gpu/backend.h" #define CUDA_CALL(x) \ @@ -35,7 +35,7 @@ namespace marian { } } - void Dropout(Tensor tensor, float p) { + void Dropout(marian::Tensor tensor, float p) { auto gpuBackend = std::static_pointer_cast<gpu::Backend>(tensor->getBackend()); curandGenerator_t gen = gpuBackend->getCurandGenerator(); int n = tensor->size(); diff --git a/src/tensors/gpu/element.h b/src/tensors/gpu/element.h new file mode 100644 index 00000000..2136f00a --- /dev/null +++ b/src/tensors/gpu/element.h @@ -0,0 +1,65 @@ +#pragma once + +#include "tensors/tensor.h" + +#ifdef __CUDACC__ +#include "tensors/gpu/cuda_helpers.h" +#endif + +namespace marian { +namespace gpu { + +#ifdef __CUDACC__ +template <size_t K, bool broadcast, class Functor> +__global__ void gElement(Functor functor, + gpu::Array<gpu::Tensor<float>, K> tensors) { + + int length = tensors[0].shape().elements(); + gpu::Array<int, gpu::Shape::size()> dims; + gpu::Array<int, K> indices; + + for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { + int index = bid + blockDim.x * blockIdx.x + threadIdx.x; + if(index < length) { + + indices.fill(index); + + if(broadcast) { + tensors[0].shape().dims(index, dims); + for(int i = 1; i < K; ++i) + indices[i] = tensors[i].shape().bindex(dims); + } + + tensors[0][index] = gpu::apply(functor, tensors, indices); + } + } +} +#endif + +template <class Functor, class ...Tensors> +void Element(Functor functor, marian::Tensor out, Tensors ...tensors) { +#ifdef __CUDACC__ + cudaSetDevice(out->getDevice().no); + + constexpr size_t K = sizeof...(tensors) + 1; + gpu::Array<gpu::Tensor<float>, K> gTensors = {out, tensors...}; + + int length = gTensors[0].shape().elements(); + int threads = std::min(MAX_THREADS, length); + int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + + bool broadcast = false; + for(int i = 1; i < K; ++i) + broadcast = broadcast || gTensors[0].shape() != gTensors[i].shape(); + + if(broadcast) + gpu::gElement<K, true><<<blocks, threads>>>(functor, gTensors); + else + gpu::gElement<K, false><<<blocks, threads>>>(functor, gTensors); +#else + ABORT("Not implemented"); +#endif +} + +} +} diff --git a/src/tensors/gpu/prod.cu b/src/tensors/gpu/prod.cu new file mode 100644 index 00000000..38278ec5 --- /dev/null +++ b/src/tensors/gpu/prod.cu @@ -0,0 +1,129 @@ + +#include <cublas_v2.h> + +#include "tensors/gpu/prod.h" +#include "tensors/gpu/backend.h" + +namespace marian { + +namespace gpu { + +void Prod(marian::Tensor C, + const marian::Tensor A, + const marian::Tensor B, + bool transA, + bool transB, + float beta, + float scalar) { + cudaSetDevice(C->getDevice().no); + float alpha = scalar; + + size_t m = A->shape().elements() / A->shape().back(); + size_t k = A->shape().back(); + if(transA) + std::swap(m, k); + + size_t l = B->shape().elements() / B->shape().back(); + size_t n = B->shape().back(); + if(transB) + std::swap(l, n); + + size_t lda = A->shape().back(); + size_t ldb = B->shape().back(); + size_t ldc = B->shape().back(); + + if(transB) + ldc = B->shape().elements() / B->shape().back(); + + cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + + auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())->getCublasHandle(); + +#if CUDA_VERSION >= 9000 + //cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH); +#endif + + cublasSgemm(cublasHandle, + opB, + opA, + n, + m, + k, + &alpha, + B->data(), + ldb, + A->data(), + lda, + &beta, + C->data(), + ldc); +#if CUDA_VERSION >= 9000 + //cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH); +#endif +} + +void ProdBatched(marian::Tensor C, + const marian::Tensor A, + const marian::Tensor B, + bool transA, + bool transB, + float beta, + float scalar) { + cudaSetDevice(C->getDevice().no); + float alpha = scalar; + + size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]); + size_t batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]); + + size_t m = A->shape()[-2]; + size_t k = A->shape()[-1]; + if(transA) + std::swap(m, k); + + size_t l = B->shape()[-2]; + size_t n = B->shape()[-1]; + if(transB) + std::swap(l, n); + + size_t lda = A->shape()[-1]; + size_t ldb = B->shape()[-1]; + size_t ldc = B->shape()[-1]; + + if(transB) + ldc = B->shape()[-2]; + + cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + + auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())->getCublasHandle(); + +#if CUDA_VERSION >= 9000 + //cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH); +#endif + cublasSgemmStridedBatched(cublasHandle, + opB, + opA, + n, + m, + k, + &alpha, + B->data(), + ldb, + batchB == 1 ? 0 : n * k, + A->data(), + lda, + batchA == 1 ? 0 : m * k, + &beta, + C->data(), + ldc, + n * m, + std::max(batchA, batchB)); +#if CUDA_VERSION >= 9000 + //cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH); +#endif +} + + +} +} diff --git a/src/tensors/gpu/prod.h b/src/tensors/gpu/prod.h new file mode 100644 index 00000000..db9b62d8 --- /dev/null +++ b/src/tensors/gpu/prod.h @@ -0,0 +1,26 @@ +#pragma once + +#include "tensors/tensor.h" + +namespace marian { + +namespace gpu { + +void Prod(marian::Tensor C, + const marian::Tensor A, + const marian::Tensor B, + bool transA, + bool transB, + float beta = 0, + float scalar = 1); + +void ProdBatched(marian::Tensor C, + const marian::Tensor A, + const marian::Tensor B, + bool transA, + bool transB, + float beta = 0, + float scalar = 1); + +} +} diff --git a/src/kernels/sparse.cu b/src/tensors/gpu/sparse.cu index b5080c0c..b5080c0c 100644 --- a/src/kernels/sparse.cu +++ b/src/tensors/gpu/sparse.cu diff --git a/src/kernels/sparse.h b/src/tensors/gpu/sparse.h index cffb398e..cffb398e 100644 --- a/src/kernels/sparse.h +++ b/src/tensors/gpu/sparse.h diff --git a/src/kernels/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index 8dcda559..92f07a9b 100644 --- a/src/kernels/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -1,12 +1,17 @@ -#include <thrust/transform_reduce.h> -#include "kernels/cuda_helpers.h" -#include "kernels/tensor_operators.h" +//#include <thrust/transform_reduce.h> + +#include "tensors/gpu/cuda_helpers.h" +#include "tensors/tensor_operators.h" +#include "tensors/gpu/backend.h" + +#include "gpu/tensor.h" +#include "functional/functional.h" #include "3rd_party/reduce_all.h" namespace marian { -#define CUDA_FLT_MAX 1.70141e+38 +namespace gpu { struct isnan_test { __host__ __device__ bool operator()(const float a) const { return isnan(a); } @@ -22,7 +27,7 @@ __device__ inline float stableLogit(float x) { } } -bool IsNan(Tensor in) { +bool IsNan(marian::Tensor in) { //cudaSetDevice(in->getDevice().no); //thrust::device_ptr<float> begin = thrust::device_pointer_cast(in->data()); //thrust::device_ptr<float> end @@ -32,7 +37,7 @@ bool IsNan(Tensor in) { return false; } -void ConcatCont(Tensor out, const std::vector<Tensor>& inputs, int axis) { +void ConcatCont(marian::Tensor out, const std::vector<marian::Tensor>& inputs, int axis) { cudaSetDevice(out->getDevice().no); int step = 1; for(int i = 0; i < axis; ++i) @@ -78,7 +83,7 @@ __global__ void gInsertCols(float* out, } } -void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) { +void Concatenate1(marian::Tensor out, const std::vector<marian::Tensor>& inputs) { cudaSetDevice(out->getDevice().no); int rows = out->shape().elements() / out->shape().back(); @@ -102,14 +107,14 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) { cudaStreamSynchronize(0); } -void Concatenate(Tensor out, const std::vector<Tensor>& inputs, int ax) { +void Concatenate(marian::Tensor out, const std::vector<marian::Tensor>& inputs, int ax) { if(ax == out->shape().size() - 1) Concatenate1(out, inputs); else ConcatCont(out, inputs, ax); } -void Split1(std::vector<Tensor>& outputs, const Tensor in) { +void Split1(std::vector<marian::Tensor>& outputs, const marian::Tensor in) { cudaSetDevice(in->getDevice().no); size_t offset = 0; @@ -130,7 +135,7 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) { cudaStreamSynchronize(0); } -void SplitCont(std::vector<Tensor>& outputs, const Tensor in, int axis) { +void SplitCont(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int axis) { cudaSetDevice(in->getDevice().no); int step = 1; @@ -154,7 +159,7 @@ void SplitCont(std::vector<Tensor>& outputs, const Tensor in, int axis) { cudaStreamSynchronize(0); } -void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax) { +void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax) { if(ax == in->shape().size() - 1) Split1(outputs, in); else @@ -181,7 +186,7 @@ __global__ void gTransposeND(gpu::Tensor<float> out, } } -void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis) { +void TransposeND(marian::Tensor out, marian::Tensor in, const std::vector<int>& vAxis) { cudaSetDevice(out->getDevice().no); gpu::Array<int, gpu::Shape::size()> axes; @@ -297,7 +302,7 @@ __global__ void gSoftmax(float* out, } } -void Softmax(Tensor out, Tensor in, Tensor mask) { +void Softmax(marian::Tensor out, marian::Tensor in, marian::Tensor mask) { cudaSetDevice(out->getDevice().no); size_t m = out->shape().elements() / out->shape().back(); @@ -385,7 +390,7 @@ __global__ void gLogSoftmax(float* out, } } -void LogSoftmax(Tensor out, Tensor in) { +void LogSoftmax(marian::Tensor out, marian::Tensor in) { cudaSetDevice(out->getDevice().no); size_t m = out->shape().elements() / out->shape().back(); @@ -444,7 +449,7 @@ __global__ void gSoftmaxGrad(float* grad, } } -void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val) { +void SoftmaxGrad(marian::Tensor grad, marian::Tensor adj, marian::Tensor val) { cudaSetDevice(adj->getDevice().no); // grad and val are both m-by-k matrices, passed as input. // A weighted average of each row of grad (according to the weights @@ -501,7 +506,7 @@ __global__ void gLogSoftmaxGrad(float* grad, } } -void LogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val) { +void LogSoftmaxGrad(marian::Tensor grad, marian::Tensor adj, marian::Tensor val) { cudaSetDevice(adj->getDevice().no); // grad and val are both m-by-k matrices, passed as input. @@ -540,119 +545,6 @@ __global__ void gArgmax(float* out, /////////////////////////////////////////////////////// -void Prod(cublasHandle_t handle, - Tensor C, - const Tensor A, - const Tensor B, - bool transA, - bool transB, - float beta, - float scalar) { - cudaSetDevice(C->getDevice().no); - float alpha = scalar; - - size_t m = A->shape().elements() / A->shape().back(); - size_t k = A->shape().back(); - if(transA) - std::swap(m, k); - - size_t l = B->shape().elements() / B->shape().back(); - size_t n = B->shape().back(); - if(transB) - std::swap(l, n); - - size_t lda = A->shape().back(); - size_t ldb = B->shape().back(); - size_t ldc = B->shape().back(); - - if(transB) - ldc = B->shape().elements() / B->shape().back(); - - cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - -#if CUDA_VERSION >= 9000 - //cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH); -#endif - cublasSgemm(handle, - opB, - opA, - n, - m, - k, - &alpha, - B->data(), - ldb, - A->data(), - lda, - &beta, - C->data(), - ldc); -#if CUDA_VERSION >= 9000 - //cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH); -#endif -} - -void ProdBatched(cublasHandle_t handle, - Tensor C, - const Tensor A, - const Tensor B, - bool transA, - bool transB, - float beta, - float scalar) { - cudaSetDevice(C->getDevice().no); - float alpha = scalar; - - size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]); - size_t batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]); - - size_t m = A->shape()[-2]; - size_t k = A->shape()[-1]; - if(transA) - std::swap(m, k); - - size_t l = B->shape()[-2]; - size_t n = B->shape()[-1]; - if(transB) - std::swap(l, n); - - size_t lda = A->shape()[-1]; - size_t ldb = B->shape()[-1]; - size_t ldc = B->shape()[-1]; - - if(transB) - ldc = B->shape()[-2]; - - cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; - cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - -#if CUDA_VERSION >= 9000 - //cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH); -#endif - cublasSgemmStridedBatched(handle, - opB, - opA, - n, - m, - k, - &alpha, - B->data(), - ldb, - batchB == 1 ? 0 : n * k, - A->data(), - lda, - batchA == 1 ? 0 : m * k, - &beta, - C->data(), - ldc, - n * m, - std::max(batchA, batchB)); -#if CUDA_VERSION >= 9000 - //cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH); -#endif -} - __global__ void gCopyRows(float* out, const float* in, size_t cols, @@ -676,7 +568,7 @@ __global__ void gCopyRows(float* out, } } -void CopyRows(Tensor out, const Tensor in, const std::vector<size_t>& indices) { +void CopyRows(marian::Tensor out, const marian::Tensor in, const std::vector<size_t>& indices) { cudaSetDevice(out->getDevice().no); size_t cols = in->shape().back(); @@ -721,8 +613,8 @@ __global__ void gPasteRows(float* out, } } -void PasteRows(Tensor out, - const Tensor in, +void PasteRows(marian::Tensor out, + const marian::Tensor in, const std::vector<size_t>& indices) { cudaSetDevice(out->getDevice().no); @@ -768,7 +660,7 @@ __global__ void gCopyCols(float* out, } } -void CopyCols(Tensor out, const Tensor in, const std::vector<size_t>& indices) { +void CopyCols(marian::Tensor out, const marian::Tensor in, const std::vector<size_t>& indices) { cudaSetDevice(out->getDevice().no); size_t rows = in->shape().elements() / in->shape().back(); @@ -813,8 +705,8 @@ __global__ void gPasteCols(float* out, } } -void PasteCols(Tensor out, - const Tensor in, +void PasteCols(marian::Tensor out, + const marian::Tensor in, const std::vector<size_t>& indices) { cudaSetDevice(out->getDevice().no); @@ -879,11 +771,11 @@ __global__ void gInsert(float* out, } } -void Select(Ptr<Allocator> allocator, - Tensor out, - const Tensor in, +void Select(marian::Tensor out, + const marian::Tensor in, int axis, - const std::vector<size_t>& indices) { + const std::vector<size_t>& indices, + Ptr<Allocator> allocator) { cudaSetDevice(out->getDevice().no); int length = out->shape().elements(); @@ -905,11 +797,11 @@ void Select(Ptr<Allocator> allocator, allocator->free(mp_indices); } -void Insert(Ptr<Allocator> allocator, - Tensor out, - const Tensor in, +void Insert(marian::Tensor out, + const marian::Tensor in, int axis, - const std::vector<size_t>& indices) { + const std::vector<size_t>& indices, + Ptr<Allocator> allocator) { cudaSetDevice(in->getDevice().no); int length = in->shape().elements(); @@ -974,7 +866,7 @@ __global__ void gGRUFastForward(float* out, } } -void GRUFastForward(Tensor out, std::vector<Tensor> inputs, bool final) { +void GRUFastForward(marian::Tensor out, std::vector<marian::Tensor> inputs, bool final) { cudaSetDevice(out->getDevice().no); int rows = out->shape().elements() / out->shape().back(); @@ -1084,9 +976,9 @@ __global__ void gGRUFastBackward(float* outState, } } -void GRUFastBackward(std::vector<Tensor> outputs, - std::vector<Tensor> inputs, - Tensor adj, +void GRUFastBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj, bool final) { cudaSetDevice(adj->getDevice().no); @@ -1182,7 +1074,7 @@ __global__ void gCrossEntropyPick(float* out, } } -void CrossEntropyPick(Tensor out, Tensor in, Tensor pick) { +void CrossEntropyPick(marian::Tensor out, marian::Tensor in, marian::Tensor pick) { cudaSetDevice(out->getDevice().no); int rows = in->shape().elements() / in->shape().back(); @@ -1268,7 +1160,7 @@ __global__ void gCrossEntropyPickBackward(float* out, } } -void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) { +void CrossEntropyPickBackward(marian::Tensor out, marian::Tensor adj, marian::Tensor a, marian::Tensor pick) { cudaSetDevice(out->getDevice().no); int rows = out->shape().elements() / out->shape().back(); @@ -1283,7 +1175,7 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) { } -float L2Norm(Tensor in) { +float L2Norm(marian::Tensor in) { using namespace functional; cudaSetDevice(in->getDevice().no); @@ -1294,7 +1186,7 @@ float L2Norm(Tensor in) { uint8_t* data; cudaMalloc(&data, blocks * sizeof(float)); - Tensor out(new TensorBase( + marian::Tensor out(new TensorBase( New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getBackend())); ReduceAll(_1 * _1, out, in); @@ -1351,7 +1243,7 @@ __global__ void gAtt(float* out, } } -void Att(Tensor out, Tensor va, Tensor context, Tensor state) { +void Att(marian::Tensor out, marian::Tensor va, marian::Tensor context, marian::Tensor state) { cudaSetDevice(out->getDevice().no); size_t m = out->shape().elements() / out->shape().back(); @@ -1412,13 +1304,13 @@ __global__ void gAttBack(float* gVa, } } -void AttBack(Tensor gVa, - Tensor gContext, - Tensor gState, - Tensor va, - Tensor context, - Tensor state, - Tensor adj) { +void AttBack(marian::Tensor gVa, + marian::Tensor gContext, + marian::Tensor gState, + marian::Tensor va, + marian::Tensor context, + marian::Tensor state, + marian::Tensor adj) { cudaSetDevice(adj->getDevice().no); size_t m = adj->shape().elements() / adj->shape().back(); @@ -1517,10 +1409,10 @@ __global__ void gLNormalization(float* out, } } -void LayerNormalization(Tensor out, - Tensor in, - Tensor gamma, - Tensor beta, +void LayerNormalization(marian::Tensor out, + marian::Tensor in, + marian::Tensor gamma, + marian::Tensor beta, float eps) { cudaSetDevice(out->getDevice().no); @@ -1642,14 +1534,14 @@ __global__ void gLayerNormalizationGrad(float* gradX, } } -void LayerNormalizationGrad(Tensor gradX, - Tensor gradGamma, - Tensor gradBeta, - Tensor adj, - Tensor y, - Tensor x, - Tensor gamma, - Tensor beta, +void LayerNormalizationGrad(marian::Tensor gradX, + marian::Tensor gradGamma, + marian::Tensor gradBeta, + marian::Tensor adj, + marian::Tensor y, + marian::Tensor x, + marian::Tensor gamma, + marian::Tensor beta, float eps) { cudaSetDevice(adj->getDevice().no); int rows = y->shape().elements() / y->shape().back(); @@ -1685,9 +1577,9 @@ __global__ void gShift(float* out, const float* in, int length, int offset) { } } -void Shift(Tensor out, Tensor in, Shape shift, bool invert) { +void Shift(marian::Tensor out, marian::Tensor in, marian::Shape shift, bool invert) { - UTIL_THROW_IF2(in->shape().size() != shift.size(), "bad dimensions"); + ABORT_IF(in->shape().size() != shift.size(), "bad dimensions"); int offset = 0; for(int i = 0; i < shift.size(); ++i) @@ -1784,7 +1676,7 @@ __global__ void gLSTMCellForward(float* out, } } -void LSTMCellForward(Tensor out, std::vector<Tensor> inputs) { +void LSTMCellForward(marian::Tensor out, std::vector<marian::Tensor> inputs) { cudaSetDevice(out->getDevice().no); int rows = out->shape().elements() / out->shape().back(); @@ -1833,7 +1725,7 @@ __global__ void gLSTMOutputForward(float* out, } } -void LSTMOutputForward(Tensor out, std::vector<Tensor> inputs) { +void LSTMOutputForward(marian::Tensor out, std::vector<marian::Tensor> inputs) { cudaSetDevice(out->getDevice().no); int rows = out->shape().elements() / out->shape().back(); @@ -1927,9 +1819,9 @@ __global__ void gLSTMCellBackward(float* outCell, } } -void LSTMCellBackward(std::vector<Tensor> outputs, - std::vector<Tensor> inputs, - Tensor adj) { +void LSTMCellBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj) { cudaSetDevice(adj->getDevice().no); int rows = adj->shape().elements() / adj->shape().back(); @@ -2005,9 +1897,9 @@ __global__ void gLSTMOutputBackward(float* outCell, } } -void LSTMOutputBackward(std::vector<Tensor> outputs, - std::vector<Tensor> inputs, - Tensor adj) { +void LSTMOutputBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj) { cudaSetDevice(adj->getDevice().no); int rows = adj->shape().elements() / adj->shape().back(); @@ -2044,10 +1936,10 @@ __global__ void gHighwayForward(float* out, } } -void HighwayForward(Tensor out, - const Tensor in1, - const Tensor in2, - const Tensor t) { +void HighwayForward(marian::Tensor out, + const marian::Tensor in1, + const marian::Tensor in2, + const marian::Tensor t) { cudaSetDevice(out->getDevice().no); int length = out->shape().elements(); @@ -2079,13 +1971,13 @@ __global__ void gHighwayBackward(float* out1, } } -void HighwayBackward(Tensor out1, - Tensor out2, - Tensor outt, - const Tensor in1, - const Tensor in2, - const Tensor t, - const Tensor adj) { +void HighwayBackward(marian::Tensor out1, + marian::Tensor out2, + marian::Tensor outt, + const marian::Tensor in1, + const marian::Tensor in2, + const marian::Tensor t, + const marian::Tensor adj) { cudaSetDevice(out1->getDevice().no); int length = out1->shape().elements(); @@ -2138,20 +2030,20 @@ __global__ void gMaxPoolingForward(float* out, out[rowId + (colId * outCols)] = currentMax; } -void PoolingWithMaskingForward(Tensor out, - Tensor in, - Tensor mask, +void PoolingWithMaskingForward(marian::Tensor out, + marian::Tensor in, + marian::Tensor mask, int width, bool isEven) { int n = out->shape().elements(); int threads = std::min(n, MAX_THREADS); int blocks = n / threads + (n % threads != 0); - Shape& inShape = in->shape(); + auto& inShape = in->shape(); int inRows = inShape[0] * inShape[1]; int inCols = inShape[2]; - Shape& outShape = out->shape(); + auto& outShape = out->shape(); int outRows = outShape[2]; int outCols = outShape[0] * outShape[1]; @@ -2203,21 +2095,21 @@ __global__ void gMaxPoolingBackward(float* adj, adjIn[(rowId * inCols) + (colId * width) + currentMaxIdx] += adj[rowId + (colId * adjCols)]; } -void PoolingWithMaskingBackward(Tensor adj, - Tensor adjIn, - Tensor in, - Tensor mask, +void PoolingWithMaskingBackward(marian::Tensor adj, + marian::Tensor adjIn, + marian::Tensor in, + marian::Tensor mask, int width, bool isEven) { int n = adj->shape().elements(); int threads = std::min(n, 512); int blocks = n / threads + (n % threads != 0); - Shape& inShape = in->shape(); + auto& inShape = in->shape(); int inRows = inShape[0] * inShape[1]; int inCols = inShape[2]; - Shape& adjShape = adj->shape(); + auto& adjShape = adj->shape(); int adjRows = adjShape[2]; int adjCols = adjShape[0] * adjShape[1]; @@ -2232,4 +2124,5 @@ void PoolingWithMaskingBackward(Tensor adj, width, lastWidth); } +} } // namespace marian diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h new file mode 100644 index 00000000..06f76b64 --- /dev/null +++ b/src/tensors/tensor_operators.h @@ -0,0 +1,213 @@ +#pragma once + +#include "common/definitions.h" +#include "tensors/tensor.h" +#include "tensors/allocator.h" + +#include "tensors/dispatch.h" + +#include "gpu/shape.h" +#include "gpu/tmp.h" +#include "gpu/tensor.h" + +#include "tensors/gpu/element.h" +#include "tensors/gpu/add.h" +#include "tensors/gpu/prod.h" + +#include "tensors/cpu/element.h" +#include "tensors/cpu/add.h" + +namespace marian { + + template <class Functor, class ...Tensors> + void Element(Functor functor, marian::Tensor out, Tensors ...tensors) { + if(out->getBackend()->getDevice().type == DeviceType::gpu) + gpu::Element(functor, out, tensors...); + else + cpu::Element(functor, out, tensors...); + } + + template <class Functor, class ...Tensors> + void Add(Functor functor, + float scale, + marian::Tensor out, + Tensors... tensors) { + if(out->getBackend()->getDevice().type == DeviceType::gpu) + gpu::Add(functor, scale, out, tensors...); + else + cpu::Add(functor, scale, out, tensors...); + } + + template <class Functor, class ...Tensors> + void Add(Functor functor, + marian::Tensor out, + Tensors... tensors) { + Add(functor, 1, out, tensors...); + } + + template <class Functor, class ...Tensors> + void Reduce(Functor functor, + float scale, + marian::Tensor out, + Tensors... tensors) { + out->set(0); + Add(functor, scale, out, tensors...); + } + + template <class Functor, class ...Tensors> + void Reduce(Functor functor, + marian::Tensor out, + Tensors... tensors) { + out->set(0); + Add(functor, out, tensors...); + } + + DISPATCH7(Prod, marian::Tensor, const marian::Tensor, const marian::Tensor, bool, bool, float, float) + DISPATCH7(ProdBatched, marian::Tensor, const marian::Tensor, const marian::Tensor, bool, bool, float, float) + + DISPATCH2(Dropout, marian::Tensor, float) + + DISPATCH3(Softmax, marian::Tensor, marian::Tensor, marian::Tensor) + DISPATCH3(SoftmaxGrad, marian::Tensor, marian::Tensor, marian::Tensor) + + DISPATCH2(LogSoftmax, marian::Tensor, marian::Tensor) + DISPATCH3(LogSoftmaxGrad, marian::Tensor, marian::Tensor, marian::Tensor) + + DISPATCH3(CrossEntropyPick, marian::Tensor, marian::Tensor, marian::Tensor) + DISPATCH4(CrossEntropyPickBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor) + + DISPATCH3(TransposeND, marian::Tensor, marian::Tensor, const std::vector<int>&) + DISPATCH4(Shift, marian::Tensor, marian::Tensor, marian::Shape, bool) + + DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int) + + namespace gpu { + void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax); + } + + namespace cpu { + void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax); + } + + static inline void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax) { + if(in->getBackend()->getDevice().type == DeviceType::gpu) { + gpu::Deconcatenate(outputs, in, ax); + } + else { + cpu::Deconcatenate(outputs, in, ax); + } + } + + DISPATCH5(LayerNormalization, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float) + DISPATCH9(LayerNormalizationGrad, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float) + + DISPATCH4(HighwayForward, marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor) + DISPATCH7(HighwayBackward, marian::Tensor, marian::Tensor, marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor) + + DISPATCH3(CopyRows, marian::Tensor, const marian::Tensor, const std::vector<size_t>&) + DISPATCH3(PasteRows, marian::Tensor, const marian::Tensor, const std::vector<size_t>&) + DISPATCH3(CopyCols, marian::Tensor, const marian::Tensor, const std::vector<size_t>&) + DISPATCH3(PasteCols, marian::Tensor, const marian::Tensor, const std::vector<size_t>&) + + DISPATCH5(Select, marian::Tensor, marian::Tensor, int, const std::vector<size_t>&, Ptr<Allocator>) + DISPATCH5(Insert, marian::Tensor, marian::Tensor, int, const std::vector<size_t>&, Ptr<Allocator>) + + + DISPATCH2(LSTMCellForward, marian::Tensor, std::vector<marian::Tensor>) + DISPATCH2(LSTMOutputForward, marian::Tensor, std::vector<marian::Tensor>); + + namespace gpu { + void LSTMCellBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj); + } + + namespace cpu { + void LSTMCellBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj); + } + + static inline void LSTMCellBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj) { + if(adj->getBackend()->getDevice().type == DeviceType::gpu) { + gpu::LSTMCellBackward(outputs, inputs, adj); + } + else { + cpu::LSTMCellBackward(outputs, inputs, adj); + } + } + + namespace gpu { + void LSTMOutputBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj); + } + + namespace cpu { + void LSTMOutputBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj); + } + + static inline void LSTMOutputBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj) { + if(adj->getBackend()->getDevice().type == DeviceType::gpu) { + gpu::LSTMOutputBackward(outputs, inputs, adj); + } + else { + cpu::LSTMOutputBackward(outputs, inputs, adj); + } + } + + DISPATCH3(GRUFastForward, marian::Tensor, std::vector<marian::Tensor>, bool) + + namespace gpu { + void GRUFastBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj, + bool final); + } + + namespace cpu { + void GRUFastBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj, + bool final); + } + + static inline void GRUFastBackward(std::vector<marian::Tensor> outputs, + std::vector<marian::Tensor> inputs, + marian::Tensor adj, + bool final = false) { + if(adj->getBackend()->getDevice().type == DeviceType::gpu) { + gpu::GRUFastBackward(outputs, inputs, adj, final); + } + else { + cpu::GRUFastBackward(outputs, inputs, adj, final); + } + } + + DISPATCH4(Att, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor) + DISPATCH7(AttBack, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor) + + namespace gpu { + float L2Norm(marian::Tensor in); + } + + namespace cpu { + float L2Norm(marian::Tensor in); + } + + static inline float L2Norm(marian::Tensor in) { + if(in->getBackend()->getDevice().type == DeviceType::gpu) { + return gpu::L2Norm(in); + } + else { + return cpu::L2Norm(in); + } + } + +} diff --git a/src/tests/attention_tests.cpp b/src/tests/attention_tests.cpp index b42c126d..1a7afd03 100644 --- a/src/tests/attention_tests.cpp +++ b/src/tests/attention_tests.cpp @@ -7,8 +7,7 @@ using namespace marian; -TEST_CASE("Model components, Attention", "[attention]") { - +void tests(DeviceType type) { auto floatApprox = [](float x, float y) { return x == Approx(y).epsilon(0.01); }; std::vector<size_t> vWords = { @@ -37,7 +36,7 @@ TEST_CASE("Model components, Attention", "[attention]") { Config::seed = 1234; auto graph = New<ExpressionGraph>(); - graph->setDevice({0, DeviceType::gpu}); + graph->setDevice({0, type}); graph->reserveWorkspaceMB(16); std::vector<float> values; @@ -109,3 +108,11 @@ TEST_CASE("Model components, Attention", "[attention]") { vAligned.begin(), floatApprox) ); } } + +TEST_CASE("Model components, Attention (gpu)", "[attention]") { + tests(DeviceType::gpu); +} + +TEST_CASE("Model components, Attention (cpu)", "[attention]") { + tests(DeviceType::cpu); +} diff --git a/src/tests/dropout_test.cpp b/src/tests/dropout_test.cpp index 5023606b..f99dbf68 100644 --- a/src/tests/dropout_test.cpp +++ b/src/tests/dropout_test.cpp @@ -12,7 +12,9 @@ using namespace keywords; int main(int argc, char** argv) { auto c = New<Config>(argc, argv); - auto type = c->get<bool>("cpu") ? DeviceType::cpu : DeviceType::gpu; + auto type = c->get<bool>("cpu") + ? DeviceType::cpu + : DeviceType::gpu; DeviceId deviceId{0, type}; auto g = New<ExpressionGraph>(); @@ -21,7 +23,11 @@ int main(int argc, char** argv) { for(int i = 0; i < 10; ++i) { g->clear(); - auto mask = g->dropout(0.2, {10, 3072}); + auto mask1 = g->dropout(0.2, {10, 3072}); + auto mask2 = g->dropout(0.3, {1, 3072}); + auto mask = mask1 + mask2; + debug(mask1, "mask1"); + debug(mask2, "mask2"); debug(mask, "mask"); g->forward(); } diff --git a/src/tests/operator_tests.cpp b/src/tests/operator_tests.cpp index 0c3cab5a..680d171a 100644 --- a/src/tests/operator_tests.cpp +++ b/src/tests/operator_tests.cpp @@ -4,33 +4,17 @@ using namespace marian; -TEST_CASE("Expression graph supports basic math operations", "[operator]") { - +void tests(DeviceType device) { auto floatApprox = [](float x, float y) { return x == Approx(y); }; auto graph = New<ExpressionGraph>(); - graph->setDevice({0, DeviceType::gpu}); + graph->setDevice({0, device}); graph->reserveWorkspaceMB(16); std::vector<float> vA({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12}); std::vector<float> vB({1, 2, 3, 4, 5, 6}); std::vector<float> values; - SECTION("dot product") { - graph->clear(); - values.clear(); - std::vector<float> vC({22, 28, 49, 64, 76, 100, 103, 136}); - - auto A = graph->param("A", {2, 2, 3}, keywords::init = inits::from_vector(vA)); - auto B = graph->param("B", {3, 2}, keywords::init = inits::from_vector(vB)); - auto C = dot(A, B); - graph->forward(); - - CHECK(C->shape() == Shape({2, 2, 2})); - C->val()->get(values); - CHECK(values == vC); - } - SECTION("scalar multiplication") { graph->clear(); values.clear(); @@ -45,38 +29,6 @@ TEST_CASE("Expression graph supports basic math operations", "[operator]") { CHECK(values == vB2); } - SECTION("softmax and logsoftmax") { - graph->clear(); - values.clear(); - std::vector<float> in({-.2, -.3, 4.5, 5.2, -10, 101.45, -100.05, 1.05e-5}); - - std::vector<float> smOut({ 0.52498f, 0.47502f, 0.33181f, 0.66819f, - 0.0f, 1.0f, 0.0f, 1.0f }); - - std::vector<float> lsmOut({ -0.6444f, -0.7444f, -1.10319f, -0.40319f, - -111.45f, 0.0f, -100.05001f, 0.0f }); - - auto input = graph->constant({2, 2, 2}, keywords::init = inits::from_vector(in)); - - auto sm = softmax(input); - auto lsm = logsoftmax(input); - - graph->forward(); - - CHECK(sm->shape() == Shape({2, 2, 2})); - CHECK(lsm->shape() == Shape({2, 2, 2})); - - sm->val()->get(values); - - CHECK( std::equal(values.begin(), values.end(), - smOut.begin(), floatApprox) ); - - lsm->val()->get(values); - - CHECK( std::equal(values.begin(), values.end(), - lsmOut.begin(), floatApprox) ); - } - SECTION("elementwise binary operators with broadcasting") { graph->clear(); values.clear(); @@ -163,6 +115,69 @@ TEST_CASE("Expression graph supports basic math operations", "[operator]") { CHECK( values == vT5 ); } + SECTION("softmax and logsoftmax") { + graph->clear(); + values.clear(); + std::vector<float> in({-.2, -.3, 4.5, 5.2, -10, 101.45, -100.05, 1.05e-5}); + + std::vector<float> smOut({ 0.52498f, 0.47502f, 0.33181f, 0.66819f, + 0.0f, 1.0f, 0.0f, 1.0f }); + + std::vector<float> lsmOut({ -0.6444f, -0.7444f, -1.10319f, -0.40319f, + -111.45f, 0.0f, -100.05001f, 0.0f }); + + auto input = graph->constant({2, 2, 2}, keywords::init = inits::from_vector(in)); + + auto sm = softmax(input); + auto lsm = logsoftmax(input); + + graph->forward(); + + CHECK(sm->shape() == Shape({2, 2, 2})); + CHECK(lsm->shape() == Shape({2, 2, 2})); + + sm->val()->get(values); + + CHECK( std::equal(values.begin(), values.end(), + smOut.begin(), floatApprox) ); + + lsm->val()->get(values); + + CHECK( std::equal(values.begin(), values.end(), + lsmOut.begin(), floatApprox) ); + } + + SECTION("layer normalization") { + graph->clear(); + values.clear(); + + Config::seed = 1234; + + std::vector<float> vLn({ + -1.20521, -0.321409, -0.0363369, 1.56296, + 0.332987, -0.613398, -1.17766, 1.45807, + -0.731601, -0.187812, -0.766431, 1.68584, + -1.31923, -0.059028, 1.49732, -0.119065 + }); + + auto a = graph->constant({2, 2, 4}, keywords::init=inits::glorot_uniform); + + auto gamma = graph->param("gamma", {1, 4}, keywords::init=inits::ones); + auto beta = graph->param("beta", {1, 4}, keywords::init=inits::zeros); + + auto ln = layer_norm(a, gamma, beta); + + graph->forward(); + + CHECK(ln->shape() == Shape({2, 2, 4})); + + + ln->val()->get(values); + CHECK( std::equal(values.begin(), values.end(), + vLn.begin(), floatApprox) ); + + } + SECTION("reductions") { graph->clear(); values.clear(); @@ -261,35 +276,26 @@ TEST_CASE("Expression graph supports basic math operations", "[operator]") { CHECK( values == vO4 ); } - SECTION("layer normalization") { + SECTION("dot product") { graph->clear(); values.clear(); + std::vector<float> vC({22, 28, 49, 64, 76, 100, 103, 136}); - Config::seed = 1234; - - std::vector<float> vLn({ - -1.20521, -0.321409, -0.0363369, 1.56296, - 0.332987, -0.613398, -1.17766, 1.45807, - -0.731601, -0.187812, -0.766431, 1.68584, - -1.31923, -0.059028, 1.49732, -0.119065 - }); - - auto a = graph->constant({2, 2, 4}, keywords::init=inits::glorot_uniform); - - auto gamma = graph->param("gamma", {1, 4}, keywords::init=inits::ones); - auto beta = graph->param("beta", {1, 4}, keywords::init=inits::zeros); - - auto ln = layer_norm(a, gamma, beta); - + auto A = graph->param("A", {2, 2, 3}, keywords::init = inits::from_vector(vA)); + auto B = graph->param("B", {3, 2}, keywords::init = inits::from_vector(vB)); + auto C = dot(A, B); graph->forward(); - CHECK(ln->shape() == Shape({2, 2, 4})); - - - ln->val()->get(values); - CHECK( std::equal(values.begin(), values.end(), - vLn.begin(), floatApprox) ); - + CHECK(C->shape() == Shape({2, 2, 2})); + C->val()->get(values); + CHECK(values == vC); } +} + +TEST_CASE("Expression graph supports basic math operations (gpu)", "[operator]") { + tests(DeviceType::gpu); +} +TEST_CASE("Expression graph supports basic math operations (cpu)", "[operator]") { + tests(DeviceType::cpu); } diff --git a/src/tests/rnn_tests.cpp b/src/tests/rnn_tests.cpp index 681bdbe8..b73e1abe 100644 --- a/src/tests/rnn_tests.cpp +++ b/src/tests/rnn_tests.cpp @@ -6,8 +6,7 @@ using namespace marian; -TEST_CASE("Model components, RNN etc.", "[model]") { - +void tests(DeviceType type) { auto floatApprox = [](float x, float y) { return x == Approx(y).epsilon(0.01); }; std::vector<size_t> vWords = { @@ -36,7 +35,7 @@ TEST_CASE("Model components, RNN etc.", "[model]") { Config::seed = 1234; auto graph = New<ExpressionGraph>(); - graph->setDevice({0, DeviceType::gpu}); + graph->setDevice({0, type}); graph->reserveWorkspaceMB(16); std::vector<float> values; @@ -74,7 +73,7 @@ TEST_CASE("Model components, RNN etc.", "[model]") { Config::seed = 1234; auto graph = New<ExpressionGraph>(); - graph->setDevice({0, DeviceType::gpu}); + graph->setDevice({0, type}); graph->reserveWorkspaceMB(16); std::vector<float> values; @@ -279,3 +278,11 @@ TEST_CASE("Model components, RNN etc.", "[model]") { // vContextSum3.begin(), floatApprox) ); } } + +TEST_CASE("Model components, RNN etc. (gpu)", "[model]") { + tests(DeviceType::gpu); +} + +TEST_CASE("Model components, RNN etc. (cpu)", "[model]") { + tests(DeviceType::cpu); +} diff --git a/src/training/dropper.cu b/src/training/dropper.cu index b8870bb3..75e82433 100644 --- a/src/training/dropper.cu +++ b/src/training/dropper.cu @@ -4,8 +4,8 @@ #include <thrust/sort.h> #include <memory> -#include "kernels/cuda_helpers.h" -#include "kernels/tensor_operators.h" +#include "tensors/gpu/cuda_helpers.h" +#include "tensors/tensor_operators.h" #include "training/dropper.h" #include "training/sparse_tensor.h" diff --git a/src/training/graph_group_async.cu b/src/training/graph_group_async.cu index 18f3908a..faba19e9 100644 --- a/src/training/graph_group_async.cu +++ b/src/training/graph_group_async.cu @@ -1,6 +1,6 @@ #include "training/graph_group_async.h" - -#include "kernels/tensor_operators.h" +#include "tensors/tensor_operators.h" +#include "functional/functional.h" namespace marian { diff --git a/src/training/graph_group_async_drop.cu b/src/training/graph_group_async_drop.cu index f92077bc..7dabc796 100644 --- a/src/training/graph_group_async_drop.cu +++ b/src/training/graph_group_async_drop.cu @@ -2,7 +2,7 @@ #include "training/graph_group_async_drop.h" #include "functional/functional.h" -#include "kernels/tensor_operators.h" +#include "tensors/tensor_operators.h" #include "training/dropper.h" #include "training/sparse_tensor.h" diff --git a/src/training/graph_group_multinode.cu b/src/training/graph_group_multinode.cu index 34aa2b5d..2437cee9 100644 --- a/src/training/graph_group_multinode.cu +++ b/src/training/graph_group_multinode.cu @@ -1,5 +1,6 @@ #include "training/graph_group_multinode.h" -#include "kernels/tensor_operators.h" +#include "tensors/tensor_operators.h" +#include "functional/functional.h" namespace marian { diff --git a/src/training/graph_group_singleton.cu b/src/training/graph_group_singleton.cu index 68398af5..509e50c6 100644 --- a/src/training/graph_group_singleton.cu +++ b/src/training/graph_group_singleton.cu @@ -1,5 +1,6 @@ -#include "kernels/tensor_operators.h" +#include "tensors/tensor_operators.h" #include "training/graph_group_singleton.h" +#include "functional/functional.h" namespace marian { diff --git a/src/training/graph_group_sync.cu b/src/training/graph_group_sync.cu index 171c0652..87ec6784 100644 --- a/src/training/graph_group_sync.cu +++ b/src/training/graph_group_sync.cu @@ -1,5 +1,6 @@ -#include "kernels/tensor_operators.h" #include "training/graph_group_sync.h" +#include "tensors/tensor_operators.h" +#include "functional/functional.h" namespace marian { diff --git a/src/training/sparse_tensor.cu b/src/training/sparse_tensor.cu index aafafa97..d3fad64c 100644 --- a/src/training/sparse_tensor.cu +++ b/src/training/sparse_tensor.cu @@ -4,8 +4,8 @@ #include <thrust/sort.h> #include <memory> -#include "kernels/cuda_helpers.h" -#include "kernels/tensor_operators.h" +#include "tensors/tensor.h" +#include "tensors/tensor_operators.h" #include "training/sparse_tensor.h" namespace marian { diff --git a/src/training/sparse_tensor.h b/src/training/sparse_tensor.h index 9194748f..2c5fe911 100644 --- a/src/training/sparse_tensor.h +++ b/src/training/sparse_tensor.h @@ -3,6 +3,7 @@ #include <memory> #include "common/definitions.h" +#include "tensors/backend.h" namespace marian { class SparseTensorBase : public std::enable_shared_from_this<SparseTensorBase> { diff --git a/src/translator/nth_element.cu b/src/translator/nth_element.cu index 9ae19478..978b6a62 100644 --- a/src/translator/nth_element.cu +++ b/src/translator/nth_element.cu @@ -1,8 +1,7 @@ #include <iostream> #include "translator/nth_element.h" - -#include "kernels/cuda_helpers.h" +#include "tensors/gpu/cuda_helpers.h" namespace marian { |