Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2018-02-22 04:44:04 +0300
committerMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2018-02-22 04:44:04 +0300
commitd9d66f416b3284516f44e5b8e7b1fc7a0623e33f (patch)
tree59cd81a1b74c4ce395279b552bf7e6b724e82a4a
parent6e421f7a741dca8d7181f87acd398da60bb77f7d (diff)
prototype cpu version
-rw-r--r--CMakeLists.txt8
-rw-r--r--src/CMakeLists.txt16
-rw-r--r--src/examples/mnist/model_lenet.h4
-rw-r--r--src/graph/expression_graph.cpp2
-rw-r--r--src/graph/expression_operators.cu154
-rw-r--r--src/graph/node_initializers.cpp (renamed from src/graph/node_initializers.cu)19
-rw-r--r--src/graph/node_operators_binary.h185
-rw-r--r--src/graph/node_operators_unary.h171
-rw-r--r--src/kernels/tensor_operators.h396
-rw-r--r--src/layers/convolution.cu12
-rw-r--r--src/models/model_factory.cpp20
-rw-r--r--src/optimizers/clippers.cu2
-rw-r--r--src/optimizers/optimizers.cu12
-rw-r--r--src/rnn/attention.cu2
-rw-r--r--src/rnn/cells.cu2
-rw-r--r--src/tensors/cpu/add.h135
-rw-r--r--src/tensors/cpu/device.cpp (renamed from src/tensors/device.cpp)0
-rw-r--r--src/tensors/cpu/dropout.cpp2
-rw-r--r--src/tensors/cpu/element.h51
-rw-r--r--src/tensors/cpu/prod.cpp69
-rw-r--r--src/tensors/cpu/tensor_operators.cpp456
-rw-r--r--src/tensors/dispatch.h167
-rw-r--r--src/tensors/gpu/add.h180
-rw-r--r--src/tensors/gpu/algorithm.cu7
-rw-r--r--src/tensors/gpu/cuda_helpers.h (renamed from src/kernels/cuda_helpers.h)5
-rw-r--r--src/tensors/gpu/cudnn_wrappers.cu (renamed from src/kernels/cudnn_wrappers.cu)2
-rw-r--r--src/tensors/gpu/cudnn_wrappers.h (renamed from src/kernels/cudnn_wrappers.h)0
-rw-r--r--src/tensors/gpu/device.cu (renamed from src/tensors/device.cu)2
-rw-r--r--src/tensors/gpu/dropout.cu4
-rw-r--r--src/tensors/gpu/element.h65
-rw-r--r--src/tensors/gpu/prod.cu129
-rw-r--r--src/tensors/gpu/prod.h26
-rw-r--r--src/tensors/gpu/sparse.cu (renamed from src/kernels/sparse.cu)0
-rw-r--r--src/tensors/gpu/sparse.h (renamed from src/kernels/sparse.h)0
-rw-r--r--src/tensors/gpu/tensor_operators.cu (renamed from src/kernels/tensor_operators.cu)299
-rw-r--r--src/tensors/tensor_operators.h213
-rw-r--r--src/tests/attention_tests.cpp13
-rw-r--r--src/tests/dropout_test.cpp10
-rw-r--r--src/tests/operator_tests.cpp154
-rw-r--r--src/tests/rnn_tests.cpp15
-rw-r--r--src/training/dropper.cu4
-rw-r--r--src/training/graph_group_async.cu4
-rw-r--r--src/training/graph_group_async_drop.cu2
-rw-r--r--src/training/graph_group_multinode.cu3
-rw-r--r--src/training/graph_group_singleton.cu3
-rw-r--r--src/training/graph_group_sync.cu3
-rw-r--r--src/training/sparse_tensor.cu4
-rw-r--r--src/training/sparse_tensor.h1
-rw-r--r--src/translator/nth_element.cu3
49 files changed, 1981 insertions, 1055 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 23598378..2b9067e5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -65,6 +65,14 @@ endif(Tcmalloc_FOUND)
# add_definitions(-DMPI_FOUND=1)
#endif(MPI_FOUND)
+# currently for normal (openblas) blas
+find_package(BLAS)
+if(BLAS_FOUND)
+ include_directories(${BLAS_INCLUDE_DIRS})
+ set(EXT_LIBS ${EXT_LIBS} ${BLAS_LIBRARIES})
+ add_definitions(-DBLAS_FOUND=1)
+endif(BLAS_FOUND)
+
find_package(ZLIB)
if(ZLIB_FOUND)
include_directories(${ZLIB_INCLUDE_DIRS})
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 92c8a776..cb1606f6 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -8,20 +8,24 @@ cuda_add_library(marian
3rd_party/cnpy/cnpy.cpp
3rd_party/exception.cpp
3rd_party/svd/svd.cpp
-# tensors/tensor.cu
- tensors/device.cu
- tensors/device.cpp
+
tensors/backend.cpp
+ tensors/gpu/device.cu
+ tensors/cpu/device.cpp
tensors/gpu/algorithm.cu
tensors/gpu/dropout.cu
tensors/cpu/dropout.cpp
- kernels/tensor_operators.cu
- kernels/cudnn_wrappers.cu
+ tensors/gpu/prod.cu
+ tensors/cpu/prod.cpp
+ tensors/gpu/tensor_operators.cu
+ tensors/cpu/tensor_operators.cpp
+ tensors/gpu/cudnn_wrappers.cu
+
graph/expression_graph.cpp
graph/expression_operators.cu
graph/node.cpp
graph/node_operators.cu
- graph/node_initializers.cu
+ graph/node_initializers.cpp
layers/convolution.cu
rnn/cells.cu
diff --git a/src/examples/mnist/model_lenet.h b/src/examples/mnist/model_lenet.h
index 968ceaf3..8dd96634 100644
--- a/src/examples/mnist/model_lenet.h
+++ b/src/examples/mnist/model_lenet.h
@@ -33,6 +33,9 @@ protected:
// Construct hidden layers
+ ABORT("TEMPORARY");
+
+ /*
auto conv_1 = convolution(g)
("prefix", "conv_1")
("kernel-dims", std::make_pair(3,3))
@@ -99,6 +102,7 @@ protected:
// Define a top-level node for inference
return logsoftmax(last);
}
+ */;
}
};
}
diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp
index 934e2b73..520476ae 100644
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@@ -1,7 +1,7 @@
#include <sstream>
#include "graph/expression_graph.h"
-#include "tensors/dispatch.h"
+#include "tensors/tensor_operators.h"
namespace marian {
diff --git a/src/graph/expression_operators.cu b/src/graph/expression_operators.cu
index 20f025ad..0cb65c1e 100644
--- a/src/graph/expression_operators.cu
+++ b/src/graph/expression_operators.cu
@@ -1,5 +1,5 @@
#include "graph/expression_operators.h"
-#include "kernels/sparse.h"
+//#include "kernels/sparse.h"
#include "layers/constructors.h"
#include "graph/node_operators.h"
@@ -333,81 +333,81 @@ Expr shift(Expr a, Shape shift) {
// return Expression<LexicalProbNodeOp>(logits, att, eps, lf);
//}
-Expr avg_pooling(
- Expr x,
- int height,
- int width,
- int padHeight,
- int padWidth,
- int strideHeight,
- int strideWidth) {
- return Expression<PoolingOp>(x,
- height,
- width,
- padHeight,
- padWidth,
- strideHeight,
- strideWidth,
- "avg");
-}
-
-Expr max_pooling(
- Expr x,
- int height,
- int width,
- int padHeight,
- int padWidth,
- int strideHeight,
- int strideWidth)
-{
- return Expression<PoolingOp>(x,
- height,
- width,
- padHeight,
- padWidth,
- strideHeight,
- strideWidth,
- "max");
-}
-
-Expr convert2cudnnFormat(Expr x) {
- int numWords = x->shape()[0];
- int numExamples = x->shape()[1];
- int embSize = x->shape()[2];
-
- std::vector<size_t> newIndeces;
- for (int b = 0; b < numExamples; ++b) {
- for (int t = 0; t < numWords; ++t) {
- newIndeces.push_back((t * numExamples) + b);
- }
- }
-
- auto xRows = reshape(x, {x->shape()[0] * x ->shape()[1], x->shape()[2]});
-
- Shape outShape({numExamples, 1, numWords, embSize});
- return reshape(rows(xRows, newIndeces), outShape);
-}
-
-Expr convertFromcudnnFormat(Expr x) {
- int batchDim = x->shape()[0];
- int sentenceDim = x->shape()[2];
- int embSize = x->shape()[3];
-
- auto reshapedX = reshape(x, {batchDim * sentenceDim, embSize});
-
- std::vector<size_t> newIndeces;
- for (int t = 0; t < sentenceDim; ++t) {
- for (int b = 0; b < batchDim; ++b) {
- newIndeces.push_back(b * sentenceDim + t);
- }
- }
-
- Shape shape({batchDim, sentenceDim, embSize});
- return reshape(rows(reshapedX, newIndeces), shape);
-}
-
-Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) {
- return Expression<PoolingWithMaskingOp>(x, mask, width, isEven);
-}
+//Expr avg_pooling(
+// Expr x,
+// int height,
+// int width,
+// int padHeight,
+// int padWidth,
+// int strideHeight,
+// int strideWidth) {
+// return Expression<PoolingOp>(x,
+// height,
+// width,
+// padHeight,
+// padWidth,
+// strideHeight,
+// strideWidth,
+// "avg");
+//}
+//
+//Expr max_pooling(
+// Expr x,
+// int height,
+// int width,
+// int padHeight,
+// int padWidth,
+// int strideHeight,
+// int strideWidth)
+//{
+// return Expression<PoolingOp>(x,
+// height,
+// width,
+// padHeight,
+// padWidth,
+// strideHeight,
+// strideWidth,
+// "max");
+//}
+//
+//Expr convert2cudnnFormat(Expr x) {
+// int numWords = x->shape()[0];
+// int numExamples = x->shape()[1];
+// int embSize = x->shape()[2];
+//
+// std::vector<size_t> newIndeces;
+// for (int b = 0; b < numExamples; ++b) {
+// for (int t = 0; t < numWords; ++t) {
+// newIndeces.push_back((t * numExamples) + b);
+// }
+// }
+//
+// auto xRows = reshape(x, {x->shape()[0] * x ->shape()[1], x->shape()[2]});
+//
+// Shape outShape({numExamples, 1, numWords, embSize});
+// return reshape(rows(xRows, newIndeces), outShape);
+//}
+//
+//Expr convertFromcudnnFormat(Expr x) {
+// int batchDim = x->shape()[0];
+// int sentenceDim = x->shape()[2];
+// int embSize = x->shape()[3];
+//
+// auto reshapedX = reshape(x, {batchDim * sentenceDim, embSize});
+//
+// std::vector<size_t> newIndeces;
+// for (int t = 0; t < sentenceDim; ++t) {
+// for (int b = 0; b < batchDim; ++b) {
+// newIndeces.push_back(b * sentenceDim + t);
+// }
+// }
+//
+// Shape shape({batchDim, sentenceDim, embSize});
+// return reshape(rows(reshapedX, newIndeces), shape);
+//}
+//
+//Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) {
+// return Expression<PoolingWithMaskingOp>(x, mask, width, isEven);
+//}
}
diff --git a/src/graph/node_initializers.cu b/src/graph/node_initializers.cpp
index 2010c17d..20aa9603 100644
--- a/src/graph/node_initializers.cu
+++ b/src/graph/node_initializers.cpp
@@ -1,7 +1,3 @@
-// TODO: move to backend, into graph/
-
-#include "kernels/cuda_helpers.h"
-#include "kernels/tensor_operators.h"
#include "graph/node_initializers.h"
#include "3rd_party/svd/svd.h"
@@ -145,16 +141,19 @@ std::function<void(Tensor)> from_word2vec(const std::string& file,
int dimVoc,
int dimEmb,
bool normalize /*= false*/) {
- using namespace functional;
-
return [file, dimVoc, dimEmb, normalize](Tensor t) {
auto embs = Word2VecReader().read(file, dimVoc, dimEmb);
- t->set(embs);
+
if(normalize) {
- float l2Norm = L2Norm(t);
- if(l2Norm != 0)
- Element(_1 = _1 / l2Norm, t);
+ float norm = 0;
+ for(auto e : embs)
+ norm += e * e;
+ norm = std::sqrt(norm);
+ if(norm != 0)
+ for(auto& e : embs)
+ e = e / norm;
}
+ t->set(embs);
};
}
}
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index 7d6836a3..9b0655ec 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -2,11 +2,9 @@
#include <thread>
-#include "tensors/gpu/backend.h"
#include "graph/node.h"
#include "functional/functional.h"
-#include "kernels/tensor_operators.h"
-#include "kernels/cudnn_wrappers.h"
+#include "tensors/tensor_operators.h"
namespace marian {
@@ -54,7 +52,6 @@ public:
NodeOps forwardOps() {
// C = alpha * dot(op(A), op(B))
return {NodeOp(Prod(
- std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
val_,
child(0)->val(),
child(1)->val(),
@@ -72,18 +69,14 @@ public:
// to sum gradients from different graph parts
if(!transA_ && transB_)
- return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(0)->grad(),
+ return {NodeOp(Prod(child(0)->grad(),
adj_,
child(1)->val(),
false,
false,
1.0,
scalar_)),
- NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(1)->grad(),
+ NodeOp(Prod(child(1)->grad(),
adj_,
child(0)->val(),
true,
@@ -92,18 +85,14 @@ public:
scalar_))};
if(transA_ && !transB_)
- return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(0)->grad(),
+ return {NodeOp(Prod(child(0)->grad(),
child(1)->val(),
adj_,
false,
true,
1.0,
scalar_)),
- NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(1)->grad(),
+ NodeOp(Prod(child(1)->grad(),
child(0)->val(),
adj_,
false,
@@ -112,18 +101,14 @@ public:
scalar_))};
if(transA_ && transB_)
- return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(0)->grad(),
+ return {NodeOp(Prod(child(0)->grad(),
child(1)->val(),
adj_,
true,
true,
1.0,
scalar_)),
- NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(1)->grad(),
+ NodeOp(Prod(child(1)->grad(),
adj_,
child(0)->val(),
true,
@@ -131,18 +116,14 @@ public:
1.0,
scalar_))};
- return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(0)->grad(),
+ return {NodeOp(Prod(child(0)->grad(),
adj_,
child(1)->val(),
false,
true,
1.0,
scalar_)),
- NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(1)->grad(),
+ NodeOp(Prod(child(1)->grad(),
child(0)->val(),
adj_,
true,
@@ -198,7 +179,6 @@ public:
using namespace functional;
return {
NodeOp(Prod(
- std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
val_,
child(0)->val(),
child(1)->val(),
@@ -219,18 +199,14 @@ public:
using namespace functional;
if(!transA_ && transB_)
- return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(0)->grad(),
+ return {NodeOp(Prod(child(0)->grad(),
adj_,
child(1)->val(),
false,
false,
1.0,
scalar_)),
- NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(1)->grad(),
+ NodeOp(Prod(child(1)->grad(),
adj_,
child(0)->val(),
true,
@@ -240,18 +216,14 @@ public:
NodeOp(Add(_1, child(2)->grad(), adj_))};
if(transA_ && !transB_)
- return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(0)->grad(),
+ return {NodeOp(Prod(child(0)->grad(),
child(1)->val(),
adj_,
false,
true,
1.0,
scalar_)),
- NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(1)->grad(),
+ NodeOp(Prod(child(1)->grad(),
child(0)->val(),
adj_,
false,
@@ -261,18 +233,14 @@ public:
NodeOp(Add(_1, child(2)->grad(), adj_))};
if(transA_ && transB_)
- return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(0)->grad(),
+ return {NodeOp(Prod(child(0)->grad(),
child(1)->val(),
adj_,
true,
true,
1.0,
scalar_)),
- NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(1)->grad(),
+ NodeOp(Prod(child(1)->grad(),
adj_,
child(0)->val(),
true,
@@ -281,18 +249,14 @@ public:
scalar_)),
NodeOp(Add(_1, child(2)->grad(), adj_))};
- return {NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(0)->grad(),
+ return {NodeOp(Prod(child(0)->grad(),
adj_,
child(1)->val(),
false,
true,
1.0,
scalar_)),
- NodeOp(Prod(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(1)->grad(),
+ NodeOp(Prod(child(1)->grad(),
child(0)->val(),
adj_,
true,
@@ -350,7 +314,6 @@ public:
NodeOps forwardOps() {
// C = alpha * dot(op(A), op(B))
return {NodeOp(ProdBatched(
- std::static_pointer_cast<gpu::Backend>(getBackend())->getCublasHandle(),
val_,
child(0)->val(),
child(1)->val(),
@@ -369,18 +332,14 @@ public:
if(!transA_ && transB_)
return {
- NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(0)->grad(),
+ NodeOp(ProdBatched(child(0)->grad(),
adj_,
child(1)->val(),
false,
false,
1.0,
scalar_)),
- NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(1)->grad(),
+ NodeOp(ProdBatched(child(1)->grad(),
adj_,
child(0)->val(),
true,
@@ -390,18 +349,14 @@ public:
if(transA_ && !transB_)
return {
- NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(0)->grad(),
+ NodeOp(ProdBatched(child(0)->grad(),
child(1)->val(),
adj_,
false,
true,
1.0,
scalar_)),
- NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(1)->grad(),
+ NodeOp(ProdBatched(child(1)->grad(),
child(0)->val(),
adj_,
false,
@@ -411,18 +366,14 @@ public:
if(transA_ && transB_)
return {
- NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(0)->grad(),
+ NodeOp(ProdBatched(child(0)->grad(),
child(1)->val(),
adj_,
true,
true,
1.0,
scalar_)),
- NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(1)->grad(),
+ NodeOp(ProdBatched(child(1)->grad(),
adj_,
child(0)->val(),
true,
@@ -431,18 +382,14 @@ public:
scalar_))};
return {
- NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(0)->grad(),
+ NodeOp(ProdBatched(child(0)->grad(),
adj_,
child(1)->val(),
false,
true,
1.0,
scalar_)),
- NodeOp(ProdBatched(std::static_pointer_cast<gpu::Backend>(getBackend())
- ->getCublasHandle(),
- child(1)->grad(),
+ NodeOp(ProdBatched(child(1)->grad(),
child(0)->val(),
adj_,
true,
@@ -796,46 +743,46 @@ struct HighwayNodeOp : public NaryNodeOp {
const std::string type() { return "highway"; }
};
-class ConvolutionOp : public NaryNodeOp {
-public:
- ConvolutionOp(
- const std::vector<Expr>& nodes,
- int hPad = 0,
- int wPad = 0,
- int hStride = 1,
- int wStride = 1)
- : NaryNodeOp(nodes),
- conv_(nodes[1]->shape(),
- nodes[2]->shape(),
- hPad,
- wPad,
- hStride,
- wStride) {
- conv_.getOutputShape(nodes[0]->shape(), shape_);
- }
-
- NodeOps forwardOps() {
- return {NodeOp(conv_.forward(
- child(0)->val(),
- child(1)->val(),
- child(2)->val(),
- val_))};
- }
-
- NodeOps backwardOps() {
- return {NodeOp(conv_.backward(
- child(0)->val(),
- child(0)->grad(),
- child(1)->val(),
- child(1)->grad(),
- child(2)->grad(),
- adj_))};
- }
-
- const std::string type() { return "layer_convolution"; }
-
-protected:
- ConvolutionWrapper conv_;
-};
+//class ConvolutionOp : public NaryNodeOp {
+//public:
+// ConvolutionOp(
+// const std::vector<Expr>& nodes,
+// int hPad = 0,
+// int wPad = 0,
+// int hStride = 1,
+// int wStride = 1)
+// : NaryNodeOp(nodes),
+// conv_(nodes[1]->shape(),
+// nodes[2]->shape(),
+// hPad,
+// wPad,
+// hStride,
+// wStride) {
+// conv_.getOutputShape(nodes[0]->shape(), shape_);
+// }
+//
+// NodeOps forwardOps() {
+// return {NodeOp(conv_.forward(
+// child(0)->val(),
+// child(1)->val(),
+// child(2)->val(),
+// val_))};
+// }
+//
+// NodeOps backwardOps() {
+// return {NodeOp(conv_.backward(
+// child(0)->val(),
+// child(0)->grad(),
+// child(1)->val(),
+// child(1)->grad(),
+// child(2)->grad(),
+// adj_))};
+// }
+//
+// const std::string type() { return "layer_convolution"; }
+//
+//protected:
+// ConvolutionWrapper conv_;
+//};
}
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index 0170fc73..07c06fda 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -4,10 +4,11 @@
#include "tensors/gpu/backend.h"
#include "graph/node.h"
-#include "kernels/sparse.h"
-#include "kernels/tensor_operators.h"
+//#include "kernels/sparse.h"
+#include "tensors/tensor_operators.h"
#include "functional/functional.h"
-#include "kernels/cudnn_wrappers.h"
+
+#include "tensors/gpu/cudnn_wrappers.h"
namespace marian {
@@ -735,12 +736,12 @@ struct SelectNodeOp : public UnaryNodeOp {
NodeOps forwardOps() {
return {NodeOp(
- Select(graph()->allocator(), val_, child(0)->val(), axis_, indices_))};
+ Select(val_, child(0)->val(), axis_, indices_, graph()->allocator()))};
}
NodeOps backwardOps() {
return {NodeOp(
- Insert(graph()->allocator(), child(0)->grad(), adj_, axis_, indices_))};
+ Insert(child(0)->grad(), adj_, axis_, indices_, graph()->allocator()))};
}
Shape newShape(Expr a, int axis, const std::vector<size_t>& indeces) {
@@ -985,7 +986,7 @@ struct ShiftNodeOp : public UnaryNodeOp {
: UnaryNodeOp(a, keywords::shape = a->shape(), args...), shift_(shift) {}
NodeOps forwardOps() {
- return {NodeOp(Shift(val_, child(0)->val(), shift_))};
+ return {NodeOp(Shift(val_, child(0)->val(), shift_, false))};
}
NodeOps backwardOps() {
@@ -1054,84 +1055,84 @@ struct ShiftNodeOp : public UnaryNodeOp {
// Ptr<sparse::CSR> lf_;
//};
-class PoolingOp : public UnaryNodeOp {
-public:
- PoolingOp(Expr x,
- int height,
- int width,
- int padHeight,
- int padWidth,
- int strideHeight,
- int strideWidth,
- std::string mode)
- : UnaryNodeOp(x),
- pooling_(height,
- width,
- padHeight,
- padWidth,
- strideHeight,
- strideWidth,
- mode) {
- }
-
- NodeOps forwardOps() {
- return {NodeOp(pooling_.forward(child(0)->val(), val_))};
- }
-
- NodeOps backwardOps() {
- return {NodeOp(pooling_.backward(
- child(0)->val(),
- child(0)->grad(),
- val_,
- adj_))};
- }
-
- const std::string type() { return "layer_pooling"; }
-
-
-protected:
- PoolingWrapper pooling_;
-};
-
-class PoolingWithMaskingOp : public UnaryNodeOp {
- public:
- PoolingWithMaskingOp( Expr x, Expr mask, int width, bool isEven=false)
- : UnaryNodeOp(x),
- mask_(mask),
- width_(width),
- isEven_(isEven)
- {
- auto xShape = x->shape();
- int dimBatch = xShape[0];
- int dimWord = xShape[1];
- int cols = (isEven_) ? xShape[2] - 1 : xShape[2];
- int dimSentence = (cols / width_) + (cols % width_ != 0);
- shape_ = {dimBatch, dimWord, dimSentence};
- }
-
- NodeOps forwardOps() {
- return {NodeOp(PoolingWithMaskingForward(val_,
- child(0)->val(),
- mask_->val(),
- width_,
- isEven_))};
- }
-
- NodeOps backwardOps() {
- return {NodeOp(PoolingWithMaskingBackward(adj_,
- child(0)->grad(),
- child(0)->val(),
- mask_->val(),
- width_,
- isEven_))};
- }
-
- const std::string type() {return "layer_pooling";}
-
- protected:
- Expr mask_;
- int width_;
- bool isEven_;
-};
+//class PoolingOp : public UnaryNodeOp {
+//public:
+// PoolingOp(Expr x,
+// int height,
+// int width,
+// int padHeight,
+// int padWidth,
+// int strideHeight,
+// int strideWidth,
+// std::string mode)
+// : UnaryNodeOp(x),
+// pooling_(height,
+// width,
+// padHeight,
+// padWidth,
+// strideHeight,
+// strideWidth,
+// mode) {
+// }
+//
+// NodeOps forwardOps() {
+// return {NodeOp(pooling_.forward(child(0)->val(), val_))};
+// }
+//
+// NodeOps backwardOps() {
+// return {NodeOp(pooling_.backward(
+// child(0)->val(),
+// child(0)->grad(),
+// val_,
+// adj_))};
+// }
+//
+// const std::string type() { return "layer_pooling"; }
+//
+//
+//protected:
+// PoolingWrapper pooling_;
+//};
+//
+//class PoolingWithMaskingOp : public UnaryNodeOp {
+// public:
+// PoolingWithMaskingOp( Expr x, Expr mask, int width, bool isEven=false)
+// : UnaryNodeOp(x),
+// mask_(mask),
+// width_(width),
+// isEven_(isEven)
+// {
+// auto xShape = x->shape();
+// int dimBatch = xShape[0];
+// int dimWord = xShape[1];
+// int cols = (isEven_) ? xShape[2] - 1 : xShape[2];
+// int dimSentence = (cols / width_) + (cols % width_ != 0);
+// shape_ = {dimBatch, dimWord, dimSentence};
+// }
+//
+// NodeOps forwardOps() {
+// return {NodeOp(PoolingWithMaskingForward(val_,
+// child(0)->val(),
+// mask_->val(),
+// width_,
+// isEven_))};
+// }
+//
+// NodeOps backwardOps() {
+// return {NodeOp(PoolingWithMaskingBackward(adj_,
+// child(0)->grad(),
+// child(0)->val(),
+// mask_->val(),
+// width_,
+// isEven_))};
+// }
+//
+// const std::string type() {return "layer_pooling";}
+//
+// protected:
+// Expr mask_;
+// int width_;
+// bool isEven_;
+//};
}
diff --git a/src/kernels/tensor_operators.h b/src/kernels/tensor_operators.h
deleted file mode 100644
index 77e58670..00000000
--- a/src/kernels/tensor_operators.h
+++ /dev/null
@@ -1,396 +0,0 @@
-#pragma once
-
-#include <cublas_v2.h>
-
-#include <thrust/pair.h>
-
-#include "tensors/tensor.h"
-
-#include "tensors/allocator.h"
-
-#include "gpu/shape.h"
-#include "gpu/tmp.h"
-#include "gpu/tensor.h"
-#include "functional/functional.h"
-
-namespace marian {
-
-bool IsNan(Tensor in);
-
-const int MAX_THREADS = 512;
-const int MAX_BLOCKS = 65535;
-
-cublasHandle_t create_handle(size_t);
-
-template <size_t K, bool broadcast, class Functor>
-__global__ void gElement(Functor functor,
- gpu::Array<gpu::Tensor<float>, K> tensors) {
-
- int length = tensors[0].shape().elements();
- gpu::Array<int, gpu::Shape::size()> dims;
- gpu::Array<int, K> indices;
-
- for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
- int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
- if(index < length) {
-
- indices.fill(index);
-
- if(broadcast) {
- tensors[0].shape().dims(index, dims);
- for(int i = 1; i < K; ++i)
- indices[i] = tensors[i].shape().bindex(dims);
- }
-
- tensors[0][index] = gpu::apply(functor, tensors, indices);
- }
- }
-}
-
-template <class Functor, class ...Tensors>
-void Element(Functor functor, Tensor out, Tensors ...tensors) {
- cudaSetDevice(out->getDevice().no);
-
- constexpr size_t K = sizeof...(tensors) + 1;
- gpu::Array<gpu::Tensor<float>, K> gTensors = {out, tensors...};
-
- int length = gTensors[0].shape().elements();
- int threads = std::min(MAX_THREADS, length);
- int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
-
- bool broadcast = false;
- for(int i = 1; i < K; ++i)
- broadcast = broadcast || gTensors[0].shape() != gTensors[i].shape();
-
- if(broadcast)
- gElement<K, true><<<blocks, threads>>>(functor, gTensors);
- else
- gElement<K, false><<<blocks, threads>>>(functor, gTensors);
-}
-
-void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis);
-
-void Select(Ptr<Allocator> allocator,
- Tensor out,
- Tensor in,
- int axis,
- const std::vector<size_t>&);
-
-void Insert(Ptr<Allocator> allocator,
- Tensor out,
- Tensor in,
- int axis,
- const std::vector<size_t>&);
-
-void Concatenate(Tensor out, const std::vector<Tensor>& inputs, int ax);
-
-void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax);
-
-template <size_t K, class Functor>
-__global__ void gAddGeneric(Functor functor,
- const gpu::Shape full,
- gpu::Tensor<float> out,
- gpu::Array<gpu::Tensor<float>, K> ins,
- float scale = 1.0) {
-
- int outLength = out.shape().elements();
- bool same = outLength == full.elements();
- for(int i = 0; i < K; ++i)
- same = same && outLength == ins[i].shape().elements();
-
- constexpr size_t N = gpu::Shape::size();
- gpu::Array<int, N> len;
- for(int i = 0; i < N; ++i)
- len[i] = full[i] / out.shape()[i];
-
- gpu::Array<int, N> dims;
- for(int bid = 0; bid < outLength; bid += blockDim.x * gridDim.x) {
- int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
- if(index < outLength) {
-
- if(same) {
- out[index] += gpu::apply(functor, ins, index) * scale;
- } else {
- out.shape().dims(index, dims);
- out[index] += gpu::loops(functor, ins, len, dims) * scale;
- }
-
- }
- }
-}
-
-template <size_t K, class Functor>
-__global__ void gAddEqual(Functor functor,
- gpu::Tensor<float> out,
- gpu::Array<gpu::Tensor<float>, K> ins,
- float scale,
- bool broadcast) {
- int length = out.shape().elements();
- gpu::Array<int, gpu::Shape::size()> dims;
-
- for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
- int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
- if(index < length) {
- gpu::Array<int, K> indices;
- indices.fill(index);
-
- if(broadcast) {
- out.shape().dims(index, dims);
- for(size_t i = 0; i < K; ++i)
- indices[i] = ins[i].shape().bindex(dims);
- }
-
- out[index] += gpu::apply(functor, ins, indices) * scale;
- }
- }
-}
-
-template <size_t K, class Functor>
-__global__ void gAddReduce(Functor functor,
- const gpu::Shape full,
- gpu::Tensor<float> out,
- gpu::Array<gpu::Tensor<float>, K> ins,
- float scale = 1.0) {
-
- int rows = full.elements() / full.back();
- int cols = full.back();
-
- bool same = true;
- for(int i = 0; i < K; ++i)
- same = same && ins[i].shape().elements() == full.elements();
-
- for(int bid = 0; bid < rows; bid += gridDim.x) {
- int j = bid + blockIdx.x;
- if(j < rows) {
- extern __shared__ float _share[];
- float* _sum = _share + blockDim.x;
-
- if(same) {
- _sum[threadIdx.x] = 0;
- for(int tid = 0; tid < cols; tid += blockDim.x) {
- int id = tid + threadIdx.x;
- if(id < cols)
- _sum[threadIdx.x] += gpu::apply(functor, ins, j * cols + id);
- }
- } else {
- gpu::Array<int, gpu::Shape::size()> dims;
- _sum[threadIdx.x] = 0;
-
- for(int tid = 0; tid < cols; tid += blockDim.x) {
- int id = tid + threadIdx.x;
- if(id < cols) {
- full.dims(j * cols + id, dims);
- gpu::Array<int, K> indices;
- for(int i = 0; i < K; ++i)
- indices[i] = ins[i].shape().bindex(dims);
- _sum[threadIdx.x] += gpu::apply(functor, ins, indices);
- }
- }
- }
- __syncthreads();
- int len = blockDim.x;
- while(len != 1) {
- __syncthreads();
- int skip = (len + 1) >> 1;
- if(threadIdx.x < (len >> 1)) {
- _sum[threadIdx.x] += _sum[threadIdx.x + skip];
- }
- len = (len + 1) >> 1;
- }
- __syncthreads();
- out[j] += _sum[0] * scale;
- }
- }
-}
-
-template <class Functor, class ...Tensors>
-void Add(Functor functor,
- float scale,
- Tensor out,
- Tensors... tensors) {
- cudaSetDevice(out->getDevice().no);
-
- Shape full = Shape::broadcast({out, tensors...});
-
- int length = out->shape().elements();
-
- constexpr size_t K = sizeof...(Tensors);
-
- gpu::Tensor<float> gOut = out;
- gpu::Array<gpu::Tensor<float>, K> gIns = {tensors ...};
-
- if(full.back() != 1 && out->shape().back() == 1) {
- size_t m = full.elements() / length;
- size_t k = full.back();
-
- int blocks = std::min(MAX_BLOCKS, (int)m);
- int threads = std::min(MAX_THREADS, (int)k);
- int shared = sizeof(float) * threads * 2;
-
- gAddReduce<<<blocks, threads, shared>>>(functor, full, gOut, gIns, scale);
-
- } else if(out->shape() == full) {
- int threads = std::min(MAX_THREADS, length);
- int blocks
- = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
-
- bool broadcast = false;
- for(int i = 0; i < K; ++i)
- broadcast = broadcast || gOut.shape() != gIns[i].shape();
-
- gAddEqual<<<blocks, threads>>>(functor, gOut, gIns, scale, broadcast);
- } else {
- int threads = std::min(MAX_THREADS, length);
- int blocks
- = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
-
- gAddGeneric<<<blocks, threads>>>(functor, full, gOut, gIns, scale);
- }
-}
-
-template <class Functor, class ...Tensors>
-void Add(Functor functor,
- Tensor out,
- Tensors... tensors) {
- Add(functor, 1, out, tensors...);
-}
-
-template <class Functor, class ...Tensors>
-void Reduce(Functor functor,
- float scale,
- Tensor out,
- Tensors... tensors) {
- out->set(0);
- Add(functor, scale, out, tensors...);
-}
-
-template <class Functor, class ...Tensors>
-void Reduce(Functor functor,
- Tensor out,
- Tensors... tensors) {
- out->set(0);
- Add(functor, out, tensors...);
-}
-
-float L2Norm(Tensor in);
-
-void Softmax(Tensor out, Tensor in, Tensor mask = nullptr);
-void LogSoftmax(Tensor out, Tensor in);
-
-void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val);
-void LogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val);
-
-void CudnnSoftmax(Tensor out, Tensor in);
-void CudnnSoftmaxGrad(Tensor grad, Tensor adj, Tensor val);
-
-void CudnnLogSoftmax(Tensor out, Tensor in);
-void CudnnLogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val);
-
-void CrossEntropyPick(Tensor out, Tensor in, Tensor pick);
-void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick);
-
-void Argmax(Tensor Out, const Tensor In);
-
-void Prod(cublasHandle_t handle,
- Tensor C,
- const Tensor A,
- const Tensor B,
- bool transA,
- bool transB,
- float beta = 0,
- float scalar = 1);
-
-void ProdBatched(cublasHandle_t handle,
- Tensor C,
- const Tensor A,
- const Tensor B,
- bool transA,
- bool transB,
- float beta = 0,
- float scalar = 1);
-
-void CopyRowsByIndex(Tensor out,
- const Tensor in,
- thrust::pair<size_t, size_t>* ipair,
- size_t length);
-
-void CopyRows(Tensor out, const Tensor in, const std::vector<size_t>& indeces);
-
-void PasteRows(Tensor out, const Tensor in, const std::vector<size_t>& indeces);
-
-void CopyCols(Tensor out, const Tensor in, const std::vector<size_t>& indeces);
-
-void PasteCols(Tensor out, const Tensor in, const std::vector<size_t>& indeces);
-
-void LSTMCellForward(Tensor out, std::vector<Tensor> inputs);
-void LSTMOutputForward(Tensor out, std::vector<Tensor> inputs);
-void LSTMCellBackward(std::vector<Tensor> outputs,
- std::vector<Tensor> inputs,
- Tensor adj);
-void LSTMOutputBackward(std::vector<Tensor> outputs,
- std::vector<Tensor> inputs,
- Tensor adj);
-
-void GRUFastForward(Tensor out, std::vector<Tensor> inputs, bool final = false);
-
-void GRUFastBackward(std::vector<Tensor> outputs,
- std::vector<Tensor> inputs,
- Tensor adj,
- bool final = false);
-
-void Att(Tensor out, Tensor va, Tensor context, Tensor state);
-void AttBack(Tensor gva,
- Tensor gContext,
- Tensor gState,
- Tensor va,
- Tensor context,
- Tensor state,
- Tensor adj);
-
-void LayerNormalization(Tensor out,
- Tensor in,
- Tensor gamma,
- Tensor beta,
- float eps = 1e-9);
-void LayerNormalizationGrad(Tensor gradX,
- Tensor gradGamma,
- Tensor gradBeta,
- Tensor adj,
- Tensor y,
- Tensor x,
- Tensor gamma,
- Tensor beta,
- float eps = 1e-9);
-
-void Shift(Tensor out, Tensor in, Shape shift, bool invert = false);
-
-void SetSparse(float*,
- const std::vector<size_t>& indeces,
- const std::vector<float>& values);
-
-void HighwayForward(Tensor out,
- const Tensor in1,
- const Tensor in2,
- const Tensor t);
-
-void HighwayBackward(Tensor out1,
- Tensor out2,
- Tensor outt,
- const Tensor in1,
- const Tensor in2,
- const Tensor t,
- const Tensor adj);
-
-void PoolingWithMaskingForward(Tensor out,
- Tensor in,
- Tensor mask,
- int width,
- bool isEven=false);
-
-void PoolingWithMaskingBackward(Tensor adj,
- Tensor adjIn,
- Tensor in,
- Tensor mask,
- int width,
- bool isEven=false);
-}
diff --git a/src/layers/convolution.cu b/src/layers/convolution.cu
index 958ff4b4..83e881bf 100644
--- a/src/layers/convolution.cu
+++ b/src/layers/convolution.cu
@@ -25,11 +25,12 @@ Expr Convolution::apply(Expr x) {
keywords::init=inits::zeros);
std::vector<Expr> nodes = {x, kernel, bias};
- return Expression<ConvolutionOp>(nodes,
- paddings.first,
- paddings.second,
- strides.first,
- strides.second);
+ ABORT("Temporarily not implemented");
+ //return Expression<ConvolutionOp>(nodes,
+ // paddings.first,
+ // paddings.second,
+ // strides.first,
+ // strides.second);
}
Expr Convolution::apply(const std::vector<Expr>&) {
@@ -38,4 +39,3 @@ Expr Convolution::apply(const std::vector<Expr>&) {
}
}
-
diff --git a/src/models/model_factory.cpp b/src/models/model_factory.cpp
index 2cd47fac..5a771f05 100644
--- a/src/models/model_factory.cpp
+++ b/src/models/model_factory.cpp
@@ -2,15 +2,22 @@
#include "models/model_factory.h"
#include "models/s2s.h"
-#include "models/char_s2s.h"
#include "models/transformer.h"
#include "models/hardatt.h"
#include "models/amun.h"
#include "models/nematus.h"
#include "models/encdec.h"
+#ifdef USE_CUDNN
+#include "models/char_s2s.h"
+#endif
+
+#ifdef COMPILE_EXAMPLES
#include "examples/mnist/model.h"
+#ifdef USE_CUDNN
#include "examples/mnist/model_lenet.h"
+#endif
+#endif
namespace marian {
namespace models {
@@ -18,8 +25,12 @@ namespace models {
Ptr<EncoderBase> EncoderFactory::construct() {
if(options_->get<std::string>("type") == "s2s")
return New<EncoderS2S>(options_);
+
+#ifdef USE_CUDNN
if(options_->get<std::string>("type") == "char-s2s")
return New<CharS2SEncoder>(options_);
+#endif
+
if(options_->get<std::string>("type") == "transformer")
return New<EncoderTransformer>(options_);
@@ -172,15 +183,19 @@ Ptr<ModelBase> by_type(std::string type, Ptr<Options> options) {
.construct();
}
+#ifdef COMPILE_EXAMPLES
// @TODO: examples should be compiled optionally
if(type == "mnist-ffnn") {
return New<MnistFeedForwardNet>(options);
}
+#endif
+#ifdef USE_CUDNN
+#ifdef COMPILE_EXAMPLES
if(type == "mnist-lenet") {
return New<MnistLeNet>(options);
}
-
+#endif
if(type == "char-s2s") {
return models::encoder_decoder()(options)
("original-type", type)
@@ -188,6 +203,7 @@ Ptr<ModelBase> by_type(std::string type, Ptr<Options> options) {
.push_back(models::decoder()("type", "s2s"))
.construct();
}
+#endif
// clang-format on
ABORT("Unknown model type: {}", type);
diff --git a/src/optimizers/clippers.cu b/src/optimizers/clippers.cu
index 7289955c..e42586af 100644
--- a/src/optimizers/clippers.cu
+++ b/src/optimizers/clippers.cu
@@ -1,6 +1,6 @@
#include "clippers.h"
-#include "kernels/tensor_operators.h"
+#include "tensors/tensor_operators.h"
#include "functional/functional.h"
namespace marian {
diff --git a/src/optimizers/optimizers.cu b/src/optimizers/optimizers.cu
index e82800c9..d2bfa113 100644
--- a/src/optimizers/optimizers.cu
+++ b/src/optimizers/optimizers.cu
@@ -1,6 +1,6 @@
#include "optimizers.h"
-#include "kernels/tensor_operators.h"
+#include "tensors/tensor_operators.h"
#include "functional/functional.h"
namespace marian {
@@ -8,7 +8,7 @@ void Sgd::updateImpl(Tensor params, Tensor grads) {
using namespace functional;
Element(_1 -= (multiplyFactor_ * eta_) * _2, params, grads);
- cudaStreamSynchronize(0);
+ //cudaStreamSynchronize(0);
}
void Adagrad::updateImpl(Tensor params, Tensor grads) {
@@ -31,13 +31,13 @@ void Adagrad::updateImpl(Tensor params, Tensor grads) {
gt_,
grads);
- cudaStreamSynchronize(0);
+ //cudaStreamSynchronize(0);
}
void Adagrad::resetStats() {
if(gt_)
gt_->set(0);
- cudaStreamSynchronize(0);
+ //cudaStreamSynchronize(0);
}
void Adam::updateImpl(Tensor params, Tensor grads) {
@@ -69,7 +69,7 @@ void Adam::updateImpl(Tensor params, Tensor grads) {
mt_,
vt_);
- cudaStreamSynchronize(0);
+ //cudaStreamSynchronize(0);
}
void Adam::resetStats() {
@@ -79,7 +79,7 @@ void Adam::resetStats() {
if(vt_)
vt_->set(0);
- cudaStreamSynchronize(0);
+ //cudaStreamSynchronize(0);
}
Ptr<OptimizerBase> Optimizer(Ptr<Config> options) {
diff --git a/src/rnn/attention.cu b/src/rnn/attention.cu
index a1c48a69..2faa9d9a 100644
--- a/src/rnn/attention.cu
+++ b/src/rnn/attention.cu
@@ -1,7 +1,7 @@
#include "attention.h"
#include "graph/node_operators_binary.h"
-#include "kernels/tensor_operators.h"
+#include "tensors/tensor_operators.h"
namespace marian {
diff --git a/src/rnn/cells.cu b/src/rnn/cells.cu
index 5134307f..42373eab 100644
--- a/src/rnn/cells.cu
+++ b/src/rnn/cells.cu
@@ -1,7 +1,7 @@
#include "rnn/cells.h"
#include "graph/node_operators_binary.h"
-#include "kernels/tensor_operators.h"
+#include "tensors/tensor_operators.h"
namespace marian {
namespace rnn {
diff --git a/src/tensors/cpu/add.h b/src/tensors/cpu/add.h
new file mode 100644
index 00000000..d76cc3c5
--- /dev/null
+++ b/src/tensors/cpu/add.h
@@ -0,0 +1,135 @@
+/* All or part of this file was contributed by Intel under license:
+ * Copyright (C) 2017-2018 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "tensors/tensor.h"
+
+namespace marian {
+
+namespace cpu {
+
+#include "gpu/shape.h"
+#include "gpu/tmp.h"
+#include "gpu/tensor.h"
+#include "functional/functional.h"
+
+template <size_t K, class Functor>
+void gAddGeneric(Functor functor,
+ const gpu::Shape full,
+ gpu::Tensor<float> out,
+ gpu::Array<gpu::Tensor<float>, K> ins,
+ float scale = 1.0) {
+
+ int outLength = out.shape().elements();
+ bool same = outLength == full.elements();
+ for(int i = 0; i < K; ++i)
+ same = same && outLength == ins[i].shape().elements();
+
+ constexpr size_t N = gpu::Shape::size();
+ gpu::Array<int, N> len;
+ for(int i = 0; i < N; ++i)
+ len[i] = full[i] / out.shape()[i];
+
+ gpu::Array<int, N> dims;
+ for(int index = 0; index < outLength; ++index) {
+ if(same) {
+ out[index] += gpu::apply(functor, ins, index) * scale;
+ } else {
+ out.shape().dims(index, dims);
+ out[index] += gpu::loops(functor, ins, len, dims) * scale;
+ }
+ }
+}
+
+template <size_t K, class Functor>
+void gAddEqual(Functor functor,
+ gpu::Tensor<float> out,
+ gpu::Array<gpu::Tensor<float>, K> ins,
+ float scale,
+ bool broadcast) {
+ int length = out.shape().elements();
+ gpu::Array<int, gpu::Shape::size()> dims;
+
+ for(int index = 0; index < length; ++index) {
+ gpu::Array<int, K> indices;
+ indices.fill(index);
+
+ if(broadcast) {
+ out.shape().dims(index, dims);
+ for(size_t i = 0; i < K; ++i)
+ indices[i] = ins[i].shape().bindex(dims);
+ }
+
+ out[index] += gpu::apply(functor, ins, indices) * scale;
+ }
+}
+
+template <size_t K, class Functor>
+void gAddReduce(Functor functor,
+ const gpu::Shape full,
+ gpu::Tensor<float> out,
+ gpu::Array<gpu::Tensor<float>, K> ins,
+ float scale = 1.0) {
+
+ int rows = full.elements() / full.back();
+ int cols = full.back();
+
+ bool same = true;
+ for(int i = 0; i < K; ++i)
+ same = same && ins[i].shape().elements() == full.elements();
+
+ for(int j = 0; j < rows; ++j) {
+ float sum = 0;
+ if(same) {
+ for(int id = 0; id < cols; ++id)
+ sum += gpu::apply(functor, ins, j * cols + id);
+ } else {
+ gpu::Array<int, gpu::Shape::size()> dims;
+ for(int id = 0; id < cols; ++id) {
+ full.dims(j * cols + id, dims);
+ gpu::Array<int, K> indices;
+ for(int i = 0; i < K; ++i)
+ indices[i] = ins[i].shape().bindex(dims);
+ sum += gpu::apply(functor, ins, indices);
+ }
+ }
+ out[j] += sum * scale;
+ }
+}
+
+template <class Functor, class ...Tensors>
+void Add(Functor functor,
+ float scale,
+ marian::Tensor out,
+ Tensors... tensors) {
+
+ auto full = marian::Shape::broadcast({out, tensors...});
+
+ int length = out->shape().elements();
+
+ constexpr size_t K = sizeof...(Tensors);
+
+ gpu::Tensor<float> gOut = out;
+ gpu::Array<gpu::Tensor<float>, K> gIns = {tensors ...};
+
+ if(full.back() != 1 && out->shape().back() == 1) {
+ size_t m = full.elements() / length;
+ size_t k = full.back();
+ cpu::gAddReduce(functor, full, gOut, gIns, scale);
+ } else if(out->shape() == full) {
+ bool broadcast = false;
+ for(int i = 0; i < K; ++i)
+ broadcast = broadcast || gOut.shape() != gIns[i].shape();
+ cpu::gAddEqual(functor, gOut, gIns, scale, broadcast);
+ } else {
+ cpu::gAddGeneric(functor, full, gOut, gIns, scale);
+ }
+}
+
+
+}
+
+}
diff --git a/src/tensors/device.cpp b/src/tensors/cpu/device.cpp
index e4bf9d17..e4bf9d17 100644
--- a/src/tensors/device.cpp
+++ b/src/tensors/cpu/device.cpp
diff --git a/src/tensors/cpu/dropout.cpp b/src/tensors/cpu/dropout.cpp
index bb6ee799..6187cf3d 100644
--- a/src/tensors/cpu/dropout.cpp
+++ b/src/tensors/cpu/dropout.cpp
@@ -1,7 +1,7 @@
#include <algorithm>
#include <random>
-#include "tensors/dispatch.h"
+#include "tensors/tensor_operators.h"
#include "tensors/cpu/backend.h"
namespace marian {
diff --git a/src/tensors/cpu/element.h b/src/tensors/cpu/element.h
new file mode 100644
index 00000000..cdff1170
--- /dev/null
+++ b/src/tensors/cpu/element.h
@@ -0,0 +1,51 @@
+/* All or part of this file was contributed by Intel under license:
+ * Copyright (C) 2017-2018 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#pragma once
+
+#include "tensors/tensor.h"
+
+namespace marian {
+namespace cpu {
+
+template <size_t K, bool broadcast, class Functor>
+void gElement(Functor functor,
+ gpu::Array<gpu::Tensor<float>, K> tensors) {
+
+ int length = tensors[0].shape().elements();
+ gpu::Array<int, gpu::Shape::size()> dims;
+ gpu::Array<int, K> indices;
+
+ #pragma omp parallel for simd
+ for(int index = 0; index < length; ++index) {
+ indices.fill(index);
+ if(broadcast) {
+ tensors[0].shape().dims(index, dims);
+ for(int i = 1; i < K; ++i)
+ indices[i] = tensors[i].shape().bindex(dims);
+ }
+ tensors[0][index] = gpu::apply(functor, tensors, indices);
+ }
+}
+
+template <class Functor, class ...Tensors>
+void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
+ constexpr size_t K = sizeof...(tensors) + 1;
+ gpu::Array<gpu::Tensor<float>, K> gTensors = {out, tensors...};
+
+ int length = gTensors[0].shape().elements();
+
+ bool broadcast = false;
+ for(int i = 1; i < K; ++i)
+ broadcast = broadcast || gTensors[0].shape() != gTensors[i].shape();
+
+ if(broadcast)
+ cpu::gElement<K, true>(functor, gTensors);
+ else
+ cpu::gElement<K, false>(functor, gTensors);
+}
+
+}
+}
diff --git a/src/tensors/cpu/prod.cpp b/src/tensors/cpu/prod.cpp
new file mode 100644
index 00000000..f009ee3f
--- /dev/null
+++ b/src/tensors/cpu/prod.cpp
@@ -0,0 +1,69 @@
+#include "tensors/gpu/prod.h"
+#include "tensors/gpu/backend.h"
+
+#if BLAS_FOUND
+#include <cblas.h>
+#endif
+
+namespace marian {
+
+namespace cpu {
+
+void Prod(marian::Tensor C,
+ const marian::Tensor A,
+ const marian::Tensor B,
+ bool transA,
+ bool transB,
+ float beta,
+ float scalar) {
+
+#if BLAS_FOUND
+ float alpha = scalar;
+
+ int m = A->shape().elements() / A->shape()[-1];
+ int k = A->shape().back();
+ if(transA)
+ std::swap(m, k);
+
+ int l = B->shape().elements() / B->shape()[-1];
+ int n = B->shape()[-1];
+ if(transB)
+ std::swap(l, n);
+
+ int lda = A->shape()[-1];
+ int ldb = B->shape()[-1];
+ int ldc = B->shape()[-1];
+
+ if(transB)
+ ldc = B->shape().elements() / B->shape()[-1];
+
+ cblas_sgemm(
+ CblasColMajor,
+ transB ? CblasTrans : CblasNoTrans,
+ transA ? CblasTrans : CblasNoTrans,
+ n, m, k,
+ alpha,
+ B->data(),
+ ldb,
+ A->data(),
+ lda,
+ beta,
+ C->data(),
+ ldc);
+#else
+ ABORT("Not implemented!");
+#endif
+}
+
+void ProdBatched(marian::Tensor C,
+ const marian::Tensor A,
+ const marian::Tensor B,
+ bool transA,
+ bool transB,
+ float beta,
+ float scalar) {
+ ABORT("Not implemented!");
+}
+
+}
+}
diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp
new file mode 100644
index 00000000..46dcb919
--- /dev/null
+++ b/src/tensors/cpu/tensor_operators.cpp
@@ -0,0 +1,456 @@
+/* All or part of this file was contributed by Intel under license:
+ * Copyright (C) 2017-2018 Intel Corporation
+ * SPDX-License-Identifier: MIT
+ */
+
+#include "tensors/tensor_operators.h"
+#include "tensors/cpu/backend.h"
+
+#include "gpu/tensor.h"
+#include "functional/functional.h"
+
+namespace marian {
+
+namespace cpu {
+
+void ConcatCont(marian::Tensor out, const std::vector<marian::Tensor>& inputs, int axis) {
+ int step = 1;
+ for(int i = 0; i < axis; ++i)
+ step *= out->shape()[i];
+
+ size_t offset1 = 0;
+ for(int i = 0; i < step; ++i) {
+ for(auto in : inputs) {
+ size_t size = in->shape().elements() / step;
+ size_t offset2 = i * size;
+
+ std::copy(in->data() + offset2,
+ in->data() + offset2 + size,
+ out->data() + offset1);
+
+ offset1 += size;
+ }
+ }
+}
+
+inline void gInsertCols(float* out,
+ const float* in,
+ size_t rows,
+ size_t cols,
+ size_t cols_out,
+ size_t cols_in,
+ size_t offset_out,
+ size_t offset_in) {
+ for(int j = 0; j < rows; ++j) {
+ float* rowOut = out + j * cols_out + offset_out;
+ const float* rowIn = in + j * cols_in + offset_in;
+ for(int i = 0; i < cols; ++i) {
+ rowOut[i] = rowIn[i];
+ }
+ }
+}
+
+void Concatenate1(marian::Tensor out, const std::vector<marian::Tensor>& inputs) {
+ int rows = out->shape().elements() / out->shape().back();
+
+ size_t offset = 0;
+ int cols_out = out->shape().back();
+
+ for(auto in : inputs) {
+ ABORT_IF(rows != in->shape().elements() / in->shape().back(),
+ "First dimension must be equal");
+ int cols_in = in->shape().back();
+ cpu::gInsertCols(out->data(), in->data(), rows, cols_in, cols_out, cols_in, offset, 0);
+ offset += cols_in;
+ }
+}
+
+void Concatenate(marian::Tensor out, const std::vector<marian::Tensor>& inputs, int ax) {
+ if(ax == out->shape().size() - 1)
+ Concatenate1(out, inputs);
+ else
+ ConcatCont(out, inputs, ax);
+}
+
+void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax) {
+ ABORT("Not implemented!");
+}
+
+// @TODO: optimize this, currently it's quite horrible
+void TransposeND(marian::Tensor out, marian::Tensor in, const std::vector<int>& vAxis) {
+ gpu::Array<int, gpu::Shape::size()> permute;
+ int diff = gpu::Shape::size() - vAxis.size();
+ for(int i = 0; i < permute.size(); ++i)
+ if(i < diff)
+ permute[i] = i;
+ else
+ permute[i] = vAxis[i - diff] + diff;
+
+ int length = out->shape().elements();
+
+ constexpr size_t N = gpu::Shape::size();
+ gpu::Array<int, N> oDims;
+ gpu::Array<int, N> pDims;
+ gpu::Tensor<float> gOut = out;
+ gpu::Tensor<float> gIn = in;
+
+ for(int index = 0; index < length; ++index) {
+ gOut.shape().dims(index, oDims);
+ for(int i = 0; i < N; ++i)
+ pDims[permute[i]] = oDims[i];
+ gOut[index] = gIn[pDims];
+ }
+}
+
+void Softmax(Tensor out_, Tensor in_, Tensor mask_) {
+ float* out = out_->data();
+ const float* in = in_->data();
+ const float* mask = mask_ ? mask_->data() : nullptr;
+
+ int rows = out_->shape().elements() / out_->shape().back();
+ int cols = out_->shape().back();
+
+ for (int j = 0; j < rows; ++j) {
+ float* so = out + j*cols;
+ const float* sp = in + j*cols;
+ const float* mp = mask ? mask + j*cols : nullptr;
+
+ float max = sp[0];
+ for (int i = 1; i < cols; ++i) {
+ max = std::max(max, sp[i]);
+ }
+
+ float sum = 0.f;
+ for (int i = 0; i < cols; ++i) {
+ float ex = !mask || mp[i] ? std::exp(sp[i] - max) : 0.f;
+ so[i] = ex;
+ sum += ex;
+ }
+
+ for (int i = 0; i < cols; ++i) {
+ so[i] /= sum;
+ }
+ }
+}
+
+void LogSoftmax(Tensor out_, Tensor in_) {
+ float* out = out_->data();
+ const float* in = in_->data();
+
+ int rows = out_->shape().elements() / out_->shape().back();
+ int cols = out_->shape().back();
+
+ for (int j = 0; j < rows; ++j) {
+ float* so = out + j * cols;
+ const float* sp = in + j*cols;
+
+ float max = sp[0];
+ for (int i = 1; i < cols; ++i) {
+ max = std::max(max, sp[i]);
+ }
+
+ float sum = 0.f;
+ for (int i = 0; i < cols; ++i) {
+ float sm = sp[i] - max;
+ float ex = std::exp(sm);
+ so[i] = sm;
+ sum += ex;
+ }
+
+ for (int i = 0; i < cols; ++i) {
+ so[i] -= std::log(sum);
+ }
+ }
+}
+
+void SoftmaxGrad(marian::Tensor grad, marian::Tensor adj, marian::Tensor val) {
+ ABORT("Not implemented!");
+}
+
+void LogSoftmaxGrad(marian::Tensor grad, marian::Tensor adj, marian::Tensor val) {
+ ABORT("Not implemented!");
+}
+
+void CopyRows(marian::Tensor out_, const marian::Tensor in_, const std::vector<size_t>& indices) {
+ size_t cols = in_->shape()[1];
+ size_t rows = indices.size();
+
+ float* out = out_->data();
+ const float* in = in_->data();
+
+ #pragma omp parallel for
+ for (int j = 0; j < rows; ++j) {
+ size_t dst = j;
+ size_t src = indices[j];
+
+ float* rowOut = out + dst*cols;
+ const float* rowIn = in + src*cols;
+
+ std::copy(rowIn, rowIn + cols, rowOut);
+ }
+}
+
+void PasteRows(marian::Tensor out,
+ const marian::Tensor in,
+ const std::vector<size_t>& indices) {
+ ABORT("Not implemented!");
+}
+
+void CopyCols(marian::Tensor out, const marian::Tensor in, const std::vector<size_t>& indices) {
+ ABORT("Not implemented!");
+}
+
+void PasteCols(marian::Tensor out,
+ const marian::Tensor in,
+ const std::vector<size_t>& indices) {
+ ABORT("Not implemented!");
+}
+
+void Select(marian::Tensor out,
+ const marian::Tensor in,
+ int axis,
+ const std::vector<size_t>& indices,
+ Ptr<Allocator> allocator) {
+ ABORT("Not implemented!");
+}
+
+void Insert(marian::Tensor out,
+ const marian::Tensor in,
+ int axis,
+ const std::vector<size_t>& indices,
+ Ptr<Allocator> allocator) {
+ ABORT("Not implemented!");
+}
+
+void GRUFastForward(marian::Tensor out_, std::vector<marian::Tensor> inputs, bool final) {
+ int rows = out_->shape().elements() / out_->shape().back();
+ int cols = out_->shape().back();
+
+ float* out = out_->data();
+
+ const float* state = inputs[0]->data();
+ const float* xW = inputs[1]->data();
+ const float* sU = inputs[2]->data();
+ const float* b = inputs[3]->data();
+ const float* mask = inputs.size() > 4 ? inputs[4]->data() : nullptr;
+
+ #pragma omp parallel for
+ for (int j = 0; j < rows; ++j) {
+ float m = !mask || mask[j];
+ float* rowOut = out + j * cols;
+ const float* rowState = state + j * cols;
+
+ const float* xWrow = xW + j * cols * 3;
+ const float* sUrow = sU + j * cols * 3;
+
+ #pragma omp simd
+ for (int i = 0; i < cols; ++i) {
+ // @TODO: stable logit
+ float ev1 = std::exp(-(xWrow[i] + sUrow[i] + b[i]));
+ float r = 1.0f / (1.0f + ev1);
+
+ int k = i + cols;
+ // @TODO: stable logit
+ float ev2 = std::exp(-(xWrow[k] + sUrow[k] + b[k]));
+ float z = 1.0f / (1.0f + ev2);
+
+ int l = i + 2 * cols;
+ float h;
+ if(final)
+ h = std::tanh(xWrow[l] + (sUrow[l] + b[l]) * r);
+ else
+ h = std::tanh(xWrow[l] + sUrow[l] * r + b[l]);
+
+ float out = (1.0f - z) * h + z * rowState[i];
+ rowOut[i] = m * out + (1 - m) * rowState[i];
+ }
+ }
+}
+
+void GRUFastBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj,
+ bool final) {
+ ABORT("Not implemented!");
+}
+
+void CrossEntropyPick(marian::Tensor out, marian::Tensor in, marian::Tensor pick) {
+ ABORT("Not implemented!");
+}
+
+void CrossEntropyPickBackward(marian::Tensor out, marian::Tensor adj, marian::Tensor a, marian::Tensor pick) {
+ ABORT("Not implemented!");
+}
+
+
+float L2Norm(marian::Tensor in) {
+ ABORT("Not implemented!");
+}
+
+void Att(marian::Tensor out_, marian::Tensor va_, marian::Tensor context_, marian::Tensor state_) {
+ float* out = out_->data();
+ const float* va = va_->data();
+ const float* ctx = context_->data();
+ const float* state = state_->data();
+
+ int m = out_->shape().elements() / out_->shape().back();
+ int k = context_->shape()[-1];
+ int b = context_->shape()[-2];
+ int t = context_->shape()[-3];
+
+ int rows = m;
+ int cols = k;
+
+ #pragma omp parallel for
+ for (size_t j = 0; j < rows; ++j) {
+ const float* vaRow = va;
+ const float* ctxRow = ctx + (j % (b * t)) * cols;
+ const float* stateRow = state + ((j / (b * t)) * b + j % b) * cols;
+
+ float sum = 0.f;
+ #pragma omp simd reduction(+:sum)
+ for (size_t i = 0; i < cols; ++i) {
+ float z = ctxRow[i] + stateRow[i];
+ sum += std::tanh(z) * vaRow[i];
+ }
+
+ out[j] = sum;
+ }
+}
+
+void AttBack(marian::Tensor gVa,
+ marian::Tensor gContext,
+ marian::Tensor gState,
+ marian::Tensor va,
+ marian::Tensor context,
+ marian::Tensor state,
+ marian::Tensor adj) {
+ ABORT("Not implemented!");
+}
+
+void LayerNormalization(marian::Tensor out_,
+ marian::Tensor in_,
+ marian::Tensor gamma_,
+ marian::Tensor beta_,
+ float eps) {
+ float* out = out_->data();
+ const float* in = in_->data();
+ const float* alpha = gamma_->data();
+ const float* beta = beta_ ? beta_->data() : nullptr;
+
+ int rows = in_->shape().elements() / in_->shape().back();
+ int cols = in_->shape().back();
+
+ #pragma omp parallel for
+ for (int j = 0; j < rows; ++j) {
+ float* so = out + j*cols;
+ const float* sp = in + j*cols;
+
+ float sum = 0.f;
+ #pragma omp simd reduction(+:sum)
+ for (int i = 0; i < cols; ++i) {
+ sum += sp[i];
+ }
+
+ float mean = sum / cols;
+ float sqSum = 0.f;
+ #pragma omp simd reduction(+:sqSum)
+ for (int i = 0; i < cols; ++i) {
+ float ex = sp[i] - mean;
+ sqSum += ex*ex;
+ }
+
+ float sigma = std::sqrt(eps + sqSum / cols);
+
+ #pragma omp simd
+ for (int i = 0; i < cols; ++i) {
+ float t = alpha[i] * ((sp[i] - mean) / sigma);
+ if (beta != nullptr) {
+ t += beta[i];
+ }
+
+ so[i] = t;
+ }
+ }
+}
+
+void LayerNormalizationGrad(marian::Tensor gradX,
+ marian::Tensor gradGamma,
+ marian::Tensor gradBeta,
+ marian::Tensor adj,
+ marian::Tensor y,
+ marian::Tensor x,
+ marian::Tensor gamma,
+ marian::Tensor beta,
+ float eps) {
+ ABORT("Not implemented!");
+}
+
+
+void Shift(marian::Tensor out, marian::Tensor in, marian::Shape shift, bool invert) {
+ ABORT("Not implemented!");
+}
+
+void SetSparse(float* out,
+ const std::vector<size_t>& indices,
+ const std::vector<float>& values) {
+ ABORT("Not implemented!");
+}
+
+
+void LSTMCellForward(marian::Tensor out, std::vector<marian::Tensor> inputs) {
+ ABORT("Not implemented!");
+}
+
+void LSTMOutputForward(marian::Tensor out, std::vector<marian::Tensor> inputs) {
+ ABORT("Not implemented!");
+}
+
+void LSTMCellBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj) {
+ ABORT("Not implemented!");
+}
+
+void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj) {
+ ABORT("Not implemented!");
+}
+
+void HighwayForward(marian::Tensor out,
+ const marian::Tensor in1,
+ const marian::Tensor in2,
+ const marian::Tensor t) {
+ ABORT("Not implemented!");
+}
+
+void HighwayBackward(marian::Tensor out1,
+ marian::Tensor out2,
+ marian::Tensor outt,
+ const marian::Tensor in1,
+ const marian::Tensor in2,
+ const marian::Tensor t,
+ const marian::Tensor adj) {
+ ABORT("Not implemented!");
+}
+
+void PoolingWithMaskingForward(marian::Tensor out,
+ marian::Tensor in,
+ marian::Tensor mask,
+ int width,
+ bool isEven) {
+ ABORT("Not implemented!");
+}
+
+void PoolingWithMaskingBackward(marian::Tensor adj,
+ marian::Tensor adjIn,
+ marian::Tensor in,
+ marian::Tensor mask,
+ int width,
+ bool isEven) {
+ ABORT("Not implemented!");
+}
+
+}
+} // namespace marian
diff --git a/src/tensors/dispatch.h b/src/tensors/dispatch.h
index e63a6af1..d8e218b3 100644
--- a/src/tensors/dispatch.h
+++ b/src/tensors/dispatch.h
@@ -1,38 +1,129 @@
-#pragma once
-
-#include "common/definitions.h"
-#include "tensors/tensor.h"
-
-#define DISPATCH1(Function, Arg1) \
- namespace gpu { \
- void Function(Arg1); \
- } \
- namespace cpu { \
- void Function(Arg1); \
- } \
- void Function(Arg1 arg1) { \
- if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
- gpu::Function(arg1); \
- else \
- cpu::Function(arg1); \
- }
-
-#define DISPATCH2(Function, Arg1, Arg2) \
- namespace gpu { \
- void Function(Arg1, Arg2); \
- } \
- namespace cpu { \
- void Function(Arg1, Arg2); \
- } \
- static inline void Function(Arg1 arg1, Arg2 arg2) { \
- if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
- gpu::Function(arg1, arg2); \
- else \
- cpu::Function(arg1, arg2); \
- }
-
-namespace marian {
-
- DISPATCH2(Dropout, Tensor, float)
-
-}
+#pragma once
+
+
+#define DISPATCH1(Function, Arg1) \
+ namespace gpu { \
+ void Function(Arg1); \
+ } \
+ namespace cpu { \
+ void Function(Arg1); \
+ } \
+ void Function(Arg1 arg1) { \
+ if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+ gpu::Function(arg1); \
+ else \
+ cpu::Function(arg1); \
+ }
+
+#define DISPATCH2(Function, Arg1, Arg2) \
+ namespace gpu { \
+ void Function(Arg1, Arg2); \
+ } \
+ namespace cpu { \
+ void Function(Arg1, Arg2); \
+ } \
+ static inline void Function(Arg1 arg1, Arg2 arg2) { \
+ if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+ gpu::Function(arg1, arg2); \
+ else \
+ cpu::Function(arg1, arg2); \
+ }
+
+#define DISPATCH3(Function, Arg1, Arg2, Arg3) \
+ namespace gpu { \
+ void Function(Arg1, Arg2, Arg3); \
+ } \
+ namespace cpu { \
+ void Function(Arg1, Arg2, Arg3); \
+ } \
+ static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3) { \
+ if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+ gpu::Function(arg1, arg2, arg3); \
+ else \
+ cpu::Function(arg1, arg2, arg3); \
+ }
+
+#define DISPATCH4(Function, Arg1, Arg2, Arg3, Arg4) \
+ namespace gpu { \
+ void Function(Arg1, Arg2, Arg3, Arg4); \
+ } \
+ namespace cpu { \
+ void Function(Arg1, Arg2, Arg3, Arg4); \
+ } \
+ static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4) { \
+ if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+ gpu::Function(arg1, arg2, arg3, arg4); \
+ else \
+ cpu::Function(arg1, arg2, arg3, arg4); \
+ }
+
+#define DISPATCH5(Function, Arg1, Arg2, Arg3, Arg4, Arg5) \
+ namespace gpu { \
+ void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
+ } \
+ namespace cpu { \
+ void Function(Arg1, Arg2, Arg3, Arg4, Arg5); \
+ } \
+ static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5) { \
+ if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+ gpu::Function(arg1, arg2, arg3, arg4, arg5); \
+ else \
+ cpu::Function(arg1, arg2, arg3, arg4, arg5); \
+ }
+
+#define DISPATCH6(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6) \
+ namespace gpu { \
+ void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
+ } \
+ namespace cpu { \
+ void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6); \
+ } \
+ static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6) { \
+ if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+ gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
+ else \
+ cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6); \
+ }
+
+#define DISPATCH7(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7) \
+ namespace gpu { \
+ void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
+ } \
+ namespace cpu { \
+ void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7); \
+ } \
+ static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7) { \
+ if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+ gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
+ else \
+ cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7); \
+ }
+
+#define DISPATCH8(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8) \
+ namespace gpu { \
+ void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
+ } \
+ namespace cpu { \
+ void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8); \
+ } \
+ static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8) { \
+ if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+ gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
+ else \
+ cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8); \
+ }
+
+#define DISPATCH9(Function, Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9) \
+ namespace gpu { \
+ void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
+ } \
+ namespace cpu { \
+ void Function(Arg1, Arg2, Arg3, Arg4, Arg5, Arg6, Arg7, Arg8, Arg9); \
+ } \
+ static inline void Function(Arg1 arg1, Arg2 arg2, Arg3 arg3, Arg4 arg4, Arg5 arg5, Arg6 arg6, Arg7 arg7, Arg8 arg8, Arg9 arg9) { \
+ if(arg1->getBackend()->getDevice().type == DeviceType::gpu) \
+ gpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
+ else \
+ cpu::Function(arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9); \
+ }
+
diff --git a/src/tensors/gpu/add.h b/src/tensors/gpu/add.h
new file mode 100644
index 00000000..13ffc500
--- /dev/null
+++ b/src/tensors/gpu/add.h
@@ -0,0 +1,180 @@
+#include "gpu/shape.h"
+#include "gpu/tmp.h"
+#include "gpu/tensor.h"
+#include "functional/functional.h"
+
+namespace marian {
+
+namespace gpu {
+
+#ifdef __CUDACC__
+template <size_t K, class Functor>
+__global__ void gAddGeneric(Functor functor,
+ const gpu::Shape full,
+ gpu::Tensor<float> out,
+ gpu::Array<gpu::Tensor<float>, K> ins,
+ float scale = 1.0) {
+
+ int outLength = out.shape().elements();
+ bool same = outLength == full.elements();
+ for(int i = 0; i < K; ++i)
+ same = same && outLength == ins[i].shape().elements();
+
+ constexpr size_t N = gpu::Shape::size();
+ gpu::Array<int, N> len;
+ for(int i = 0; i < N; ++i)
+ len[i] = full[i] / out.shape()[i];
+
+ gpu::Array<int, N> dims;
+ for(int bid = 0; bid < outLength; bid += blockDim.x * gridDim.x) {
+ int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
+ if(index < outLength) {
+
+ if(same) {
+ out[index] += gpu::apply(functor, ins, index) * scale;
+ } else {
+ out.shape().dims(index, dims);
+ out[index] += gpu::loops(functor, ins, len, dims) * scale;
+ }
+
+ }
+ }
+}
+
+template <size_t K, class Functor>
+__global__ void gAddEqual(Functor functor,
+ gpu::Tensor<float> out,
+ gpu::Array<gpu::Tensor<float>, K> ins,
+ float scale,
+ bool broadcast) {
+ int length = out.shape().elements();
+ gpu::Array<int, gpu::Shape::size()> dims;
+
+ for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
+ int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
+ if(index < length) {
+ gpu::Array<int, K> indices;
+ indices.fill(index);
+
+ if(broadcast) {
+ out.shape().dims(index, dims);
+ for(size_t i = 0; i < K; ++i)
+ indices[i] = ins[i].shape().bindex(dims);
+ }
+
+ out[index] += gpu::apply(functor, ins, indices) * scale;
+ }
+ }
+}
+
+template <size_t K, class Functor>
+__global__ void gAddReduce(Functor functor,
+ const gpu::Shape full,
+ gpu::Tensor<float> out,
+ gpu::Array<gpu::Tensor<float>, K> ins,
+ float scale = 1.0) {
+
+ int rows = full.elements() / full.back();
+ int cols = full.back();
+
+ bool same = true;
+ for(int i = 0; i < K; ++i)
+ same = same && ins[i].shape().elements() == full.elements();
+
+ for(int bid = 0; bid < rows; bid += gridDim.x) {
+ int j = bid + blockIdx.x;
+ if(j < rows) {
+ extern __shared__ float _share[];
+ float* _sum = _share + blockDim.x;
+
+ if(same) {
+ _sum[threadIdx.x] = 0;
+ for(int tid = 0; tid < cols; tid += blockDim.x) {
+ int id = tid + threadIdx.x;
+ if(id < cols)
+ _sum[threadIdx.x] += gpu::apply(functor, ins, j * cols + id);
+ }
+ } else {
+ gpu::Array<int, gpu::Shape::size()> dims;
+ _sum[threadIdx.x] = 0;
+
+ for(int tid = 0; tid < cols; tid += blockDim.x) {
+ int id = tid + threadIdx.x;
+ if(id < cols) {
+ full.dims(j * cols + id, dims);
+ gpu::Array<int, K> indices;
+ for(int i = 0; i < K; ++i)
+ indices[i] = ins[i].shape().bindex(dims);
+ _sum[threadIdx.x] += gpu::apply(functor, ins, indices);
+ }
+ }
+ }
+ __syncthreads();
+ int len = blockDim.x;
+ while(len != 1) {
+ __syncthreads();
+ int skip = (len + 1) >> 1;
+ if(threadIdx.x < (len >> 1)) {
+ _sum[threadIdx.x] += _sum[threadIdx.x + skip];
+ }
+ len = (len + 1) >> 1;
+ }
+ __syncthreads();
+ out[j] += _sum[0] * scale;
+ }
+ }
+}
+#endif
+
+template <class Functor, class ...Tensors>
+void Add(Functor functor,
+ float scale,
+ marian::Tensor out,
+ Tensors... tensors) {
+
+#ifdef __CUDACC__
+ cudaSetDevice(out->getDevice().no);
+
+ auto full = marian::Shape::broadcast({out, tensors...});
+
+ int length = out->shape().elements();
+
+ constexpr size_t K = sizeof...(Tensors);
+
+ gpu::Tensor<float> gOut = out;
+ gpu::Array<gpu::Tensor<float>, K> gIns = {tensors ...};
+
+ if(full.back() != 1 && out->shape().back() == 1) {
+ size_t m = full.elements() / length;
+ size_t k = full.back();
+
+ int blocks = std::min(MAX_BLOCKS, (int)m);
+ int threads = std::min(MAX_THREADS, (int)k);
+ int shared = sizeof(float) * threads * 2;
+
+ gAddReduce<<<blocks, threads, shared>>>(functor, full, gOut, gIns, scale);
+
+ } else if(out->shape() == full) {
+ int threads = std::min(MAX_THREADS, length);
+ int blocks
+ = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+ bool broadcast = false;
+ for(int i = 0; i < K; ++i)
+ broadcast = broadcast || gOut.shape() != gIns[i].shape();
+
+ gAddEqual<<<blocks, threads>>>(functor, gOut, gIns, scale, broadcast);
+ } else {
+ int threads = std::min(MAX_THREADS, length);
+ int blocks
+ = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+ gAddGeneric<<<blocks, threads>>>(functor, full, gOut, gIns, scale);
+ }
+#else
+ ABORT("Not implemented");
+#endif
+}
+
+}
+}
diff --git a/src/tensors/gpu/algorithm.cu b/src/tensors/gpu/algorithm.cu
index b26e00e9..2742cc20 100644
--- a/src/tensors/gpu/algorithm.cu
+++ b/src/tensors/gpu/algorithm.cu
@@ -1,7 +1,7 @@
#include "tensors/gpu/algorithm.h"
-#include "kernels/cuda_helpers.h"
-#include "kernels/tensor_operators.h"
+#include "tensors/tensor_operators.h"
+#include "tensors/gpu/cuda_helpers.h"
namespace marian {
namespace gpu {
@@ -34,7 +34,8 @@ namespace marian {
const std::vector<float>& values,
float* data) {
CUDA_CHECK(cudaSetDevice(backend->getDevice().no));
- SetSparse(data, keys, values);
+ ABORT("no SetSparse");
+ //gpu::SetSparse(data, keys, values);
CUDA_CHECK(cudaStreamSynchronize(0));
}
diff --git a/src/kernels/cuda_helpers.h b/src/tensors/gpu/cuda_helpers.h
index 46d84afe..6726e127 100644
--- a/src/kernels/cuda_helpers.h
+++ b/src/tensors/gpu/cuda_helpers.h
@@ -4,9 +4,14 @@
#include "3rd_party/exception.h"
#include "common/logging.h"
+const float CUDA_FLT_MAX = 1.70141e+38;
+const int MAX_THREADS = 512;
+const int MAX_BLOCKS = 65535;
+
#define CUDA_CHECK(ans) \
{ gpuAssert((ans), __FILE__, __LINE__); }
+
inline void gpuAssert(cudaError_t code,
const char *file,
int line,
diff --git a/src/kernels/cudnn_wrappers.cu b/src/tensors/gpu/cudnn_wrappers.cu
index ca5bec8c..f773d900 100644
--- a/src/kernels/cudnn_wrappers.cu
+++ b/src/tensors/gpu/cudnn_wrappers.cu
@@ -1,4 +1,4 @@
-#include "kernels/cudnn_wrappers.h"
+#include "tensors/gpu/cudnn_wrappers.h"
namespace marian {
diff --git a/src/kernels/cudnn_wrappers.h b/src/tensors/gpu/cudnn_wrappers.h
index fca4b6e0..fca4b6e0 100644
--- a/src/kernels/cudnn_wrappers.h
+++ b/src/tensors/gpu/cudnn_wrappers.h
diff --git a/src/tensors/device.cu b/src/tensors/gpu/device.cu
index bef5491b..e15b80d7 100644
--- a/src/tensors/device.cu
+++ b/src/tensors/gpu/device.cu
@@ -2,7 +2,7 @@
#include <iostream>
#include "tensors/device.h"
-#include "kernels/cuda_helpers.h"
+#include "tensors/gpu/cuda_helpers.h"
namespace marian {
namespace gpu {
diff --git a/src/tensors/gpu/dropout.cu b/src/tensors/gpu/dropout.cu
index 4a4223a8..5bd68352 100644
--- a/src/tensors/gpu/dropout.cu
+++ b/src/tensors/gpu/dropout.cu
@@ -3,7 +3,7 @@
#include <stdio.h>
#include <stdlib.h>
-#include "tensors/dispatch.h"
+#include "tensors/tensor_operators.h"
#include "tensors/gpu/backend.h"
#define CUDA_CALL(x) \
@@ -35,7 +35,7 @@ namespace marian {
}
}
- void Dropout(Tensor tensor, float p) {
+ void Dropout(marian::Tensor tensor, float p) {
auto gpuBackend = std::static_pointer_cast<gpu::Backend>(tensor->getBackend());
curandGenerator_t gen = gpuBackend->getCurandGenerator();
int n = tensor->size();
diff --git a/src/tensors/gpu/element.h b/src/tensors/gpu/element.h
new file mode 100644
index 00000000..2136f00a
--- /dev/null
+++ b/src/tensors/gpu/element.h
@@ -0,0 +1,65 @@
+#pragma once
+
+#include "tensors/tensor.h"
+
+#ifdef __CUDACC__
+#include "tensors/gpu/cuda_helpers.h"
+#endif
+
+namespace marian {
+namespace gpu {
+
+#ifdef __CUDACC__
+template <size_t K, bool broadcast, class Functor>
+__global__ void gElement(Functor functor,
+ gpu::Array<gpu::Tensor<float>, K> tensors) {
+
+ int length = tensors[0].shape().elements();
+ gpu::Array<int, gpu::Shape::size()> dims;
+ gpu::Array<int, K> indices;
+
+ for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
+ int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
+ if(index < length) {
+
+ indices.fill(index);
+
+ if(broadcast) {
+ tensors[0].shape().dims(index, dims);
+ for(int i = 1; i < K; ++i)
+ indices[i] = tensors[i].shape().bindex(dims);
+ }
+
+ tensors[0][index] = gpu::apply(functor, tensors, indices);
+ }
+ }
+}
+#endif
+
+template <class Functor, class ...Tensors>
+void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
+#ifdef __CUDACC__
+ cudaSetDevice(out->getDevice().no);
+
+ constexpr size_t K = sizeof...(tensors) + 1;
+ gpu::Array<gpu::Tensor<float>, K> gTensors = {out, tensors...};
+
+ int length = gTensors[0].shape().elements();
+ int threads = std::min(MAX_THREADS, length);
+ int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+ bool broadcast = false;
+ for(int i = 1; i < K; ++i)
+ broadcast = broadcast || gTensors[0].shape() != gTensors[i].shape();
+
+ if(broadcast)
+ gpu::gElement<K, true><<<blocks, threads>>>(functor, gTensors);
+ else
+ gpu::gElement<K, false><<<blocks, threads>>>(functor, gTensors);
+#else
+ ABORT("Not implemented");
+#endif
+}
+
+}
+}
diff --git a/src/tensors/gpu/prod.cu b/src/tensors/gpu/prod.cu
new file mode 100644
index 00000000..38278ec5
--- /dev/null
+++ b/src/tensors/gpu/prod.cu
@@ -0,0 +1,129 @@
+
+#include <cublas_v2.h>
+
+#include "tensors/gpu/prod.h"
+#include "tensors/gpu/backend.h"
+
+namespace marian {
+
+namespace gpu {
+
+void Prod(marian::Tensor C,
+ const marian::Tensor A,
+ const marian::Tensor B,
+ bool transA,
+ bool transB,
+ float beta,
+ float scalar) {
+ cudaSetDevice(C->getDevice().no);
+ float alpha = scalar;
+
+ size_t m = A->shape().elements() / A->shape().back();
+ size_t k = A->shape().back();
+ if(transA)
+ std::swap(m, k);
+
+ size_t l = B->shape().elements() / B->shape().back();
+ size_t n = B->shape().back();
+ if(transB)
+ std::swap(l, n);
+
+ size_t lda = A->shape().back();
+ size_t ldb = B->shape().back();
+ size_t ldc = B->shape().back();
+
+ if(transB)
+ ldc = B->shape().elements() / B->shape().back();
+
+ cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+ cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+ auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())->getCublasHandle();
+
+#if CUDA_VERSION >= 9000
+ //cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
+#endif
+
+ cublasSgemm(cublasHandle,
+ opB,
+ opA,
+ n,
+ m,
+ k,
+ &alpha,
+ B->data(),
+ ldb,
+ A->data(),
+ lda,
+ &beta,
+ C->data(),
+ ldc);
+#if CUDA_VERSION >= 9000
+ //cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
+#endif
+}
+
+void ProdBatched(marian::Tensor C,
+ const marian::Tensor A,
+ const marian::Tensor B,
+ bool transA,
+ bool transB,
+ float beta,
+ float scalar) {
+ cudaSetDevice(C->getDevice().no);
+ float alpha = scalar;
+
+ size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]);
+ size_t batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]);
+
+ size_t m = A->shape()[-2];
+ size_t k = A->shape()[-1];
+ if(transA)
+ std::swap(m, k);
+
+ size_t l = B->shape()[-2];
+ size_t n = B->shape()[-1];
+ if(transB)
+ std::swap(l, n);
+
+ size_t lda = A->shape()[-1];
+ size_t ldb = B->shape()[-1];
+ size_t ldc = B->shape()[-1];
+
+ if(transB)
+ ldc = B->shape()[-2];
+
+ cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+ cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+ auto cublasHandle = std::static_pointer_cast<gpu::Backend>(C->getBackend())->getCublasHandle();
+
+#if CUDA_VERSION >= 9000
+ //cublasSetMathMode(cublasHandle, CUBLAS_TENSOR_OP_MATH);
+#endif
+ cublasSgemmStridedBatched(cublasHandle,
+ opB,
+ opA,
+ n,
+ m,
+ k,
+ &alpha,
+ B->data(),
+ ldb,
+ batchB == 1 ? 0 : n * k,
+ A->data(),
+ lda,
+ batchA == 1 ? 0 : m * k,
+ &beta,
+ C->data(),
+ ldc,
+ n * m,
+ std::max(batchA, batchB));
+#if CUDA_VERSION >= 9000
+ //cublasSetMathMode(cublasHandle, CUBLAS_DEFAULT_MATH);
+#endif
+}
+
+
+}
+}
diff --git a/src/tensors/gpu/prod.h b/src/tensors/gpu/prod.h
new file mode 100644
index 00000000..db9b62d8
--- /dev/null
+++ b/src/tensors/gpu/prod.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "tensors/tensor.h"
+
+namespace marian {
+
+namespace gpu {
+
+void Prod(marian::Tensor C,
+ const marian::Tensor A,
+ const marian::Tensor B,
+ bool transA,
+ bool transB,
+ float beta = 0,
+ float scalar = 1);
+
+void ProdBatched(marian::Tensor C,
+ const marian::Tensor A,
+ const marian::Tensor B,
+ bool transA,
+ bool transB,
+ float beta = 0,
+ float scalar = 1);
+
+}
+}
diff --git a/src/kernels/sparse.cu b/src/tensors/gpu/sparse.cu
index b5080c0c..b5080c0c 100644
--- a/src/kernels/sparse.cu
+++ b/src/tensors/gpu/sparse.cu
diff --git a/src/kernels/sparse.h b/src/tensors/gpu/sparse.h
index cffb398e..cffb398e 100644
--- a/src/kernels/sparse.h
+++ b/src/tensors/gpu/sparse.h
diff --git a/src/kernels/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index 8dcda559..92f07a9b 100644
--- a/src/kernels/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -1,12 +1,17 @@
-#include <thrust/transform_reduce.h>
-#include "kernels/cuda_helpers.h"
-#include "kernels/tensor_operators.h"
+//#include <thrust/transform_reduce.h>
+
+#include "tensors/gpu/cuda_helpers.h"
+#include "tensors/tensor_operators.h"
+#include "tensors/gpu/backend.h"
+
+#include "gpu/tensor.h"
+#include "functional/functional.h"
#include "3rd_party/reduce_all.h"
namespace marian {
-#define CUDA_FLT_MAX 1.70141e+38
+namespace gpu {
struct isnan_test {
__host__ __device__ bool operator()(const float a) const { return isnan(a); }
@@ -22,7 +27,7 @@ __device__ inline float stableLogit(float x) {
}
}
-bool IsNan(Tensor in) {
+bool IsNan(marian::Tensor in) {
//cudaSetDevice(in->getDevice().no);
//thrust::device_ptr<float> begin = thrust::device_pointer_cast(in->data());
//thrust::device_ptr<float> end
@@ -32,7 +37,7 @@ bool IsNan(Tensor in) {
return false;
}
-void ConcatCont(Tensor out, const std::vector<Tensor>& inputs, int axis) {
+void ConcatCont(marian::Tensor out, const std::vector<marian::Tensor>& inputs, int axis) {
cudaSetDevice(out->getDevice().no);
int step = 1;
for(int i = 0; i < axis; ++i)
@@ -78,7 +83,7 @@ __global__ void gInsertCols(float* out,
}
}
-void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
+void Concatenate1(marian::Tensor out, const std::vector<marian::Tensor>& inputs) {
cudaSetDevice(out->getDevice().no);
int rows = out->shape().elements() / out->shape().back();
@@ -102,14 +107,14 @@ void Concatenate1(Tensor out, const std::vector<Tensor>& inputs) {
cudaStreamSynchronize(0);
}
-void Concatenate(Tensor out, const std::vector<Tensor>& inputs, int ax) {
+void Concatenate(marian::Tensor out, const std::vector<marian::Tensor>& inputs, int ax) {
if(ax == out->shape().size() - 1)
Concatenate1(out, inputs);
else
ConcatCont(out, inputs, ax);
}
-void Split1(std::vector<Tensor>& outputs, const Tensor in) {
+void Split1(std::vector<marian::Tensor>& outputs, const marian::Tensor in) {
cudaSetDevice(in->getDevice().no);
size_t offset = 0;
@@ -130,7 +135,7 @@ void Split1(std::vector<Tensor>& outputs, const Tensor in) {
cudaStreamSynchronize(0);
}
-void SplitCont(std::vector<Tensor>& outputs, const Tensor in, int axis) {
+void SplitCont(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int axis) {
cudaSetDevice(in->getDevice().no);
int step = 1;
@@ -154,7 +159,7 @@ void SplitCont(std::vector<Tensor>& outputs, const Tensor in, int axis) {
cudaStreamSynchronize(0);
}
-void Deconcatenate(std::vector<Tensor>& outputs, const Tensor in, int ax) {
+void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax) {
if(ax == in->shape().size() - 1)
Split1(outputs, in);
else
@@ -181,7 +186,7 @@ __global__ void gTransposeND(gpu::Tensor<float> out,
}
}
-void TransposeND(Tensor out, Tensor in, const std::vector<int>& vAxis) {
+void TransposeND(marian::Tensor out, marian::Tensor in, const std::vector<int>& vAxis) {
cudaSetDevice(out->getDevice().no);
gpu::Array<int, gpu::Shape::size()> axes;
@@ -297,7 +302,7 @@ __global__ void gSoftmax(float* out,
}
}
-void Softmax(Tensor out, Tensor in, Tensor mask) {
+void Softmax(marian::Tensor out, marian::Tensor in, marian::Tensor mask) {
cudaSetDevice(out->getDevice().no);
size_t m = out->shape().elements() / out->shape().back();
@@ -385,7 +390,7 @@ __global__ void gLogSoftmax(float* out,
}
}
-void LogSoftmax(Tensor out, Tensor in) {
+void LogSoftmax(marian::Tensor out, marian::Tensor in) {
cudaSetDevice(out->getDevice().no);
size_t m = out->shape().elements() / out->shape().back();
@@ -444,7 +449,7 @@ __global__ void gSoftmaxGrad(float* grad,
}
}
-void SoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
+void SoftmaxGrad(marian::Tensor grad, marian::Tensor adj, marian::Tensor val) {
cudaSetDevice(adj->getDevice().no);
// grad and val are both m-by-k matrices, passed as input.
// A weighted average of each row of grad (according to the weights
@@ -501,7 +506,7 @@ __global__ void gLogSoftmaxGrad(float* grad,
}
}
-void LogSoftmaxGrad(Tensor grad, Tensor adj, Tensor val) {
+void LogSoftmaxGrad(marian::Tensor grad, marian::Tensor adj, marian::Tensor val) {
cudaSetDevice(adj->getDevice().no);
// grad and val are both m-by-k matrices, passed as input.
@@ -540,119 +545,6 @@ __global__ void gArgmax(float* out,
///////////////////////////////////////////////////////
-void Prod(cublasHandle_t handle,
- Tensor C,
- const Tensor A,
- const Tensor B,
- bool transA,
- bool transB,
- float beta,
- float scalar) {
- cudaSetDevice(C->getDevice().no);
- float alpha = scalar;
-
- size_t m = A->shape().elements() / A->shape().back();
- size_t k = A->shape().back();
- if(transA)
- std::swap(m, k);
-
- size_t l = B->shape().elements() / B->shape().back();
- size_t n = B->shape().back();
- if(transB)
- std::swap(l, n);
-
- size_t lda = A->shape().back();
- size_t ldb = B->shape().back();
- size_t ldc = B->shape().back();
-
- if(transB)
- ldc = B->shape().elements() / B->shape().back();
-
- cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
- cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-#if CUDA_VERSION >= 9000
- //cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
-#endif
- cublasSgemm(handle,
- opB,
- opA,
- n,
- m,
- k,
- &alpha,
- B->data(),
- ldb,
- A->data(),
- lda,
- &beta,
- C->data(),
- ldc);
-#if CUDA_VERSION >= 9000
- //cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH);
-#endif
-}
-
-void ProdBatched(cublasHandle_t handle,
- Tensor C,
- const Tensor A,
- const Tensor B,
- bool transA,
- bool transB,
- float beta,
- float scalar) {
- cudaSetDevice(C->getDevice().no);
- float alpha = scalar;
-
- size_t batchA = A->shape().elements() / (A->shape()[-1] * A->shape()[-2]);
- size_t batchB = B->shape().elements() / (B->shape()[-1] * B->shape()[-2]);
-
- size_t m = A->shape()[-2];
- size_t k = A->shape()[-1];
- if(transA)
- std::swap(m, k);
-
- size_t l = B->shape()[-2];
- size_t n = B->shape()[-1];
- if(transB)
- std::swap(l, n);
-
- size_t lda = A->shape()[-1];
- size_t ldb = B->shape()[-1];
- size_t ldc = B->shape()[-1];
-
- if(transB)
- ldc = B->shape()[-2];
-
- cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
- cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
-
-#if CUDA_VERSION >= 9000
- //cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
-#endif
- cublasSgemmStridedBatched(handle,
- opB,
- opA,
- n,
- m,
- k,
- &alpha,
- B->data(),
- ldb,
- batchB == 1 ? 0 : n * k,
- A->data(),
- lda,
- batchA == 1 ? 0 : m * k,
- &beta,
- C->data(),
- ldc,
- n * m,
- std::max(batchA, batchB));
-#if CUDA_VERSION >= 9000
- //cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH);
-#endif
-}
-
__global__ void gCopyRows(float* out,
const float* in,
size_t cols,
@@ -676,7 +568,7 @@ __global__ void gCopyRows(float* out,
}
}
-void CopyRows(Tensor out, const Tensor in, const std::vector<size_t>& indices) {
+void CopyRows(marian::Tensor out, const marian::Tensor in, const std::vector<size_t>& indices) {
cudaSetDevice(out->getDevice().no);
size_t cols = in->shape().back();
@@ -721,8 +613,8 @@ __global__ void gPasteRows(float* out,
}
}
-void PasteRows(Tensor out,
- const Tensor in,
+void PasteRows(marian::Tensor out,
+ const marian::Tensor in,
const std::vector<size_t>& indices) {
cudaSetDevice(out->getDevice().no);
@@ -768,7 +660,7 @@ __global__ void gCopyCols(float* out,
}
}
-void CopyCols(Tensor out, const Tensor in, const std::vector<size_t>& indices) {
+void CopyCols(marian::Tensor out, const marian::Tensor in, const std::vector<size_t>& indices) {
cudaSetDevice(out->getDevice().no);
size_t rows = in->shape().elements() / in->shape().back();
@@ -813,8 +705,8 @@ __global__ void gPasteCols(float* out,
}
}
-void PasteCols(Tensor out,
- const Tensor in,
+void PasteCols(marian::Tensor out,
+ const marian::Tensor in,
const std::vector<size_t>& indices) {
cudaSetDevice(out->getDevice().no);
@@ -879,11 +771,11 @@ __global__ void gInsert(float* out,
}
}
-void Select(Ptr<Allocator> allocator,
- Tensor out,
- const Tensor in,
+void Select(marian::Tensor out,
+ const marian::Tensor in,
int axis,
- const std::vector<size_t>& indices) {
+ const std::vector<size_t>& indices,
+ Ptr<Allocator> allocator) {
cudaSetDevice(out->getDevice().no);
int length = out->shape().elements();
@@ -905,11 +797,11 @@ void Select(Ptr<Allocator> allocator,
allocator->free(mp_indices);
}
-void Insert(Ptr<Allocator> allocator,
- Tensor out,
- const Tensor in,
+void Insert(marian::Tensor out,
+ const marian::Tensor in,
int axis,
- const std::vector<size_t>& indices) {
+ const std::vector<size_t>& indices,
+ Ptr<Allocator> allocator) {
cudaSetDevice(in->getDevice().no);
int length = in->shape().elements();
@@ -974,7 +866,7 @@ __global__ void gGRUFastForward(float* out,
}
}
-void GRUFastForward(Tensor out, std::vector<Tensor> inputs, bool final) {
+void GRUFastForward(marian::Tensor out, std::vector<marian::Tensor> inputs, bool final) {
cudaSetDevice(out->getDevice().no);
int rows = out->shape().elements() / out->shape().back();
@@ -1084,9 +976,9 @@ __global__ void gGRUFastBackward(float* outState,
}
}
-void GRUFastBackward(std::vector<Tensor> outputs,
- std::vector<Tensor> inputs,
- Tensor adj,
+void GRUFastBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj,
bool final) {
cudaSetDevice(adj->getDevice().no);
@@ -1182,7 +1074,7 @@ __global__ void gCrossEntropyPick(float* out,
}
}
-void CrossEntropyPick(Tensor out, Tensor in, Tensor pick) {
+void CrossEntropyPick(marian::Tensor out, marian::Tensor in, marian::Tensor pick) {
cudaSetDevice(out->getDevice().no);
int rows = in->shape().elements() / in->shape().back();
@@ -1268,7 +1160,7 @@ __global__ void gCrossEntropyPickBackward(float* out,
}
}
-void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) {
+void CrossEntropyPickBackward(marian::Tensor out, marian::Tensor adj, marian::Tensor a, marian::Tensor pick) {
cudaSetDevice(out->getDevice().no);
int rows = out->shape().elements() / out->shape().back();
@@ -1283,7 +1175,7 @@ void CrossEntropyPickBackward(Tensor out, Tensor adj, Tensor a, Tensor pick) {
}
-float L2Norm(Tensor in) {
+float L2Norm(marian::Tensor in) {
using namespace functional;
cudaSetDevice(in->getDevice().no);
@@ -1294,7 +1186,7 @@ float L2Norm(Tensor in) {
uint8_t* data;
cudaMalloc(&data, blocks * sizeof(float));
- Tensor out(new TensorBase(
+ marian::Tensor out(new TensorBase(
New<MemoryPiece>(data, blocks * sizeof(float)), {1, blocks}, in->getBackend()));
ReduceAll(_1 * _1, out, in);
@@ -1351,7 +1243,7 @@ __global__ void gAtt(float* out,
}
}
-void Att(Tensor out, Tensor va, Tensor context, Tensor state) {
+void Att(marian::Tensor out, marian::Tensor va, marian::Tensor context, marian::Tensor state) {
cudaSetDevice(out->getDevice().no);
size_t m = out->shape().elements() / out->shape().back();
@@ -1412,13 +1304,13 @@ __global__ void gAttBack(float* gVa,
}
}
-void AttBack(Tensor gVa,
- Tensor gContext,
- Tensor gState,
- Tensor va,
- Tensor context,
- Tensor state,
- Tensor adj) {
+void AttBack(marian::Tensor gVa,
+ marian::Tensor gContext,
+ marian::Tensor gState,
+ marian::Tensor va,
+ marian::Tensor context,
+ marian::Tensor state,
+ marian::Tensor adj) {
cudaSetDevice(adj->getDevice().no);
size_t m = adj->shape().elements() / adj->shape().back();
@@ -1517,10 +1409,10 @@ __global__ void gLNormalization(float* out,
}
}
-void LayerNormalization(Tensor out,
- Tensor in,
- Tensor gamma,
- Tensor beta,
+void LayerNormalization(marian::Tensor out,
+ marian::Tensor in,
+ marian::Tensor gamma,
+ marian::Tensor beta,
float eps) {
cudaSetDevice(out->getDevice().no);
@@ -1642,14 +1534,14 @@ __global__ void gLayerNormalizationGrad(float* gradX,
}
}
-void LayerNormalizationGrad(Tensor gradX,
- Tensor gradGamma,
- Tensor gradBeta,
- Tensor adj,
- Tensor y,
- Tensor x,
- Tensor gamma,
- Tensor beta,
+void LayerNormalizationGrad(marian::Tensor gradX,
+ marian::Tensor gradGamma,
+ marian::Tensor gradBeta,
+ marian::Tensor adj,
+ marian::Tensor y,
+ marian::Tensor x,
+ marian::Tensor gamma,
+ marian::Tensor beta,
float eps) {
cudaSetDevice(adj->getDevice().no);
int rows = y->shape().elements() / y->shape().back();
@@ -1685,9 +1577,9 @@ __global__ void gShift(float* out, const float* in, int length, int offset) {
}
}
-void Shift(Tensor out, Tensor in, Shape shift, bool invert) {
+void Shift(marian::Tensor out, marian::Tensor in, marian::Shape shift, bool invert) {
- UTIL_THROW_IF2(in->shape().size() != shift.size(), "bad dimensions");
+ ABORT_IF(in->shape().size() != shift.size(), "bad dimensions");
int offset = 0;
for(int i = 0; i < shift.size(); ++i)
@@ -1784,7 +1676,7 @@ __global__ void gLSTMCellForward(float* out,
}
}
-void LSTMCellForward(Tensor out, std::vector<Tensor> inputs) {
+void LSTMCellForward(marian::Tensor out, std::vector<marian::Tensor> inputs) {
cudaSetDevice(out->getDevice().no);
int rows = out->shape().elements() / out->shape().back();
@@ -1833,7 +1725,7 @@ __global__ void gLSTMOutputForward(float* out,
}
}
-void LSTMOutputForward(Tensor out, std::vector<Tensor> inputs) {
+void LSTMOutputForward(marian::Tensor out, std::vector<marian::Tensor> inputs) {
cudaSetDevice(out->getDevice().no);
int rows = out->shape().elements() / out->shape().back();
@@ -1927,9 +1819,9 @@ __global__ void gLSTMCellBackward(float* outCell,
}
}
-void LSTMCellBackward(std::vector<Tensor> outputs,
- std::vector<Tensor> inputs,
- Tensor adj) {
+void LSTMCellBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj) {
cudaSetDevice(adj->getDevice().no);
int rows = adj->shape().elements() / adj->shape().back();
@@ -2005,9 +1897,9 @@ __global__ void gLSTMOutputBackward(float* outCell,
}
}
-void LSTMOutputBackward(std::vector<Tensor> outputs,
- std::vector<Tensor> inputs,
- Tensor adj) {
+void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj) {
cudaSetDevice(adj->getDevice().no);
int rows = adj->shape().elements() / adj->shape().back();
@@ -2044,10 +1936,10 @@ __global__ void gHighwayForward(float* out,
}
}
-void HighwayForward(Tensor out,
- const Tensor in1,
- const Tensor in2,
- const Tensor t) {
+void HighwayForward(marian::Tensor out,
+ const marian::Tensor in1,
+ const marian::Tensor in2,
+ const marian::Tensor t) {
cudaSetDevice(out->getDevice().no);
int length = out->shape().elements();
@@ -2079,13 +1971,13 @@ __global__ void gHighwayBackward(float* out1,
}
}
-void HighwayBackward(Tensor out1,
- Tensor out2,
- Tensor outt,
- const Tensor in1,
- const Tensor in2,
- const Tensor t,
- const Tensor adj) {
+void HighwayBackward(marian::Tensor out1,
+ marian::Tensor out2,
+ marian::Tensor outt,
+ const marian::Tensor in1,
+ const marian::Tensor in2,
+ const marian::Tensor t,
+ const marian::Tensor adj) {
cudaSetDevice(out1->getDevice().no);
int length = out1->shape().elements();
@@ -2138,20 +2030,20 @@ __global__ void gMaxPoolingForward(float* out,
out[rowId + (colId * outCols)] = currentMax;
}
-void PoolingWithMaskingForward(Tensor out,
- Tensor in,
- Tensor mask,
+void PoolingWithMaskingForward(marian::Tensor out,
+ marian::Tensor in,
+ marian::Tensor mask,
int width,
bool isEven) {
int n = out->shape().elements();
int threads = std::min(n, MAX_THREADS);
int blocks = n / threads + (n % threads != 0);
- Shape& inShape = in->shape();
+ auto& inShape = in->shape();
int inRows = inShape[0] * inShape[1];
int inCols = inShape[2];
- Shape& outShape = out->shape();
+ auto& outShape = out->shape();
int outRows = outShape[2];
int outCols = outShape[0] * outShape[1];
@@ -2203,21 +2095,21 @@ __global__ void gMaxPoolingBackward(float* adj,
adjIn[(rowId * inCols) + (colId * width) + currentMaxIdx] += adj[rowId + (colId * adjCols)];
}
-void PoolingWithMaskingBackward(Tensor adj,
- Tensor adjIn,
- Tensor in,
- Tensor mask,
+void PoolingWithMaskingBackward(marian::Tensor adj,
+ marian::Tensor adjIn,
+ marian::Tensor in,
+ marian::Tensor mask,
int width,
bool isEven) {
int n = adj->shape().elements();
int threads = std::min(n, 512);
int blocks = n / threads + (n % threads != 0);
- Shape& inShape = in->shape();
+ auto& inShape = in->shape();
int inRows = inShape[0] * inShape[1];
int inCols = inShape[2];
- Shape& adjShape = adj->shape();
+ auto& adjShape = adj->shape();
int adjRows = adjShape[2];
int adjCols = adjShape[0] * adjShape[1];
@@ -2232,4 +2124,5 @@ void PoolingWithMaskingBackward(Tensor adj,
width, lastWidth);
}
+}
} // namespace marian
diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h
new file mode 100644
index 00000000..06f76b64
--- /dev/null
+++ b/src/tensors/tensor_operators.h
@@ -0,0 +1,213 @@
+#pragma once
+
+#include "common/definitions.h"
+#include "tensors/tensor.h"
+#include "tensors/allocator.h"
+
+#include "tensors/dispatch.h"
+
+#include "gpu/shape.h"
+#include "gpu/tmp.h"
+#include "gpu/tensor.h"
+
+#include "tensors/gpu/element.h"
+#include "tensors/gpu/add.h"
+#include "tensors/gpu/prod.h"
+
+#include "tensors/cpu/element.h"
+#include "tensors/cpu/add.h"
+
+namespace marian {
+
+ template <class Functor, class ...Tensors>
+ void Element(Functor functor, marian::Tensor out, Tensors ...tensors) {
+ if(out->getBackend()->getDevice().type == DeviceType::gpu)
+ gpu::Element(functor, out, tensors...);
+ else
+ cpu::Element(functor, out, tensors...);
+ }
+
+ template <class Functor, class ...Tensors>
+ void Add(Functor functor,
+ float scale,
+ marian::Tensor out,
+ Tensors... tensors) {
+ if(out->getBackend()->getDevice().type == DeviceType::gpu)
+ gpu::Add(functor, scale, out, tensors...);
+ else
+ cpu::Add(functor, scale, out, tensors...);
+ }
+
+ template <class Functor, class ...Tensors>
+ void Add(Functor functor,
+ marian::Tensor out,
+ Tensors... tensors) {
+ Add(functor, 1, out, tensors...);
+ }
+
+ template <class Functor, class ...Tensors>
+ void Reduce(Functor functor,
+ float scale,
+ marian::Tensor out,
+ Tensors... tensors) {
+ out->set(0);
+ Add(functor, scale, out, tensors...);
+ }
+
+ template <class Functor, class ...Tensors>
+ void Reduce(Functor functor,
+ marian::Tensor out,
+ Tensors... tensors) {
+ out->set(0);
+ Add(functor, out, tensors...);
+ }
+
+ DISPATCH7(Prod, marian::Tensor, const marian::Tensor, const marian::Tensor, bool, bool, float, float)
+ DISPATCH7(ProdBatched, marian::Tensor, const marian::Tensor, const marian::Tensor, bool, bool, float, float)
+
+ DISPATCH2(Dropout, marian::Tensor, float)
+
+ DISPATCH3(Softmax, marian::Tensor, marian::Tensor, marian::Tensor)
+ DISPATCH3(SoftmaxGrad, marian::Tensor, marian::Tensor, marian::Tensor)
+
+ DISPATCH2(LogSoftmax, marian::Tensor, marian::Tensor)
+ DISPATCH3(LogSoftmaxGrad, marian::Tensor, marian::Tensor, marian::Tensor)
+
+ DISPATCH3(CrossEntropyPick, marian::Tensor, marian::Tensor, marian::Tensor)
+ DISPATCH4(CrossEntropyPickBackward, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
+
+ DISPATCH3(TransposeND, marian::Tensor, marian::Tensor, const std::vector<int>&)
+ DISPATCH4(Shift, marian::Tensor, marian::Tensor, marian::Shape, bool)
+
+ DISPATCH3(Concatenate, marian::Tensor, const std::vector<marian::Tensor>&, int)
+
+ namespace gpu {
+ void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax);
+ }
+
+ namespace cpu {
+ void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax);
+ }
+
+ static inline void Deconcatenate(std::vector<marian::Tensor>& outputs, const marian::Tensor in, int ax) {
+ if(in->getBackend()->getDevice().type == DeviceType::gpu) {
+ gpu::Deconcatenate(outputs, in, ax);
+ }
+ else {
+ cpu::Deconcatenate(outputs, in, ax);
+ }
+ }
+
+ DISPATCH5(LayerNormalization, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float)
+ DISPATCH9(LayerNormalizationGrad, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, float)
+
+ DISPATCH4(HighwayForward, marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor)
+ DISPATCH7(HighwayBackward, marian::Tensor, marian::Tensor, marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor)
+
+ DISPATCH3(CopyRows, marian::Tensor, const marian::Tensor, const std::vector<size_t>&)
+ DISPATCH3(PasteRows, marian::Tensor, const marian::Tensor, const std::vector<size_t>&)
+ DISPATCH3(CopyCols, marian::Tensor, const marian::Tensor, const std::vector<size_t>&)
+ DISPATCH3(PasteCols, marian::Tensor, const marian::Tensor, const std::vector<size_t>&)
+
+ DISPATCH5(Select, marian::Tensor, marian::Tensor, int, const std::vector<size_t>&, Ptr<Allocator>)
+ DISPATCH5(Insert, marian::Tensor, marian::Tensor, int, const std::vector<size_t>&, Ptr<Allocator>)
+
+
+ DISPATCH2(LSTMCellForward, marian::Tensor, std::vector<marian::Tensor>)
+ DISPATCH2(LSTMOutputForward, marian::Tensor, std::vector<marian::Tensor>);
+
+ namespace gpu {
+ void LSTMCellBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj);
+ }
+
+ namespace cpu {
+ void LSTMCellBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj);
+ }
+
+ static inline void LSTMCellBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj) {
+ if(adj->getBackend()->getDevice().type == DeviceType::gpu) {
+ gpu::LSTMCellBackward(outputs, inputs, adj);
+ }
+ else {
+ cpu::LSTMCellBackward(outputs, inputs, adj);
+ }
+ }
+
+ namespace gpu {
+ void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj);
+ }
+
+ namespace cpu {
+ void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj);
+ }
+
+ static inline void LSTMOutputBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj) {
+ if(adj->getBackend()->getDevice().type == DeviceType::gpu) {
+ gpu::LSTMOutputBackward(outputs, inputs, adj);
+ }
+ else {
+ cpu::LSTMOutputBackward(outputs, inputs, adj);
+ }
+ }
+
+ DISPATCH3(GRUFastForward, marian::Tensor, std::vector<marian::Tensor>, bool)
+
+ namespace gpu {
+ void GRUFastBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj,
+ bool final);
+ }
+
+ namespace cpu {
+ void GRUFastBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj,
+ bool final);
+ }
+
+ static inline void GRUFastBackward(std::vector<marian::Tensor> outputs,
+ std::vector<marian::Tensor> inputs,
+ marian::Tensor adj,
+ bool final = false) {
+ if(adj->getBackend()->getDevice().type == DeviceType::gpu) {
+ gpu::GRUFastBackward(outputs, inputs, adj, final);
+ }
+ else {
+ cpu::GRUFastBackward(outputs, inputs, adj, final);
+ }
+ }
+
+ DISPATCH4(Att, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
+ DISPATCH7(AttBack, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor)
+
+ namespace gpu {
+ float L2Norm(marian::Tensor in);
+ }
+
+ namespace cpu {
+ float L2Norm(marian::Tensor in);
+ }
+
+ static inline float L2Norm(marian::Tensor in) {
+ if(in->getBackend()->getDevice().type == DeviceType::gpu) {
+ return gpu::L2Norm(in);
+ }
+ else {
+ return cpu::L2Norm(in);
+ }
+ }
+
+}
diff --git a/src/tests/attention_tests.cpp b/src/tests/attention_tests.cpp
index b42c126d..1a7afd03 100644
--- a/src/tests/attention_tests.cpp
+++ b/src/tests/attention_tests.cpp
@@ -7,8 +7,7 @@
using namespace marian;
-TEST_CASE("Model components, Attention", "[attention]") {
-
+void tests(DeviceType type) {
auto floatApprox = [](float x, float y) { return x == Approx(y).epsilon(0.01); };
std::vector<size_t> vWords = {
@@ -37,7 +36,7 @@ TEST_CASE("Model components, Attention", "[attention]") {
Config::seed = 1234;
auto graph = New<ExpressionGraph>();
- graph->setDevice({0, DeviceType::gpu});
+ graph->setDevice({0, type});
graph->reserveWorkspaceMB(16);
std::vector<float> values;
@@ -109,3 +108,11 @@ TEST_CASE("Model components, Attention", "[attention]") {
vAligned.begin(), floatApprox) );
}
}
+
+TEST_CASE("Model components, Attention (gpu)", "[attention]") {
+ tests(DeviceType::gpu);
+}
+
+TEST_CASE("Model components, Attention (cpu)", "[attention]") {
+ tests(DeviceType::cpu);
+}
diff --git a/src/tests/dropout_test.cpp b/src/tests/dropout_test.cpp
index 5023606b..f99dbf68 100644
--- a/src/tests/dropout_test.cpp
+++ b/src/tests/dropout_test.cpp
@@ -12,7 +12,9 @@ using namespace keywords;
int main(int argc, char** argv) {
auto c = New<Config>(argc, argv);
- auto type = c->get<bool>("cpu") ? DeviceType::cpu : DeviceType::gpu;
+ auto type = c->get<bool>("cpu")
+ ? DeviceType::cpu
+ : DeviceType::gpu;
DeviceId deviceId{0, type};
auto g = New<ExpressionGraph>();
@@ -21,7 +23,11 @@ int main(int argc, char** argv) {
for(int i = 0; i < 10; ++i) {
g->clear();
- auto mask = g->dropout(0.2, {10, 3072});
+ auto mask1 = g->dropout(0.2, {10, 3072});
+ auto mask2 = g->dropout(0.3, {1, 3072});
+ auto mask = mask1 + mask2;
+ debug(mask1, "mask1");
+ debug(mask2, "mask2");
debug(mask, "mask");
g->forward();
}
diff --git a/src/tests/operator_tests.cpp b/src/tests/operator_tests.cpp
index 0c3cab5a..680d171a 100644
--- a/src/tests/operator_tests.cpp
+++ b/src/tests/operator_tests.cpp
@@ -4,33 +4,17 @@
using namespace marian;
-TEST_CASE("Expression graph supports basic math operations", "[operator]") {
-
+void tests(DeviceType device) {
auto floatApprox = [](float x, float y) { return x == Approx(y); };
auto graph = New<ExpressionGraph>();
- graph->setDevice({0, DeviceType::gpu});
+ graph->setDevice({0, device});
graph->reserveWorkspaceMB(16);
std::vector<float> vA({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12});
std::vector<float> vB({1, 2, 3, 4, 5, 6});
std::vector<float> values;
- SECTION("dot product") {
- graph->clear();
- values.clear();
- std::vector<float> vC({22, 28, 49, 64, 76, 100, 103, 136});
-
- auto A = graph->param("A", {2, 2, 3}, keywords::init = inits::from_vector(vA));
- auto B = graph->param("B", {3, 2}, keywords::init = inits::from_vector(vB));
- auto C = dot(A, B);
- graph->forward();
-
- CHECK(C->shape() == Shape({2, 2, 2}));
- C->val()->get(values);
- CHECK(values == vC);
- }
-
SECTION("scalar multiplication") {
graph->clear();
values.clear();
@@ -45,38 +29,6 @@ TEST_CASE("Expression graph supports basic math operations", "[operator]") {
CHECK(values == vB2);
}
- SECTION("softmax and logsoftmax") {
- graph->clear();
- values.clear();
- std::vector<float> in({-.2, -.3, 4.5, 5.2, -10, 101.45, -100.05, 1.05e-5});
-
- std::vector<float> smOut({ 0.52498f, 0.47502f, 0.33181f, 0.66819f,
- 0.0f, 1.0f, 0.0f, 1.0f });
-
- std::vector<float> lsmOut({ -0.6444f, -0.7444f, -1.10319f, -0.40319f,
- -111.45f, 0.0f, -100.05001f, 0.0f });
-
- auto input = graph->constant({2, 2, 2}, keywords::init = inits::from_vector(in));
-
- auto sm = softmax(input);
- auto lsm = logsoftmax(input);
-
- graph->forward();
-
- CHECK(sm->shape() == Shape({2, 2, 2}));
- CHECK(lsm->shape() == Shape({2, 2, 2}));
-
- sm->val()->get(values);
-
- CHECK( std::equal(values.begin(), values.end(),
- smOut.begin(), floatApprox) );
-
- lsm->val()->get(values);
-
- CHECK( std::equal(values.begin(), values.end(),
- lsmOut.begin(), floatApprox) );
- }
-
SECTION("elementwise binary operators with broadcasting") {
graph->clear();
values.clear();
@@ -163,6 +115,69 @@ TEST_CASE("Expression graph supports basic math operations", "[operator]") {
CHECK( values == vT5 );
}
+ SECTION("softmax and logsoftmax") {
+ graph->clear();
+ values.clear();
+ std::vector<float> in({-.2, -.3, 4.5, 5.2, -10, 101.45, -100.05, 1.05e-5});
+
+ std::vector<float> smOut({ 0.52498f, 0.47502f, 0.33181f, 0.66819f,
+ 0.0f, 1.0f, 0.0f, 1.0f });
+
+ std::vector<float> lsmOut({ -0.6444f, -0.7444f, -1.10319f, -0.40319f,
+ -111.45f, 0.0f, -100.05001f, 0.0f });
+
+ auto input = graph->constant({2, 2, 2}, keywords::init = inits::from_vector(in));
+
+ auto sm = softmax(input);
+ auto lsm = logsoftmax(input);
+
+ graph->forward();
+
+ CHECK(sm->shape() == Shape({2, 2, 2}));
+ CHECK(lsm->shape() == Shape({2, 2, 2}));
+
+ sm->val()->get(values);
+
+ CHECK( std::equal(values.begin(), values.end(),
+ smOut.begin(), floatApprox) );
+
+ lsm->val()->get(values);
+
+ CHECK( std::equal(values.begin(), values.end(),
+ lsmOut.begin(), floatApprox) );
+ }
+
+ SECTION("layer normalization") {
+ graph->clear();
+ values.clear();
+
+ Config::seed = 1234;
+
+ std::vector<float> vLn({
+ -1.20521, -0.321409, -0.0363369, 1.56296,
+ 0.332987, -0.613398, -1.17766, 1.45807,
+ -0.731601, -0.187812, -0.766431, 1.68584,
+ -1.31923, -0.059028, 1.49732, -0.119065
+ });
+
+ auto a = graph->constant({2, 2, 4}, keywords::init=inits::glorot_uniform);
+
+ auto gamma = graph->param("gamma", {1, 4}, keywords::init=inits::ones);
+ auto beta = graph->param("beta", {1, 4}, keywords::init=inits::zeros);
+
+ auto ln = layer_norm(a, gamma, beta);
+
+ graph->forward();
+
+ CHECK(ln->shape() == Shape({2, 2, 4}));
+
+
+ ln->val()->get(values);
+ CHECK( std::equal(values.begin(), values.end(),
+ vLn.begin(), floatApprox) );
+
+ }
+
SECTION("reductions") {
graph->clear();
values.clear();
@@ -261,35 +276,26 @@ TEST_CASE("Expression graph supports basic math operations", "[operator]") {
CHECK( values == vO4 );
}
- SECTION("layer normalization") {
+ SECTION("dot product") {
graph->clear();
values.clear();
+ std::vector<float> vC({22, 28, 49, 64, 76, 100, 103, 136});
- Config::seed = 1234;
-
- std::vector<float> vLn({
- -1.20521, -0.321409, -0.0363369, 1.56296,
- 0.332987, -0.613398, -1.17766, 1.45807,
- -0.731601, -0.187812, -0.766431, 1.68584,
- -1.31923, -0.059028, 1.49732, -0.119065
- });
-
- auto a = graph->constant({2, 2, 4}, keywords::init=inits::glorot_uniform);
-
- auto gamma = graph->param("gamma", {1, 4}, keywords::init=inits::ones);
- auto beta = graph->param("beta", {1, 4}, keywords::init=inits::zeros);
-
- auto ln = layer_norm(a, gamma, beta);
-
+ auto A = graph->param("A", {2, 2, 3}, keywords::init = inits::from_vector(vA));
+ auto B = graph->param("B", {3, 2}, keywords::init = inits::from_vector(vB));
+ auto C = dot(A, B);
graph->forward();
- CHECK(ln->shape() == Shape({2, 2, 4}));
-
-
- ln->val()->get(values);
- CHECK( std::equal(values.begin(), values.end(),
- vLn.begin(), floatApprox) );
-
+ CHECK(C->shape() == Shape({2, 2, 2}));
+ C->val()->get(values);
+ CHECK(values == vC);
}
+}
+
+TEST_CASE("Expression graph supports basic math operations (gpu)", "[operator]") {
+ tests(DeviceType::gpu);
+}
+TEST_CASE("Expression graph supports basic math operations (cpu)", "[operator]") {
+ tests(DeviceType::cpu);
}
diff --git a/src/tests/rnn_tests.cpp b/src/tests/rnn_tests.cpp
index 681bdbe8..b73e1abe 100644
--- a/src/tests/rnn_tests.cpp
+++ b/src/tests/rnn_tests.cpp
@@ -6,8 +6,7 @@
using namespace marian;
-TEST_CASE("Model components, RNN etc.", "[model]") {
-
+void tests(DeviceType type) {
auto floatApprox = [](float x, float y) { return x == Approx(y).epsilon(0.01); };
std::vector<size_t> vWords = {
@@ -36,7 +35,7 @@ TEST_CASE("Model components, RNN etc.", "[model]") {
Config::seed = 1234;
auto graph = New<ExpressionGraph>();
- graph->setDevice({0, DeviceType::gpu});
+ graph->setDevice({0, type});
graph->reserveWorkspaceMB(16);
std::vector<float> values;
@@ -74,7 +73,7 @@ TEST_CASE("Model components, RNN etc.", "[model]") {
Config::seed = 1234;
auto graph = New<ExpressionGraph>();
- graph->setDevice({0, DeviceType::gpu});
+ graph->setDevice({0, type});
graph->reserveWorkspaceMB(16);
std::vector<float> values;
@@ -279,3 +278,11 @@ TEST_CASE("Model components, RNN etc.", "[model]") {
// vContextSum3.begin(), floatApprox) );
}
}
+
+TEST_CASE("Model components, RNN etc. (gpu)", "[model]") {
+ tests(DeviceType::gpu);
+}
+
+TEST_CASE("Model components, RNN etc. (cpu)", "[model]") {
+ tests(DeviceType::cpu);
+}
diff --git a/src/training/dropper.cu b/src/training/dropper.cu
index b8870bb3..75e82433 100644
--- a/src/training/dropper.cu
+++ b/src/training/dropper.cu
@@ -4,8 +4,8 @@
#include <thrust/sort.h>
#include <memory>
-#include "kernels/cuda_helpers.h"
-#include "kernels/tensor_operators.h"
+#include "tensors/gpu/cuda_helpers.h"
+#include "tensors/tensor_operators.h"
#include "training/dropper.h"
#include "training/sparse_tensor.h"
diff --git a/src/training/graph_group_async.cu b/src/training/graph_group_async.cu
index 18f3908a..faba19e9 100644
--- a/src/training/graph_group_async.cu
+++ b/src/training/graph_group_async.cu
@@ -1,6 +1,6 @@
#include "training/graph_group_async.h"
-
-#include "kernels/tensor_operators.h"
+#include "tensors/tensor_operators.h"
+#include "functional/functional.h"
namespace marian {
diff --git a/src/training/graph_group_async_drop.cu b/src/training/graph_group_async_drop.cu
index f92077bc..7dabc796 100644
--- a/src/training/graph_group_async_drop.cu
+++ b/src/training/graph_group_async_drop.cu
@@ -2,7 +2,7 @@
#include "training/graph_group_async_drop.h"
#include "functional/functional.h"
-#include "kernels/tensor_operators.h"
+#include "tensors/tensor_operators.h"
#include "training/dropper.h"
#include "training/sparse_tensor.h"
diff --git a/src/training/graph_group_multinode.cu b/src/training/graph_group_multinode.cu
index 34aa2b5d..2437cee9 100644
--- a/src/training/graph_group_multinode.cu
+++ b/src/training/graph_group_multinode.cu
@@ -1,5 +1,6 @@
#include "training/graph_group_multinode.h"
-#include "kernels/tensor_operators.h"
+#include "tensors/tensor_operators.h"
+#include "functional/functional.h"
namespace marian {
diff --git a/src/training/graph_group_singleton.cu b/src/training/graph_group_singleton.cu
index 68398af5..509e50c6 100644
--- a/src/training/graph_group_singleton.cu
+++ b/src/training/graph_group_singleton.cu
@@ -1,5 +1,6 @@
-#include "kernels/tensor_operators.h"
+#include "tensors/tensor_operators.h"
#include "training/graph_group_singleton.h"
+#include "functional/functional.h"
namespace marian {
diff --git a/src/training/graph_group_sync.cu b/src/training/graph_group_sync.cu
index 171c0652..87ec6784 100644
--- a/src/training/graph_group_sync.cu
+++ b/src/training/graph_group_sync.cu
@@ -1,5 +1,6 @@
-#include "kernels/tensor_operators.h"
#include "training/graph_group_sync.h"
+#include "tensors/tensor_operators.h"
+#include "functional/functional.h"
namespace marian {
diff --git a/src/training/sparse_tensor.cu b/src/training/sparse_tensor.cu
index aafafa97..d3fad64c 100644
--- a/src/training/sparse_tensor.cu
+++ b/src/training/sparse_tensor.cu
@@ -4,8 +4,8 @@
#include <thrust/sort.h>
#include <memory>
-#include "kernels/cuda_helpers.h"
-#include "kernels/tensor_operators.h"
+#include "tensors/tensor.h"
+#include "tensors/tensor_operators.h"
#include "training/sparse_tensor.h"
namespace marian {
diff --git a/src/training/sparse_tensor.h b/src/training/sparse_tensor.h
index 9194748f..2c5fe911 100644
--- a/src/training/sparse_tensor.h
+++ b/src/training/sparse_tensor.h
@@ -3,6 +3,7 @@
#include <memory>
#include "common/definitions.h"
+#include "tensors/backend.h"
namespace marian {
class SparseTensorBase : public std::enable_shared_from_this<SparseTensorBase> {
diff --git a/src/translator/nth_element.cu b/src/translator/nth_element.cu
index 9ae19478..978b6a62 100644
--- a/src/translator/nth_element.cu
+++ b/src/translator/nth_element.cu
@@ -1,8 +1,7 @@
#include <iostream>
#include "translator/nth_element.h"
-
-#include "kernels/cuda_helpers.h"
+#include "tensors/gpu/cuda_helpers.h"
namespace marian {