Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2018-06-29 00:32:12 +0300
committerMarcin Junczys-Dowmunt <junczys@amu.edu.pl>2018-06-29 00:32:12 +0300
commit34b16ff585055bb18ccd85db196d65ff236a1c7e (patch)
treeb09f646b92616d8663b74cae6bf4a8cca61403d1
parent352a437ab49ec00be944e11ed4bba0d52ac49931 (diff)
parent54dac41e9dc044105f9e5cc4df8345c876a22c09 (diff)
resolve merge conflict
-rw-r--r--src/functional/predicates.h12
-rw-r--r--src/graph/expression_operators.cpp29
-rw-r--r--src/graph/expression_operators.h22
-rw-r--r--src/graph/node_operators_binary.h20
-rw-r--r--src/graph/node_operators_unary.h14
-rw-r--r--src/layers/generic.h28
-rw-r--r--src/models/encoder.h7
-rw-r--r--src/models/transformer.h641
-rw-r--r--src/rnn/attention.h8
-rw-r--r--src/rnn/cells.h34
-rw-r--r--src/tensors/cpu/tensor_operators.cpp24
-rw-r--r--src/tensors/gpu/add.inc6
-rw-r--r--src/tensors/gpu/element.inc10
-rw-r--r--src/tensors/gpu/tensor_operators.cu26
-rw-r--r--src/tests/tensor_test.cu10
15 files changed, 393 insertions, 498 deletions
diff --git a/src/functional/predicates.h b/src/functional/predicates.h
index e129036e..51af38ad 100644
--- a/src/functional/predicates.h
+++ b/src/functional/predicates.h
@@ -85,8 +85,8 @@ UNARY(Exp, exp, expf(x));
UNARY(Abs, abs, fabs(x));
UNARY(Sqrt, sqrt, sqrtf(x));
UNARY(Neg, operator-, -x);
-UNARY(Logit,
- logit,
+UNARY(Sigmoid,
+ sigmoid,
x > 0 ? (1.f / (1.f + expf(-x))) : (expf(x) / (1.f + expf(x))));
BINARY(Plus, operator+, x + y);
@@ -94,14 +94,14 @@ BINARY(Minus, operator-, x - y);
BINARY(Mult, operator*, x* y);
BINARY(Div, operator/, x / y);
-BINARY(LogSum,
- logsum,
+BINARY(LogAddExp,
+ logaddexp,
(/*if*/ (x < y) ? // Note: This may not be ideal for CUDA; cf. CNTK implementation
(y + log1pf(expf(x - y)))
/*else*/ :
(x + log1pf(expf(y - x)))));
-BINARY(Max, max, (x > y) ? y : x); // note: std::max not available on CUDA it seems
-BINARY(Min, min, (x < y) ? y : x);
+BINARY(Maximum, max, (x > y) ? y : x); // note: std::max not available on CUDA it seems
+BINARY(Minimum, min, (x < y) ? y : x);
UNARY(Negate, operator!, !x);
BINARY(Eq, operator==, x == y);
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 91a7ffc2..1666357a 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -15,8 +15,8 @@ Expr debug(Expr a, const std::string& message) {
return a;
}
-Expr logit(Expr a) {
- return Expression<LogitNodeOp>(a);
+Expr sigmoid(Expr a) { // logistic function. Note: scipy name is expit()
+ return Expression<SigmoidNodeOp>(a);
}
Expr relu(Expr a) {
@@ -80,17 +80,16 @@ Expr operator/(Expr a, Expr b) {
return Expression<DivNodeOp>(a, b);
}
-// on names: stay close to Python/numpy?
-Expr logsum(Expr a, Expr b) { // TODO: haggle over the name (logplus, logadd, expAddLog)
- return Expression<LogSumNodeOp>(a, b);
+Expr logaddexp(Expr a, Expr b) {
+ return Expression<LogAddExpNodeOp>(a, b);
}
-Expr max(Expr a, Expr b) { // TODO: haggle over the name (max vs. elementMax)
- return Expression<MaxNodeOp>(a, b);
+Expr maximum(Expr a, Expr b) {
+ return Expression<MaximumNodeOp>(a, b);
}
-Expr min(Expr a, Expr b) { // TODO: haggle over the name
- return Expression<MinNodeOp>(a, b);
+Expr minimum(Expr a, Expr b) {
+ return Expression<MinimumNodeOp>(a, b);
}
/*********************************************************/
@@ -387,7 +386,7 @@ Expr tanh(const std::vector<Expr>& nodes) {
return Expression<TanhNodeOp>(nodes);
}
-Expr logit(const std::vector<Expr>&) {
+Expr sigmoid(const std::vector<Expr>&) {
ABORT("Not implemented");
}
@@ -411,10 +410,10 @@ Expr square(Expr a) {
return Expression<SquareNodeOp>(a);
}
-Expr layer_norm(Expr x,
- Expr gamma,
- Expr beta /*= nullptr*/,
- float eps /*= 1e-9*/) {
+Expr layerNorm(Expr x,
+ Expr gamma,
+ Expr beta /*= nullptr*/,
+ float eps /*= 1e-9*/) {
std::vector<Expr> nodes = {x, gamma};
if(beta)
nodes.push_back(beta);
@@ -432,7 +431,7 @@ Expr highway(const std::string prefix, Expr x) {
auto g = mlp::dense(x->graph())
("prefix", prefix + "_highway_d1")
("dim", outDim)
- ("activation", mlp::act::logit)
+ ("activation", mlp::act::sigmoid)
.construct()->apply(x);
auto relued = mlp::dense(x->graph())
("prefix", prefix + "_highway_d2")
diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
index f1bea94a..cc07dafb 100644
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@@ -5,10 +5,12 @@ namespace marian {
Expr debug(Expr a, const std::string& message = "");
+typedef Expr(ActivationFunction) (Expr);
+
Expr plus(const std::vector<Expr>&);
-Expr logit(Expr a); // aka sigmoid --BUGBUG: should be logistic(), not logit()
-Expr logit(const std::vector<Expr>&);
+Expr sigmoid(Expr a); // aka sigmoid --BUGBUG: should be logistic(), not sigmoid()
+Expr sigmoid(const std::vector<Expr>&);
Expr swish(Expr a);
Expr swish(const std::vector<Expr>&);
@@ -60,7 +62,7 @@ Expr operator/(Expr a, float b);
// Expr pow(float a, Expr b);
// Expr pow(Expr a, float b);
-Expr logsum(Expr a, Expr b); // TODO: haggle over the name (logplus, logadd, expAddLog)
+Expr logaddexp(Expr a, Expr b);
Expr max(Expr a, Expr b); // TODO: haggle over the name (max vs. elementMax)
@@ -128,7 +130,7 @@ Expr step(Expr a, int step, int axis);
Expr sqrt(Expr a, float eps = 0.f);
Expr square(Expr a);
-Expr layer_norm(Expr x, Expr gamma, Expr beta = nullptr, float eps = 1e-9);
+Expr layerNorm(Expr x, Expr gamma, Expr beta = nullptr, float eps = 1e-9);
Expr highway(Expr y, Expr x, Expr t);
Expr highway(const std::string prefix, Expr x);
@@ -137,14 +139,18 @@ static inline Expr dropout(Expr x, Expr mask) {
return x * mask;
}
-static inline Expr dropout(Expr x, float prob, Shape shape) {
+static inline Expr dropout(Expr x, float dropProb, Shape shape) {
+ if (dropProb == 0)
+ return x;
auto graph = x->graph();
- auto mask = graph->dropout(prob, shape);
+ auto mask = graph->dropout(dropProb, shape);
return dropout(x, mask);
}
-static inline Expr dropout(Expr x, float prob) {
- return dropout(x, prob, x->shape());
+static inline Expr dropout(Expr x, float dropProb) {
+ if (dropProb == 0)
+ return x;
+ return dropout(x, dropProb, x->shape());
}
Expr shift(Expr, Shape, float padValue = 0);
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index cff55955..5b1f9865 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -528,29 +528,29 @@ struct DivNodeOp : public ElementBinaryNodeOp {
// const std::string type() { return "pow"; }
//};
-struct LogSumNodeOp : public ElementBinaryNodeOp {
- LogSumNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {}
+struct LogAddExpNodeOp : public ElementBinaryNodeOp {
+ LogAddExpNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {}
NodeOps forwardOps() {
using namespace functional;
return{
- NodeOp(Element(_1 = logsum(_2, _3), val_, child(0)->val(), child(1)->val())) };
+ NodeOp(Element(_1 = logaddexp(_2, _3), val_, child(0)->val(), child(1)->val())) };
}
NodeOps backwardOps() {
using namespace functional;
// d/dx (ln( exp(x) + (exp(y)) = exp(x) / (exp(x) + exp(y)) = 1 / (1 + exp(y-x)) = sigmoid(x-y)
- return{ NodeOp(Add(_1 * logit(_2 - _3), child(0)->grad(), adj_, child(0)->val(), child(1)->val())),
- NodeOp(Add(_1 * logit(_3 - _2), child(1)->grad(), adj_, child(0)->val(), child(1)->val())) };
+ return{ NodeOp(Add(_1 * sigmoid(_2 - _3), child(0)->grad(), adj_, child(0)->val(), child(1)->val())),
+ NodeOp(Add(_1 * sigmoid(_3 - _2), child(1)->grad(), adj_, child(0)->val(), child(1)->val())) };
}
// TODO: this is not a "type" (as in data type). It's an operator name.
- const std::string type() { return "logsum"; }
+ const std::string type() { return "logaddexp"; }
};
-struct MaxNodeOp : public ElementBinaryNodeOp {
- MaxNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {}
+struct MaximumNodeOp : public ElementBinaryNodeOp {
+ MaximumNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {}
NodeOps forwardOps() {
using namespace functional;
@@ -569,8 +569,8 @@ struct MaxNodeOp : public ElementBinaryNodeOp {
};
// TODO: lotsa code dup here!
-struct MinNodeOp : public ElementBinaryNodeOp {
- MinNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {}
+struct MinimumNodeOp : public ElementBinaryNodeOp {
+ MinimumNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {}
NodeOps forwardOps() {
using namespace functional;
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index dda4dd03..fa6d25c7 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -138,12 +138,12 @@ public:
}
};
-struct LogitNodeOp : public UnaryNodeOp {
- LogitNodeOp(Expr a) : UnaryNodeOp(a) {}
+struct SigmoidNodeOp : public UnaryNodeOp {
+ SigmoidNodeOp(Expr a) : UnaryNodeOp(a) {}
NodeOps forwardOps() {
using namespace functional;
- return {NodeOp(Element(_1 = logit(_2), val_, child(0)->val()))};
+ return {NodeOp(Element(_1 = sigmoid(_2), val_, child(0)->val()))};
}
NodeOps backwardOps() {
@@ -151,7 +151,7 @@ struct LogitNodeOp : public UnaryNodeOp {
return {NodeOp(Add(_1 * _2 * (1.0f - _2), child(0)->grad(), adj_, val_))};
}
- const std::string type() { return "logit"; }
+ const std::string type() { return "sigmoid"; }
};
// struct Scalar2PowNodeOp : public UnaryNodeOp {
@@ -350,13 +350,13 @@ struct SwishNodeOp : public UnaryNodeOp {
NodeOps forwardOps() {
using namespace functional;
- return {NodeOp(Element(_1 = _2 * logit(_2), val_, child(0)->val()))};
+ return {NodeOp(Element(_1 = _2 * sigmoid(_2), val_, child(0)->val()))};
}
NodeOps backwardOps() {
using namespace functional;
// dJ/dx += dJ/df * ( f(x) + sigma(x) * (1 - f(x)) )
- return {NodeOp(Add(_1 * (_3 + logit(_2) * (1.f - _3)),
+ return {NodeOp(Add(_1 * (_3 + sigmoid(_2) * (1.f - _3)),
child(0)->grad(), // dJ/dx
adj_, // _1 := dJ/df
child(0)->val(), // _2 := x
@@ -936,8 +936,10 @@ public:
Shape outShape = a->shape();
axis_ = outShape.axis(axis);
+#if 0 // this check currently fails in translation; I think should not fail for step==0
for(int i = 0; i < axis_; ++i)
ABORT_IF(outShape[i] != 1, "non-consecutive slices are presently not supported by step()");
+#endif
outShape.set(axis_, 1);
return outShape;
diff --git a/src/layers/generic.h b/src/layers/generic.h
index dcb9b955..8b19123b 100644
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@@ -7,7 +7,7 @@
namespace marian {
namespace mlp {
-enum struct act : int { linear, tanh, logit, ReLU, LeakyReLU, PReLU, swish };
+enum struct act : int { linear, tanh, sigmoid, ReLU, LeakyReLU, PReLU, swish };
}
}
@@ -50,8 +50,8 @@ public:
auto name = opt<std::string>("prefix");
auto dim = opt<int>("dim");
- auto layerNorm = opt<bool>("layer-normalization", false);
- auto nematusNorm = opt<bool>("nematus-normalization", false);
+ auto useLayerNorm = opt<bool>("layer-normalization", false);
+ auto useNematusNorm = opt<bool>("nematus-normalization", false);
auto activation = opt<act>("activation", act::linear);
auto g = graph_;
@@ -71,8 +71,8 @@ public:
{1, dim},
inits::zeros);
- if(layerNorm) {
- if(nematusNorm) {
+ if(useLayerNorm) {
+ if(useNematusNorm) {
auto ln_s = g->param(name + "_ln_s" + num,
{1, dim},
inits::from_value(1.f));
@@ -80,13 +80,13 @@ public:
{1, dim},
inits::zeros);
- outputs.push_back(layer_norm(affine(in, W, b), ln_s, ln_b, NEMATUS_LN_EPS));
+ outputs.push_back(layerNorm(affine(in, W, b), ln_s, ln_b, NEMATUS_LN_EPS));
} else {
auto gamma = g->param(name + "_gamma" + num,
{1, dim},
inits::from_value(1.0));
- outputs.push_back(layer_norm(dot(in, W), gamma, b));
+ outputs.push_back(layerNorm(dot(in, W), gamma, b));
}
} else {
@@ -96,14 +96,14 @@ public:
}
switch(activation) {
- case act::linear: return plus(outputs);
- case act::tanh: return tanh(outputs);
- case act::logit: return logit(outputs);
- case act::ReLU: return relu(outputs);
+ case act::linear: return plus(outputs);
+ case act::tanh: return tanh(outputs);
+ case act::sigmoid: return sigmoid(outputs);
+ case act::ReLU: return relu(outputs);
case act::LeakyReLU: return leakyrelu(outputs);
- case act::PReLU: return prelu(outputs);
- case act::swish: return swish(outputs);
- default: return plus(outputs);
+ case act::PReLU: return prelu(outputs);
+ case act::swish: return swish(outputs);
+ default: return plus(outputs);
}
};
diff --git a/src/models/encoder.h b/src/models/encoder.h
index 6a0a62f5..f23be88b 100644
--- a/src/models/encoder.h
+++ b/src/models/encoder.h
@@ -12,9 +12,10 @@ protected:
bool inference_{false};
size_t batchIndex_{0};
- virtual std::tuple<Expr, Expr> lookup(Ptr<ExpressionGraph> graph,
- Expr srcEmbeddings,
- Ptr<data::CorpusBatch> batch) {
+ //virtual --Note: This used to be virtual, but is never overridden.
+ std::tuple<Expr, Expr> lookup(Ptr<ExpressionGraph> graph,
+ Expr srcEmbeddings,
+ Ptr<data::CorpusBatch> batch) const {
using namespace keywords;
auto subBatch = (*batch)[batchIndex_];
diff --git a/src/models/transformer.h b/src/models/transformer.h
index 4601bb9b..f7a80b2a 100644
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -1,5 +1,5 @@
// TODO: This is really a .CPP file now. I kept the .H name to minimize confusing git, until this is code-reviewed.
-// This is meant to speed-up builds, and to support Ctrl-F7 to rebuild
+// This is meant to speed-up builds, and to support Ctrl-F7 to rebuild.
#pragma once
@@ -14,17 +14,24 @@
namespace marian {
-// collection of subroutines for Transformer implementation
-class Transformer {
+// shared base class for transformer-based encoder and decoder
+template<class EncoderDecoderBase>
+class Transformer : public EncoderDecoderBase {
+ typedef EncoderDecoderBase Base;
+protected:
+ using Base::options_; using Base::inference_;
+ template <typename T> T opt(const std::string& key) const { Ptr<Options> options = options_; return options->get<T>(key); } // need to duplicate, since somehow using Base::opt is not working
+
+ Ptr<ExpressionGraph> graph_;
public:
- static Expr TransposeTimeBatch(Expr input) { return transpose(input, {0, 2, 1, 3}); }
+ Transformer(Ptr<Options> options)
+ : EncoderDecoderBase(options) {
+ }
- static Expr AddPositionalEmbeddings(Ptr<ExpressionGraph> graph,
- Expr input,
- int start = 0) {
- using namespace keywords;
+ static Expr transposeTimeBatch(Expr input) { return transpose(input, {0, 2, 1, 3}); }
- int dimEmb = input->shape()[-1];
+ Expr addPositionalEmbeddings(Expr input, int start = 0) const {
+ int dimEmb = input->shape()[-1];
int dimWords = input->shape()[-3];
float num_timescales = dimEmb / 2;
@@ -41,19 +48,17 @@ public:
// shared across batch entries
auto signal
- = graph->constant({dimWords, 1, dimEmb}, inits::from_vector(vPos));
+ = graph_->constant({dimWords, 1, dimEmb}, inits::from_vector(vPos));
return input + signal;
}
- Expr TriangleMask(Ptr<ExpressionGraph> graph, int length) {
- using namespace keywords;
-
+ Expr triangleMask(int length) const {
// fill triangle mask
std::vector<float> vMask(length * length, 0);
for(int i = 0; i < length; ++i)
for(int j = 0; j <= i; ++j)
vMask[i * length + j] = 1.f;
- return graph->constant({1, length, length}, inits::from_vector(vMask));
+ return graph_->constant({1, length, length}, inits::from_vector(vMask));
}
// convert multiplicative 1/0 mask to additive 0/-inf log mask, and transpose to match result of bdot() op in Attention()
@@ -67,14 +72,14 @@ public:
int dimModel = input->shape()[-1];
int dimSteps = input->shape()[-2];
int dimBatch = input->shape()[-3];
- int dimBeam = input->shape()[-4];
+ int dimBeam = input->shape()[-4];
int dimDepth = dimModel / dimHeads;
auto output
= reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth});
- return transpose(output, {0, 2, 1, 3});
+ return transpose(output, {0, 2, 1, 3}); // [dimBatch*dimBeam, dimHeads, dimSteps, dimDepth]
}
static Expr JoinHeads(Expr input, int dimBeam = 1) {
@@ -91,84 +96,77 @@ public:
return reshape(output, {dimBeam, dimBatch, dimSteps, dimModel});
}
- static Expr PreProcess(Ptr<ExpressionGraph> graph,
- std::string prefix,
- std::string ops,
- Expr input,
- float dropProb = 0.0f) {
- using namespace keywords;
+ // like affine() but with built-in parameters, activation, and dropout
+ static inline
+ Expr dense(Expr x, std::string prefix, std::string suffix, int outDim, const std::function<Expr(Expr)>& actFn = nullptr, float dropProb = 0.0f)
+ {
+ auto graph = x->graph();
+
+ auto W = graph->param(prefix + "_W" + suffix, { x->shape()[-1], outDim }, inits::glorot_uniform);
+ auto b = graph->param(prefix + "_b" + suffix, { 1, outDim }, inits::zeros);
+
+ x = affine(x, W, b);
+ if (actFn)
+ x = actFn(x);
+ if (dropProb)
+ x = dropout(x, dropProb);
+ return x;
+ }
- int dimModel = input->shape()[-1];
+ Expr layerNorm(Expr x, std::string prefix, std::string suffix = std::string()) const {
+ int dimModel = x->shape()[-1];
+ auto scale = graph_->param(prefix + "_ln_scale" + suffix, { 1, dimModel }, inits::ones);
+ auto bias = graph_->param(prefix + "_ln_bias" + suffix, { 1, dimModel }, inits::zeros);
+ return marian::layerNorm(x, scale, bias, 1e-6);
+ }
+
+ Expr preProcess(std::string prefix, std::string ops, Expr input, float dropProb = 0.0f) const {
auto output = input;
for(auto op : ops) {
// dropout
- if(op == 'd' && dropProb > 0.0f) {
+ if (op == 'd')
output = dropout(output, dropProb);
- }
// layer normalization
- if(op == 'n') {
- auto scale = graph->param(
- prefix + "_ln_scale_pre", {1, dimModel}, inits::ones);
- auto bias = graph->param(
- prefix + "_ln_bias_pre", {1, dimModel}, inits::zeros);
- output = layer_norm(output, scale, bias, 1e-6);
- }
+ else if (op == 'n')
+ output = layerNorm(output, prefix, "_pre");
+ else
+ ABORT("Unknown pre-processing operation '%c'", op);
}
return output;
}
- static Expr PostProcess(Ptr<ExpressionGraph> graph,
- std::string prefix,
- std::string ops,
- Expr input,
- Expr prevInput,
- float dropProb = 0.0f) {
- using namespace keywords;
-
- int dimModel = input->shape()[-1];
+ Expr postProcess(std::string prefix, std::string ops, Expr input, Expr prevInput, float dropProb = 0.0f) const {
auto output = input;
for(auto op : ops) {
// dropout
- if(op == 'd' && dropProb > 0.0f) {
+ if(op == 'd')
output = dropout(output, dropProb);
- }
// skip connection
- if(op == 'a') {
+ else if(op == 'a')
output = output + prevInput;
- }
// highway connection
- if(op == 'h') {
- auto Wh = graph->param(
- prefix + "_Wh", {dimModel, dimModel}, inits::glorot_uniform);
- auto bh = graph->param(prefix + "_bh", {1, dimModel}, inits::zeros);
-
- auto t = affine(prevInput, Wh, bh);
+ else if(op == 'h') {
+ int dimModel = input->shape()[-1];
+ auto t = dense(prevInput, prefix, /*suffix=*/"h", dimModel);
output = highway(output, prevInput, t);
}
// layer normalization
- if(op == 'n') {
- auto scale
- = graph->param(prefix + "_ln_scale", {1, dimModel}, inits::ones);
- auto bias
- = graph->param(prefix + "_ln_bias", {1, dimModel}, inits::zeros);
- output = layer_norm(output, scale, bias, 1e-6);
- }
+ else if(op == 'n')
+ output = layerNorm(output, prefix);
+ else
+ ABORT("Unknown pre-processing operation '%c'", op);
}
return output;
}
// determine the multiplicative-attention probability and performs the associative lookup as well
// q, k, and v have already been split into multiple heads, undergone any desired linear transform.
- static Expr Attention(Ptr<ExpressionGraph> graph,
- Ptr<Options> options,
- std::string prefix,
- Expr q, // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: split vector dim]
- Expr k, // [-4: batch size, -3: num heads, -2: max src length, -1: split vector dim]
- Expr v, // [-4: batch size, -3: num heads, -2: max src length, -1: split vector dim]
- Expr mask = nullptr, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
- bool inference = false) {
- using namespace keywords;
-
+ Expr Attention(std::string prefix,
+ Expr q, // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: split vector dim]
+ Expr k, // [-4: batch size, -3: num heads, -2: max src length, -1: split vector dim]
+ Expr v, // [-4: batch size, -3: num heads, -2: max src length, -1: split vector dim]
+ Expr values, // [-4: beam depth, -3: batch size, -2: max kv length, -1: vector dim]
+ Expr mask = nullptr) const { // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
int dk = k->shape()[-1];
// softmax over batched dot product of query and keys (applied over all
@@ -179,8 +177,8 @@ public:
int dimBeamK = k->shape()[-4];
int dimBeam = dimBeamQ / dimBeamK;
if(dimBeam > 1) { // broadcast k and v into all beam elements --TODO: if we use a separate dimension, then this would be automatic at no memory cost
- k = repeat(k, dimBeam, axis = -4); // [-4: beam depth * batch size, -3: num heads, -2: max src length, -1: split vector dim]
- v = repeat(v, dimBeam, axis = -4); // [-4: beam depth * batch size, -3: num heads, -2: max src length, -1: split vector dim]
+ k = repeat(k, dimBeam, /*axis=*/-4); // [-4: beam depth * batch size, -3: num heads, -2: max src length, -1: split vector dim]
+ v = repeat(v, dimBeam, /*axis=*/-4); // [-4: beam depth * batch size, -3: num heads, -2: max src length, -1: split vector dim]
}
// now q, k, and v have the same first dims [-4: beam depth * batch size, -3: num heads, -2: max src or tgt length, -1: split vector dim]
@@ -188,39 +186,38 @@ public:
float scale = 1.0 / std::sqrt((float)dk); // scaling to avoid extreme values due to matrix multiplication
auto z = bdot(q, k, false, true, scale); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length]
+ // mask out garbage beyond end of sequences
+ z = z + mask;
+
// take softmax along src sequence axis (-1)
- auto zm = z + mask;
- auto weights = softmax(zm); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length]
+ auto weights = softmax(z); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length]
// optional dropout for attention weights
float dropProb
- = inference ? 0 : options->get<float>("transformer-dropout-attention");
-
- if(dropProb)
- weights = dropout(weights, dropProb);
+ = inference_ ? 0 : opt<float>("transformer-dropout-attention");
+ weights = dropout(weights, dropProb);
// apply attention weights to values
- return bdot(weights, v); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: split vector dim]
+ auto output = bdot(weights, v); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: split vector dim]
+ return output;
}
- static Expr MultiHead(Ptr<ExpressionGraph> graph,
- Ptr<Options> options,
- std::string prefix,
- int dimOut,
- int dimHeads,
- Expr q, // [-4: beam depth * batch size, -3: num heads, -2: max length, -1: split vector dim]
- const std::vector<Expr> &keys, // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim]
- const std::vector<Expr> &values,
- const std::vector<Expr> &masks, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
- bool inference = false) {
+ Expr MultiHead(std::string prefix,
+ int dimOut,
+ int dimHeads,
+ Expr q, // [-4: beam depth * batch size, -3: num heads, -2: max q length, -1: split vector dim]
+ const std::vector<Expr> &keys, // [-4: beam depth, -3: batch size, -2: max kv length, -1: vector dim]
+ const std::vector<Expr> &values, // [-4: beam depth, -3: batch size, -2: max kv length, -1: vector dim]
+ const std::vector<Expr> &masks) const { // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
using namespace keywords;
int dimModel = q->shape()[-1];
- auto Wq = graph->param(
+ auto Wq = graph_->param(
prefix + "_Wq", {dimModel, dimModel}, inits::glorot_uniform);
- auto bq = graph->param(prefix + "_bq", {1, dimModel}, inits::zeros);
+ auto bq = graph_->param(prefix + "_bq", {1, dimModel}, inits::zeros);
auto qh = affine(q, Wq, bq);
+
qh = SplitHeads(qh, dimHeads); // [-4: beam depth * batch size, -3: num heads, -2: max length, -1: split vector dim]
std::vector<Expr> outputs;
@@ -229,15 +226,15 @@ public:
if(i > 0)
prefixProj += "_enc" + std::to_string(i + 1);
- auto Wk = graph->param(prefixProj + "_Wk",
+ auto Wk = graph_->param(prefixProj + "_Wk",
{dimModel, dimModel},
inits::glorot_uniform);
- auto bk = graph->param(
+ auto bk = graph_->param(
prefixProj + "_bk", {1, dimModel}, inits::zeros);
- auto Wv = graph->param(
+ auto Wv = graph_->param(
prefixProj + "_Wv", {dimModel, dimModel}, inits::glorot_uniform);
- auto bv = graph->param(prefixProj + "_bv", {1, dimModel}, inits::zeros);
+ auto bv = graph_->param(prefixProj + "_bv", {1, dimModel}, inits::zeros);
auto kh = affine(keys[i], Wk, bk); // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim]
auto vh = affine(values[i], Wv, bv);
@@ -247,7 +244,7 @@ public:
// apply multi-head attention to downscaled inputs
auto output
- = Attention(graph, options, prefix, qh, kh, vh, masks[i], inference); // [-4: beam depth * batch size, -3: num heads, -2: max length, -1: split vector dim]
+ = Attention(prefix, qh, kh, vh, values[i], masks[i]); // [-4: beam depth * batch size, -3: num heads, -2: max length, -1: split vector dim]
output = JoinHeads(output, q->shape()[-4]); // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim]
outputs.push_back(output);
@@ -261,244 +258,143 @@ public:
int dimAtt = output->shape()[-1];
- bool project = !options->get<bool>("transformer-no-projection");
+ bool project = !opt<bool>("transformer-no-projection");
if(project || dimAtt != dimOut) {
auto Wo
- = graph->param(prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform);
- auto bo = graph->param(prefix + "_bo", {1, dimOut}, inits::zeros);
+ = graph_->param(prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform);
+ auto bo = graph_->param(prefix + "_bo", {1, dimOut}, inits::zeros);
output = affine(output, Wo, bo);
}
return output;
}
- static Expr LayerAttention(Ptr<ExpressionGraph> graph,
- Ptr<Options> options,
- std::string prefix,
- Expr input,
- Expr keys,
- Expr values,
- Expr mask,
- bool inference = false) {
- return LayerAttention(graph,
- options,
- prefix,
- input,
- std::vector<Expr>{keys},
- std::vector<Expr>{values},
- std::vector<Expr>{mask},
- inference);
+ // TODO: the multi-input version below is never used. Can we remove it?
+ Expr LayerAttention(std::string prefix, Expr input, Expr keys, Expr values, Expr mask) const {
+ return LayerAttention_(prefix, input, std::vector<Expr>{keys}, std::vector<Expr>{values}, std::vector<Expr>{mask});
}
- static Expr LayerAttention(Ptr<ExpressionGraph> graph,
- Ptr<Options> options,
- std::string prefix,
- Expr input, // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim]
- const std::vector<Expr> &keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
- const std::vector<Expr> &values,
- const std::vector<Expr> &masks, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
- bool inference = false) {
- using namespace keywords;
-
+ Expr LayerAttention_(std::string prefix,
+ Expr input, // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim]
+ const std::vector<Expr> &keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+ const std::vector<Expr> &values, // ...?
+ const std::vector<Expr> &masks) const { // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length]
int dimModel = input->shape()[-1];
- float dropProb = inference ? 0 : options->get<float>("transformer-dropout");
- auto opsPre = options->get<std::string>("transformer-preprocess");
- auto output = PreProcess(graph, prefix + "_Wo", opsPre, input, dropProb);
+ float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
+ auto opsPre = opt<std::string>("transformer-preprocess");
+ auto output = preProcess(prefix + "_Wo", opsPre, input, dropProb);
- auto heads = options->get<int>("transformer-heads");
+ auto heads = opt<int>("transformer-heads");
// multi-head self-attention over previous input
- output = MultiHead(graph,
- options,
- prefix,
- dimModel,
- heads,
- output,
- keys,
- values,
- masks,
- inference);
-
- auto opsPost = options->get<std::string>("transformer-postprocess");
- output
- = PostProcess(graph, prefix + "_Wo", opsPost, output, input, dropProb);
+ output = MultiHead(prefix, dimModel, heads, output, keys, values, masks);
+
+ auto opsPost = opt<std::string>("transformer-postprocess");
+ output = postProcess(prefix + "_Wo", opsPost, output, input, dropProb);
return output;
}
Expr DecoderLayerSelfAttention(rnn::State& decoderState,
const rnn::State& prevDecoderState,
- Ptr<ExpressionGraph> graph,
- Ptr<Options> options,
std::string prefix,
Expr input,
Expr selfMask,
- int startPos,
- bool inference = false) {
-
- using namespace keywords;
-
+ int startPos) const {
selfMask = transposedLogMask(selfMask);
auto values = input;
if(startPos > 0) {
- values = concatenate({prevDecoderState.output, input},
- axis = -2);
+ values = concatenate({prevDecoderState.output, input}, /*axis=*/-2);
}
decoderState.output = values;
// TODO: do not recompute matrix multiplies
- return LayerAttention(graph,
- options,
- prefix,
- input,
- values,
- values,
- selfMask,
- inference);
+ return LayerAttention(prefix, input, values, values, selfMask);
}
- Expr LayerFFN(Ptr<ExpressionGraph> graph,
- Ptr<Options> options,
- std::string prefix,
- Expr input,
- bool inference = false) {
- using namespace keywords;
+ static inline
+ std::function<Expr(Expr)> activationByName(const std::string& actName)
+ {
+ if (actName == "relu")
+ return (ActivationFunction*)relu;
+ else if (actName == "swish")
+ return (ActivationFunction*)swish;
+ ABORT("Invalid activation name '{}'", actName);
+ }
+ Expr LayerFFN(std::string prefix, Expr input) const {
int dimModel = input->shape()[-1];
- float dropProb = inference ? 0 : options->get<float>("transformer-dropout");
- auto opsPre = options->get<std::string>("transformer-preprocess");
- auto output = PreProcess(graph, prefix + "_ffn", opsPre, input, dropProb);
+ float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
+ auto opsPre = opt<std::string>("transformer-preprocess");
+ auto output = preProcess(prefix + "_ffn", opsPre, input, dropProb);
- int dimFfn = options->get<int>("transformer-dim-ffn");
- int depthFfn = options->get<int>("transformer-ffn-depth");
- auto act = options->get<std::string>("transformer-ffn-activation");
+ int dimFfn = opt<int>("transformer-dim-ffn");
+ int depthFfn = opt<int>("transformer-ffn-depth");
+ auto actFn = activationByName(opt<std::string>("transformer-ffn-activation"));
float ffnDropProb
- = inference ? 0 : options->get<float>("transformer-dropout-ffn");
+ = inference_ ? 0 : opt<float>("transformer-dropout-ffn");
ABORT_IF(depthFfn < 1, "Filter depth {} is smaller than 1", depthFfn);
- int i = 1;
- int dimLast = dimModel;
- for(; i < depthFfn; ++i) {
- int dimFirst = i == 1 ? dimModel : dimFfn;
- auto W = graph->param(
- prefix + "_W" + std::to_string(i), {dimFirst, dimFfn}, inits::glorot_uniform);
- auto b = graph->param(prefix + "_b" + std::to_string(i), {1, dimFfn}, inits::zeros);
-
- output = affine(output, W, b);
-
- if(act == "relu")
- output = relu(output);
- else
- output = swish(output);
-
- if(ffnDropProb)
- output = dropout(output, ffnDropProb);
-
- dimLast = dimFfn;
- }
-
- auto W = graph->param(
- prefix + "_W" + std::to_string(i), {dimLast, dimModel}, inits::glorot_uniform);
- auto b = graph->param(prefix + "_b" + std::to_string(i), {1, dimModel}, inits::zeros);
-
- output = affine(output, W, b);
+ // the stack of FF layers
+ for(int i = 1; i < depthFfn; ++i)
+ output = dense(output, prefix, /*suffix=*/std::to_string(i), dimFfn, actFn, ffnDropProb);
+ output = dense(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel);
- auto opsPost = options->get<std::string>("transformer-postprocess");
+ auto opsPost = opt<std::string>("transformer-postprocess");
output
- = PostProcess(graph, prefix + "_ffn", opsPost, output, input, dropProb);
+ = postProcess(prefix + "_ffn", opsPost, output, input, dropProb);
return output;
}
- // Implementation of Average Attention Network Layer (ANN) from
+ // Implementation of Average Attention Network Layer (AAN) from
// https://arxiv.org/pdf/1805.00631.pdf
- Expr LayerAAN(Ptr<ExpressionGraph> graph,
- Ptr<Options> options,
- std::string prefix,
- Expr x,
- Expr y,
- bool inference = false) {
- using namespace keywords;
-
+ Expr LayerAAN(std::string prefix, Expr x, Expr y) const {
int dimModel = x->shape()[-1];
- float dropProb = inference ? 0 : options->get<float>("transformer-dropout");
- auto opsPre = options->get<std::string>("transformer-preprocess");
+ float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
+ auto opsPre = opt<std::string>("transformer-preprocess");
- y = PreProcess(graph, prefix + "_ffn", opsPre, y, dropProb);
+ y = preProcess(prefix + "_ffn", opsPre, y, dropProb);
// FFN
- int dimAan = options->get<int>("transformer-dim-aan");
- int depthAan = options->get<int>("transformer-aan-depth");
- auto act = options->get<std::string>("transformer-aan-activation");
- float aanDropProb = inference ? 0 : options->get<float>("transformer-dropout-ffn");
-
- int i = 1;
- int dimLast = dimModel;
- for(; i < depthAan; ++i) {
- int dimFirst = i == 1 ? dimModel : dimAan;
- auto W = graph->param(
- prefix + "_W" + std::to_string(i), {dimFirst, dimAan}, inits::glorot_uniform);
- auto b = graph->param(prefix + "_b" + std::to_string(i), {1, dimAan}, inits::zeros);
-
- y = affine(y, W, b);
-
- if(act == "relu")
- y = relu(y);
- else
- y = swish(y);
-
- if(aanDropProb)
- y = dropout(y, aanDropProb);
-
- dimLast = dimAan;
- }
-
- if(dimLast != dimModel) {
- auto W = graph->param(
- prefix + "_W" + std::to_string(i), {dimLast, dimModel}, inits::glorot_uniform);
- auto b = graph->param(prefix + "_b" + std::to_string(i), {1, dimModel}, inits::zeros);
- y = affine(y, W, b);
- }
-
- bool noGate = options->get<bool>("transformer-aan-nogate");
+ int dimAan = opt<int>("transformer-dim-aan");
+ int depthAan = opt<int>("transformer-aan-depth");
+ auto actFn = activationByName(opt<std::string>("transformer-aan-activation"));
+ float aanDropProb = inference_ ? 0 : opt<float>("transformer-dropout-ffn");
+
+ // the stack of AAN layers
+ for(int i = 1; i < depthAan; ++i)
+ y = dense(y, prefix, /*suffix=*/std::to_string(i), dimAan, actFn, aanDropProb);
+ if(y->shape()[-1] != dimModel) // bring it back to the desired dimension if needed
+ y = dense(y, prefix, std::to_string(depthAan), dimModel);
+
+ bool noGate = opt<bool>("transformer-aan-nogate");
if(!noGate) {
- auto Wi = graph->param(prefix + "_Wi", {dimModel, dimModel}, inits::glorot_uniform);
- auto bi = graph->param(prefix + "_bi", {1, dimModel}, inits::zeros);
-
- auto Wf = graph->param(prefix + "_Wf", {dimModel, dimModel}, inits::glorot_uniform);
- auto bf = graph->param(prefix + "_bf", {1, dimModel}, inits::zeros);
-
- auto gi = logit(affine(x, Wi, bi));
- auto gf = logit(affine(y, Wf, bf));
+ auto gi = dense(x, prefix, /*suffix=*/"i", dimModel, (ActivationFunction*)sigmoid);
+ auto gf = dense(y, prefix, /*suffix=*/"f", dimModel, (ActivationFunction*)sigmoid);
y = gi * x + gf * y;
}
- auto opsPost = options->get<std::string>("transformer-postprocess");
- y = PostProcess(graph, prefix + "_ffn", opsPost, y, x, dropProb);
+ auto opsPost = opt<std::string>("transformer-postprocess");
+ y = postProcess(prefix + "_ffn", opsPost, y, x, dropProb);
return y;
}
- // Implementation of Average Attention Network Layer (ANN) from
+ // Implementation of Average Attention Network Layer (AAN) from
// https://arxiv.org/pdf/1805.00631.pdf
// Function wrapper using decoderState as input.
Expr DecoderLayerAAN(rnn::State& decoderState,
const rnn::State& prevDecoderState,
- Ptr<ExpressionGraph> graph,
- Ptr<Options> options,
std::string prefix,
Expr input,
Expr selfMask,
- int startPos,
- bool inference = false) {
-
- using namespace keywords;
-
+ int startPos) const {
auto output = input;
if(startPos > 0) {
// we are decoding at a position after 0
@@ -508,27 +404,28 @@ public:
// we are training or scoring, because there is no history and
// the context is larger than a single time step. We do not need
// to average batch with only single words.
- selfMask = selfMask / sum(selfMask, axis=-1);
+ selfMask = selfMask / sum(selfMask, /*axis=*/-1);
output = bdot(selfMask, output);
}
- decoderState.output = output;
+ decoderState.output = output; // BUGBUG: mutable?
- return LayerAAN(graph, options, prefix, input, output, inference);
+ return LayerAAN(prefix, input, output);
}
};
-class EncoderTransformer : public EncoderBase, public Transformer {
+class EncoderTransformer : public Transformer<EncoderBase> {
public:
- EncoderTransformer(Ptr<Options> options) : EncoderBase(options) {}
+ EncoderTransformer(Ptr<Options> options) : Transformer(options) {}
- Expr WordEmbeddings(Ptr<ExpressionGraph> graph,
- Ptr<data::CorpusBatch> batch) {
+ // returns the embedding matrix based on options
+ // And based on batchIndex_.
+ Expr wordEmbeddings(int subBatchIndex) const {
// standard encoder word embeddings
- int dimVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
+ int dimVoc = opt<std::vector<int>>("dim-vocabs")[subBatchIndex];
int dimEmb = opt<int>("dim-emb");
- auto embFactory = embedding(graph)("dimVocab", dimVoc)("dimEmb", dimEmb);
+ auto embFactory = embedding(graph_)("dimVocab", dimVoc)("dimEmb", dimEmb);
if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all"))
embFactory("prefix", "Wemb");
@@ -541,7 +438,7 @@ public:
if(options_->has("embedding-vectors")) {
auto embFiles = opt<std::vector<std::string>>("embedding-vectors");
embFactory //
- ("embFile", embFiles[batchIndex_]) //
+ ("embFile", embFiles[subBatchIndex]) //
("normalization", opt<bool>("embedding-normalization"));
}
@@ -549,19 +446,22 @@ public:
}
Ptr<EncoderState> build(Ptr<ExpressionGraph> graph,
- Ptr<data::CorpusBatch> batch) {
- using namespace keywords;
+ Ptr<data::CorpusBatch> batch) override {
+ graph_ = graph;
+ return apply(batch);
+ }
+ Ptr<EncoderState> apply(Ptr<data::CorpusBatch> batch) const {
int dimEmb = opt<int>("dim-emb");
int dimBatch = batch->size();
int dimSrcWords = (*batch)[batchIndex_]->batchWidth();
- auto embeddings = WordEmbeddings(graph, batch);
+ auto embeddings = wordEmbeddings(batchIndex_); // embedding matrix, considering tying and some other options
// embed the source words in the batch
Expr batchEmbeddings, batchMask;
std::tie(batchEmbeddings, batchMask)
- = EncoderBase::lookup(graph, embeddings, batch);
+ = EncoderBase::lookup(graph_, embeddings, batch);
// apply dropout over source words
float dropoutSrc = inference_ ? 0 : opt<float>("dropout-src");
@@ -573,45 +473,38 @@ public:
// according to paper embeddings are scaled up by \sqrt(d_m)
auto scaledEmbeddings = std::sqrt(dimEmb) * batchEmbeddings;
- scaledEmbeddings = AddPositionalEmbeddings(graph, scaledEmbeddings);
+ scaledEmbeddings = addPositionalEmbeddings(scaledEmbeddings);
// reorganize batch and timestep
scaledEmbeddings = atleast_nd(scaledEmbeddings, 4);
batchMask = atleast_nd(batchMask, 4);
- auto layer = TransposeTimeBatch(scaledEmbeddings); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+ auto layer = transposeTimeBatch(scaledEmbeddings); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
auto layerMask
- = reshape(TransposeTimeBatch(batchMask), {1, dimBatch, 1, dimSrcWords}); // [-4: beam depth=1, -3: batch size, -2: vector dim=1, -1: max length]
+ = reshape(transposeTimeBatch(batchMask), {1, dimBatch, 1, dimSrcWords}); // [-4: beam depth=1, -3: batch size, -2: vector dim=1, -1: max length]
auto opsEmb = opt<std::string>("transformer-postprocess-emb");
float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
- layer = PreProcess(graph, prefix_ + "_emb", opsEmb, layer, dropProb);
+ layer = preProcess(prefix_ + "_emb", opsEmb, layer, dropProb);
layerMask = transposedLogMask(layerMask); // [-4: batch size, -3: 1, -2: vector dim=1, -1: max length]
// apply encoder layers
auto encDepth = opt<int>("enc-depth");
for(int i = 1; i <= encDepth; ++i) {
- layer = LayerAttention(graph,
- options_,
- prefix_ + "_l" + std::to_string(i) + "_self",
- layer,
- layer,
- layer,
- layerMask,
- inference_);
-
- layer = LayerFFN(graph,
- options_,
- prefix_ + "_l" + std::to_string(i) + "_ffn",
- layer,
- inference_);
+ layer = LayerAttention(prefix_ + "_l" + std::to_string(i) + "_self",
+ layer, // query
+ layer, // keys
+ layer, // values
+ layerMask);
+
+ layer = LayerFFN(prefix_ + "_l" + std::to_string(i) + "_ffn", layer);
}
// restore organization of batch and time steps. This is currently required
// to make RNN-based decoders and beam search work with this. We are looking
// into making this more natural.
- auto context = TransposeTimeBatch(layer); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]
+ auto context = transposeTimeBatch(layer); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]
return New<EncoderState>(context, batchMask, batch);
}
@@ -655,27 +548,62 @@ public:
}
};
-class DecoderTransformer : public DecoderBase, public Transformer {
-protected:
+class DecoderTransformer : public Transformer<DecoderBase> {
+private:
Ptr<mlp::MLP> output_;
+private:
+ void LazyCreateOutputLayer(std::string prefix)
+ {
+ if(output_) // create it lazily
+ return;
+
+ int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
+
+ auto layerOut = mlp::output(graph_) //
+ ("prefix", prefix_ + "_ff_logit_out") //
+ ("dim", dimTrgVoc);
+
+ if(opt<bool>("tied-embeddings") || opt<bool>("tied-embeddings-all")) {
+ std::string tiedPrefix = prefix_ + "_Wemb";
+ if(opt<bool>("tied-embeddings-all") || opt<bool>("tied-embeddings-src"))
+ tiedPrefix = "Wemb";
+ layerOut.tie_transposed("W", tiedPrefix);
+ }
+
+ if(shortlist_)
+ layerOut.set_shortlist(shortlist_);
+
+ // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim]
+ // assemble layers into MLP and apply to embeddings, decoder context and
+ // aligned source context
+ output_ = mlp::mlp(graph_) //
+ .push_back(layerOut) //
+ .construct();
+ }
+
public:
- DecoderTransformer(Ptr<Options> options) : DecoderBase(options) {}
+ DecoderTransformer(Ptr<Options> options) : Transformer(options) {}
virtual Ptr<DecoderState> startState(
Ptr<ExpressionGraph> graph,
Ptr<data::CorpusBatch> batch,
- std::vector<Ptr<EncoderState>> &encStates) {
+ std::vector<Ptr<EncoderState>> &encStates) override {
+ graph_ = graph;
rnn::States startStates;
return New<TransformerState>(startStates, nullptr, encStates, batch);
}
virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph,
- Ptr<DecoderState> state) {
- using namespace keywords;
+ Ptr<DecoderState> state) override {
+ ABORT_IF(graph != graph_, "An inconsistent graph parameter was passed to step().");
+ LazyCreateOutputLayer(prefix_ + "_ff_logit_out");
+ return step(state);
+ }
- auto embeddings = state->getTargetEmbeddings(); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]
- auto decoderMask = state->getTargetMask(); // [max length, batch size, 1] --this is a hypothesis
+ Ptr<DecoderState> step(Ptr<DecoderState> state) const {
+ auto embeddings = state->getTargetEmbeddings(); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]
+ auto decoderMask = state->getTargetMask(); // [max length, batch size, 1] --this is a hypothesis
// dropout target words
float dropoutTrg = inference_ ? 0 : opt<float>("dropout-trg");
@@ -700,24 +628,24 @@ public:
int startPos = state->getPosition();
scaledEmbeddings
- = AddPositionalEmbeddings(graph, scaledEmbeddings, startPos);
+ = addPositionalEmbeddings(scaledEmbeddings, startPos);
scaledEmbeddings = atleast_nd(scaledEmbeddings, 4);
// reorganize batch and timestep
- auto query = TransposeTimeBatch(scaledEmbeddings); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+ auto query = transposeTimeBatch(scaledEmbeddings); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
auto opsEmb = opt<std::string>("transformer-postprocess-emb");
float dropProb = inference_ ? 0 : opt<float>("transformer-dropout");
- query = PreProcess(graph, prefix_ + "_emb", opsEmb, query, dropProb);
+ query = preProcess(prefix_ + "_emb", opsEmb, query, dropProb);
int dimTrgWords = query->shape()[-2];
- int dimBatch = query->shape()[-3];
- auto selfMask = TriangleMask(graph, dimTrgWords); // [ (1,) 1, max length, max length]
+ int dimBatch = query->shape()[-3];
+ auto selfMask = triangleMask(dimTrgWords); // [ (1,) 1, max length, max length]
if(decoderMask) {
decoderMask = atleast_nd(decoderMask, 4); // [ 1, max length, batch size, 1 ]
- decoderMask = reshape(TransposeTimeBatch(decoderMask),// [ 1, batch size, max length, 1 ]
+ decoderMask = reshape(transposeTimeBatch(decoderMask),// [ 1, batch size, max length, 1 ]
{1, dimBatch, 1, dimTrgWords}); // [ 1, batch size, 1, max length ]
selfMask = selfMask * decoderMask;
// if(dimBeam > 1)
@@ -731,17 +659,17 @@ public:
auto encoderContext = encoderState->getContext();
auto encoderMask = encoderState->getMask();
- encoderContext = TransposeTimeBatch(encoderContext); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
+ encoderContext = transposeTimeBatch(encoderContext); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
int dimSrcWords = encoderContext->shape()[-2];
int dims = encoderMask->shape().size();
encoderMask = atleast_nd(encoderMask, 4);
- encoderMask = reshape(TransposeTimeBatch(encoderMask),
+ encoderMask = reshape(transposeTimeBatch(encoderMask),
{1, dimBatch, 1, dimSrcWords});
encoderMask = transposedLogMask(encoderMask);
if(dimBeam > 1)
- encoderMask = repeat(encoderMask, dimBeam, axis = -4);
+ encoderMask = repeat(encoderMask, dimBeam, /*axis=*/ -4);
encoderContexts.push_back(encoderContext);
encoderMasks.push_back(encoderMask);
@@ -749,97 +677,54 @@ public:
rnn::States prevDecoderStates = state->getStates();
rnn::States decoderStates;
- // apply layers
- for(int i = 1; i <= opt<int>("dec-depth"); ++i) {
+ // apply decoder layers
+ auto decDepth = opt<int>("dec-depth");
+ for(int i = 1; i <= decDepth; ++i) {
rnn::State decoderState;
rnn::State prevDecoderState;
if(prevDecoderStates.size() > 0)
prevDecoderState = prevDecoderStates[i - 1];
+ // self-attention
std::string layerType = opt<std::string>("transformer-decoder-autoreg", "self-attention");
- if(layerType == "self-attention") {
- query = DecoderLayerSelfAttention(decoderState,
- prevDecoderState,
- graph,
- options_,
- prefix_ + "_l" + std::to_string(i) + "_self",
- query,
- selfMask,
- startPos,
- inference_);
- } else if(layerType == "average-attention") {
- query = DecoderLayerAAN(decoderState,
- prevDecoderState,
- graph,
- options_,
- prefix_ + "_l" + std::to_string(i) + "_aan",
- query,
- selfMask,
- startPos,
- inference_);
- } else {
+ if(layerType == "self-attention")
+ query = DecoderLayerSelfAttention(decoderState, prevDecoderState, prefix_ + "_l" + std::to_string(i) + "_self", query, selfMask, startPos);
+ else if(layerType == "average-attention")
+ query = DecoderLayerAAN(decoderState, prevDecoderState, prefix_ + "_l" + std::to_string(i) + "_aan", query, selfMask, startPos);
+ else
ABORT("Unknown auto-regressive layer type in transformer decoder {}", layerType);
- }
decoderStates.push_back(decoderState);
+ // source-target attention
// Iterate over multiple encoders and simply stack the attention blocks
if(encoderContexts.size() > 0) {
- for(int j = 0; j < encoderContexts.size(); ++j) {
+ for(int j = 0; j < encoderContexts.size(); ++j) { // multiple encoders are applied one after another
std::string prefix
= prefix_ + "_l" + std::to_string(i) + "_context";
if(j > 0)
prefix += "_enc" + std::to_string(j + 1);
- query = LayerAttention(graph,
- options_,
- prefix,
+ query = LayerAttention(prefix,
query,
- encoderContexts[j],
- encoderContexts[j],
- encoderMasks[j],
- inference_);
+ encoderContexts[j], // keys
+ encoderContexts[j], // values
+ encoderMasks[j]);
}
}
- query = LayerFFN(graph, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
- options_,
- prefix_ + "_l" + std::to_string(i) + "_ffn",
- query,
- inference_);
+ query = LayerFFN(prefix_ + "_l" + std::to_string(i) + "_ffn", query); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim]
}
- auto decoderContext = TransposeTimeBatch(query); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]
+ auto decoderContext = transposeTimeBatch(query); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim]
//************************************************************************//
- if(!output_) {
- int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
-
- auto layerOut = mlp::output(graph) //
- ("prefix", prefix_ + "_ff_logit_out") //
- ("dim", dimTrgVoc);
-
- if(opt<bool>("tied-embeddings") || opt<bool>("tied-embeddings-all")) {
- std::string tiedPrefix = prefix_ + "_Wemb";
- if(opt<bool>("tied-embeddings-all") || opt<bool>("tied-embeddings-src"))
- tiedPrefix = "Wemb";
- layerOut.tie_transposed("W", tiedPrefix);
- }
-
- if(shortlist_)
- layerOut.set_shortlist(shortlist_);
+ // final feed-forward layer (output)
+ Expr logits = output_->apply(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim]
- // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim]
- // assemble layers into MLP and apply to embeddings, decoder context and
- // aligned source context
- output_ = mlp::mlp(graph) //
- .push_back(layerOut) //
- .construct();
- }
-
- Expr logits = output_->apply(decoderContext);
+ int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
// return unormalized(!) probabilities
auto nextState = New<TransformerState>(decoderStates,
@@ -851,7 +736,9 @@ public:
}
// helper function for guided alignment
- virtual const std::vector<Expr> getAlignments(int i = 0) { return {}; }
+ virtual const std::vector<Expr> getAlignments(int i = 0) {
+ return {};
+ }
void clear() {
output_ = nullptr;
diff --git a/src/rnn/attention.h b/src/rnn/attention.h
index fc09e9b7..92a89e99 100644
--- a/src/rnn/attention.h
+++ b/src/rnn/attention.h
@@ -80,7 +80,7 @@ public:
W_comb_att_lnb_ = graph->param(
prefix + "_W_comb_att_lnb", {1, dimEncState}, inits::zeros);
- mappedContext_ = layer_norm(affine(contextDropped_, Ua_, ba_),
+ mappedContext_ = layerNorm(affine(contextDropped_, Ua_, ba_),
Wc_att_lns_,
Wc_att_lnb_,
NEMATUS_LN_EPS);
@@ -91,7 +91,7 @@ public:
prefix + "_att_gamma2", {1, dimEncState}, inits::from_value(1.0));
mappedContext_
- = layer_norm(dot(contextDropped_, Ua_), gammaContext_, ba_);
+ = layerNorm(dot(contextDropped_, Ua_), gammaContext_, ba_);
}
} else {
@@ -121,10 +121,10 @@ public:
auto mappedState = dot(recState, Wa_);
if(layerNorm_)
if(nematusNorm_)
- mappedState = layer_norm(
+ mappedState = layerNorm(
mappedState, W_comb_att_lns_, W_comb_att_lnb_, NEMATUS_LN_EPS);
else
- mappedState = layer_norm(mappedState, gammaState_);
+ mappedState = layerNorm(mappedState, gammaState_);
auto attReduce = attOps(va_, mappedContext_, mappedState);
diff --git a/src/rnn/cells.h b/src/rnn/cells.h
index b813d3ac..f3299144 100644
--- a/src/rnn/cells.h
+++ b/src/rnn/cells.h
@@ -81,7 +81,7 @@ public:
auto xW = dot(input, W_);
if(layerNorm_)
- xW = layer_norm(xW, gamma1_);
+ xW = layerNorm(xW, gamma1_);
return {xW};
}
@@ -94,7 +94,7 @@ public:
stateDropped = dropout(recState, dropMaskS_);
auto sU = dot(stateDropped, U_);
if(layerNorm_)
- sU = layer_norm(sU, gamma2_);
+ sU = layerNorm(sU, gamma2_);
Expr output;
if(xWs.empty())
@@ -207,7 +207,7 @@ public:
auto xW = dot(input, W_);
if(layerNorm_)
- xW = layer_norm(xW, gamma1_);
+ xW = layerNorm(xW, gamma1_);
return {xW};
}
@@ -222,7 +222,7 @@ public:
auto sU = dot(stateDropped, U_);
if(layerNorm_)
- sU = layer_norm(sU, gamma2_);
+ sU = layerNorm(sU, gamma2_);
Expr xW;
if(xWs.empty()) {
@@ -406,8 +406,8 @@ public:
W = affine(input, W_, b_);
Wx = affine(input, Wx_, bx_);
}
- W = layer_norm(W, W_lns_, W_lnb_, NEMATUS_LN_EPS);
- Wx = layer_norm(Wx, Wx_lns_, Wx_lnb_, NEMATUS_LN_EPS);
+ W = layerNorm(W, W_lns_, W_lnb_, NEMATUS_LN_EPS);
+ Wx = layerNorm(Wx, Wx_lns_, Wx_lnb_, NEMATUS_LN_EPS);
xW = concatenate({W, Wx}, keywords::axis = -1);
} else {
@@ -434,8 +434,8 @@ public:
Expr Ux; // Temp_2_ in Amun
if(encoder_) {
- U = layer_norm(dot(stateDropped, U_), U_lns_, U_lnb_, NEMATUS_LN_EPS);
- Ux = layer_norm(
+ U = layerNorm(dot(stateDropped, U_), U_lns_, U_lnb_, NEMATUS_LN_EPS);
+ Ux = layerNorm(
dot(stateDropped, Ux_), Ux_lns_, Ux_lnb_, NEMATUS_LN_EPS);
if(transition_) {
@@ -449,8 +449,8 @@ public:
U = dot(stateDropped, U_);
Ux = dot(stateDropped, Ux_);
}
- U = layer_norm(U, U_lns_, U_lnb_, NEMATUS_LN_EPS);
- Ux = layer_norm(Ux, Ux_lns_, Ux_lnb_, NEMATUS_LN_EPS);
+ U = layerNorm(U, U_lns_, U_lnb_, NEMATUS_LN_EPS);
+ Ux = layerNorm(Ux, Ux_lns_, Ux_lnb_, NEMATUS_LN_EPS);
}
sU = concatenate({U, Ux}, keywords::axis = -1);
@@ -555,7 +555,7 @@ public:
auto xW = dot(input, W_);
if(layerNorm_)
- xW = layer_norm(xW, gamma1_);
+ xW = layerNorm(xW, gamma1_);
return {xW};
}
@@ -573,7 +573,7 @@ public:
auto sU = dot(recStateDropped, U_);
if(layerNorm_)
- sU = layer_norm(sU, gamma2_);
+ sU = layerNorm(sU, gamma2_);
Expr xW;
if(xWs.empty()) {
@@ -648,7 +648,7 @@ public:
auto xWs = CellType::applyInput({input});
auto xWm = affine(input, Wm_, bwm_);
if(CellType::layerNorm_)
- xWm = layer_norm(xWm, gamma1m_);
+ xWm = layerNorm(xWm, gamma1m_);
xWs.push_back(xWm);
return xWs;
@@ -662,7 +662,7 @@ public:
auto sUm = affine(state.output, Um_, bm_);
if(CellType::layerNorm_)
- sUm = layer_norm(sUm, gamma2m_);
+ sUm = layerNorm(sUm, gamma2m_);
auto mstate = xWm * sUm;
@@ -757,9 +757,9 @@ public:
auto sUo = affine(recState, Uo_, bo_);
auto sUc = affine(recState, Uc_, bc_);
- auto f = logit(xWs[0] + sUf);
- auto i = logit(xWs[1] + sUi);
- auto o = logit(xWs[2] + sUo);
+ auto f = sigmoid(xWs[0] + sUf);
+ auto i = sigmoid(xWs[1] + sUi);
+ auto o = sigmoid(xWs[2] + sUo);
auto c = tanh(xWs[3] + sUc);
auto nextCellState = f * cellState + i * c;
diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp
index 1a007cf7..7310102d 100644
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@@ -13,7 +13,7 @@ namespace marian {
namespace cpu {
-inline float stableLogit(float x) {
+inline float stableSigmoid(float x) {
if(x >= 0) {
float z = expf(-x);
return 1.0 / (1.0 + z);
@@ -458,12 +458,12 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) {
#pragma omp simd
for(int i = 0; i < cols; ++i) {
- // @TODO: stable logit
- float r = stableLogit(xWrow[i] + sUrow[i] + b[i]);
+ // @TODO: stable sigmoid
+ float r = stableSigmoid(xWrow[i] + sUrow[i] + b[i]);
int k = i + cols;
- float z = stableLogit(xWrow[k] + sUrow[k] + b[k]);
+ float z = stableSigmoid(xWrow[k] + sUrow[k] + b[k]);
int l = i + 2 * cols;
float h;
@@ -515,8 +515,8 @@ void GRUFastBackward(std::vector<Tensor> outputs,
int k = i + cols;
int l = i + 2 * cols;
- float r = stableLogit(rowXW[i] + rowSU[i] + b[i]);
- float z = stableLogit(rowXW[k] + rowSU[k] + b[k]);
+ float r = stableSigmoid(rowXW[i] + rowSU[i] + b[i]);
+ float z = stableSigmoid(rowXW[k] + rowSU[k] + b[k]);
float h;
if(final)
@@ -931,10 +931,10 @@ void LSTMCellForward(Tensor out_, std::vector<Tensor> inputs) {
const float* sUrow = sU + j * cols * 4;
for(int i = 0; i < cols; ++i) {
- float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]);
+ float gf = stableSigmoid(xWrow[i] + sUrow[i] + b[i]);
int k = i + cols;
- float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]);
+ float gi = stableSigmoid(xWrow[k] + sUrow[k] + b[k]);
int l = i + 2 * cols;
float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]);
@@ -964,7 +964,7 @@ void LSTMOutputForward(Tensor out_, std::vector<Tensor> inputs) {
for(int i = 0; i < cols; ++i) {
int k = i + 3 * cols;
- float go = stableLogit(xWrow[k] + sUrow[k] + b[k]);
+ float go = stableSigmoid(xWrow[k] + sUrow[k] + b[k]);
rowOut[i] = go * std::tanh(rowCell[i]);
}
@@ -1004,10 +1004,10 @@ void LSTMCellBackward(std::vector<Tensor> outputs,
const float* rowAdj = adj + j * cols;
for(int i = 0; i < cols; ++i) {
- float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]);
+ float gf = stableSigmoid(xWrow[i] + sUrow[i] + b[i]);
int k = i + cols;
- float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]);
+ float gi = stableSigmoid(xWrow[k] + sUrow[k] + b[k]);
int l = i + 2 * cols;
float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]);
@@ -1089,7 +1089,7 @@ void LSTMOutputBackward(std::vector<Tensor> outputs,
for(int i = 0; i < cols; ++i) {
int k = i + 3 * cols;
- float go = stableLogit(xWrow[k] + sUrow[k] + b[k]);
+ float go = stableSigmoid(xWrow[k] + sUrow[k] + b[k]);
float t = std::tanh(rowCell[i]);
diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc
index a026906b..27f35b95 100644
--- a/src/tensors/gpu/add.inc
+++ b/src/tensors/gpu/add.inc
@@ -8,7 +8,7 @@ template void Add<BinaryFunctor<elem::Mult, Capture, Assignee<1>>, marian::Tenso
template void Add<BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Neg, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Mult, Assignee<3>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Neg, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Mult, Assignee<3>, Assignee<3>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<UnaryFunctor<elem::Neg, Assignee<1>>, marian::Tensor>(UnaryFunctor<elem::Neg, Assignee<1>>, float, marian::Tensor, marian::Tensor);
-template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Logit, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Logit, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div, Capture, Assignee<2>>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div, Capture, Assignee<2>>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
@@ -17,8 +17,8 @@ template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
-template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Logit, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Logit, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
-template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Logit, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Logit, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
diff --git a/src/tensors/gpu/element.inc b/src/tensors/gpu/element.inc
index 279ae7b3..02d269f3 100644
--- a/src/tensors/gpu/element.inc
+++ b/src/tensors/gpu/element.inc
@@ -1,10 +1,10 @@
using namespace functional;
-template void Element<Assign<Var<1>, BinaryFunctor<elem::Mult, Assignee<2>, UnaryFunctor<elem::Logit, Assignee<2>>>>, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Mult, Assignee<2>, UnaryFunctor<elem::Logit, Assignee<2>>>>, marian::Tensor, marian::Tensor);
+template void Element<Assign<Var<1>, BinaryFunctor<elem::Mult, Assignee<2>, UnaryFunctor<elem::Sigmoid, Assignee<2>>>>, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Mult, Assignee<2>, UnaryFunctor<elem::Sigmoid, Assignee<2>>>>, marian::Tensor, marian::Tensor);
template void Element<Assign<Var<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, marian::Tensor>(Assign<Var<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, marian::Tensor, marian::Tensor);
template void Element<Assign<Var<1>, UnaryFunctor<elem::Log, Assignee<2>>>, marian::Tensor>(Assign<Var<1>, UnaryFunctor<elem::Log, Assignee<2>>>, marian::Tensor, marian::Tensor);
template void Element<Assign<Var<1>, BinaryFunctor<elem::sPReLU, Assignee<2>, Capture>>, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::sPReLU, Assignee<2>, Capture>>, marian::Tensor, marian::Tensor);
template void Element<Assign<Var<1>, UnaryFunctor<elem::sReLU, Assignee<2>>>, marian::Tensor>(Assign<Var<1>, UnaryFunctor<elem::sReLU, Assignee<2>>>, marian::Tensor, marian::Tensor);
-template void Element<Assign<Var<1>, UnaryFunctor<elem::Logit, Assignee<2>>>, marian::Tensor>(Assign<Var<1>, UnaryFunctor<elem::Logit, Assignee<2>>>, marian::Tensor, marian::Tensor);
+template void Element<Assign<Var<1>, UnaryFunctor<elem::Sigmoid, Assignee<2>>>, marian::Tensor>(Assign<Var<1>, UnaryFunctor<elem::Sigmoid, Assignee<2>>>, marian::Tensor, marian::Tensor);
template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>>, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>>, marian::Tensor, marian::Tensor);
template void Element<Assign<Var<1>, BinaryFunctor<elem::Plus, Assignee<1>, BinaryFunctor<elem::Mult, Assignee<2>, Assignee<2>>>>, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Plus, Assignee<1>, BinaryFunctor<elem::Mult, Assignee<2>, Assignee<2>>>>, marian::Tensor, marian::Tensor);
template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Div, Capture, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, Assignee<2>>, Capture>>, Assignee<3>>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Div, Capture, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, Assignee<2>>, Capture>>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor);
@@ -38,6 +38,6 @@ template void Element<Assign<Var<1>, TernaryFunctor<elem::IfThenElse, BinaryFunc
template void Element<Assign<Var<1>, TernaryFunctor<elem::IfThenElse, BinaryFunctor<elem::Leq, UnaryFunctor<elem::Abs, Assignee<1> >, Capture>, Capture, Assignee<1> > >>(Assign<Var<1>, TernaryFunctor<elem::IfThenElse, BinaryFunctor<elem::Leq, UnaryFunctor<elem::Abs, Assignee<1> >, Capture>, Capture, Assignee<1> > >, marian::Tensor);
template void Element<Assign<Var<1>, TernaryFunctor<elem::IfThenElse, BinaryFunctor<elem::Leq, UnaryFunctor<elem::Abs, Assignee<2> >, Capture>, Capture, Capture> >, marian::Tensor >(Assign<Var<1>, TernaryFunctor<elem::IfThenElse, BinaryFunctor<elem::Leq, UnaryFunctor<elem::Abs, Assignee<2> >, Capture>, Capture, Capture> >, marian::Tensor, marian::Tensor);
template void Element<Assign<Var<1>, BinaryFunctor<elem::Clip, Assignee<2>, Capture>>, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Clip, Assignee<2>, Capture>>, marian::Tensor, marian::Tensor);
-template void Element<Assign<Var<1>, BinaryFunctor<elem::LogSum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::LogSum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor);
-template void Element<Assign<Var<1>, BinaryFunctor<elem::Max, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Max, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor);
-template void Element<Assign<Var<1>, BinaryFunctor<elem::Min, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Min, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Element<Assign<Var<1>, BinaryFunctor<elem::LogAddExp, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::LogAddExp, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Element<Assign<Var<1>, BinaryFunctor<elem::Maximum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Maximum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor);
+template void Element<Assign<Var<1>, BinaryFunctor<elem::Minimum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Minimum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor);
diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index e20dd8b4..87861a1c 100644
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -17,7 +17,7 @@ struct isnan_test {
__host__ __device__ bool operator()(const float a) const { return isnan(a); }
};
-__device__ inline float stableLogit(float x) {
+__device__ inline float stableSigmoid(float x) {
if(x >= 0) {
float z = expf(-x);
return 1.0 / (1.0 + z);
@@ -847,11 +847,11 @@ __global__ void gGRUFastForward(float* out,
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols) {
- float r = stableLogit(xWrow[i] + sUrow[i] + b[i]);
+ float r = stableSigmoid(xWrow[i] + sUrow[i] + b[i]);
int k = i + cols;
- float z = stableLogit(xWrow[k] + sUrow[k] + b[k]);
+ float z = stableSigmoid(xWrow[k] + sUrow[k] + b[k]);
int l = i + 2 * cols;
float h;
@@ -922,8 +922,8 @@ __global__ void gGRUFastBackward(float* outState,
int k = i + cols;
int l = i + 2 * cols;
- float r = stableLogit(rowXW[i] + rowSU[i] + b[i]);
- float z = stableLogit(rowXW[k] + rowSU[k] + b[k]);
+ float r = stableSigmoid(rowXW[i] + rowSU[i] + b[i]);
+ float z = stableSigmoid(rowXW[k] + rowSU[k] + b[k]);
float h;
if(final)
@@ -1653,10 +1653,10 @@ __global__ void gLSTMCellForward(float* out,
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols) {
- float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]);
+ float gf = stableSigmoid(xWrow[i] + sUrow[i] + b[i]);
int k = i + cols;
- float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]);
+ float gi = stableSigmoid(xWrow[k] + sUrow[k] + b[k]);
int l = i + 2 * cols;
float gc = tanhf(xWrow[l] + sUrow[l] + b[l]);
@@ -1709,7 +1709,7 @@ __global__ void gLSTMOutputForward(float* out,
int i = tid + threadIdx.x;
if(i < cols) {
int k = i + 3 * cols;
- float go = stableLogit(xWrow[k] + sUrow[k] + b[k]);
+ float go = stableSigmoid(xWrow[k] + sUrow[k] + b[k]);
rowOut[i] = go * tanhf(rowCell[i]);
}
@@ -1766,10 +1766,10 @@ __global__ void gLSTMCellBackward(float* outCell,
for(int tid = 0; tid < cols; tid += blockDim.x) {
int i = tid + threadIdx.x;
if(i < cols) {
- float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]);
+ float gf = stableSigmoid(xWrow[i] + sUrow[i] + b[i]);
int k = i + cols;
- float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]);
+ float gi = stableSigmoid(xWrow[k] + sUrow[k] + b[k]);
int l = i + 2 * cols;
float gc = tanhf(xWrow[l] + sUrow[l] + b[l]);
@@ -1866,7 +1866,7 @@ __global__ void gLSTMOutputBackward(float* outCell,
int i = tid + threadIdx.x;
if(i < cols) {
int k = i + 3 * cols;
- float go = stableLogit(xWrow[k] + sUrow[k] + b[k]);
+ float go = stableSigmoid(xWrow[k] + sUrow[k] + b[k]);
float t = tanhf(rowCell[i]);
@@ -1923,7 +1923,7 @@ __global__ void gHighwayForward(float* out,
for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < length) {
- float sigma = stableLogit(t[index]);
+ float sigma = stableSigmoid(t[index]);
out[index] = in1[index] * sigma + in2[index] * (1.f - sigma);
}
}
@@ -1955,7 +1955,7 @@ __global__ void gHighwayBackward(float* out1,
for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) {
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < length) {
- float sigma = stableLogit(t[index]);
+ float sigma = stableSigmoid(t[index]);
out1[index] = sigma * adj[index];
out2[index] = (1.f - sigma) * adj[index];
outt[index]
diff --git a/src/tests/tensor_test.cu b/src/tests/tensor_test.cu
index 5f54ccfa..72cdc276 100644
--- a/src/tests/tensor_test.cu
+++ b/src/tests/tensor_test.cu
@@ -186,10 +186,10 @@
//}
//
//template <typename X>
-//struct Logit {
+//struct Sigmoid {
// X x;
//
-// __HD__ Logit(X _x) : x(_x) {}
+// __HD__ Sigmoid(X _x) : x(_x) {}
//
// template <typename ...Args>
// __HDI__ float operator()(Args&&... args) {
@@ -198,8 +198,8 @@
//};
//
//template <class X>
-//__HDI__ Logit<X> logit(X x) {
-// return Logit<X>(x);
+//__HDI__ Sigmoid<X> logit(X x) {
+// return Sigmoid<X>(x);
//}
//
///******************************************************************************/
@@ -392,7 +392,7 @@ __HDI__ auto simple(Mult<X, Y> f)->decltype(cut(simple(f.x) * simple(f.y))) {
//}
//
//template <typename X, int N>
-//__HDI__ auto grad(Logit<X> f, Var<N> g)->decltype(f * (C<1>() - f) * grad(f.x, g)) {
+//__HDI__ auto grad(Sigmoid<X> f, Var<N> g)->decltype(f * (C<1>() - f) * grad(f.x, g)) {
// return f * (C<1>() - f) * grad(f.x, g);
//}