diff options
-rw-r--r-- | src/functional/predicates.h | 12 | ||||
-rw-r--r-- | src/graph/expression_operators.cpp | 29 | ||||
-rw-r--r-- | src/graph/expression_operators.h | 22 | ||||
-rw-r--r-- | src/graph/node_operators_binary.h | 20 | ||||
-rw-r--r-- | src/graph/node_operators_unary.h | 14 | ||||
-rw-r--r-- | src/layers/generic.h | 28 | ||||
-rw-r--r-- | src/models/encoder.h | 7 | ||||
-rw-r--r-- | src/models/transformer.h | 641 | ||||
-rw-r--r-- | src/rnn/attention.h | 8 | ||||
-rw-r--r-- | src/rnn/cells.h | 34 | ||||
-rw-r--r-- | src/tensors/cpu/tensor_operators.cpp | 24 | ||||
-rw-r--r-- | src/tensors/gpu/add.inc | 6 | ||||
-rw-r--r-- | src/tensors/gpu/element.inc | 10 | ||||
-rw-r--r-- | src/tensors/gpu/tensor_operators.cu | 26 | ||||
-rw-r--r-- | src/tests/tensor_test.cu | 10 |
15 files changed, 393 insertions, 498 deletions
diff --git a/src/functional/predicates.h b/src/functional/predicates.h index e129036e..51af38ad 100644 --- a/src/functional/predicates.h +++ b/src/functional/predicates.h @@ -85,8 +85,8 @@ UNARY(Exp, exp, expf(x)); UNARY(Abs, abs, fabs(x)); UNARY(Sqrt, sqrt, sqrtf(x)); UNARY(Neg, operator-, -x); -UNARY(Logit, - logit, +UNARY(Sigmoid, + sigmoid, x > 0 ? (1.f / (1.f + expf(-x))) : (expf(x) / (1.f + expf(x)))); BINARY(Plus, operator+, x + y); @@ -94,14 +94,14 @@ BINARY(Minus, operator-, x - y); BINARY(Mult, operator*, x* y); BINARY(Div, operator/, x / y); -BINARY(LogSum, - logsum, +BINARY(LogAddExp, + logaddexp, (/*if*/ (x < y) ? // Note: This may not be ideal for CUDA; cf. CNTK implementation (y + log1pf(expf(x - y))) /*else*/ : (x + log1pf(expf(y - x))))); -BINARY(Max, max, (x > y) ? y : x); // note: std::max not available on CUDA it seems -BINARY(Min, min, (x < y) ? y : x); +BINARY(Maximum, max, (x > y) ? y : x); // note: std::max not available on CUDA it seems +BINARY(Minimum, min, (x < y) ? y : x); UNARY(Negate, operator!, !x); BINARY(Eq, operator==, x == y); diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 91a7ffc2..1666357a 100644 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -15,8 +15,8 @@ Expr debug(Expr a, const std::string& message) { return a; } -Expr logit(Expr a) { - return Expression<LogitNodeOp>(a); +Expr sigmoid(Expr a) { // logistic function. Note: scipy name is expit() + return Expression<SigmoidNodeOp>(a); } Expr relu(Expr a) { @@ -80,17 +80,16 @@ Expr operator/(Expr a, Expr b) { return Expression<DivNodeOp>(a, b); } -// on names: stay close to Python/numpy? -Expr logsum(Expr a, Expr b) { // TODO: haggle over the name (logplus, logadd, expAddLog) - return Expression<LogSumNodeOp>(a, b); +Expr logaddexp(Expr a, Expr b) { + return Expression<LogAddExpNodeOp>(a, b); } -Expr max(Expr a, Expr b) { // TODO: haggle over the name (max vs. elementMax) - return Expression<MaxNodeOp>(a, b); +Expr maximum(Expr a, Expr b) { + return Expression<MaximumNodeOp>(a, b); } -Expr min(Expr a, Expr b) { // TODO: haggle over the name - return Expression<MinNodeOp>(a, b); +Expr minimum(Expr a, Expr b) { + return Expression<MinimumNodeOp>(a, b); } /*********************************************************/ @@ -387,7 +386,7 @@ Expr tanh(const std::vector<Expr>& nodes) { return Expression<TanhNodeOp>(nodes); } -Expr logit(const std::vector<Expr>&) { +Expr sigmoid(const std::vector<Expr>&) { ABORT("Not implemented"); } @@ -411,10 +410,10 @@ Expr square(Expr a) { return Expression<SquareNodeOp>(a); } -Expr layer_norm(Expr x, - Expr gamma, - Expr beta /*= nullptr*/, - float eps /*= 1e-9*/) { +Expr layerNorm(Expr x, + Expr gamma, + Expr beta /*= nullptr*/, + float eps /*= 1e-9*/) { std::vector<Expr> nodes = {x, gamma}; if(beta) nodes.push_back(beta); @@ -432,7 +431,7 @@ Expr highway(const std::string prefix, Expr x) { auto g = mlp::dense(x->graph()) ("prefix", prefix + "_highway_d1") ("dim", outDim) - ("activation", mlp::act::logit) + ("activation", mlp::act::sigmoid) .construct()->apply(x); auto relued = mlp::dense(x->graph()) ("prefix", prefix + "_highway_d2") diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index f1bea94a..cc07dafb 100644 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -5,10 +5,12 @@ namespace marian { Expr debug(Expr a, const std::string& message = ""); +typedef Expr(ActivationFunction) (Expr); + Expr plus(const std::vector<Expr>&); -Expr logit(Expr a); // aka sigmoid --BUGBUG: should be logistic(), not logit() -Expr logit(const std::vector<Expr>&); +Expr sigmoid(Expr a); // aka sigmoid --BUGBUG: should be logistic(), not sigmoid() +Expr sigmoid(const std::vector<Expr>&); Expr swish(Expr a); Expr swish(const std::vector<Expr>&); @@ -60,7 +62,7 @@ Expr operator/(Expr a, float b); // Expr pow(float a, Expr b); // Expr pow(Expr a, float b); -Expr logsum(Expr a, Expr b); // TODO: haggle over the name (logplus, logadd, expAddLog) +Expr logaddexp(Expr a, Expr b); Expr max(Expr a, Expr b); // TODO: haggle over the name (max vs. elementMax) @@ -128,7 +130,7 @@ Expr step(Expr a, int step, int axis); Expr sqrt(Expr a, float eps = 0.f); Expr square(Expr a); -Expr layer_norm(Expr x, Expr gamma, Expr beta = nullptr, float eps = 1e-9); +Expr layerNorm(Expr x, Expr gamma, Expr beta = nullptr, float eps = 1e-9); Expr highway(Expr y, Expr x, Expr t); Expr highway(const std::string prefix, Expr x); @@ -137,14 +139,18 @@ static inline Expr dropout(Expr x, Expr mask) { return x * mask; } -static inline Expr dropout(Expr x, float prob, Shape shape) { +static inline Expr dropout(Expr x, float dropProb, Shape shape) { + if (dropProb == 0) + return x; auto graph = x->graph(); - auto mask = graph->dropout(prob, shape); + auto mask = graph->dropout(dropProb, shape); return dropout(x, mask); } -static inline Expr dropout(Expr x, float prob) { - return dropout(x, prob, x->shape()); +static inline Expr dropout(Expr x, float dropProb) { + if (dropProb == 0) + return x; + return dropout(x, dropProb, x->shape()); } Expr shift(Expr, Shape, float padValue = 0); diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index cff55955..5b1f9865 100644 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -528,29 +528,29 @@ struct DivNodeOp : public ElementBinaryNodeOp { // const std::string type() { return "pow"; } //}; -struct LogSumNodeOp : public ElementBinaryNodeOp { - LogSumNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {} +struct LogAddExpNodeOp : public ElementBinaryNodeOp { + LogAddExpNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {} NodeOps forwardOps() { using namespace functional; return{ - NodeOp(Element(_1 = logsum(_2, _3), val_, child(0)->val(), child(1)->val())) }; + NodeOp(Element(_1 = logaddexp(_2, _3), val_, child(0)->val(), child(1)->val())) }; } NodeOps backwardOps() { using namespace functional; // d/dx (ln( exp(x) + (exp(y)) = exp(x) / (exp(x) + exp(y)) = 1 / (1 + exp(y-x)) = sigmoid(x-y) - return{ NodeOp(Add(_1 * logit(_2 - _3), child(0)->grad(), adj_, child(0)->val(), child(1)->val())), - NodeOp(Add(_1 * logit(_3 - _2), child(1)->grad(), adj_, child(0)->val(), child(1)->val())) }; + return{ NodeOp(Add(_1 * sigmoid(_2 - _3), child(0)->grad(), adj_, child(0)->val(), child(1)->val())), + NodeOp(Add(_1 * sigmoid(_3 - _2), child(1)->grad(), adj_, child(0)->val(), child(1)->val())) }; } // TODO: this is not a "type" (as in data type). It's an operator name. - const std::string type() { return "logsum"; } + const std::string type() { return "logaddexp"; } }; -struct MaxNodeOp : public ElementBinaryNodeOp { - MaxNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {} +struct MaximumNodeOp : public ElementBinaryNodeOp { + MaximumNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {} NodeOps forwardOps() { using namespace functional; @@ -569,8 +569,8 @@ struct MaxNodeOp : public ElementBinaryNodeOp { }; // TODO: lotsa code dup here! -struct MinNodeOp : public ElementBinaryNodeOp { - MinNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {} +struct MinimumNodeOp : public ElementBinaryNodeOp { + MinimumNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {} NodeOps forwardOps() { using namespace functional; diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index dda4dd03..fa6d25c7 100644 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -138,12 +138,12 @@ public: } }; -struct LogitNodeOp : public UnaryNodeOp { - LogitNodeOp(Expr a) : UnaryNodeOp(a) {} +struct SigmoidNodeOp : public UnaryNodeOp { + SigmoidNodeOp(Expr a) : UnaryNodeOp(a) {} NodeOps forwardOps() { using namespace functional; - return {NodeOp(Element(_1 = logit(_2), val_, child(0)->val()))}; + return {NodeOp(Element(_1 = sigmoid(_2), val_, child(0)->val()))}; } NodeOps backwardOps() { @@ -151,7 +151,7 @@ struct LogitNodeOp : public UnaryNodeOp { return {NodeOp(Add(_1 * _2 * (1.0f - _2), child(0)->grad(), adj_, val_))}; } - const std::string type() { return "logit"; } + const std::string type() { return "sigmoid"; } }; // struct Scalar2PowNodeOp : public UnaryNodeOp { @@ -350,13 +350,13 @@ struct SwishNodeOp : public UnaryNodeOp { NodeOps forwardOps() { using namespace functional; - return {NodeOp(Element(_1 = _2 * logit(_2), val_, child(0)->val()))}; + return {NodeOp(Element(_1 = _2 * sigmoid(_2), val_, child(0)->val()))}; } NodeOps backwardOps() { using namespace functional; // dJ/dx += dJ/df * ( f(x) + sigma(x) * (1 - f(x)) ) - return {NodeOp(Add(_1 * (_3 + logit(_2) * (1.f - _3)), + return {NodeOp(Add(_1 * (_3 + sigmoid(_2) * (1.f - _3)), child(0)->grad(), // dJ/dx adj_, // _1 := dJ/df child(0)->val(), // _2 := x @@ -936,8 +936,10 @@ public: Shape outShape = a->shape(); axis_ = outShape.axis(axis); +#if 0 // this check currently fails in translation; I think should not fail for step==0 for(int i = 0; i < axis_; ++i) ABORT_IF(outShape[i] != 1, "non-consecutive slices are presently not supported by step()"); +#endif outShape.set(axis_, 1); return outShape; diff --git a/src/layers/generic.h b/src/layers/generic.h index dcb9b955..8b19123b 100644 --- a/src/layers/generic.h +++ b/src/layers/generic.h @@ -7,7 +7,7 @@ namespace marian { namespace mlp { -enum struct act : int { linear, tanh, logit, ReLU, LeakyReLU, PReLU, swish }; +enum struct act : int { linear, tanh, sigmoid, ReLU, LeakyReLU, PReLU, swish }; } } @@ -50,8 +50,8 @@ public: auto name = opt<std::string>("prefix"); auto dim = opt<int>("dim"); - auto layerNorm = opt<bool>("layer-normalization", false); - auto nematusNorm = opt<bool>("nematus-normalization", false); + auto useLayerNorm = opt<bool>("layer-normalization", false); + auto useNematusNorm = opt<bool>("nematus-normalization", false); auto activation = opt<act>("activation", act::linear); auto g = graph_; @@ -71,8 +71,8 @@ public: {1, dim}, inits::zeros); - if(layerNorm) { - if(nematusNorm) { + if(useLayerNorm) { + if(useNematusNorm) { auto ln_s = g->param(name + "_ln_s" + num, {1, dim}, inits::from_value(1.f)); @@ -80,13 +80,13 @@ public: {1, dim}, inits::zeros); - outputs.push_back(layer_norm(affine(in, W, b), ln_s, ln_b, NEMATUS_LN_EPS)); + outputs.push_back(layerNorm(affine(in, W, b), ln_s, ln_b, NEMATUS_LN_EPS)); } else { auto gamma = g->param(name + "_gamma" + num, {1, dim}, inits::from_value(1.0)); - outputs.push_back(layer_norm(dot(in, W), gamma, b)); + outputs.push_back(layerNorm(dot(in, W), gamma, b)); } } else { @@ -96,14 +96,14 @@ public: } switch(activation) { - case act::linear: return plus(outputs); - case act::tanh: return tanh(outputs); - case act::logit: return logit(outputs); - case act::ReLU: return relu(outputs); + case act::linear: return plus(outputs); + case act::tanh: return tanh(outputs); + case act::sigmoid: return sigmoid(outputs); + case act::ReLU: return relu(outputs); case act::LeakyReLU: return leakyrelu(outputs); - case act::PReLU: return prelu(outputs); - case act::swish: return swish(outputs); - default: return plus(outputs); + case act::PReLU: return prelu(outputs); + case act::swish: return swish(outputs); + default: return plus(outputs); } }; diff --git a/src/models/encoder.h b/src/models/encoder.h index 6a0a62f5..f23be88b 100644 --- a/src/models/encoder.h +++ b/src/models/encoder.h @@ -12,9 +12,10 @@ protected: bool inference_{false}; size_t batchIndex_{0}; - virtual std::tuple<Expr, Expr> lookup(Ptr<ExpressionGraph> graph, - Expr srcEmbeddings, - Ptr<data::CorpusBatch> batch) { + //virtual --Note: This used to be virtual, but is never overridden. + std::tuple<Expr, Expr> lookup(Ptr<ExpressionGraph> graph, + Expr srcEmbeddings, + Ptr<data::CorpusBatch> batch) const { using namespace keywords; auto subBatch = (*batch)[batchIndex_]; diff --git a/src/models/transformer.h b/src/models/transformer.h index 4601bb9b..f7a80b2a 100644 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -1,5 +1,5 @@ // TODO: This is really a .CPP file now. I kept the .H name to minimize confusing git, until this is code-reviewed. -// This is meant to speed-up builds, and to support Ctrl-F7 to rebuild +// This is meant to speed-up builds, and to support Ctrl-F7 to rebuild. #pragma once @@ -14,17 +14,24 @@ namespace marian { -// collection of subroutines for Transformer implementation -class Transformer { +// shared base class for transformer-based encoder and decoder +template<class EncoderDecoderBase> +class Transformer : public EncoderDecoderBase { + typedef EncoderDecoderBase Base; +protected: + using Base::options_; using Base::inference_; + template <typename T> T opt(const std::string& key) const { Ptr<Options> options = options_; return options->get<T>(key); } // need to duplicate, since somehow using Base::opt is not working + + Ptr<ExpressionGraph> graph_; public: - static Expr TransposeTimeBatch(Expr input) { return transpose(input, {0, 2, 1, 3}); } + Transformer(Ptr<Options> options) + : EncoderDecoderBase(options) { + } - static Expr AddPositionalEmbeddings(Ptr<ExpressionGraph> graph, - Expr input, - int start = 0) { - using namespace keywords; + static Expr transposeTimeBatch(Expr input) { return transpose(input, {0, 2, 1, 3}); } - int dimEmb = input->shape()[-1]; + Expr addPositionalEmbeddings(Expr input, int start = 0) const { + int dimEmb = input->shape()[-1]; int dimWords = input->shape()[-3]; float num_timescales = dimEmb / 2; @@ -41,19 +48,17 @@ public: // shared across batch entries auto signal - = graph->constant({dimWords, 1, dimEmb}, inits::from_vector(vPos)); + = graph_->constant({dimWords, 1, dimEmb}, inits::from_vector(vPos)); return input + signal; } - Expr TriangleMask(Ptr<ExpressionGraph> graph, int length) { - using namespace keywords; - + Expr triangleMask(int length) const { // fill triangle mask std::vector<float> vMask(length * length, 0); for(int i = 0; i < length; ++i) for(int j = 0; j <= i; ++j) vMask[i * length + j] = 1.f; - return graph->constant({1, length, length}, inits::from_vector(vMask)); + return graph_->constant({1, length, length}, inits::from_vector(vMask)); } // convert multiplicative 1/0 mask to additive 0/-inf log mask, and transpose to match result of bdot() op in Attention() @@ -67,14 +72,14 @@ public: int dimModel = input->shape()[-1]; int dimSteps = input->shape()[-2]; int dimBatch = input->shape()[-3]; - int dimBeam = input->shape()[-4]; + int dimBeam = input->shape()[-4]; int dimDepth = dimModel / dimHeads; auto output = reshape(input, {dimBatch * dimBeam, dimSteps, dimHeads, dimDepth}); - return transpose(output, {0, 2, 1, 3}); + return transpose(output, {0, 2, 1, 3}); // [dimBatch*dimBeam, dimHeads, dimSteps, dimDepth] } static Expr JoinHeads(Expr input, int dimBeam = 1) { @@ -91,84 +96,77 @@ public: return reshape(output, {dimBeam, dimBatch, dimSteps, dimModel}); } - static Expr PreProcess(Ptr<ExpressionGraph> graph, - std::string prefix, - std::string ops, - Expr input, - float dropProb = 0.0f) { - using namespace keywords; + // like affine() but with built-in parameters, activation, and dropout + static inline + Expr dense(Expr x, std::string prefix, std::string suffix, int outDim, const std::function<Expr(Expr)>& actFn = nullptr, float dropProb = 0.0f) + { + auto graph = x->graph(); + + auto W = graph->param(prefix + "_W" + suffix, { x->shape()[-1], outDim }, inits::glorot_uniform); + auto b = graph->param(prefix + "_b" + suffix, { 1, outDim }, inits::zeros); + + x = affine(x, W, b); + if (actFn) + x = actFn(x); + if (dropProb) + x = dropout(x, dropProb); + return x; + } - int dimModel = input->shape()[-1]; + Expr layerNorm(Expr x, std::string prefix, std::string suffix = std::string()) const { + int dimModel = x->shape()[-1]; + auto scale = graph_->param(prefix + "_ln_scale" + suffix, { 1, dimModel }, inits::ones); + auto bias = graph_->param(prefix + "_ln_bias" + suffix, { 1, dimModel }, inits::zeros); + return marian::layerNorm(x, scale, bias, 1e-6); + } + + Expr preProcess(std::string prefix, std::string ops, Expr input, float dropProb = 0.0f) const { auto output = input; for(auto op : ops) { // dropout - if(op == 'd' && dropProb > 0.0f) { + if (op == 'd') output = dropout(output, dropProb); - } // layer normalization - if(op == 'n') { - auto scale = graph->param( - prefix + "_ln_scale_pre", {1, dimModel}, inits::ones); - auto bias = graph->param( - prefix + "_ln_bias_pre", {1, dimModel}, inits::zeros); - output = layer_norm(output, scale, bias, 1e-6); - } + else if (op == 'n') + output = layerNorm(output, prefix, "_pre"); + else + ABORT("Unknown pre-processing operation '%c'", op); } return output; } - static Expr PostProcess(Ptr<ExpressionGraph> graph, - std::string prefix, - std::string ops, - Expr input, - Expr prevInput, - float dropProb = 0.0f) { - using namespace keywords; - - int dimModel = input->shape()[-1]; + Expr postProcess(std::string prefix, std::string ops, Expr input, Expr prevInput, float dropProb = 0.0f) const { auto output = input; for(auto op : ops) { // dropout - if(op == 'd' && dropProb > 0.0f) { + if(op == 'd') output = dropout(output, dropProb); - } // skip connection - if(op == 'a') { + else if(op == 'a') output = output + prevInput; - } // highway connection - if(op == 'h') { - auto Wh = graph->param( - prefix + "_Wh", {dimModel, dimModel}, inits::glorot_uniform); - auto bh = graph->param(prefix + "_bh", {1, dimModel}, inits::zeros); - - auto t = affine(prevInput, Wh, bh); + else if(op == 'h') { + int dimModel = input->shape()[-1]; + auto t = dense(prevInput, prefix, /*suffix=*/"h", dimModel); output = highway(output, prevInput, t); } // layer normalization - if(op == 'n') { - auto scale - = graph->param(prefix + "_ln_scale", {1, dimModel}, inits::ones); - auto bias - = graph->param(prefix + "_ln_bias", {1, dimModel}, inits::zeros); - output = layer_norm(output, scale, bias, 1e-6); - } + else if(op == 'n') + output = layerNorm(output, prefix); + else + ABORT("Unknown pre-processing operation '%c'", op); } return output; } // determine the multiplicative-attention probability and performs the associative lookup as well // q, k, and v have already been split into multiple heads, undergone any desired linear transform. - static Expr Attention(Ptr<ExpressionGraph> graph, - Ptr<Options> options, - std::string prefix, - Expr q, // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: split vector dim] - Expr k, // [-4: batch size, -3: num heads, -2: max src length, -1: split vector dim] - Expr v, // [-4: batch size, -3: num heads, -2: max src length, -1: split vector dim] - Expr mask = nullptr, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length] - bool inference = false) { - using namespace keywords; - + Expr Attention(std::string prefix, + Expr q, // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: split vector dim] + Expr k, // [-4: batch size, -3: num heads, -2: max src length, -1: split vector dim] + Expr v, // [-4: batch size, -3: num heads, -2: max src length, -1: split vector dim] + Expr values, // [-4: beam depth, -3: batch size, -2: max kv length, -1: vector dim] + Expr mask = nullptr) const { // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length] int dk = k->shape()[-1]; // softmax over batched dot product of query and keys (applied over all @@ -179,8 +177,8 @@ public: int dimBeamK = k->shape()[-4]; int dimBeam = dimBeamQ / dimBeamK; if(dimBeam > 1) { // broadcast k and v into all beam elements --TODO: if we use a separate dimension, then this would be automatic at no memory cost - k = repeat(k, dimBeam, axis = -4); // [-4: beam depth * batch size, -3: num heads, -2: max src length, -1: split vector dim] - v = repeat(v, dimBeam, axis = -4); // [-4: beam depth * batch size, -3: num heads, -2: max src length, -1: split vector dim] + k = repeat(k, dimBeam, /*axis=*/-4); // [-4: beam depth * batch size, -3: num heads, -2: max src length, -1: split vector dim] + v = repeat(v, dimBeam, /*axis=*/-4); // [-4: beam depth * batch size, -3: num heads, -2: max src length, -1: split vector dim] } // now q, k, and v have the same first dims [-4: beam depth * batch size, -3: num heads, -2: max src or tgt length, -1: split vector dim] @@ -188,39 +186,38 @@ public: float scale = 1.0 / std::sqrt((float)dk); // scaling to avoid extreme values due to matrix multiplication auto z = bdot(q, k, false, true, scale); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length] + // mask out garbage beyond end of sequences + z = z + mask; + // take softmax along src sequence axis (-1) - auto zm = z + mask; - auto weights = softmax(zm); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length] + auto weights = softmax(z); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: max src length] // optional dropout for attention weights float dropProb - = inference ? 0 : options->get<float>("transformer-dropout-attention"); - - if(dropProb) - weights = dropout(weights, dropProb); + = inference_ ? 0 : opt<float>("transformer-dropout-attention"); + weights = dropout(weights, dropProb); // apply attention weights to values - return bdot(weights, v); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: split vector dim] + auto output = bdot(weights, v); // [-4: beam depth * batch size, -3: num heads, -2: max tgt length, -1: split vector dim] + return output; } - static Expr MultiHead(Ptr<ExpressionGraph> graph, - Ptr<Options> options, - std::string prefix, - int dimOut, - int dimHeads, - Expr q, // [-4: beam depth * batch size, -3: num heads, -2: max length, -1: split vector dim] - const std::vector<Expr> &keys, // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim] - const std::vector<Expr> &values, - const std::vector<Expr> &masks, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length] - bool inference = false) { + Expr MultiHead(std::string prefix, + int dimOut, + int dimHeads, + Expr q, // [-4: beam depth * batch size, -3: num heads, -2: max q length, -1: split vector dim] + const std::vector<Expr> &keys, // [-4: beam depth, -3: batch size, -2: max kv length, -1: vector dim] + const std::vector<Expr> &values, // [-4: beam depth, -3: batch size, -2: max kv length, -1: vector dim] + const std::vector<Expr> &masks) const { // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length] using namespace keywords; int dimModel = q->shape()[-1]; - auto Wq = graph->param( + auto Wq = graph_->param( prefix + "_Wq", {dimModel, dimModel}, inits::glorot_uniform); - auto bq = graph->param(prefix + "_bq", {1, dimModel}, inits::zeros); + auto bq = graph_->param(prefix + "_bq", {1, dimModel}, inits::zeros); auto qh = affine(q, Wq, bq); + qh = SplitHeads(qh, dimHeads); // [-4: beam depth * batch size, -3: num heads, -2: max length, -1: split vector dim] std::vector<Expr> outputs; @@ -229,15 +226,15 @@ public: if(i > 0) prefixProj += "_enc" + std::to_string(i + 1); - auto Wk = graph->param(prefixProj + "_Wk", + auto Wk = graph_->param(prefixProj + "_Wk", {dimModel, dimModel}, inits::glorot_uniform); - auto bk = graph->param( + auto bk = graph_->param( prefixProj + "_bk", {1, dimModel}, inits::zeros); - auto Wv = graph->param( + auto Wv = graph_->param( prefixProj + "_Wv", {dimModel, dimModel}, inits::glorot_uniform); - auto bv = graph->param(prefixProj + "_bv", {1, dimModel}, inits::zeros); + auto bv = graph_->param(prefixProj + "_bv", {1, dimModel}, inits::zeros); auto kh = affine(keys[i], Wk, bk); // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim] auto vh = affine(values[i], Wv, bv); @@ -247,7 +244,7 @@ public: // apply multi-head attention to downscaled inputs auto output - = Attention(graph, options, prefix, qh, kh, vh, masks[i], inference); // [-4: beam depth * batch size, -3: num heads, -2: max length, -1: split vector dim] + = Attention(prefix, qh, kh, vh, values[i], masks[i]); // [-4: beam depth * batch size, -3: num heads, -2: max length, -1: split vector dim] output = JoinHeads(output, q->shape()[-4]); // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim] outputs.push_back(output); @@ -261,244 +258,143 @@ public: int dimAtt = output->shape()[-1]; - bool project = !options->get<bool>("transformer-no-projection"); + bool project = !opt<bool>("transformer-no-projection"); if(project || dimAtt != dimOut) { auto Wo - = graph->param(prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform); - auto bo = graph->param(prefix + "_bo", {1, dimOut}, inits::zeros); + = graph_->param(prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform); + auto bo = graph_->param(prefix + "_bo", {1, dimOut}, inits::zeros); output = affine(output, Wo, bo); } return output; } - static Expr LayerAttention(Ptr<ExpressionGraph> graph, - Ptr<Options> options, - std::string prefix, - Expr input, - Expr keys, - Expr values, - Expr mask, - bool inference = false) { - return LayerAttention(graph, - options, - prefix, - input, - std::vector<Expr>{keys}, - std::vector<Expr>{values}, - std::vector<Expr>{mask}, - inference); + // TODO: the multi-input version below is never used. Can we remove it? + Expr LayerAttention(std::string prefix, Expr input, Expr keys, Expr values, Expr mask) const { + return LayerAttention_(prefix, input, std::vector<Expr>{keys}, std::vector<Expr>{values}, std::vector<Expr>{mask}); } - static Expr LayerAttention(Ptr<ExpressionGraph> graph, - Ptr<Options> options, - std::string prefix, - Expr input, // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim] - const std::vector<Expr> &keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] - const std::vector<Expr> &values, - const std::vector<Expr> &masks, // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length] - bool inference = false) { - using namespace keywords; - + Expr LayerAttention_(std::string prefix, + Expr input, // [-4: beam depth, -3: batch size, -2: max length, -1: vector dim] + const std::vector<Expr> &keys, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + const std::vector<Expr> &values, // ...? + const std::vector<Expr> &masks) const { // [-4: batch size, -3: num heads broadcast=1, -2: max length broadcast=1, -1: max length] int dimModel = input->shape()[-1]; - float dropProb = inference ? 0 : options->get<float>("transformer-dropout"); - auto opsPre = options->get<std::string>("transformer-preprocess"); - auto output = PreProcess(graph, prefix + "_Wo", opsPre, input, dropProb); + float dropProb = inference_ ? 0 : opt<float>("transformer-dropout"); + auto opsPre = opt<std::string>("transformer-preprocess"); + auto output = preProcess(prefix + "_Wo", opsPre, input, dropProb); - auto heads = options->get<int>("transformer-heads"); + auto heads = opt<int>("transformer-heads"); // multi-head self-attention over previous input - output = MultiHead(graph, - options, - prefix, - dimModel, - heads, - output, - keys, - values, - masks, - inference); - - auto opsPost = options->get<std::string>("transformer-postprocess"); - output - = PostProcess(graph, prefix + "_Wo", opsPost, output, input, dropProb); + output = MultiHead(prefix, dimModel, heads, output, keys, values, masks); + + auto opsPost = opt<std::string>("transformer-postprocess"); + output = postProcess(prefix + "_Wo", opsPost, output, input, dropProb); return output; } Expr DecoderLayerSelfAttention(rnn::State& decoderState, const rnn::State& prevDecoderState, - Ptr<ExpressionGraph> graph, - Ptr<Options> options, std::string prefix, Expr input, Expr selfMask, - int startPos, - bool inference = false) { - - using namespace keywords; - + int startPos) const { selfMask = transposedLogMask(selfMask); auto values = input; if(startPos > 0) { - values = concatenate({prevDecoderState.output, input}, - axis = -2); + values = concatenate({prevDecoderState.output, input}, /*axis=*/-2); } decoderState.output = values; // TODO: do not recompute matrix multiplies - return LayerAttention(graph, - options, - prefix, - input, - values, - values, - selfMask, - inference); + return LayerAttention(prefix, input, values, values, selfMask); } - Expr LayerFFN(Ptr<ExpressionGraph> graph, - Ptr<Options> options, - std::string prefix, - Expr input, - bool inference = false) { - using namespace keywords; + static inline + std::function<Expr(Expr)> activationByName(const std::string& actName) + { + if (actName == "relu") + return (ActivationFunction*)relu; + else if (actName == "swish") + return (ActivationFunction*)swish; + ABORT("Invalid activation name '{}'", actName); + } + Expr LayerFFN(std::string prefix, Expr input) const { int dimModel = input->shape()[-1]; - float dropProb = inference ? 0 : options->get<float>("transformer-dropout"); - auto opsPre = options->get<std::string>("transformer-preprocess"); - auto output = PreProcess(graph, prefix + "_ffn", opsPre, input, dropProb); + float dropProb = inference_ ? 0 : opt<float>("transformer-dropout"); + auto opsPre = opt<std::string>("transformer-preprocess"); + auto output = preProcess(prefix + "_ffn", opsPre, input, dropProb); - int dimFfn = options->get<int>("transformer-dim-ffn"); - int depthFfn = options->get<int>("transformer-ffn-depth"); - auto act = options->get<std::string>("transformer-ffn-activation"); + int dimFfn = opt<int>("transformer-dim-ffn"); + int depthFfn = opt<int>("transformer-ffn-depth"); + auto actFn = activationByName(opt<std::string>("transformer-ffn-activation")); float ffnDropProb - = inference ? 0 : options->get<float>("transformer-dropout-ffn"); + = inference_ ? 0 : opt<float>("transformer-dropout-ffn"); ABORT_IF(depthFfn < 1, "Filter depth {} is smaller than 1", depthFfn); - int i = 1; - int dimLast = dimModel; - for(; i < depthFfn; ++i) { - int dimFirst = i == 1 ? dimModel : dimFfn; - auto W = graph->param( - prefix + "_W" + std::to_string(i), {dimFirst, dimFfn}, inits::glorot_uniform); - auto b = graph->param(prefix + "_b" + std::to_string(i), {1, dimFfn}, inits::zeros); - - output = affine(output, W, b); - - if(act == "relu") - output = relu(output); - else - output = swish(output); - - if(ffnDropProb) - output = dropout(output, ffnDropProb); - - dimLast = dimFfn; - } - - auto W = graph->param( - prefix + "_W" + std::to_string(i), {dimLast, dimModel}, inits::glorot_uniform); - auto b = graph->param(prefix + "_b" + std::to_string(i), {1, dimModel}, inits::zeros); - - output = affine(output, W, b); + // the stack of FF layers + for(int i = 1; i < depthFfn; ++i) + output = dense(output, prefix, /*suffix=*/std::to_string(i), dimFfn, actFn, ffnDropProb); + output = dense(output, prefix, /*suffix=*/std::to_string(depthFfn), dimModel); - auto opsPost = options->get<std::string>("transformer-postprocess"); + auto opsPost = opt<std::string>("transformer-postprocess"); output - = PostProcess(graph, prefix + "_ffn", opsPost, output, input, dropProb); + = postProcess(prefix + "_ffn", opsPost, output, input, dropProb); return output; } - // Implementation of Average Attention Network Layer (ANN) from + // Implementation of Average Attention Network Layer (AAN) from // https://arxiv.org/pdf/1805.00631.pdf - Expr LayerAAN(Ptr<ExpressionGraph> graph, - Ptr<Options> options, - std::string prefix, - Expr x, - Expr y, - bool inference = false) { - using namespace keywords; - + Expr LayerAAN(std::string prefix, Expr x, Expr y) const { int dimModel = x->shape()[-1]; - float dropProb = inference ? 0 : options->get<float>("transformer-dropout"); - auto opsPre = options->get<std::string>("transformer-preprocess"); + float dropProb = inference_ ? 0 : opt<float>("transformer-dropout"); + auto opsPre = opt<std::string>("transformer-preprocess"); - y = PreProcess(graph, prefix + "_ffn", opsPre, y, dropProb); + y = preProcess(prefix + "_ffn", opsPre, y, dropProb); // FFN - int dimAan = options->get<int>("transformer-dim-aan"); - int depthAan = options->get<int>("transformer-aan-depth"); - auto act = options->get<std::string>("transformer-aan-activation"); - float aanDropProb = inference ? 0 : options->get<float>("transformer-dropout-ffn"); - - int i = 1; - int dimLast = dimModel; - for(; i < depthAan; ++i) { - int dimFirst = i == 1 ? dimModel : dimAan; - auto W = graph->param( - prefix + "_W" + std::to_string(i), {dimFirst, dimAan}, inits::glorot_uniform); - auto b = graph->param(prefix + "_b" + std::to_string(i), {1, dimAan}, inits::zeros); - - y = affine(y, W, b); - - if(act == "relu") - y = relu(y); - else - y = swish(y); - - if(aanDropProb) - y = dropout(y, aanDropProb); - - dimLast = dimAan; - } - - if(dimLast != dimModel) { - auto W = graph->param( - prefix + "_W" + std::to_string(i), {dimLast, dimModel}, inits::glorot_uniform); - auto b = graph->param(prefix + "_b" + std::to_string(i), {1, dimModel}, inits::zeros); - y = affine(y, W, b); - } - - bool noGate = options->get<bool>("transformer-aan-nogate"); + int dimAan = opt<int>("transformer-dim-aan"); + int depthAan = opt<int>("transformer-aan-depth"); + auto actFn = activationByName(opt<std::string>("transformer-aan-activation")); + float aanDropProb = inference_ ? 0 : opt<float>("transformer-dropout-ffn"); + + // the stack of AAN layers + for(int i = 1; i < depthAan; ++i) + y = dense(y, prefix, /*suffix=*/std::to_string(i), dimAan, actFn, aanDropProb); + if(y->shape()[-1] != dimModel) // bring it back to the desired dimension if needed + y = dense(y, prefix, std::to_string(depthAan), dimModel); + + bool noGate = opt<bool>("transformer-aan-nogate"); if(!noGate) { - auto Wi = graph->param(prefix + "_Wi", {dimModel, dimModel}, inits::glorot_uniform); - auto bi = graph->param(prefix + "_bi", {1, dimModel}, inits::zeros); - - auto Wf = graph->param(prefix + "_Wf", {dimModel, dimModel}, inits::glorot_uniform); - auto bf = graph->param(prefix + "_bf", {1, dimModel}, inits::zeros); - - auto gi = logit(affine(x, Wi, bi)); - auto gf = logit(affine(y, Wf, bf)); + auto gi = dense(x, prefix, /*suffix=*/"i", dimModel, (ActivationFunction*)sigmoid); + auto gf = dense(y, prefix, /*suffix=*/"f", dimModel, (ActivationFunction*)sigmoid); y = gi * x + gf * y; } - auto opsPost = options->get<std::string>("transformer-postprocess"); - y = PostProcess(graph, prefix + "_ffn", opsPost, y, x, dropProb); + auto opsPost = opt<std::string>("transformer-postprocess"); + y = postProcess(prefix + "_ffn", opsPost, y, x, dropProb); return y; } - // Implementation of Average Attention Network Layer (ANN) from + // Implementation of Average Attention Network Layer (AAN) from // https://arxiv.org/pdf/1805.00631.pdf // Function wrapper using decoderState as input. Expr DecoderLayerAAN(rnn::State& decoderState, const rnn::State& prevDecoderState, - Ptr<ExpressionGraph> graph, - Ptr<Options> options, std::string prefix, Expr input, Expr selfMask, - int startPos, - bool inference = false) { - - using namespace keywords; - + int startPos) const { auto output = input; if(startPos > 0) { // we are decoding at a position after 0 @@ -508,27 +404,28 @@ public: // we are training or scoring, because there is no history and // the context is larger than a single time step. We do not need // to average batch with only single words. - selfMask = selfMask / sum(selfMask, axis=-1); + selfMask = selfMask / sum(selfMask, /*axis=*/-1); output = bdot(selfMask, output); } - decoderState.output = output; + decoderState.output = output; // BUGBUG: mutable? - return LayerAAN(graph, options, prefix, input, output, inference); + return LayerAAN(prefix, input, output); } }; -class EncoderTransformer : public EncoderBase, public Transformer { +class EncoderTransformer : public Transformer<EncoderBase> { public: - EncoderTransformer(Ptr<Options> options) : EncoderBase(options) {} + EncoderTransformer(Ptr<Options> options) : Transformer(options) {} - Expr WordEmbeddings(Ptr<ExpressionGraph> graph, - Ptr<data::CorpusBatch> batch) { + // returns the embedding matrix based on options + // And based on batchIndex_. + Expr wordEmbeddings(int subBatchIndex) const { // standard encoder word embeddings - int dimVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_]; + int dimVoc = opt<std::vector<int>>("dim-vocabs")[subBatchIndex]; int dimEmb = opt<int>("dim-emb"); - auto embFactory = embedding(graph)("dimVocab", dimVoc)("dimEmb", dimEmb); + auto embFactory = embedding(graph_)("dimVocab", dimVoc)("dimEmb", dimEmb); if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all")) embFactory("prefix", "Wemb"); @@ -541,7 +438,7 @@ public: if(options_->has("embedding-vectors")) { auto embFiles = opt<std::vector<std::string>>("embedding-vectors"); embFactory // - ("embFile", embFiles[batchIndex_]) // + ("embFile", embFiles[subBatchIndex]) // ("normalization", opt<bool>("embedding-normalization")); } @@ -549,19 +446,22 @@ public: } Ptr<EncoderState> build(Ptr<ExpressionGraph> graph, - Ptr<data::CorpusBatch> batch) { - using namespace keywords; + Ptr<data::CorpusBatch> batch) override { + graph_ = graph; + return apply(batch); + } + Ptr<EncoderState> apply(Ptr<data::CorpusBatch> batch) const { int dimEmb = opt<int>("dim-emb"); int dimBatch = batch->size(); int dimSrcWords = (*batch)[batchIndex_]->batchWidth(); - auto embeddings = WordEmbeddings(graph, batch); + auto embeddings = wordEmbeddings(batchIndex_); // embedding matrix, considering tying and some other options // embed the source words in the batch Expr batchEmbeddings, batchMask; std::tie(batchEmbeddings, batchMask) - = EncoderBase::lookup(graph, embeddings, batch); + = EncoderBase::lookup(graph_, embeddings, batch); // apply dropout over source words float dropoutSrc = inference_ ? 0 : opt<float>("dropout-src"); @@ -573,45 +473,38 @@ public: // according to paper embeddings are scaled up by \sqrt(d_m) auto scaledEmbeddings = std::sqrt(dimEmb) * batchEmbeddings; - scaledEmbeddings = AddPositionalEmbeddings(graph, scaledEmbeddings); + scaledEmbeddings = addPositionalEmbeddings(scaledEmbeddings); // reorganize batch and timestep scaledEmbeddings = atleast_nd(scaledEmbeddings, 4); batchMask = atleast_nd(batchMask, 4); - auto layer = TransposeTimeBatch(scaledEmbeddings); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + auto layer = transposeTimeBatch(scaledEmbeddings); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] auto layerMask - = reshape(TransposeTimeBatch(batchMask), {1, dimBatch, 1, dimSrcWords}); // [-4: beam depth=1, -3: batch size, -2: vector dim=1, -1: max length] + = reshape(transposeTimeBatch(batchMask), {1, dimBatch, 1, dimSrcWords}); // [-4: beam depth=1, -3: batch size, -2: vector dim=1, -1: max length] auto opsEmb = opt<std::string>("transformer-postprocess-emb"); float dropProb = inference_ ? 0 : opt<float>("transformer-dropout"); - layer = PreProcess(graph, prefix_ + "_emb", opsEmb, layer, dropProb); + layer = preProcess(prefix_ + "_emb", opsEmb, layer, dropProb); layerMask = transposedLogMask(layerMask); // [-4: batch size, -3: 1, -2: vector dim=1, -1: max length] // apply encoder layers auto encDepth = opt<int>("enc-depth"); for(int i = 1; i <= encDepth; ++i) { - layer = LayerAttention(graph, - options_, - prefix_ + "_l" + std::to_string(i) + "_self", - layer, - layer, - layer, - layerMask, - inference_); - - layer = LayerFFN(graph, - options_, - prefix_ + "_l" + std::to_string(i) + "_ffn", - layer, - inference_); + layer = LayerAttention(prefix_ + "_l" + std::to_string(i) + "_self", + layer, // query + layer, // keys + layer, // values + layerMask); + + layer = LayerFFN(prefix_ + "_l" + std::to_string(i) + "_ffn", layer); } // restore organization of batch and time steps. This is currently required // to make RNN-based decoders and beam search work with this. We are looking // into making this more natural. - auto context = TransposeTimeBatch(layer); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim] + auto context = transposeTimeBatch(layer); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim] return New<EncoderState>(context, batchMask, batch); } @@ -655,27 +548,62 @@ public: } }; -class DecoderTransformer : public DecoderBase, public Transformer { -protected: +class DecoderTransformer : public Transformer<DecoderBase> { +private: Ptr<mlp::MLP> output_; +private: + void LazyCreateOutputLayer(std::string prefix) + { + if(output_) // create it lazily + return; + + int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_]; + + auto layerOut = mlp::output(graph_) // + ("prefix", prefix_ + "_ff_logit_out") // + ("dim", dimTrgVoc); + + if(opt<bool>("tied-embeddings") || opt<bool>("tied-embeddings-all")) { + std::string tiedPrefix = prefix_ + "_Wemb"; + if(opt<bool>("tied-embeddings-all") || opt<bool>("tied-embeddings-src")) + tiedPrefix = "Wemb"; + layerOut.tie_transposed("W", tiedPrefix); + } + + if(shortlist_) + layerOut.set_shortlist(shortlist_); + + // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim] + // assemble layers into MLP and apply to embeddings, decoder context and + // aligned source context + output_ = mlp::mlp(graph_) // + .push_back(layerOut) // + .construct(); + } + public: - DecoderTransformer(Ptr<Options> options) : DecoderBase(options) {} + DecoderTransformer(Ptr<Options> options) : Transformer(options) {} virtual Ptr<DecoderState> startState( Ptr<ExpressionGraph> graph, Ptr<data::CorpusBatch> batch, - std::vector<Ptr<EncoderState>> &encStates) { + std::vector<Ptr<EncoderState>> &encStates) override { + graph_ = graph; rnn::States startStates; return New<TransformerState>(startStates, nullptr, encStates, batch); } virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph, - Ptr<DecoderState> state) { - using namespace keywords; + Ptr<DecoderState> state) override { + ABORT_IF(graph != graph_, "An inconsistent graph parameter was passed to step()."); + LazyCreateOutputLayer(prefix_ + "_ff_logit_out"); + return step(state); + } - auto embeddings = state->getTargetEmbeddings(); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim] - auto decoderMask = state->getTargetMask(); // [max length, batch size, 1] --this is a hypothesis + Ptr<DecoderState> step(Ptr<DecoderState> state) const { + auto embeddings = state->getTargetEmbeddings(); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim] + auto decoderMask = state->getTargetMask(); // [max length, batch size, 1] --this is a hypothesis // dropout target words float dropoutTrg = inference_ ? 0 : opt<float>("dropout-trg"); @@ -700,24 +628,24 @@ public: int startPos = state->getPosition(); scaledEmbeddings - = AddPositionalEmbeddings(graph, scaledEmbeddings, startPos); + = addPositionalEmbeddings(scaledEmbeddings, startPos); scaledEmbeddings = atleast_nd(scaledEmbeddings, 4); // reorganize batch and timestep - auto query = TransposeTimeBatch(scaledEmbeddings); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + auto query = transposeTimeBatch(scaledEmbeddings); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] auto opsEmb = opt<std::string>("transformer-postprocess-emb"); float dropProb = inference_ ? 0 : opt<float>("transformer-dropout"); - query = PreProcess(graph, prefix_ + "_emb", opsEmb, query, dropProb); + query = preProcess(prefix_ + "_emb", opsEmb, query, dropProb); int dimTrgWords = query->shape()[-2]; - int dimBatch = query->shape()[-3]; - auto selfMask = TriangleMask(graph, dimTrgWords); // [ (1,) 1, max length, max length] + int dimBatch = query->shape()[-3]; + auto selfMask = triangleMask(dimTrgWords); // [ (1,) 1, max length, max length] if(decoderMask) { decoderMask = atleast_nd(decoderMask, 4); // [ 1, max length, batch size, 1 ] - decoderMask = reshape(TransposeTimeBatch(decoderMask),// [ 1, batch size, max length, 1 ] + decoderMask = reshape(transposeTimeBatch(decoderMask),// [ 1, batch size, max length, 1 ] {1, dimBatch, 1, dimTrgWords}); // [ 1, batch size, 1, max length ] selfMask = selfMask * decoderMask; // if(dimBeam > 1) @@ -731,17 +659,17 @@ public: auto encoderContext = encoderState->getContext(); auto encoderMask = encoderState->getMask(); - encoderContext = TransposeTimeBatch(encoderContext); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] + encoderContext = transposeTimeBatch(encoderContext); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] int dimSrcWords = encoderContext->shape()[-2]; int dims = encoderMask->shape().size(); encoderMask = atleast_nd(encoderMask, 4); - encoderMask = reshape(TransposeTimeBatch(encoderMask), + encoderMask = reshape(transposeTimeBatch(encoderMask), {1, dimBatch, 1, dimSrcWords}); encoderMask = transposedLogMask(encoderMask); if(dimBeam > 1) - encoderMask = repeat(encoderMask, dimBeam, axis = -4); + encoderMask = repeat(encoderMask, dimBeam, /*axis=*/ -4); encoderContexts.push_back(encoderContext); encoderMasks.push_back(encoderMask); @@ -749,97 +677,54 @@ public: rnn::States prevDecoderStates = state->getStates(); rnn::States decoderStates; - // apply layers - for(int i = 1; i <= opt<int>("dec-depth"); ++i) { + // apply decoder layers + auto decDepth = opt<int>("dec-depth"); + for(int i = 1; i <= decDepth; ++i) { rnn::State decoderState; rnn::State prevDecoderState; if(prevDecoderStates.size() > 0) prevDecoderState = prevDecoderStates[i - 1]; + // self-attention std::string layerType = opt<std::string>("transformer-decoder-autoreg", "self-attention"); - if(layerType == "self-attention") { - query = DecoderLayerSelfAttention(decoderState, - prevDecoderState, - graph, - options_, - prefix_ + "_l" + std::to_string(i) + "_self", - query, - selfMask, - startPos, - inference_); - } else if(layerType == "average-attention") { - query = DecoderLayerAAN(decoderState, - prevDecoderState, - graph, - options_, - prefix_ + "_l" + std::to_string(i) + "_aan", - query, - selfMask, - startPos, - inference_); - } else { + if(layerType == "self-attention") + query = DecoderLayerSelfAttention(decoderState, prevDecoderState, prefix_ + "_l" + std::to_string(i) + "_self", query, selfMask, startPos); + else if(layerType == "average-attention") + query = DecoderLayerAAN(decoderState, prevDecoderState, prefix_ + "_l" + std::to_string(i) + "_aan", query, selfMask, startPos); + else ABORT("Unknown auto-regressive layer type in transformer decoder {}", layerType); - } decoderStates.push_back(decoderState); + // source-target attention // Iterate over multiple encoders and simply stack the attention blocks if(encoderContexts.size() > 0) { - for(int j = 0; j < encoderContexts.size(); ++j) { + for(int j = 0; j < encoderContexts.size(); ++j) { // multiple encoders are applied one after another std::string prefix = prefix_ + "_l" + std::to_string(i) + "_context"; if(j > 0) prefix += "_enc" + std::to_string(j + 1); - query = LayerAttention(graph, - options_, - prefix, + query = LayerAttention(prefix, query, - encoderContexts[j], - encoderContexts[j], - encoderMasks[j], - inference_); + encoderContexts[j], // keys + encoderContexts[j], // values + encoderMasks[j]); } } - query = LayerFFN(graph, // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] - options_, - prefix_ + "_l" + std::to_string(i) + "_ffn", - query, - inference_); + query = LayerFFN(prefix_ + "_l" + std::to_string(i) + "_ffn", query); // [-4: beam depth=1, -3: batch size, -2: max length, -1: vector dim] } - auto decoderContext = TransposeTimeBatch(query); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim] + auto decoderContext = transposeTimeBatch(query); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vector dim] //************************************************************************// - if(!output_) { - int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_]; - - auto layerOut = mlp::output(graph) // - ("prefix", prefix_ + "_ff_logit_out") // - ("dim", dimTrgVoc); - - if(opt<bool>("tied-embeddings") || opt<bool>("tied-embeddings-all")) { - std::string tiedPrefix = prefix_ + "_Wemb"; - if(opt<bool>("tied-embeddings-all") || opt<bool>("tied-embeddings-src")) - tiedPrefix = "Wemb"; - layerOut.tie_transposed("W", tiedPrefix); - } - - if(shortlist_) - layerOut.set_shortlist(shortlist_); + // final feed-forward layer (output) + Expr logits = output_->apply(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim] - // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim] - // assemble layers into MLP and apply to embeddings, decoder context and - // aligned source context - output_ = mlp::mlp(graph) // - .push_back(layerOut) // - .construct(); - } - - Expr logits = output_->apply(decoderContext); + int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_]; // return unormalized(!) probabilities auto nextState = New<TransformerState>(decoderStates, @@ -851,7 +736,9 @@ public: } // helper function for guided alignment - virtual const std::vector<Expr> getAlignments(int i = 0) { return {}; } + virtual const std::vector<Expr> getAlignments(int i = 0) { + return {}; + } void clear() { output_ = nullptr; diff --git a/src/rnn/attention.h b/src/rnn/attention.h index fc09e9b7..92a89e99 100644 --- a/src/rnn/attention.h +++ b/src/rnn/attention.h @@ -80,7 +80,7 @@ public: W_comb_att_lnb_ = graph->param( prefix + "_W_comb_att_lnb", {1, dimEncState}, inits::zeros); - mappedContext_ = layer_norm(affine(contextDropped_, Ua_, ba_), + mappedContext_ = layerNorm(affine(contextDropped_, Ua_, ba_), Wc_att_lns_, Wc_att_lnb_, NEMATUS_LN_EPS); @@ -91,7 +91,7 @@ public: prefix + "_att_gamma2", {1, dimEncState}, inits::from_value(1.0)); mappedContext_ - = layer_norm(dot(contextDropped_, Ua_), gammaContext_, ba_); + = layerNorm(dot(contextDropped_, Ua_), gammaContext_, ba_); } } else { @@ -121,10 +121,10 @@ public: auto mappedState = dot(recState, Wa_); if(layerNorm_) if(nematusNorm_) - mappedState = layer_norm( + mappedState = layerNorm( mappedState, W_comb_att_lns_, W_comb_att_lnb_, NEMATUS_LN_EPS); else - mappedState = layer_norm(mappedState, gammaState_); + mappedState = layerNorm(mappedState, gammaState_); auto attReduce = attOps(va_, mappedContext_, mappedState); diff --git a/src/rnn/cells.h b/src/rnn/cells.h index b813d3ac..f3299144 100644 --- a/src/rnn/cells.h +++ b/src/rnn/cells.h @@ -81,7 +81,7 @@ public: auto xW = dot(input, W_); if(layerNorm_) - xW = layer_norm(xW, gamma1_); + xW = layerNorm(xW, gamma1_); return {xW}; } @@ -94,7 +94,7 @@ public: stateDropped = dropout(recState, dropMaskS_); auto sU = dot(stateDropped, U_); if(layerNorm_) - sU = layer_norm(sU, gamma2_); + sU = layerNorm(sU, gamma2_); Expr output; if(xWs.empty()) @@ -207,7 +207,7 @@ public: auto xW = dot(input, W_); if(layerNorm_) - xW = layer_norm(xW, gamma1_); + xW = layerNorm(xW, gamma1_); return {xW}; } @@ -222,7 +222,7 @@ public: auto sU = dot(stateDropped, U_); if(layerNorm_) - sU = layer_norm(sU, gamma2_); + sU = layerNorm(sU, gamma2_); Expr xW; if(xWs.empty()) { @@ -406,8 +406,8 @@ public: W = affine(input, W_, b_); Wx = affine(input, Wx_, bx_); } - W = layer_norm(W, W_lns_, W_lnb_, NEMATUS_LN_EPS); - Wx = layer_norm(Wx, Wx_lns_, Wx_lnb_, NEMATUS_LN_EPS); + W = layerNorm(W, W_lns_, W_lnb_, NEMATUS_LN_EPS); + Wx = layerNorm(Wx, Wx_lns_, Wx_lnb_, NEMATUS_LN_EPS); xW = concatenate({W, Wx}, keywords::axis = -1); } else { @@ -434,8 +434,8 @@ public: Expr Ux; // Temp_2_ in Amun if(encoder_) { - U = layer_norm(dot(stateDropped, U_), U_lns_, U_lnb_, NEMATUS_LN_EPS); - Ux = layer_norm( + U = layerNorm(dot(stateDropped, U_), U_lns_, U_lnb_, NEMATUS_LN_EPS); + Ux = layerNorm( dot(stateDropped, Ux_), Ux_lns_, Ux_lnb_, NEMATUS_LN_EPS); if(transition_) { @@ -449,8 +449,8 @@ public: U = dot(stateDropped, U_); Ux = dot(stateDropped, Ux_); } - U = layer_norm(U, U_lns_, U_lnb_, NEMATUS_LN_EPS); - Ux = layer_norm(Ux, Ux_lns_, Ux_lnb_, NEMATUS_LN_EPS); + U = layerNorm(U, U_lns_, U_lnb_, NEMATUS_LN_EPS); + Ux = layerNorm(Ux, Ux_lns_, Ux_lnb_, NEMATUS_LN_EPS); } sU = concatenate({U, Ux}, keywords::axis = -1); @@ -555,7 +555,7 @@ public: auto xW = dot(input, W_); if(layerNorm_) - xW = layer_norm(xW, gamma1_); + xW = layerNorm(xW, gamma1_); return {xW}; } @@ -573,7 +573,7 @@ public: auto sU = dot(recStateDropped, U_); if(layerNorm_) - sU = layer_norm(sU, gamma2_); + sU = layerNorm(sU, gamma2_); Expr xW; if(xWs.empty()) { @@ -648,7 +648,7 @@ public: auto xWs = CellType::applyInput({input}); auto xWm = affine(input, Wm_, bwm_); if(CellType::layerNorm_) - xWm = layer_norm(xWm, gamma1m_); + xWm = layerNorm(xWm, gamma1m_); xWs.push_back(xWm); return xWs; @@ -662,7 +662,7 @@ public: auto sUm = affine(state.output, Um_, bm_); if(CellType::layerNorm_) - sUm = layer_norm(sUm, gamma2m_); + sUm = layerNorm(sUm, gamma2m_); auto mstate = xWm * sUm; @@ -757,9 +757,9 @@ public: auto sUo = affine(recState, Uo_, bo_); auto sUc = affine(recState, Uc_, bc_); - auto f = logit(xWs[0] + sUf); - auto i = logit(xWs[1] + sUi); - auto o = logit(xWs[2] + sUo); + auto f = sigmoid(xWs[0] + sUf); + auto i = sigmoid(xWs[1] + sUi); + auto o = sigmoid(xWs[2] + sUo); auto c = tanh(xWs[3] + sUc); auto nextCellState = f * cellState + i * c; diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp index 1a007cf7..7310102d 100644 --- a/src/tensors/cpu/tensor_operators.cpp +++ b/src/tensors/cpu/tensor_operators.cpp @@ -13,7 +13,7 @@ namespace marian { namespace cpu { -inline float stableLogit(float x) { +inline float stableSigmoid(float x) { if(x >= 0) { float z = expf(-x); return 1.0 / (1.0 + z); @@ -458,12 +458,12 @@ void GRUFastForward(Tensor out_, std::vector<Tensor> inputs, bool final) { #pragma omp simd for(int i = 0; i < cols; ++i) { - // @TODO: stable logit - float r = stableLogit(xWrow[i] + sUrow[i] + b[i]); + // @TODO: stable sigmoid + float r = stableSigmoid(xWrow[i] + sUrow[i] + b[i]); int k = i + cols; - float z = stableLogit(xWrow[k] + sUrow[k] + b[k]); + float z = stableSigmoid(xWrow[k] + sUrow[k] + b[k]); int l = i + 2 * cols; float h; @@ -515,8 +515,8 @@ void GRUFastBackward(std::vector<Tensor> outputs, int k = i + cols; int l = i + 2 * cols; - float r = stableLogit(rowXW[i] + rowSU[i] + b[i]); - float z = stableLogit(rowXW[k] + rowSU[k] + b[k]); + float r = stableSigmoid(rowXW[i] + rowSU[i] + b[i]); + float z = stableSigmoid(rowXW[k] + rowSU[k] + b[k]); float h; if(final) @@ -931,10 +931,10 @@ void LSTMCellForward(Tensor out_, std::vector<Tensor> inputs) { const float* sUrow = sU + j * cols * 4; for(int i = 0; i < cols; ++i) { - float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]); + float gf = stableSigmoid(xWrow[i] + sUrow[i] + b[i]); int k = i + cols; - float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]); + float gi = stableSigmoid(xWrow[k] + sUrow[k] + b[k]); int l = i + 2 * cols; float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]); @@ -964,7 +964,7 @@ void LSTMOutputForward(Tensor out_, std::vector<Tensor> inputs) { for(int i = 0; i < cols; ++i) { int k = i + 3 * cols; - float go = stableLogit(xWrow[k] + sUrow[k] + b[k]); + float go = stableSigmoid(xWrow[k] + sUrow[k] + b[k]); rowOut[i] = go * std::tanh(rowCell[i]); } @@ -1004,10 +1004,10 @@ void LSTMCellBackward(std::vector<Tensor> outputs, const float* rowAdj = adj + j * cols; for(int i = 0; i < cols; ++i) { - float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]); + float gf = stableSigmoid(xWrow[i] + sUrow[i] + b[i]); int k = i + cols; - float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]); + float gi = stableSigmoid(xWrow[k] + sUrow[k] + b[k]); int l = i + 2 * cols; float gc = std::tanh(xWrow[l] + sUrow[l] + b[l]); @@ -1089,7 +1089,7 @@ void LSTMOutputBackward(std::vector<Tensor> outputs, for(int i = 0; i < cols; ++i) { int k = i + 3 * cols; - float go = stableLogit(xWrow[k] + sUrow[k] + b[k]); + float go = stableSigmoid(xWrow[k] + sUrow[k] + b[k]); float t = std::tanh(rowCell[i]); diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc index a026906b..27f35b95 100644 --- a/src/tensors/gpu/add.inc +++ b/src/tensors/gpu/add.inc @@ -8,7 +8,7 @@ template void Add<BinaryFunctor<elem::Mult, Capture, Assignee<1>>, marian::Tenso template void Add<BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<1>, Capture>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); template void Add<BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Neg, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Mult, Assignee<3>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Neg, Assignee<1>>, Assignee<2>>, BinaryFunctor<elem::Mult, Assignee<3>, Assignee<3>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add<UnaryFunctor<elem::Neg, Assignee<1>>, marian::Tensor>(UnaryFunctor<elem::Neg, Assignee<1>>, float, marian::Tensor, marian::Tensor); -template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Logit, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Logit, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Plus, Assignee<3>, BinaryFunctor<elem::Mult, UnaryFunctor<elem::Sigmoid, Assignee<2>>, BinaryFunctor<elem::Minus, Capture, Assignee<3>>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, float, marian::Tensor, marian::Tensor, marian::Tensor); template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div, Capture, Assignee<2>>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::Div, Capture, Assignee<2>>>, float, marian::Tensor, marian::Tensor, marian::Tensor); template void Add<BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, BinaryFunctor<elem::sPReLUBack, Assignee<2>, Capture>>, float, marian::Tensor, marian::Tensor, marian::Tensor); @@ -17,8 +17,8 @@ template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Assignee<1 template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2>>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Bump, Assignee<1>, Capture>, Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, marian::Tensor, marian::Tensor); template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Lt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Geq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); -template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Logit, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Logit, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<3>, Assignee<2>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); -template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Logit, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Logit, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); +template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add<BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); diff --git a/src/tensors/gpu/element.inc b/src/tensors/gpu/element.inc index 279ae7b3..02d269f3 100644 --- a/src/tensors/gpu/element.inc +++ b/src/tensors/gpu/element.inc @@ -1,10 +1,10 @@ using namespace functional; -template void Element<Assign<Var<1>, BinaryFunctor<elem::Mult, Assignee<2>, UnaryFunctor<elem::Logit, Assignee<2>>>>, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Mult, Assignee<2>, UnaryFunctor<elem::Logit, Assignee<2>>>>, marian::Tensor, marian::Tensor); +template void Element<Assign<Var<1>, BinaryFunctor<elem::Mult, Assignee<2>, UnaryFunctor<elem::Sigmoid, Assignee<2>>>>, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Mult, Assignee<2>, UnaryFunctor<elem::Sigmoid, Assignee<2>>>>, marian::Tensor, marian::Tensor); template void Element<Assign<Var<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, marian::Tensor>(Assign<Var<1>, UnaryFunctor<elem::Exp, Assignee<2>>>, marian::Tensor, marian::Tensor); template void Element<Assign<Var<1>, UnaryFunctor<elem::Log, Assignee<2>>>, marian::Tensor>(Assign<Var<1>, UnaryFunctor<elem::Log, Assignee<2>>>, marian::Tensor, marian::Tensor); template void Element<Assign<Var<1>, BinaryFunctor<elem::sPReLU, Assignee<2>, Capture>>, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::sPReLU, Assignee<2>, Capture>>, marian::Tensor, marian::Tensor); template void Element<Assign<Var<1>, UnaryFunctor<elem::sReLU, Assignee<2>>>, marian::Tensor>(Assign<Var<1>, UnaryFunctor<elem::sReLU, Assignee<2>>>, marian::Tensor, marian::Tensor); -template void Element<Assign<Var<1>, UnaryFunctor<elem::Logit, Assignee<2>>>, marian::Tensor>(Assign<Var<1>, UnaryFunctor<elem::Logit, Assignee<2>>>, marian::Tensor, marian::Tensor); +template void Element<Assign<Var<1>, UnaryFunctor<elem::Sigmoid, Assignee<2>>>, marian::Tensor>(Assign<Var<1>, UnaryFunctor<elem::Sigmoid, Assignee<2>>>, marian::Tensor, marian::Tensor); template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>>, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, Assignee<2>>>>, marian::Tensor, marian::Tensor); template void Element<Assign<Var<1>, BinaryFunctor<elem::Plus, Assignee<1>, BinaryFunctor<elem::Mult, Assignee<2>, Assignee<2>>>>, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Plus, Assignee<1>, BinaryFunctor<elem::Mult, Assignee<2>, Assignee<2>>>>, marian::Tensor, marian::Tensor); template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Div, Capture, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, Assignee<2>>, Capture>>, Assignee<3>>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Div, Capture, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, Assignee<2>>, Capture>>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor); @@ -38,6 +38,6 @@ template void Element<Assign<Var<1>, TernaryFunctor<elem::IfThenElse, BinaryFunc template void Element<Assign<Var<1>, TernaryFunctor<elem::IfThenElse, BinaryFunctor<elem::Leq, UnaryFunctor<elem::Abs, Assignee<1> >, Capture>, Capture, Assignee<1> > >>(Assign<Var<1>, TernaryFunctor<elem::IfThenElse, BinaryFunctor<elem::Leq, UnaryFunctor<elem::Abs, Assignee<1> >, Capture>, Capture, Assignee<1> > >, marian::Tensor); template void Element<Assign<Var<1>, TernaryFunctor<elem::IfThenElse, BinaryFunctor<elem::Leq, UnaryFunctor<elem::Abs, Assignee<2> >, Capture>, Capture, Capture> >, marian::Tensor >(Assign<Var<1>, TernaryFunctor<elem::IfThenElse, BinaryFunctor<elem::Leq, UnaryFunctor<elem::Abs, Assignee<2> >, Capture>, Capture, Capture> >, marian::Tensor, marian::Tensor); template void Element<Assign<Var<1>, BinaryFunctor<elem::Clip, Assignee<2>, Capture>>, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Clip, Assignee<2>, Capture>>, marian::Tensor, marian::Tensor); -template void Element<Assign<Var<1>, BinaryFunctor<elem::LogSum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::LogSum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor); -template void Element<Assign<Var<1>, BinaryFunctor<elem::Max, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Max, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor); -template void Element<Assign<Var<1>, BinaryFunctor<elem::Min, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Min, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor); +template void Element<Assign<Var<1>, BinaryFunctor<elem::LogAddExp, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::LogAddExp, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor); +template void Element<Assign<Var<1>, BinaryFunctor<elem::Maximum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Maximum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor); +template void Element<Assign<Var<1>, BinaryFunctor<elem::Minimum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor>(Assign<Var<1>, BinaryFunctor<elem::Minimum, Assignee<2>, Assignee<3>>>, marian::Tensor, marian::Tensor, marian::Tensor); diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu index e20dd8b4..87861a1c 100644 --- a/src/tensors/gpu/tensor_operators.cu +++ b/src/tensors/gpu/tensor_operators.cu @@ -17,7 +17,7 @@ struct isnan_test { __host__ __device__ bool operator()(const float a) const { return isnan(a); } }; -__device__ inline float stableLogit(float x) { +__device__ inline float stableSigmoid(float x) { if(x >= 0) { float z = expf(-x); return 1.0 / (1.0 + z); @@ -847,11 +847,11 @@ __global__ void gGRUFastForward(float* out, for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { - float r = stableLogit(xWrow[i] + sUrow[i] + b[i]); + float r = stableSigmoid(xWrow[i] + sUrow[i] + b[i]); int k = i + cols; - float z = stableLogit(xWrow[k] + sUrow[k] + b[k]); + float z = stableSigmoid(xWrow[k] + sUrow[k] + b[k]); int l = i + 2 * cols; float h; @@ -922,8 +922,8 @@ __global__ void gGRUFastBackward(float* outState, int k = i + cols; int l = i + 2 * cols; - float r = stableLogit(rowXW[i] + rowSU[i] + b[i]); - float z = stableLogit(rowXW[k] + rowSU[k] + b[k]); + float r = stableSigmoid(rowXW[i] + rowSU[i] + b[i]); + float z = stableSigmoid(rowXW[k] + rowSU[k] + b[k]); float h; if(final) @@ -1653,10 +1653,10 @@ __global__ void gLSTMCellForward(float* out, for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { - float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]); + float gf = stableSigmoid(xWrow[i] + sUrow[i] + b[i]); int k = i + cols; - float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]); + float gi = stableSigmoid(xWrow[k] + sUrow[k] + b[k]); int l = i + 2 * cols; float gc = tanhf(xWrow[l] + sUrow[l] + b[l]); @@ -1709,7 +1709,7 @@ __global__ void gLSTMOutputForward(float* out, int i = tid + threadIdx.x; if(i < cols) { int k = i + 3 * cols; - float go = stableLogit(xWrow[k] + sUrow[k] + b[k]); + float go = stableSigmoid(xWrow[k] + sUrow[k] + b[k]); rowOut[i] = go * tanhf(rowCell[i]); } @@ -1766,10 +1766,10 @@ __global__ void gLSTMCellBackward(float* outCell, for(int tid = 0; tid < cols; tid += blockDim.x) { int i = tid + threadIdx.x; if(i < cols) { - float gf = stableLogit(xWrow[i] + sUrow[i] + b[i]); + float gf = stableSigmoid(xWrow[i] + sUrow[i] + b[i]); int k = i + cols; - float gi = stableLogit(xWrow[k] + sUrow[k] + b[k]); + float gi = stableSigmoid(xWrow[k] + sUrow[k] + b[k]); int l = i + 2 * cols; float gc = tanhf(xWrow[l] + sUrow[l] + b[l]); @@ -1866,7 +1866,7 @@ __global__ void gLSTMOutputBackward(float* outCell, int i = tid + threadIdx.x; if(i < cols) { int k = i + 3 * cols; - float go = stableLogit(xWrow[k] + sUrow[k] + b[k]); + float go = stableSigmoid(xWrow[k] + sUrow[k] + b[k]); float t = tanhf(rowCell[i]); @@ -1923,7 +1923,7 @@ __global__ void gHighwayForward(float* out, for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { - float sigma = stableLogit(t[index]); + float sigma = stableSigmoid(t[index]); out[index] = in1[index] * sigma + in2[index] * (1.f - sigma); } } @@ -1955,7 +1955,7 @@ __global__ void gHighwayBackward(float* out1, for(int bid = 0; bid < length; bid += blockDim.x * gridDim.x) { int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < length) { - float sigma = stableLogit(t[index]); + float sigma = stableSigmoid(t[index]); out1[index] = sigma * adj[index]; out2[index] = (1.f - sigma) * adj[index]; outt[index] diff --git a/src/tests/tensor_test.cu b/src/tests/tensor_test.cu index 5f54ccfa..72cdc276 100644 --- a/src/tests/tensor_test.cu +++ b/src/tests/tensor_test.cu @@ -186,10 +186,10 @@ //} // //template <typename X> -//struct Logit { +//struct Sigmoid { // X x; // -// __HD__ Logit(X _x) : x(_x) {} +// __HD__ Sigmoid(X _x) : x(_x) {} // // template <typename ...Args> // __HDI__ float operator()(Args&&... args) { @@ -198,8 +198,8 @@ //}; // //template <class X> -//__HDI__ Logit<X> logit(X x) { -// return Logit<X>(x); +//__HDI__ Sigmoid<X> logit(X x) { +// return Sigmoid<X>(x); //} // ///******************************************************************************/ @@ -392,7 +392,7 @@ __HDI__ auto simple(Mult<X, Y> f)->decltype(cut(simple(f.x) * simple(f.y))) { //} // //template <typename X, int N> -//__HDI__ auto grad(Logit<X> f, Var<N> g)->decltype(f * (C<1>() - f) * grad(f.x, g)) { +//__HDI__ auto grad(Sigmoid<X> f, Var<N> g)->decltype(f * (C<1>() - f) * grad(f.x, g)) { // return f * (C<1>() - f) * grad(f.x, g); //} |