Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/CMakeLists.txt1
-rwxr-xr-xsrc/common/config_parser.cpp7
-rwxr-xr-xsrc/common/utils.cpp5
-rwxr-xr-xsrc/common/utils.h1
-rwxr-xr-xsrc/functional/tmp.h27
-rwxr-xr-xsrc/graph/expression_operators.cpp29
-rwxr-xr-xsrc/graph/expression_operators.h9
-rwxr-xr-xsrc/graph/node_operators_binary.h4
-rwxr-xr-xsrc/graph/node_operators_unary.h130
-rwxr-xr-xsrc/layers/constructors.h6
-rwxr-xr-xsrc/layers/generic.cpp339
-rwxr-xr-xsrc/layers/generic.h110
-rwxr-xr-xsrc/layers/loss.cpp15
-rwxr-xr-xsrc/models/decoder.h77
-rwxr-xr-xsrc/models/s2s.h1
-rwxr-xr-xsrc/models/transformer.h62
-rwxr-xr-xsrc/models/transformer_factory.h1
-rwxr-xr-xsrc/tensors/cpu/add.h37
-rwxr-xr-xsrc/tensors/gpu/add.cu80
-rwxr-xr-x[-rw-r--r--]src/tensors/gpu/add.h2
-rwxr-xr-x[-rw-r--r--]src/tensors/gpu/add.inc10
-rwxr-xr-xsrc/tensors/gpu/element.inc1
-rwxr-xr-xsrc/tensors/tensor.h4
-rwxr-xr-xsrc/tensors/tensor_operators.h20
-rwxr-xr-xsrc/tests/operator_tests.cpp54
-rwxr-xr-xvs/Marian.vcxproj1
-rwxr-xr-xvs/Marian.vcxproj.filters3
27 files changed, 770 insertions, 266 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8c757f10..4c16aa1d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -47,6 +47,7 @@ add_library(marian STATIC
graph/node_initializers.cpp
layers/convolution.cpp
+ layers/generic.cpp
layers/loss.cpp
layers/weight.cpp
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 70a30a5f..77604285 100755
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -30,6 +30,7 @@ const std::set<std::string> PATHS = {
"train-sets",
"vocabs",
"embedding-vectors",
+ "embedding-factors",
"valid-sets",
"valid-script-path",
"valid-log",
@@ -385,6 +386,8 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
"Fix source embeddings. Affects all encoders");
cli.add<bool>("--embedding-fix-trg",
"Fix target embeddings. Affects all decoders");
+ cli.add_nondefault<std::vector<std::string>>("--embedding-factors",
+ "Paths to (factor map, factor list) file for factored embeddings");
cli.add<bool>("--multi-node",
"Enable asynchronous multi-node training through MPI (and legacy sync if combined with --sync-sgd)");
@@ -466,6 +469,8 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
"stdout");
cli.add<std::vector<std::string>>("--vocabs,-v",
"Paths to vocabulary files have to correspond to --input");
+ cli.add_nondefault<std::vector<std::string>>("--embedding-factors",
+ "Paths to (factor map, factor list) file for factored embeddings");
// decoding options
cli.add<size_t>("--beam-size,-b",
"Beam size used during search with validating translator",
@@ -528,6 +533,8 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
"Paths to vocabulary files have to correspond to --train-sets. "
"If this parameter is not supplied we look for vocabulary files source.{yml,json} and target.{yml,json}. "
"If these files do not exists they are created");
+ cli.add_nondefault<std::vector<std::string>>("--embedding-factors",
+ "Paths to (factor map, factor list) file for factored embeddings");
cli.add<bool>("--n-best",
"Score n-best list instead of plain text corpus");
cli.add<std::string>("--n-best-feature",
diff --git a/src/common/utils.cpp b/src/common/utils.cpp
index bde78835..252afa54 100755
--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@@ -144,6 +144,11 @@ std::string withCommas(size_t n) {
return res;
}
+bool beginsWith(const std::string& text, const std::string& prefix) {
+ return text.size() >= prefix.size()
+ && !text.compare(0, prefix.size(), prefix);
+}
+
bool endsWith(const std::string& text, const std::string& suffix) {
return text.size() >= suffix.size()
&& !text.compare(text.size() - suffix.size(), suffix.size(), suffix);
diff --git a/src/common/utils.h b/src/common/utils.h
index 94113a0e..d76d07fa 100755
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -36,6 +36,7 @@ std::string exec(const std::string& cmd);
std::pair<std::string, int> hostnameAndProcessId();
std::string withCommas(size_t n);
+bool beginsWith(const std::string& text, const std::string& prefix);
bool endsWith(const std::string& text, const std::string& suffix);
std::string toUpper(const std::string& s);
diff --git a/src/functional/tmp.h b/src/functional/tmp.h
index 08383660..7c8f6fa1 100755
--- a/src/functional/tmp.h
+++ b/src/functional/tmp.h
@@ -118,55 +118,56 @@ __HDI__ float apply(Functor functor,
/******************************************************************************/
+// @TODO: Rename this. It is a reduction loop.
template <size_t n, size_t N, size_t K>
struct Loop {
- template <class Functor>
+ template <class Functor, class AggFunctor>
__HDI__ static float result(
- Functor functor,
+ Functor functor, float aggInit, AggFunctor aggFunctor,
functional::Array<functional::Tensor<float>, K>& in,
const functional::Array<int, K>& pAcc,
const functional::Array<int, N>& length,
const functional::Array<int, N>& dim) {
- float sum = 0;
+ float agg = aggInit;
functional::Array<int, K> acc;
for(int i = 0; i < length[N - n]; ++i) {
for(size_t j = 0; j < K; ++j) {
acc[j] = pAcc[j] + (dim[N - n] + i) * in[j].shape().bstride(N - n);
}
- sum += Loop<n - 1, N, K>::result(functor, in, acc, length, dim);
+ agg = aggFunctor(agg, Loop<n - 1, N, K>::result(functor, aggInit, aggFunctor, in, acc, length, dim));
}
- return sum;
+ return agg;
}
};
template <size_t N, size_t K>
struct Loop<1, N, K> {
- template <class Functor>
+ template <class Functor, class AggFunctor>
__HDI__ static float result(
- Functor functor,
+ Functor functor, float aggInit, AggFunctor aggFunctor,
functional::Array<functional::Tensor<float>, K>& in,
const functional::Array<int, K>& pAcc,
const functional::Array<int, N>& length,
const functional::Array<int, N>& dim) {
- float sum = 0;
+ float agg = aggInit;
functional::Array<int, K> acc;
for(int i = 0; i < length[N - 1]; ++i) {
for(size_t j = 0; j < K; ++j) {
acc[j] = pAcc[j] + (dim[N - 1] + i) * in[j].shape().bstride(N - 1);
}
- sum += apply<K>(functor, in, acc);
+ agg = aggFunctor(agg, apply<K>(functor, in, acc));
}
- return sum;
+ return agg;
}
};
-template <size_t N, size_t K, class Functor>
-__HDI__ float loops(Functor functor,
+template <size_t N, size_t K, class Functor, class AggFunctor>
+__HDI__ float loops(Functor functor, float aggInit, AggFunctor aggFunctor,
functional::Array<functional::Tensor<float>, K>& in,
const functional::Array<int, N>& length,
const functional::Array<int, N>& dim) {
functional::Array<int, K> acc = {0};
- return Loop<N, N, K>::result(functor, in, acc, length, dim);
+ return Loop<N, N, K>::result(functor, aggInit, aggFunctor, in, acc, length, dim);
}
} // namespace functional
} // namespace marian
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 826bd9f0..6a07611d 100755
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -306,11 +306,36 @@ Expr slice(Expr a, int axis, Slice slice) { // numpy __getslice__ semantics, but
}
Expr sum(Expr a, int ax) {
- return Expression<SumNodeOp>(a, ax);
+ return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::sum);
}
Expr mean(Expr a, int ax) {
- return Expression<MeanNodeOp>(a, ax);
+ return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::mean);
+}
+
+Expr std(Expr a, int ax) {
+ return Expression<ReduceNodeOp>(a - mean(a,ax), ax, ReduceNodeOpCode::rms);
+}
+
+Expr var(Expr a, int ax) {
+ return Expression<ReduceNodeOp>(a - mean(a, ax), ax, ReduceNodeOpCode::meanSqr);
+}
+
+Expr max(Expr a, int ax) {
+ return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::max);
+}
+
+Expr min(Expr a, int ax) {
+ return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::min);
+}
+
+Expr prod(Expr a, int ax) {
+ return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::prod);
+}
+
+// log(sum(exp(a)))
+Expr logsumexp(Expr a, int ax) {
+ return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::logSumExp);
}
Expr scalar_product(Expr a, Expr b, int ax) {
diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
index 58149bde..78aed834 100755
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@@ -173,6 +173,13 @@ static inline Expr narrow(Expr a, int axis, size_t start, size_t length) { // Py
/*********************************************************/
Expr sum(Expr a, int ax = 0);
+Expr mean(Expr a, int ax = 0);
+Expr std(Expr a, int ax);
+Expr var(Expr a, int ax);
+Expr max(Expr a, int ax);
+Expr min(Expr a, int ax);
+Expr prod(Expr a, int ax);
+Expr logsumexp(Expr a, int ax);
Expr softmax(Expr x, int axis = -1);
@@ -182,8 +189,6 @@ Expr softmax(Expr a, Expr zeroOneMask, int axis = -1);
Expr logsoftmax(Expr a);
-Expr mean(Expr a, int ax = 0);
-
Expr cross_entropy(Expr a, Expr b);
Expr scalar_product(Expr a, Expr b, int ax = 0);
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index 10e2ca76..7d090823 100755
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -432,8 +432,8 @@ public:
ABORT_IF(S_offsets->shape()[0] - 1 != S_shape[0],
"Sparse matrix offset vector has incorrect size");
auto outShape = D->shape();
- ABORT_IF(S_shape[transS == swapOperands ? 1 : 0] != outShape[-(int)swapOperands],
- "Matrix product requires inner dimensions to match");
+ ABORT_IF(S_shape[transS == swapOperands ? 1 : 0] != outShape[-(int)swapOperands],
+ "Matrix product requires inner dimensions to match");
outShape.set(-(int)swapOperands, S_shape[transS != swapOperands]);
return outShape;
}
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index 7dbaec46..6dd90faf 100755
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -412,20 +412,75 @@ struct LogSoftmaxNodeOp : public UnaryNodeOp {
const std::string type() override { return "logsoftmax"; }
};
-struct SumNodeOp : public UnaryNodeOp {
+enum class ReduceNodeOpCode {
+ sum, mean, rms, meanSqr, min, max, prod, logSumExp
+};
+
+struct ReduceNodeOp : public UnaryNodeOp {
int axis_;
+ ReduceNodeOpCode opCode_;
+ int reducedDim_; // dimension of axis being reduced, e.g. used in mean()
- SumNodeOp(Expr a, int axis) : UnaryNodeOp(a, newShape(a, axis)) {}
+ ReduceNodeOp(Expr a, int axis, ReduceNodeOpCode opCode)
+ : UnaryNodeOp(a, newShape(a, axis)), opCode_(opCode)
+ {
+ reducedDim_ = a->shape()[axis]; // e.g. used in mean()
+ ABORT_IF(reducedDim_ != a->shape().elements() / shape().elements(), "bug in determining reducedDim");
+ }
NodeOps forwardOps() override {
using namespace functional;
- return {NodeOp(Reduce(_1, val_, child(0)->val()))};
+ switch (opCode_) {
+ case ReduceNodeOpCode::sum:
+ return {NodeOp(Reduce(_1, val_, child(0)->val()))};
+ case ReduceNodeOpCode::mean:
+ return {NodeOp(Reduce(_1, 1.0f / (float)reducedDim_, val_, child(0)->val()))};
+ case ReduceNodeOpCode::rms:
+ return {NodeOp(Reduce(_1 * _1, 1.0f / (float)reducedDim_, val_, child(0)->val());
+ Element(_1 = sqrt(_1), val_))};
+ case ReduceNodeOpCode::meanSqr:
+ return {NodeOp(Reduce(_1 * _1, 1.0f / (float)reducedDim_, val_, child(0)->val()))};
+ case ReduceNodeOpCode::min:
+ return {NodeOp(Reduce(_1, min(_1,_2), std::numeric_limits<float>::max(), val_, child(0)->val()))};
+ case ReduceNodeOpCode::max:
+ return {NodeOp(Reduce(_1, max(_1,_2), std::numeric_limits<float>::lowest(), val_, child(0)->val()))};
+ case ReduceNodeOpCode::prod:
+ return {NodeOp(Reduce(_1, _1 * _2, 1.0f, val_, child(0)->val()))};
+ case ReduceNodeOpCode::logSumExp:
+ return {NodeOp(Reduce(_1, logaddexp(_1,_2), std::numeric_limits<float>::lowest(), val_, child(0)->val()))};
+ default:
+ ABORT("Unexpected reduction op-code {}", (int)opCode_);
+ }
}
NodeOps backwardOps() override {
using namespace functional;
- return {NodeOp(Add(_1, child(0)->grad(), adj_))};
+ switch (opCode_) {
+ case ReduceNodeOpCode::sum:
+ return {NodeOp(Add(_1, child(0)->grad(), adj_))};
+ case ReduceNodeOpCode::mean:
+ return {NodeOp(Add(_1, 1.0f / (float)reducedDim_, child(0)->grad(), adj_))};
+ case ReduceNodeOpCode::rms: // WARNING: UNTESTED!!
+ // y = (sum_j x_j^2)^0.5
+ // dJ/dx_i = dJ/dy * 0.5 (sum_j x_j^2)^-0.5 * 2 x_i = dJ/dy * x_i / y --@REVIEW: is this correct?
+ // @TODO: do we need protection against div by 0? L'hospital rule?
+ return {NodeOp(Add(_1 * _2 / _3, child(0)->grad(), adj_, child(0)->val(), val_))};
+ case ReduceNodeOpCode::meanSqr: // WARNING: UNTESTED!!
+ // y = sum_j x_j^2
+ // dJ/dx_i = dJ/dy * sum_j dx_j^2/dx_i = dJ/dy * 2 dx_i --@REVIEW: is this correct?
+ return {NodeOp(Add(_1 * 2.0f * _2, child(0)->grad(), adj_, child(0)->val()))};
+ case ReduceNodeOpCode::min: // WARNING: UNTESTED!!
+ case ReduceNodeOpCode::max: // WARNING: UNTESTED!!
+ // adj_ gets routed into the min/max value --@REVIEW: is this correct?
+ return {NodeOp(Add((_1 == _2) * _3, child(0)->grad(), child(0)->val(), val_, adj_))};
+ case ReduceNodeOpCode::logSumExp:
+ // y = log(sum_j exp(x_j))
+ // dJ/dx_i = dJ/dy * 1/(sum_j exp(x_j)) exp(x_i) = dJ/dy * exp(x_i - y)) --@REVIEW: is this correct?
+ return {NodeOp(Add(_1 * exp(_2 - _3), child(0)->grad(), adj_, child(0)->val(), val_))};
+ default:
+ ABORT("Unexpected reduction op-code {}", (int)opCode_);
+ }
}
Shape newShape(Expr a, int axis) {
@@ -436,66 +491,27 @@ struct SumNodeOp : public UnaryNodeOp {
return shape;
}
- const std::string type() override { return "sum"; }
-
- const std::string color() override { return "orange"; }
-
- virtual size_t hash() override {
- if(!hash_) {
- hash_ = NaryNodeOp::hash();
- util::hash_combine(hash_, axis_);
+ const std::string type() override {
+ switch (opCode_) {
+ case ReduceNodeOpCode::sum: return "sum";
+ case ReduceNodeOpCode::mean: return "mean";
+ case ReduceNodeOpCode::rms: return "rms";
+ case ReduceNodeOpCode::meanSqr: return "meanSqr";
+ case ReduceNodeOpCode::min: return "min";
+ case ReduceNodeOpCode::max: return "max";
+ case ReduceNodeOpCode::prod: return "prod";
+ case ReduceNodeOpCode::logSumExp: return "logSumExp";
+ default: ABORT("Unexpected reduction op-code {}", (int)opCode_);
}
- return hash_;
}
- virtual bool equal(Expr node) override {
- if(!NaryNodeOp::equal(node))
- return false;
- Ptr<SumNodeOp> cnode = std::dynamic_pointer_cast<SumNodeOp>(node);
- if(!cnode)
- return false;
- if(axis_ != cnode->axis_)
- return false;
- return true;
- }
-};
-
-struct MeanNodeOp : public UnaryNodeOp {
- int axis_;
-
- MeanNodeOp(Expr a, int axis) : UnaryNodeOp(a, newShape(a, axis)) {}
-
- NodeOps forwardOps() override {
- using namespace functional;
- int left = child(0)->shape().elements() / val_->shape().elements();
- float scale = 1.f / left;
-
- return {NodeOp(Reduce(_1, scale, val_, child(0)->val()))};
- }
-
- NodeOps backwardOps() override {
- using namespace functional;
- int left = child(0)->shape().elements() / val_->shape().elements();
- float scale = 1.f / left;
-
- return {NodeOp(Add(_1, scale, child(0)->grad(), adj_))};
- }
-
- Shape newShape(Expr a, int axis) {
- Shape shape = a->shape();
- axis_ = shape.axis(axis);
- shape.set(axis_, 1);
- return shape;
- }
-
- const std::string type() override { return "mean"; }
-
const std::string color() override { return "orange"; }
virtual size_t hash() override {
if(!hash_) {
hash_ = NaryNodeOp::hash();
util::hash_combine(hash_, axis_);
+ util::hash_combine(hash_, (int)opCode_);
}
return hash_;
}
@@ -503,10 +519,10 @@ struct MeanNodeOp : public UnaryNodeOp {
virtual bool equal(Expr node) override {
if(!NaryNodeOp::equal(node))
return false;
- Ptr<MeanNodeOp> cnode = std::dynamic_pointer_cast<MeanNodeOp>(node);
+ Ptr<ReduceNodeOp> cnode = std::dynamic_pointer_cast<ReduceNodeOp>(node);
if(!cnode)
return false;
- if(axis_ != cnode->axis_)
+ if(axis_ != cnode->axis_ || opCode_ != cnode->opCode_)
return false;
return true;
}
diff --git a/src/layers/constructors.h b/src/layers/constructors.h
index d0ac3487..5ed7f3f5 100755
--- a/src/layers/constructors.h
+++ b/src/layers/constructors.h
@@ -147,4 +147,10 @@ public:
// @TODO: change naming convention.
typedef Accumulator<MLPFactory> mlp;
} // namespace mlp
+
+typedef ConstructingFactory<Embedding> EmbeddingFactory;
+typedef ConstructingFactory<ULREmbedding> ULREmbeddingFactory;
+
+typedef Accumulator<EmbeddingFactory> embedding;
+typedef Accumulator<ULREmbeddingFactory> ulr_embedding;
} // namespace marian
diff --git a/src/layers/generic.cpp b/src/layers/generic.cpp
new file mode 100755
index 00000000..2941b689
--- /dev/null
+++ b/src/layers/generic.cpp
@@ -0,0 +1,339 @@
+#include "marian.h"
+
+#include "layers/generic.h"
+
+using std::size_t; // not sure why this is needed
+
+namespace marian {
+ struct CSRSparseTensor { // simplistic for now
+ Shape shape;
+ Expr values; // [k_i..k_{i+1}-1] -> value at [i,j]
+ Expr indices; // [k_i..k_{i+1}-1] -> j of non-null value
+ Expr offsets; // [i] -> k_i
+ };
+
+ class EmbeddingFactorMapping {
+ public:
+ struct CSRData {
+ Shape shape;
+ std::vector<float> weights;
+ std::vector<IndexType> indices;
+ std::vector<IndexType> offsets;
+ };
+ // mapPath = path to file with entries in order of vocab entries of the form
+ // WORD FACTOR1 FACTOR2 FACTOR3...
+ // listPath = path to file that lists all FACTOR names
+ // vocab = original vocabulary
+ // Note: The WORD field in the map file is redundant. It is required for consistency checking only.
+ // Factors are grouped
+ // - user specifies list-factor prefixes; all factors beginning with that prefix are in the same group
+ // - factors within a group as multi-class and normalized that way
+ // - groups of size 1 are interpreted as sigmoids, multiply with P(u) / P(u-1)
+ // - one prefix must not contain another
+ // - all factors not matching a prefix get lumped into yet another class (the lemmas)
+ // - factor vocab must be sorted such that all groups are consecutive
+ // - result of Output layer is nevertheless logits, not a normalized probability, due to the sigmoid entries
+ EmbeddingFactorMapping(Ptr<Options> options) : factorVocab_(New<Options>(), 0) {
+ std::vector<std::string> paths = options->get<std::vector<std::string>>("embedding-factors");
+ ABORT_IF(paths.size() != 2, "--embedding-factors expects two paths");
+ auto mapPath = paths[0];
+ auto factorVocabPath = paths[1];
+ auto vocabPath = options->get<std::string>("vocab");
+
+ // Note: We misuse the Vocab class a little.
+ // Specifically, it means that the factorVocab_ must contain </s> and "<unk>".
+ Vocab vocab(New<Options>(), 0);
+ vocab.load(vocabPath);
+ factorVocab_.load(factorVocabPath);
+ Word numFactors = (Word)factorVocab_.size();
+
+ // load and parse factorMap
+ factorMap_.resize(vocab.size());
+ factorRefCounts_.resize(numFactors);
+ std::vector<std::string> tokens;
+ io::InputFileStream in(mapPath);
+ std::string line;
+ size_t numTotalFactors = 0;
+ for (Word v = 0; io::getline(in, line); v++) {
+ tokens.clear(); // @BUGBUG: should be done in split()
+ utils::splitAny(line, tokens, " \t");
+ ABORT_IF(tokens.size() < 2 || tokens.front() != vocab[v], "Factor map must list words in same order as vocab, and have at least one factor per word", mapPath);
+ for (size_t i = 1; i < tokens.size(); i++) {
+ auto u = factorVocab_[tokens[i]];
+ auto& m = factorMap_[v];
+ m.push_back(u);
+ factorRefCounts_[u]++;
+ }
+ numTotalFactors += tokens.size() - 1;
+ }
+ LOG(info, "[embedding] Factored-embedding map read with total/unique of {}/{} factors for {} words", numTotalFactors, numFactors, vocab.size());
+
+ // form groups
+ // @TODO: hard-coded for these initial experiments
+ std::vector<std::string> groupPrefixes = {
+ "@C",
+ "@GL", "@GR"
+ };
+ groupPrefixes.insert(groupPrefixes.begin(), "(unassigned)"); // first group is fallback for normal words (the string is only used for messages)
+ size_t numGroups = groupPrefixes.size();
+ factorGroups_.resize(numFactors, 0);
+ for (size_t g = 1; g < groupPrefixes.size(); g++) { // set group labels; what does not match any prefix will stay in group 0
+ const auto& groupPrefix = groupPrefixes[g];
+ for (Word u = 0; u < numFactors; u++)
+ if (utils::beginsWith(factorVocab_[u], groupPrefix)) {
+ ABORT_IF(factorGroups_[u] != 0, "Factor {} matches multiple groups, incl. {}", factorVocab_[u], groupPrefix);
+ factorGroups_[u] = g;
+ }
+ }
+ groupRanges_.resize(numGroups, { SIZE_MAX, (size_t)0 });
+ std::vector<size_t> groupCounts(numGroups); // number of group members
+ for (Word u = 0; u < numFactors; u++) { // determine ranges; these must be non-overlapping, verified via groupCounts
+ auto g = factorGroups_[u];
+ if (groupRanges_[g].first > u)
+ groupRanges_[g].first = u;
+ if (groupRanges_[g].second < u + 1)
+ groupRanges_[g].second = u + 1;
+ groupCounts[g]++;
+ }
+ // determine if a factor needs explicit softmax normalization
+ groupNeedsNormalization_.resize(numGroups, false);
+ for (size_t g = 0; g < numGroups; g++) { // detect non-overlapping groups
+ LOG(info, "[embedding] Factor group '{}' has {} members ({})",
+ groupPrefixes[g], groupCounts[g], groupCounts[g] == 1 ? "sigmoid" : "softmax");
+ // any factor that is not referenced in all words and is not a sigmoid needs normalization
+ if (g == 0) // @TODO: For now we assume that the main factor is used in all words. Test this.
+ continue;
+ if (groupCounts[g] == 1) // sigmoid factors have no normalizer
+ continue;
+ groupNeedsNormalization_[g] = true; // needed
+ ABORT_IF(groupRanges_[g].second - groupRanges_[g].first != groupCounts[g],
+ "Factor group '{}' members should be consecutive in the factor vocabulary", groupPrefixes[g]);
+ LOG(info, "[embedding] Factor group '{}' needs needs explicit normalization ({}..{})", groupPrefixes[g], groupRanges_[g].first, groupRanges_[g].second-1);
+ }
+
+ // create the factor matrix
+ std::vector<IndexType> data(vocab.size());
+ std::iota(data.begin(), data.end(), 0);
+ factorMatrix_ = csr_rows(data); // [V x U]
+ }
+
+ size_t factorVocabSize() const { return factorVocab_.size(); }
+
+ // create a CSR matrix M[V,U] from indices[] with
+ // M[v,u] = 1/c(u) if factor u is a factor of word v, and c(u) is how often u is referenced
+ CSRData csr_rows(const std::vector<IndexType>& words) const {
+ std::vector<float> weights;
+ std::vector<IndexType> indices;
+ std::vector<IndexType> offsets;
+ offsets.reserve(words.size() + 1);
+ indices.reserve(words.size()); // (at least this many)
+ // loop over all input words, and select the corresponding set of unit indices into CSR format
+ offsets.push_back((IndexType)indices.size());
+ for (auto v : words) {
+ const auto& m = factorMap_[v];
+ for (auto u : m) {
+ indices.push_back(u);
+ weights.push_back(1.0f/*/(float)factorRefCounts_[u]*/);
+ }
+ offsets.push_back((IndexType)indices.size()); // next matrix row begins at this offset
+ }
+ return { Shape({(int)words.size(), (int)factorVocab_.size()}), weights, indices, offsets };
+ }
+
+ const CSRData& getFactorMatrix() const { return factorMatrix_; } // [v,u] (sparse) -> =1 if u is factor of v
+ private:
+ Vocab factorVocab_; // [factor name] -> factor index = row of E_
+ std::vector<std::vector<Word>> factorMap_; // [word index] -> set of factor indices
+ std::vector<int> factorRefCounts_; // [factor index] -> how often this factor is referenced in factorMap_
+ CSRData factorMatrix_; // [v,u] (sparse) -> =1 if u is factor of v
+ std::vector<size_t> factorGroups_; // [u] -> group id of factor u
+ public: // @TODO: temporarily; later factor this properly
+ std::vector<std::pair<size_t, size_t>> groupRanges_; // [group id] -> (u_begin,u_end) index range of factors u for this group. These don't overlap.
+ std::vector<bool> groupNeedsNormalization_; // [group id] -> true if explicit softmax normalization is necessary
+ };
+
+ namespace mlp {
+ /*private*/ void Output::lazyConstruct(int inputDim) {
+ // We must construct lazily since we won't know tying nor input dim in constructor.
+ if (W_)
+ return;
+
+ auto name = options_->get<std::string>("prefix");
+ auto dim = options_->get<int>("dim");
+
+ if (options_->has("embedding-factors")) {
+ ABORT_IF(shortlist_, "Shortlists are presently not compatible with factored embeddings");
+ embeddingFactorMapping_ = New<EmbeddingFactorMapping>(options_);
+ dim = (int)embeddingFactorMapping_->factorVocabSize();
+ LOG(info, "[embedding] Factored outputs enabled");
+ }
+
+ if(tiedParam_) {
+ W_ = tiedParam_;
+ transposeW_ = true;
+ } else {
+ W_ = graph_->param(name + "_W", {inputDim, dim}, inits::glorot_uniform);
+ transposeW_ = false;
+ }
+
+ b_ = graph_->param(name + "_b", {1, dim}, inits::zeros);
+ }
+
+ Expr Output::apply(Expr input) /*override*/ {
+ lazyConstruct(input->shape()[-1]);
+
+ if (shortlist_) {
+ if (!cachedShortW_) { // short versions of parameters are cached within one batch, then clear()ed
+ if(transposeW_)
+ cachedShortW_ = rows(W_, shortlist_->indices());
+ else
+ cachedShortW_ = cols(W_, shortlist_->indices());
+ cachedShortb_ = cols(b_, shortlist_->indices());
+ }
+ return affine(input, cachedShortW_, cachedShortb_, false, transposeW_);
+ }
+ else if (embeddingFactorMapping_) {
+ auto graph = input->graph();
+ auto y = affine(input, W_, b_, false, transposeW_); // [B... x U] factor logits
+
+ // denominators (only for groups that don't normalize out naturally by the final softmax())
+ const auto& groupRanges = embeddingFactorMapping_->groupRanges_; // @TODO: factor this properly
+ auto numGroups = groupRanges.size();
+ for (size_t g = 0; g < numGroups; g++) {
+ if (!embeddingFactorMapping_->groupNeedsNormalization_[g]) // @TODO: if we ever need it, we can combine multiple
+ continue;
+ auto range = groupRanges[g];
+ // y: [B... x U]
+ // m: [1 x U] // ones at positions of group members
+ auto yDim = y->shape()[-1];
+ std::vector<float> mVec(yDim, 0.0f); // @TODO: This vector should be produced by embeddingFactorMapping_
+ for (size_t i = range.first; i < range.second; i++)
+ mVec[i] = 1.0f;
+ // need to compute log denominator over y[range] and subtract it from y[range]
+ auto groupY = slice(y, Slice((int)range.first, (int)range.second), /*axis=*/-1); // [B... x Ug]
+ auto groupZ = logsumexp(groupY, /*axis=*/-1); // [B... x 1]
+ auto m = graph->constant({ 1, (int)mVec.size() }, inits::from_vector(mVec)); // [1 x U]
+ auto Z = dot(groupZ, m); // [B... x U]
+ y = y - Z;
+#if 0
+ // and a log-linear weight
+ auto name = options_->get<std::string>("prefix");
+ auto llWeight = graph->param(name + "_llWeight_" + std::to_string(g), {}, inits::from_value(1.0f));
+ y = y * ((llWeight - 1) * m + 1);
+#endif
+ }
+
+ // sum up the unit logits across factors for each target word
+ auto factorMatrix = embeddingFactorMapping_->getFactorMatrix(); // [V x U]
+ y = dot_csr(
+ y, // [B x U]
+ factorMatrix.shape,
+ graph->constant({(int)factorMatrix.weights.size()}, inits::from_vector(factorMatrix.weights), Type::float32),
+ graph->constant({(int)factorMatrix.indices.size()}, inits::from_vector(factorMatrix.indices), Type::uint32),
+ graph->constant({(int)factorMatrix.offsets.size()}, inits::from_vector(factorMatrix.offsets), Type::uint32),
+ /*transB=*/ true); // -> [B x V]
+
+ return y;
+ }
+ else
+ return affine(input, W_, b_, false, transposeW_);
+ }
+ }
+
+ Embedding::Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options) : LayerBase(graph, options) {
+ std::string name = opt<std::string>("prefix");
+ int dimVoc = opt<int>("dimVocab");
+ int dimEmb = opt<int>("dimEmb");
+
+ bool fixed = opt<bool>("fixed", false);
+
+ if (options_->has("embedding-factors")) {
+ embeddingFactorMapping_ = New<EmbeddingFactorMapping>(options_);
+ dimVoc = (int)embeddingFactorMapping_->factorVocabSize();
+ LOG(info, "[embedding] Factored embeddings enabled");
+ }
+
+ NodeInitializer initFunc = inits::glorot_uniform;
+ if (options_->has("embFile")) {
+ std::string file = opt<std::string>("embFile");
+ if (!file.empty()) {
+ bool norm = opt<bool>("normalization", false);
+ initFunc = inits::from_word2vec(file, dimVoc, dimEmb, norm);
+ }
+ }
+
+ E_ = graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed);
+ }
+
+ // helper to embed a sequence of words (given as indices) via factored embeddings
+ /*private*/ Expr Embedding::multiRows(const std::vector<IndexType>& data) const
+ {
+ auto graph = E_->graph();
+ auto factoredData = embeddingFactorMapping_->csr_rows(data);
+ // multi-hot factor vectors are represented as a sparse CSR matrix
+ // [row index = word position index] -> set of factor indices for word at this position
+ ABORT_IF(factoredData.shape != Shape({(int)factoredData.offsets.size()-1/*=rows of CSR*/, E_->shape()[0]}), "shape mismatch??");
+ return csr_dot( // the CSR matrix is passed in pieces
+ factoredData.shape,
+ graph->constant({(int)factoredData.weights.size()}, inits::from_vector(factoredData.weights), Type::float32),
+ graph->constant({(int)factoredData.indices.size()}, inits::from_vector(factoredData.indices), Type::uint32),
+ graph->constant({(int)factoredData.offsets.size()}, inits::from_vector(factoredData.offsets), Type::uint32),
+ E_);
+ }
+
+ std::tuple<Expr/*embeddings*/, Expr/*mask*/> Embedding::apply(Ptr<data::SubBatch> subBatch) const /*override final*/ {
+ auto graph = E_->graph();
+ int dimBatch = (int)subBatch->batchSize();
+ int dimEmb = E_->shape()[-1];
+ int dimWords = (int)subBatch->batchWidth();
+
+ // factored embeddings:
+ // - regular:
+ // - y = x @ E x:[B x 1ofV] ; E:[V x D] ; y:[B x D]
+ // - factored:
+ // - u = x @ M one-hot to U-dimensional multi-hot (all factors in one concatenated space)
+ // - each row of M contains the set of factors for one word => we want a CSR matrix
+ // - y = (x @ M) @ E (x:[B x 1ofV] ; M:[V x U]) ; E:[U x D] ; y:[B x D]
+ // - first compute x @ M on the CPU
+ // - (Uvalues, Uindices, Uoffsets) = csr_rows(Mvalues, Mindices, Moffsets, subBatch->data()):
+ // - shape (U, specifically) not actually needed here
+ // - foreach input x[i]
+ // - locate row M[i,*]
+ // - copy through its index values (std::vector<push_back>)
+ // - create a matching ones vector (we can keep growing)
+ // - convert to GPU-side CSR matrix. CSR matrix now has #rows equal to len(x)
+ // - CSR matrix product with E
+ // - csr_dot(Uvalues, Uindices, Uoffsets, E_, transposeU)
+ // - double-check if all dimensions are specified. Probably not for transpose (which would be like csc_dot()).
+ // - weighting:
+ // - core factors' gradients are sums over all words that use the factors;
+ // - core factors' embeddings move very fast
+ // - words will need to make up for the move; rare words cannot
+ // - so, we multiply each factor with 1/refCount
+ // - core factors get weighed down a lot
+ // - no impact on gradients, as Adam makes up for it; embeddings still move fast just as before
+ // - but forward pass weighs them down, so that all factors are in a similar numeric range
+ // - if it is required to be in a different range, the embeddings can still learn that, but more slowly
+
+ Expr chosenEmbeddings;
+ if (embeddingFactorMapping_)
+ chosenEmbeddings = multiRows(subBatch->data());
+ else
+ chosenEmbeddings = rows(E_, subBatch->data());
+
+ auto batchEmbeddings = reshape(chosenEmbeddings, { dimWords, dimBatch, dimEmb });
+ auto batchMask = graph->constant({ dimWords, dimBatch, 1 },
+ inits::from_vector(subBatch->mask()));
+ return std::make_tuple(batchEmbeddings, batchMask);
+ }
+
+ Expr Embedding::apply(const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) const /*override final*/ {
+ int dimEmb = E_->shape()[-1];
+ Expr chosenEmbeddings;
+ if (embeddingFactorMapping_)
+ chosenEmbeddings = multiRows(embIdx);
+ else
+ chosenEmbeddings = rows(E_, embIdx);
+ return reshape(chosenEmbeddings, { dimBeam, 1, dimBatch, dimEmb });
+ }
+} // namespace marian
diff --git a/src/layers/generic.h b/src/layers/generic.h
index a9e2be01..d70c7f06 100755
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@@ -54,6 +54,8 @@ struct IEmbeddingLayer {
virtual Expr apply(const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) const = 0;
};
+class EmbeddingFactorMapping;
+
namespace mlp {
class Dense : public LayerBase, public IUnaryLayer {
@@ -124,49 +126,51 @@ public:
class Output : public LayerBase, public IUnaryLayer {
private:
- Expr tiedParam_;
- Ptr<data::Shortlist> shortlist_;
-
- Expr W_;
+ Expr W_; // parameters held by this layer
Expr b_;
+ Expr cachedShortW_; // short-listed version, cached (cleared by clear())
+ Expr cachedShortb_; // these match the current value of shortlist_
+ Ptr<EmbeddingFactorMapping > embeddingFactorMapping_;
+
+ // optional parameters set/updated after construction
+ Expr tiedParam_;
bool transposeW_{false};
+ Ptr<data::Shortlist> shortlist_;
+ void lazyConstruct(int inputDim);
public:
Output(Ptr<ExpressionGraph> graph, Ptr<Options> options)
- : LayerBase(graph, options) {}
+ : LayerBase(graph, options) {
+ clear();
+ }
void tieTransposed(Expr tied) {
- tiedParam_ = tied;
+ if (W_)
+ ABORT_IF(tiedParam_.get() != tied.get(), "Tied output projection cannot be changed once weights have been created");
+ else
+ tiedParam_ = tied;
}
- void setShortlist(Ptr<data::Shortlist> shortlist) { shortlist_ = shortlist; }
-
- Expr apply(Expr input) override {
- if(!W_) {
- auto name = options_->get<std::string>("prefix");
- auto dim = options_->get<int>("dim");
-
- if(tiedParam_) {
- transposeW_ = true;
- W_ = tiedParam_;
- if(shortlist_)
- W_ = rows(W_, shortlist_->indices());
- } else {
- W_ = graph_->param(name + "_W",
- {input->shape()[-1], dim},
- inits::glorot_uniform);
- if(shortlist_)
- W_ = cols(W_, shortlist_->indices());
- }
-
- b_ = graph_->param(name + "_b", {1, dim}, inits::zeros);
- if(shortlist_)
- b_ = cols(b_, shortlist_->indices());
+ void setShortlist(Ptr<data::Shortlist> shortlist) {
+ if (shortlist_)
+ ABORT_IF(shortlist.get() != shortlist_.get(), "Output shortlist cannot be changed except after clear()");
+ else {
+ ABORT_IF(cachedShortW_ || cachedShortb_, "No shortlist but cached parameters??");
+ shortlist_ = shortlist;
}
+ // cachedShortW_ and cachedShortb_ will be created lazily inside apply()
+ }
- return affine(input, W_, b_, false, transposeW_);
+ // this is expected to be called in sync with graph->clear(), which invalidates
+ // cachedShortW_ and cachedShortb_ in the graph's short-term cache
+ void clear() {
+ shortlist_ = nullptr;
+ cachedShortW_ = nullptr;
+ cachedShortb_ = nullptr;
}
+ Expr apply(Expr input) override;
+
virtual Expr apply(const std::vector<Expr>& /*inputs*/) override {
ABORT("Not implemented");
};
@@ -176,45 +180,15 @@ public:
class Embedding : public LayerBase, public IEmbeddingLayer {
Expr E_;
+ Ptr<EmbeddingFactorMapping> embeddingFactorMapping_;
+ Expr multiRows(const std::vector<IndexType>& data) const;
public:
- Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options) : LayerBase(graph, options) {
- std::string name = opt<std::string>("prefix");
- int dimVoc = opt<int>("dimVocab");
- int dimEmb = opt<int>("dimEmb");
-
- bool fixed = opt<bool>("fixed", false);
-
- NodeInitializer initFunc = inits::glorot_uniform;
- if (options_->has("embFile")) {
- std::string file = opt<std::string>("embFile");
- if (!file.empty()) {
- bool norm = opt<bool>("normalization", false);
- initFunc = inits::from_word2vec(file, dimVoc, dimEmb, norm);
- }
- }
+ Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options);
- E_ = graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed);
- }
-
- std::tuple<Expr/*embeddings*/, Expr/*mask*/> apply(Ptr<data::SubBatch> subBatch) const override final {
- auto graph = E_->graph();
- int dimBatch = (int)subBatch->batchSize();
- int dimEmb = E_->shape()[-1];
- int dimWords = (int)subBatch->batchWidth();
- // @TODO: merge this with below. Currently can't only due to the extra beam dimension
- auto chosenEmbeddings = rows(E_, subBatch->data());
- auto batchEmbeddings = reshape(chosenEmbeddings, { dimWords, dimBatch, dimEmb });
- auto batchMask = graph->constant({ dimWords, dimBatch, 1 },
- inits::from_vector(subBatch->mask()));
- return std::make_tuple(batchEmbeddings, batchMask);
- }
+ std::tuple<Expr/*embeddings*/, Expr/*mask*/> apply(Ptr<data::SubBatch> subBatch) const override final;
// special version used in decoding
- Expr apply(const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) const override final {
- int dimEmb = E_->shape()[-1];
- auto selectedEmbs = rows(E_, embIdx);
- return reshape(selectedEmbs, { dimBeam, 1, dimBatch, dimEmb });
- }
+ Expr apply(const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) const override final;
};
class ULREmbedding : public LayerBase, public IEmbeddingLayer {
@@ -322,10 +296,4 @@ public:
ABORT("not implemented"); // ULR cannot be used for decoding
}
};
-
-typedef ConstructingFactory<Embedding> EmbeddingFactory;
-typedef ConstructingFactory<ULREmbedding> ULREmbeddingFactory;
-
-typedef Accumulator<EmbeddingFactory> embedding;
-typedef Accumulator<ULREmbeddingFactory> ulr_embedding;
} // namespace marian
diff --git a/src/layers/loss.cpp b/src/layers/loss.cpp
index 03b79682..d11e4384 100755
--- a/src/layers/loss.cpp
+++ b/src/layers/loss.cpp
@@ -26,13 +26,26 @@ Expr LossBase::getCrossEntropy(Expr logits,
Expr indices,
Expr mask,
Expr weights) {
- auto ce = cross_entropy(logits, indices);
+ Expr ce;
if(smoothing_ > 0) {
// @TODO: add this to CE kernels instead
+#if 0
+ ce = cross_entropy(logits, indices);
auto ceq = mean(logsoftmax(logits), /*axis=*/ -1);
ce = (1 - smoothing_) * ce - smoothing_ * ceq;
+#else // alternative that is cheaper memory-wise
+ ce = cross_entropy(logits, indices);
+ auto ceq = mean(logits, /*axis=*/ -1) - logsumexp(logits, /*axis=*/ -1);
+ ce = (1 - smoothing_) * ce - smoothing_ * ceq;
+ //auto ceq = mean(logits, /*axis=*/ -1) - Z;
+ //ce = (1 - smoothing_) * cols(logits, indices) // ce term
+ // - smoothing_ * mean(logits, /*axis=*/ -1) // smoothing term
+ // - logsumexp(logits, /*axis=*/ -1); // denominator
+#endif
}
+ else
+ ce = cross_entropy(logits, indices);
if(mask)
ce = ce * mask;
diff --git a/src/models/decoder.h b/src/models/decoder.h
index 99fbd4ad..9b7aad9c 100755
--- a/src/models/decoder.h
+++ b/src/models/decoder.h
@@ -4,6 +4,7 @@
#include "states.h"
#include "data/shortlist.h"
+#include "layers/constructors.h"
#include "layers/generic.h"
namespace marian {
@@ -14,6 +15,7 @@ protected:
std::string prefix_{"decoder"};
bool inference_{false};
size_t batchIndex_{1};
+ std::vector<Ptr<IEmbeddingLayer>> embedding_; // @TODO: find a more grammattical name
Ptr<data::Shortlist> shortlist_;
@@ -31,37 +33,41 @@ public:
virtual Ptr<DecoderState> step(Ptr<ExpressionGraph>, Ptr<DecoderState>) = 0;
+ void lazyCreateEmbedding(Ptr<ExpressionGraph> graph) {
+ // @TODO: code dup with EncoderTransformer
+ if (embedding_.size() <= batchIndex_ || !embedding_[batchIndex_]) { // lazy
+ if (embedding_.size() <= batchIndex_)
+ embedding_.resize(batchIndex_ + 1);
+ int dimVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
+ int dimEmb = opt<int>("dim-emb");
+ auto embFactory = embedding()("dimVocab", dimVoc)("dimEmb", dimEmb);
+ if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all"))
+ embFactory("prefix", "Wemb");
+ else
+ embFactory("prefix", prefix_ + "_Wemb");
+ if(options_->has("embedding-fix-trg"))
+ embFactory("fixed", opt<bool>("embedding-fix-trg"));
+ if(options_->has("embedding-vectors")) {
+ auto embFiles = opt<std::vector<std::string>>("embedding-vectors");
+ embFactory("embFile", embFiles[batchIndex_]) //
+ ("normalization", opt<bool>("embedding-normalization"));
+ }
+ if (options_->has("embedding-factors")) {
+ embFactory("embedding-factors", opt<std::vector<std::string>>("embedding-factors"));
+ embFactory("vocab", opt<std::vector<std::string>>("vocabs")[batchIndex_]);
+ }
+ embedding_[batchIndex_] = embFactory.construct(graph);
+ }
+ }
+
virtual void embeddingsFromBatch(Ptr<ExpressionGraph> graph,
Ptr<DecoderState> state,
Ptr<data::CorpusBatch> batch) {
-
- int dimVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
- int dimEmb = opt<int>("dim-emb");
-
- auto yEmbFactory = embedding() //
- ("dimVocab", dimVoc) //
- ("dimEmb", dimEmb);
-
- if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all"))
- yEmbFactory("prefix", "Wemb");
- else
- yEmbFactory("prefix", prefix_ + "_Wemb");
-
- if(options_->has("embedding-fix-trg"))
- yEmbFactory("fixed", opt<bool>("embedding-fix-trg"));
-
- if(options_->has("embedding-vectors")) {
- auto embFiles = opt<std::vector<std::string>>("embedding-vectors");
- yEmbFactory("embFile", embFiles[batchIndex_]) //
- ("normalization", opt<bool>("embedding-normalization"));
- }
-
- auto yEmb = yEmbFactory.construct(graph);
-
auto subBatch = (*batch)[batchIndex_];
+ lazyCreateEmbedding(graph);
Expr y, yMask; std::tie
- (y, yMask) = yEmb->apply(subBatch);
+ (y, yMask) = embedding_[batchIndex_]->apply(subBatch);
Expr yData;
if(shortlist_) {
@@ -82,26 +88,13 @@ public:
const std::vector<IndexType>& embIdx,
int dimBatch,
int dimBeam) {
- int dimTrgEmb = opt<int>("dim-emb");
- int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
-
Expr selectedEmbs;
if(embIdx.empty()) {
- selectedEmbs = graph->constant({1, 1, dimBatch, dimTrgEmb}, inits::zeros);
+ int dimEmb = opt<int>("dim-emb");
+ selectedEmbs = graph->constant({1, 1, dimBatch, dimEmb}, inits::zeros);
} else {
- // embeddings are loaded from model during translation, no fixing required
- auto yEmbFactory = embedding() //
- ("dimVocab", dimTrgVoc) //
- ("dimEmb", dimTrgEmb);
-
- if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all"))
- yEmbFactory("prefix", "Wemb");
- else
- yEmbFactory("prefix", prefix_ + "_Wemb");
-
- auto yEmb = yEmbFactory.construct(graph);
-
- selectedEmbs = yEmb->apply(embIdx, dimBatch, dimBeam);
+ lazyCreateEmbedding(graph);
+ selectedEmbs = embedding_[batchIndex_]->apply(embIdx, dimBatch, dimBeam);
}
state->setTargetEmbeddings(selectedEmbs);
}
diff --git a/src/models/s2s.h b/src/models/s2s.h
index edda79bc..2f4a4579 100755
--- a/src/models/s2s.h
+++ b/src/models/s2s.h
@@ -124,6 +124,7 @@ public:
int dimVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
int dimEmb = opt<int>("dim-emb");
+ // @TODO: code dup with Decider and EncoderTransformer; actually diverged by now. Unify this.
auto embFactory = embedding() //
("dimVocab", dimVoc) //
("dimEmb", dimEmb);
diff --git a/src/models/transformer.h b/src/models/transformer.h
index 968d481b..f6aa4ff1 100755
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -6,7 +6,6 @@
#include "marian.h"
#include "layers/constructors.h"
-#include "layers/factory.h"
#include "models/decoder.h"
#include "models/encoder.h"
#include "models/states.h"
@@ -495,7 +494,7 @@ public:
return embFactory.construct(graph_);
}
- Ptr<IEmbeddingLayer> createWordEmbeddingLayer(size_t subBatchIndex) const {
+ Ptr<IEmbeddingLayer> createSourceEmbeddingLayer(size_t subBatchIndex) const {
// standard encoder word embeddings
int dimVoc = opt<std::vector<int>>("dim-vocabs")[subBatchIndex];
int dimEmb = opt<int>("dim-emb");
@@ -511,6 +510,10 @@ public:
embFactory("embFile", embFiles[subBatchIndex])
("normalization", opt<bool>("embedding-normalization"));
}
+ if (options_->has("embedding-factors")) {
+ embFactory("embedding-factors", opt<std::vector<std::string>>("embedding-factors"));
+ embFactory("vocab", opt<std::vector<std::string>>("vocabs")[subBatchIndex]);
+ }
return embFactory.construct(graph_);
}
@@ -520,6 +523,7 @@ public:
return apply(batch);
}
+ std::vector<Ptr<IEmbeddingLayer>> embedding_; // @TODO: move away, also rename
Ptr<EncoderState> apply(Ptr<data::CorpusBatch> batch) {
int dimEmb = opt<int>("dim-emb");
int dimBatch = (int)batch->size();
@@ -527,12 +531,15 @@ public:
// create the embedding matrix, considering tying and some other options
// embed the source words in the batch
Expr batchEmbeddings, batchMask;
- Ptr<IEmbeddingLayer> embedding;
- if (options_->has("ulr") && options_->get<bool>("ulr") == true)
- embedding = createULREmbeddingLayer(); // embedding uses ULR
- else
- embedding = createWordEmbeddingLayer(batchIndex_);
- std::tie(batchEmbeddings, batchMask) = embedding->apply((*batch)[batchIndex_]);
+
+ if (embedding_.empty() || !embedding_[batchIndex_]) { // lazy
+ embedding_.resize(batch->sets());
+ if (options_->has("ulr") && options_->get<bool>("ulr") == true)
+ embedding_[batchIndex_] = createULREmbeddingLayer(); // embedding uses ULR
+ else
+ embedding_[batchIndex_] = createSourceEmbeddingLayer(batchIndex_);
+ }
+ std::tie(batchEmbeddings, batchMask) = embedding_[batchIndex_]->apply((*batch)[batchIndex_]);
// apply dropout over source words
float dropoutSrc = inference_ ? 0 : opt<float>("dropout-src");
if(dropoutSrc) {
@@ -601,17 +608,17 @@ public:
class DecoderTransformer : public Transformer<DecoderBase> {
private:
- Ptr<mlp::MLP> output_;
+ Ptr<mlp::Output> output_;
private:
- void LazyCreateOutputLayer()
+ void lazyCreateOutputLayer()
{
if(output_) // create it lazily
return;
int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
- auto layerOut = mlp::output() //
+ auto outputFactory = mlp::output() //
("prefix", prefix_ + "_ff_logit_out") //
("dim", dimTrgVoc);
@@ -619,18 +626,22 @@ private:
std::string tiedPrefix = prefix_ + "_Wemb";
if(opt<bool>("tied-embeddings-all") || opt<bool>("tied-embeddings-src"))
tiedPrefix = "Wemb";
- layerOut.tieTransposed(tiedPrefix);
+ outputFactory.tieTransposed(tiedPrefix);
}
- if(shortlist_)
- layerOut.setShortlist(shortlist_);
-
- // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim]
- // assemble layers into MLP and apply to embeddings, decoder context and
- // aligned source context
- output_ = mlp::mlp() //
- .push_back(layerOut) //
- .construct(graph_);
+ if (options_->has("embedding-factors")) {
+ // factored embeddings, simplistic version (which just adds the logits, like multiplying probs)
+ // z = h @ W // h:[B x D] ; W:[D x V] -> [B x V]
+ // with factors:
+ // z = h @ W @ M' // h:[B x D] ; W:[D x U] ; M':[U x V] -> [B x V]
+ // i.e. multiOutput():
+ // output = dot_csr(output, M, transB=true)
+ // @BUGBUG: need to specify output factors separately if not tied-embeddings or tied-embeddings-all
+ outputFactory("embedding-factors", opt<std::vector<std::string>>("embedding-factors"));
+ outputFactory("vocab", opt<std::vector<std::string>>("vocabs")[batchIndex_]);
+ }
+
+ output_ = std::dynamic_pointer_cast<mlp::Output>(outputFactory.construct(graph_)); // (construct() returns only the underlying interface)
}
public:
@@ -662,7 +673,7 @@ public:
virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph,
Ptr<DecoderState> state) override {
ABORT_IF(graph != graph_, "An inconsistent graph parameter was passed to step()");
- LazyCreateOutputLayer();
+ lazyCreateOutputLayer();
return step(state);
}
@@ -818,7 +829,9 @@ public:
//************************************************************************//
// final feed-forward layer (output)
- Expr logits = output_->apply(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim]
+ if(shortlist_)
+ output_->setShortlist(shortlist_);
+ Expr logits = output_->apply(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab or shortlist dim]
// return unormalized(!) probabilities
Ptr<DecoderState> nextState;
@@ -840,7 +853,8 @@ public:
}
void clear() override {
- output_ = nullptr;
+ if (output_)
+ output_->clear();
cache_.clear();
alignments_.clear();
}
diff --git a/src/models/transformer_factory.h b/src/models/transformer_factory.h
index aa31e4d1..825c32b9 100755
--- a/src/models/transformer_factory.h
+++ b/src/models/transformer_factory.h
@@ -1,3 +1,4 @@
+// @TODO: rename to transformer.h eventually. This is not a Factory as in factory.h.
#pragma once
#include "marian.h"
diff --git a/src/tensors/cpu/add.h b/src/tensors/cpu/add.h
index 38a0684d..4bae5bb5 100755
--- a/src/tensors/cpu/add.h
+++ b/src/tensors/cpu/add.h
@@ -15,8 +15,8 @@ namespace marian {
namespace cpu {
-template <size_t K, class Functor>
-void gAddGeneric(Functor functor,
+template <size_t K, class Functor, class AggFunctor>
+void gAggregateGeneric(Functor functor, float aggInit, AggFunctor aggFunctor,
const functional::Shape full,
functional::Tensor<float> out,
functional::Array<functional::Tensor<float>, K> ins,
@@ -34,16 +34,16 @@ void gAddGeneric(Functor functor,
functional::Array<int, N> dims;
for(int index = 0; index < outLength; ++index) {
if(same) {
- out[index] += functional::apply(functor, ins, index) * scale;
+ out[index] = aggFunctor(out[index], functional::apply(functor, ins, index) * scale);
} else {
out.shape().dims(index, dims);
- out[index] += functional::loops(functor, ins, len, dims) * scale;
+ out[index] = aggFunctor(out[index], functional::loops(functor, aggInit, aggFunctor, ins, len, dims) * scale);
}
}
}
-template <size_t K, class Functor>
-void gAddEqual(Functor functor,
+template <size_t K, class Functor, class AggFunctor>
+void gAggregateEqual(Functor functor, AggFunctor aggFunctor,
functional::Tensor<float> out,
functional::Array<functional::Tensor<float>, K> ins,
float scale,
@@ -61,12 +61,12 @@ void gAddEqual(Functor functor,
indices[i] = ins[i].shape().bindex(dims);
}
- out[index] += functional::apply(functor, ins, indices) * scale;
+ out[index] = aggFunctor(out[index], functional::apply(functor, ins, indices) * scale);
}
}
-template <size_t K, class Functor>
-void gAddReduce(Functor functor,
+template <size_t K, class Functor, class AggFunctor>
+void gAggregateReduce(Functor functor, float aggInit, AggFunctor aggFunctor,
const functional::Shape full,
functional::Tensor<float> out,
functional::Array<functional::Tensor<float>, K> ins,
@@ -79,10 +79,10 @@ void gAddReduce(Functor functor,
same = same && ins[i].shape().elements() == full.elements();
for(int j = 0; j < rows; ++j) {
- float sum = 0;
+ float colSum = aggInit;
if(same) {
for(int id = 0; id < cols; ++id)
- sum += functional::apply(functor, ins, j * cols + id);
+ colSum = aggFunctor(colSum, functional::apply(functor, ins, j * cols + id));
} else {
functional::Array<int, functional::Shape::size()> dims;
for(int id = 0; id < cols; ++id) {
@@ -90,15 +90,15 @@ void gAddReduce(Functor functor,
functional::Array<int, K> indices;
for(size_t i = 0; i < K; ++i)
indices[i] = ins[i].shape().bindex(dims);
- sum += functional::apply(functor, ins, indices);
+ colSum = aggFunctor(colSum, functional::apply(functor, ins, indices));
}
}
- out[j] += sum * scale;
+ out[j] = aggFunctor(out[j], colSum * scale);
}
}
-template <class Functor, class... Tensors>
-void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
+template <class Functor, class AggFunctor, class... Tensors>
+void Aggregate(Functor functor, float aggInit, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors) {
auto full = marian::Shape::broadcast({out, tensors...});
//int length = out->shape().elements();
@@ -111,15 +111,16 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
if(full.back() != 1 && out->shape().back() == 1) {
//size_t m = full.elements() / length;
//size_t k = full.back();
- cpu::gAddReduce(functor, full, gOut, gIns, scale);
+ cpu::gAggregateReduce(functor, aggInit, aggFunctor, full, gOut, gIns, scale);
} else if(out->shape() == full) {
bool broadcast = false;
for(size_t i = 0; i < K; ++i)
broadcast = broadcast || gOut.shape() != gIns[i].shape();
- cpu::gAddEqual(functor, gOut, gIns, scale, broadcast);
+ cpu::gAggregateEqual(functor, aggFunctor, gOut, gIns, scale, broadcast);
} else {
- cpu::gAddGeneric(functor, full, gOut, gIns, scale);
+ cpu::gAggregateGeneric(functor, aggInit, aggFunctor, full, gOut, gIns, scale);
}
}
+
} // namespace cpu
} // namespace marian
diff --git a/src/tensors/gpu/add.cu b/src/tensors/gpu/add.cu
index 2431948e..32c12783 100755
--- a/src/tensors/gpu/add.cu
+++ b/src/tensors/gpu/add.cu
@@ -16,8 +16,8 @@ namespace marian {
namespace gpu {
-template <size_t K, class Functor>
-__global__ void gAddGeneric(Functor functor,
+template <size_t K, class Functor, class AggFunctor>
+__global__ void gAggregateGeneric(Functor functor, float aggInit, AggFunctor aggFunctor,
const functional::Shape full,
functional::Tensor<float> out,
functional::Array<functional::Tensor<float>, K> ins,
@@ -37,17 +37,17 @@ __global__ void gAddGeneric(Functor functor,
int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
if(index < outLength) {
if(same) {
- out[index] += functional::apply(functor, ins, index) * scale;
+ out[index] = aggFunctor(out[index], functional::apply(functor, ins, index) * scale);
} else {
out.shape().dims(index, dims);
- out[index] += functional::loops(functor, ins, len, dims) * scale;
+ out[index] = aggFunctor(out[index], functional::loops(functor, aggInit, aggFunctor, ins, len, dims) * scale);
}
}
}
}
-template <size_t K, class Functor>
-__global__ void gAddEqual(Functor functor,
+template <size_t K, class Functor, class AggFunctor>
+__global__ void gAggregateEqual(Functor functor, AggFunctor aggFunctor,
functional::Tensor<float> out,
functional::Array<functional::Tensor<float>, K> ins,
float scale,
@@ -67,13 +67,13 @@ __global__ void gAddEqual(Functor functor,
indices[i] = ins[i].shape().bindex(dims);
}
- out[index] += functional::apply(functor, ins, indices) * scale;
+ out[index] = aggFunctor(out[index], functional::apply(functor, ins, indices) * scale);
}
}
}
-template <size_t K, class Functor>
-__global__ void gAddReduce(Functor functor,
+template <size_t K, class Functor, class AggFunctor>
+__global__ void gAggregateReduce(Functor functor, float aggInit, AggFunctor aggFunctor,
const functional::Shape full,
functional::Tensor<float> out,
functional::Array<functional::Tensor<float>, K> ins,
@@ -92,15 +92,15 @@ __global__ void gAddReduce(Functor functor,
float* _sum = _share + blockDim.x;
if(same) {
- _sum[threadIdx.x] = 0;
+ _sum[threadIdx.x] = aggInit;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
if(id < cols)
- _sum[threadIdx.x] += functional::apply(functor, ins, j * cols + id);
+ _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::apply(functor, ins, j * cols + id));
}
} else {
functional::Array<int, functional::Shape::size()> dims;
- _sum[threadIdx.x] = 0;
+ _sum[threadIdx.x] = aggInit;
for(int tid = 0; tid < cols; tid += blockDim.x) {
int id = tid + threadIdx.x;
@@ -109,7 +109,7 @@ __global__ void gAddReduce(Functor functor,
functional::Array<int, K> indices;
for(int i = 0; i < K; ++i)
indices[i] = ins[i].shape().bindex(dims);
- _sum[threadIdx.x] += functional::apply(functor, ins, indices);
+ _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::apply(functor, ins, indices));
}
}
}
@@ -119,16 +119,58 @@ __global__ void gAddReduce(Functor functor,
__syncthreads();
int skip = (len + 1) >> 1;
if(threadIdx.x < (len >> 1)) {
- _sum[threadIdx.x] += _sum[threadIdx.x + skip];
+ _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], _sum[threadIdx.x + skip]);
}
len = (len + 1) >> 1;
}
__syncthreads();
- out[j] += _sum[0] * scale;
+ out[j] = aggFunctor(out[j], _sum[0] * scale);
}
}
}
+template <class Functor, class AggFunctor, class... Tensors>
+void Aggregate(Functor functor, float aggInit, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors) {
+ cudaSetDevice(out->getDeviceId().no);
+
+ auto full = marian::Shape::broadcast({out, tensors...});
+
+ int length = out->shape().elements();
+
+ constexpr size_t K = sizeof...(Tensors);
+
+ functional::Tensor<float> gOut = out;
+ functional::Array<functional::Tensor<float>, K> gIns = {tensors...};
+
+ if(full.back() != 1 && out->shape().back() == 1) {
+ size_t m = full.elements() / length;
+ size_t k = full.back();
+
+ int blocks = std::min(MAX_BLOCKS, (int)m);
+ int threads = std::min(MAX_THREADS, (int)k);
+ int shared = sizeof(float) * threads * 2;
+
+ gAggregateReduce<<<blocks, threads, shared>>>(functor, aggInit, aggFunctor, full, gOut, gIns, scale);
+
+ } else if(out->shape() == full) {
+ int threads = std::min(MAX_THREADS, length);
+ int blocks
+ = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+ bool broadcast = false;
+ for(int i = 0; i < K; ++i)
+ broadcast = broadcast || gOut.shape() != gIns[i].shape();
+ gAggregateEqual<<<blocks, threads>>>(functor, aggFunctor, gOut, gIns, scale, broadcast);
+ } else {
+ int threads = std::min(MAX_THREADS, length);
+ int blocks
+ = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+ gAggregateGeneric<<<blocks, threads>>>(functor, aggInit, aggFunctor, full, gOut, gIns, scale);
+ }
+}
+
+// @TODO: this is a duplicate; can be removed, but need to redo all the add.inc entries...
template <class Functor, class... Tensors>
void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
cudaSetDevice(out->getDeviceId().no);
@@ -142,6 +184,8 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
functional::Tensor<float> gOut = out;
functional::Array<functional::Tensor<float>, K> gIns = {tensors...};
+ auto addFunctor = functional::_1 + functional::_2;
+
if(full.back() != 1 && out->shape().back() == 1) {
size_t m = full.elements() / length;
size_t k = full.back();
@@ -150,7 +194,7 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
int threads = std::min(MAX_THREADS, (int)k);
int shared = sizeof(float) * threads * 2;
- gAddReduce<<<blocks, threads, shared>>>(functor, full, gOut, gIns, scale);
+ gAggregateReduce<<<blocks, threads, shared>>>(functor, 0, addFunctor, full, gOut, gIns, scale);
} else if(out->shape() == full) {
int threads = std::min(MAX_THREADS, length);
@@ -160,13 +204,13 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
bool broadcast = false;
for(int i = 0; i < K; ++i)
broadcast = broadcast || gOut.shape() != gIns[i].shape();
- gAddEqual<<<blocks, threads>>>(functor, gOut, gIns, scale, broadcast);
+ gAggregateEqual<<<blocks, threads>>>(functor, addFunctor, gOut, gIns, scale, broadcast);
} else {
int threads = std::min(MAX_THREADS, length);
int blocks
= std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
- gAddGeneric<<<blocks, threads>>>(functor, full, gOut, gIns, scale);
+ gAggregateGeneric<<<blocks, threads>>>(functor, 0, addFunctor, full, gOut, gIns, scale);
}
}
diff --git a/src/tensors/gpu/add.h b/src/tensors/gpu/add.h
index e5e22d88..21e0bb96 100644..100755
--- a/src/tensors/gpu/add.h
+++ b/src/tensors/gpu/add.h
@@ -8,5 +8,7 @@ namespace gpu {
template <class Functor, class... Tensors>
void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors);
+template <class Functor, class AggFunctor, class... Tensors>
+void Aggregate(Functor functor, float initAgg, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors);
}
} // namespace marian
diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc
index 27f35b95..69244dce 100644..100755
--- a/src/tensors/gpu/add.inc
+++ b/src/tensors/gpu/add.inc
@@ -1,3 +1,4 @@
+// see element.inc for instructions on how to maintain this
using namespace functional;
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
@@ -22,3 +23,12 @@ template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>,
template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
template void Add<BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> >, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Minimum, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase> >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Minimum, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Maximum, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase> >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Maximum, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase> >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::LogAddExp, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase> >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::LogAddExp, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<2>, marian::functional::Assignee<3> > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<2>, marian::functional::Assignee<3> > > >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Capture>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Capture>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
diff --git a/src/tensors/gpu/element.inc b/src/tensors/gpu/element.inc
index 66f76301..f3cdea28 100755
--- a/src/tensors/gpu/element.inc
+++ b/src/tensors/gpu/element.inc
@@ -55,6 +55,7 @@ template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, Bin
template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, UnaryFunctor<elem::sReLU, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Minus, Assignee<3>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<5> >, Assignee<4> >, Capture> >, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<4> >, Assignee<4> >, Capture> > >, Capture>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> >, Capture> > >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, UnaryFunctor<elem::sReLU, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Minus, Assignee<3>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<5> >, Assignee<4> >, Capture> >, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<4> >, Assignee<4> >, Capture> > >, Capture>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> >, Capture> > >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Div, Assignee<3>, Capture> >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Div, Assignee<3>, Capture> >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::NEq, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::BinaryFunctor<marian::functional::elem::Gt, marian::functional::Assignee<2>, marian::functional::Assignee<3> >, marian::functional::BinaryFunctor<marian::functional::elem::Lt, marian::functional::Assignee<2>, marian::functional::Assignee<3> > >, marian::functional::Capture>, marian::functional::Capture> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::NEq, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::BinaryFunctor<marian::functional::elem::Gt, marian::functional::Assignee<2>, marian::functional::Assignee<3> >, marian::functional::BinaryFunctor<marian::functional::elem::Lt, marian::functional::Assignee<2>, marian::functional::Assignee<3> > >, marian::functional::Capture>, marian::functional::Capture> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Assignee<1> > >>(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Assignee<1> > >, std::shared_ptr<marian::TensorBase>);
// How to add new specializations:
// When you use a new specialization, it will cause a link error of this form (example):
// .../src/tensors/tensor_operators.h:41: undefined reference to `void marian::gpu::Element<marian::functional::Assign< ... > ( ... )'
diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h
index acc7e54c..f77259cb 100755
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@@ -140,6 +140,10 @@ public:
template <typename T>
void set(const T* begin, const T* end) {
+ ABORT_IF(end - begin != shape_.elements(),
+ "Vector size ({}) and underlying type ({}) do not match",
+ end - begin,
+ std::string(shape_));
ABORT_IF(!matchType<T>(type_),
"Requested type ({}) and underlying type ({}) do not match",
request<T>(),
diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h
index f7de2f20..8dd17a5c 100755
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@@ -51,7 +51,7 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
gpu::Add(functor, scale, out, tensors...);
else
#endif
- cpu::Add(functor, scale, out, tensors...);
+ cpu::Aggregate(functor, 0.0f, functional::_1 + functional::_2, scale, out, tensors...);
}
template <class Functor, class... Tensors>
@@ -59,6 +59,16 @@ void Add(Functor functor, marian::Tensor out, Tensors... tensors) {
Add(functor, 1, out, tensors...);
}
+template <class Functor, class AggFunctor, class... Tensors>
+void Aggregate(Functor functor, float aggInit, AggFunctor aggFunctor, marian::Tensor out, Tensors... tensors) {
+#ifdef CUDA_FOUND
+ if(out->getBackend()->getDeviceId().type == DeviceType::gpu)
+ gpu::Aggregate(functor, aggInit, aggFunctor, 1.0f, out, tensors...);
+ else
+#endif
+ cpu::Aggregate(functor, aggInit, aggFunctor, 1.0f, out, tensors...);
+}
+
template <class Functor, class... Tensors>
void Reduce(Functor functor,
float scale,
@@ -74,6 +84,14 @@ void Reduce(Functor functor, marian::Tensor out, Tensors... tensors) {
Add(functor, out, tensors...);
}
+template <class Functor, class AggFunctor, class... Tensors>
+void Reduce(Functor functor, AggFunctor aggFunctor, float aggInit,
+ marian::Tensor out,
+ Tensors... tensors) {
+ out->set(aggInit);
+ Aggregate(functor, aggInit, aggFunctor, out, tensors...);
+}
+
// clang-format off
DISPATCH7(Prod, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float)
DISPATCH8(ProdBatched, marian::Tensor, Ptr<Allocator>, const marian::Tensor, const marian::Tensor, bool, bool, float, float)
diff --git a/src/tests/operator_tests.cpp b/src/tests/operator_tests.cpp
index 08fdd8de..2607f41f 100755
--- a/src/tests/operator_tests.cpp
+++ b/src/tests/operator_tests.cpp
@@ -204,12 +204,19 @@ void tests(DeviceType device) {
graph->clear();
values.clear();
- std::vector<float> vA({1, 2, 3, 4, 5, 6, 7, 8});
- std::vector<float> vS1({6, 8, 10, 12});
- std::vector<float> vS2({10, 26});
-
- std::vector<float> vW({2.77778f, 6.77778f});
-
+ std::vector<float> vA({1, 6, 3, 8,
+ 5, 2, 7, 4});
+ // import numpy as np
+ // a = np.array([[1, 6, 3, 8], [5, 2, 7, 4]])
+ std::vector<float> vS1({6, 8, 10, 12}); // s1 = np.sum(a, axis=0)
+ std::vector<float> vS2({18, 18}); // np.sum(a, axis = 1)
+ std::vector<float> vS4({2.6925824f, 1.80277564f}); // np.std(a, axis = 1)
+ std::vector<float> vV5({7.25, 3.25}); // np.var(a, axis = 1)
+ std::vector<float> vM6({8, 7}); // np.max(a, axis = 1)
+ std::vector<float> vM7({1, 2}); // np.min(a, axis = 1)
+ std::vector<float> vP8({144, 280}); // np.prod(a, axis = 1)
+ std::vector<float> vL9({8.13364336f, 7.17551536f}); // np.log(np.sum(np.exp(a), axis=1))
+ std::vector<float> vW({5.0f, 4.55555556f}); // np.mean(a*s1,axis=-1) / np.mean(s1,axis=-1)
auto a = graph->constant({2, 4}, inits::from_vector(vA));
@@ -218,6 +225,14 @@ void tests(DeviceType device) {
auto m3 = mean(s1, /*axis=*/ 1);
+ auto s4 = marian::std(a, /*axis=*/ 1);
+ auto v5 = var(a, /*axis=*/ 1);
+
+ auto m6 = max(a, /*axis=*/ 1);
+ auto m7 = min(a, /*axis=*/ 1);
+ auto p8 = prod(a, /*axis=*/ 1);
+ auto l9 = logsumexp(a, /*axis=*/ 1);
+
auto sp = scalar_product(s2, s2, /*axis=*/ 0);
auto wa = weighted_average(a, s1, /*axis=*/ -1);
@@ -227,21 +242,30 @@ void tests(DeviceType device) {
CHECK(s1->shape() == Shape({1, 4}));
CHECK(s2->shape() == Shape({2, 1}));
CHECK(m3->shape() == Shape({1, 1}));
+ CHECK(s4->shape() == Shape({2, 1}));
+ CHECK(v5->shape() == Shape({2, 1}));
+ CHECK(m6->shape() == Shape({2, 1}));
+ CHECK(m7->shape() == Shape({2, 1}));
+ CHECK(p8->shape() == Shape({2, 1}));
+ CHECK(l9->shape() == Shape({2, 1}));
CHECK(sp->shape() == Shape({1, 1}));
CHECK(wa->shape() == Shape({2, 1}));
- s1->val()->get(values);
- CHECK( values == vS1 );
+ s1->val()->get(values); CHECK(values == vS1);
+ s2->val()->get(values); CHECK(values == vS2);
- s2->val()->get(values);
- CHECK( values == vS2 );
+ CHECK(m3->val()->scalar() == 9);
- CHECK( m3->val()->scalar() == 9 );
- CHECK( sp->val()->scalar() == 776 );
+ s4->val()->get(values); CHECK(std::equal(values.begin(), values.end(), vS4.begin(), floatApprox));
+ v5->val()->get(values); CHECK(values == vV5);
+ m6->val()->get(values); CHECK(values == vM6);
+ m7->val()->get(values); CHECK(values == vM7);
+ p8->val()->get(values); CHECK(values == vP8);
+ l9->val()->get(values); CHECK(std::equal(values.begin(), values.end(), vL9.begin(), floatApprox));
- wa->val()->get(values);
- CHECK( std::equal(values.begin(), values.end(),
- vW.begin(), floatApprox) );
+ CHECK(sp->val()->scalar() == 648);
+
+ wa->val()->get(values); CHECK(std::equal(values.begin(), values.end(), vW.begin(), floatApprox));
}
SECTION("concatenation") {
diff --git a/vs/Marian.vcxproj b/vs/Marian.vcxproj
index a6c560d3..a82c879e 100755
--- a/vs/Marian.vcxproj
+++ b/vs/Marian.vcxproj
@@ -575,6 +575,7 @@
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</ClCompile>
+ <ClCompile Include="..\src\layers\generic.cpp" />
<ClCompile Include="..\src\layers\loss.cpp" />
<ClCompile Include="..\src\layers\weight.cpp" />
<ClCompile Include="..\src\microsoft\quicksand.cpp">
diff --git a/vs/Marian.vcxproj.filters b/vs/Marian.vcxproj.filters
index d9e56843..a7c2331e 100755
--- a/vs/Marian.vcxproj.filters
+++ b/vs/Marian.vcxproj.filters
@@ -481,6 +481,9 @@
<ClCompile Include="..\src\examples\iris\iris.cpp">
<Filter>examples\iris</Filter>
</ClCompile>
+ <ClCompile Include="..\src\layers\generic.cpp">
+ <Filter>layers</Filter>
+ </ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\src\marian.h" />