diff options
27 files changed, 770 insertions, 266 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 8c757f10..4c16aa1d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -47,6 +47,7 @@ add_library(marian STATIC graph/node_initializers.cpp layers/convolution.cpp + layers/generic.cpp layers/loss.cpp layers/weight.cpp diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp index 70a30a5f..77604285 100755 --- a/src/common/config_parser.cpp +++ b/src/common/config_parser.cpp @@ -30,6 +30,7 @@ const std::set<std::string> PATHS = { "train-sets", "vocabs", "embedding-vectors", + "embedding-factors", "valid-sets", "valid-script-path", "valid-log", @@ -385,6 +386,8 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) { "Fix source embeddings. Affects all encoders"); cli.add<bool>("--embedding-fix-trg", "Fix target embeddings. Affects all decoders"); + cli.add_nondefault<std::vector<std::string>>("--embedding-factors", + "Paths to (factor map, factor list) file for factored embeddings"); cli.add<bool>("--multi-node", "Enable asynchronous multi-node training through MPI (and legacy sync if combined with --sync-sgd)"); @@ -466,6 +469,8 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) { "stdout"); cli.add<std::vector<std::string>>("--vocabs,-v", "Paths to vocabulary files have to correspond to --input"); + cli.add_nondefault<std::vector<std::string>>("--embedding-factors", + "Paths to (factor map, factor list) file for factored embeddings"); // decoding options cli.add<size_t>("--beam-size,-b", "Beam size used during search with validating translator", @@ -528,6 +533,8 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) { "Paths to vocabulary files have to correspond to --train-sets. " "If this parameter is not supplied we look for vocabulary files source.{yml,json} and target.{yml,json}. " "If these files do not exists they are created"); + cli.add_nondefault<std::vector<std::string>>("--embedding-factors", + "Paths to (factor map, factor list) file for factored embeddings"); cli.add<bool>("--n-best", "Score n-best list instead of plain text corpus"); cli.add<std::string>("--n-best-feature", diff --git a/src/common/utils.cpp b/src/common/utils.cpp index bde78835..252afa54 100755 --- a/src/common/utils.cpp +++ b/src/common/utils.cpp @@ -144,6 +144,11 @@ std::string withCommas(size_t n) { return res; } +bool beginsWith(const std::string& text, const std::string& prefix) { + return text.size() >= prefix.size() + && !text.compare(0, prefix.size(), prefix); +} + bool endsWith(const std::string& text, const std::string& suffix) { return text.size() >= suffix.size() && !text.compare(text.size() - suffix.size(), suffix.size(), suffix); diff --git a/src/common/utils.h b/src/common/utils.h index 94113a0e..d76d07fa 100755 --- a/src/common/utils.h +++ b/src/common/utils.h @@ -36,6 +36,7 @@ std::string exec(const std::string& cmd); std::pair<std::string, int> hostnameAndProcessId(); std::string withCommas(size_t n); +bool beginsWith(const std::string& text, const std::string& prefix); bool endsWith(const std::string& text, const std::string& suffix); std::string toUpper(const std::string& s); diff --git a/src/functional/tmp.h b/src/functional/tmp.h index 08383660..7c8f6fa1 100755 --- a/src/functional/tmp.h +++ b/src/functional/tmp.h @@ -118,55 +118,56 @@ __HDI__ float apply(Functor functor, /******************************************************************************/ +// @TODO: Rename this. It is a reduction loop. template <size_t n, size_t N, size_t K> struct Loop { - template <class Functor> + template <class Functor, class AggFunctor> __HDI__ static float result( - Functor functor, + Functor functor, float aggInit, AggFunctor aggFunctor, functional::Array<functional::Tensor<float>, K>& in, const functional::Array<int, K>& pAcc, const functional::Array<int, N>& length, const functional::Array<int, N>& dim) { - float sum = 0; + float agg = aggInit; functional::Array<int, K> acc; for(int i = 0; i < length[N - n]; ++i) { for(size_t j = 0; j < K; ++j) { acc[j] = pAcc[j] + (dim[N - n] + i) * in[j].shape().bstride(N - n); } - sum += Loop<n - 1, N, K>::result(functor, in, acc, length, dim); + agg = aggFunctor(agg, Loop<n - 1, N, K>::result(functor, aggInit, aggFunctor, in, acc, length, dim)); } - return sum; + return agg; } }; template <size_t N, size_t K> struct Loop<1, N, K> { - template <class Functor> + template <class Functor, class AggFunctor> __HDI__ static float result( - Functor functor, + Functor functor, float aggInit, AggFunctor aggFunctor, functional::Array<functional::Tensor<float>, K>& in, const functional::Array<int, K>& pAcc, const functional::Array<int, N>& length, const functional::Array<int, N>& dim) { - float sum = 0; + float agg = aggInit; functional::Array<int, K> acc; for(int i = 0; i < length[N - 1]; ++i) { for(size_t j = 0; j < K; ++j) { acc[j] = pAcc[j] + (dim[N - 1] + i) * in[j].shape().bstride(N - 1); } - sum += apply<K>(functor, in, acc); + agg = aggFunctor(agg, apply<K>(functor, in, acc)); } - return sum; + return agg; } }; -template <size_t N, size_t K, class Functor> -__HDI__ float loops(Functor functor, +template <size_t N, size_t K, class Functor, class AggFunctor> +__HDI__ float loops(Functor functor, float aggInit, AggFunctor aggFunctor, functional::Array<functional::Tensor<float>, K>& in, const functional::Array<int, N>& length, const functional::Array<int, N>& dim) { functional::Array<int, K> acc = {0}; - return Loop<N, N, K>::result(functor, in, acc, length, dim); + return Loop<N, N, K>::result(functor, aggInit, aggFunctor, in, acc, length, dim); } } // namespace functional } // namespace marian diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp index 826bd9f0..6a07611d 100755 --- a/src/graph/expression_operators.cpp +++ b/src/graph/expression_operators.cpp @@ -306,11 +306,36 @@ Expr slice(Expr a, int axis, Slice slice) { // numpy __getslice__ semantics, but } Expr sum(Expr a, int ax) { - return Expression<SumNodeOp>(a, ax); + return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::sum); } Expr mean(Expr a, int ax) { - return Expression<MeanNodeOp>(a, ax); + return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::mean); +} + +Expr std(Expr a, int ax) { + return Expression<ReduceNodeOp>(a - mean(a,ax), ax, ReduceNodeOpCode::rms); +} + +Expr var(Expr a, int ax) { + return Expression<ReduceNodeOp>(a - mean(a, ax), ax, ReduceNodeOpCode::meanSqr); +} + +Expr max(Expr a, int ax) { + return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::max); +} + +Expr min(Expr a, int ax) { + return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::min); +} + +Expr prod(Expr a, int ax) { + return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::prod); +} + +// log(sum(exp(a))) +Expr logsumexp(Expr a, int ax) { + return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::logSumExp); } Expr scalar_product(Expr a, Expr b, int ax) { diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h index 58149bde..78aed834 100755 --- a/src/graph/expression_operators.h +++ b/src/graph/expression_operators.h @@ -173,6 +173,13 @@ static inline Expr narrow(Expr a, int axis, size_t start, size_t length) { // Py /*********************************************************/ Expr sum(Expr a, int ax = 0); +Expr mean(Expr a, int ax = 0); +Expr std(Expr a, int ax); +Expr var(Expr a, int ax); +Expr max(Expr a, int ax); +Expr min(Expr a, int ax); +Expr prod(Expr a, int ax); +Expr logsumexp(Expr a, int ax); Expr softmax(Expr x, int axis = -1); @@ -182,8 +189,6 @@ Expr softmax(Expr a, Expr zeroOneMask, int axis = -1); Expr logsoftmax(Expr a); -Expr mean(Expr a, int ax = 0); - Expr cross_entropy(Expr a, Expr b); Expr scalar_product(Expr a, Expr b, int ax = 0); diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h index 10e2ca76..7d090823 100755 --- a/src/graph/node_operators_binary.h +++ b/src/graph/node_operators_binary.h @@ -432,8 +432,8 @@ public: ABORT_IF(S_offsets->shape()[0] - 1 != S_shape[0], "Sparse matrix offset vector has incorrect size"); auto outShape = D->shape(); - ABORT_IF(S_shape[transS == swapOperands ? 1 : 0] != outShape[-(int)swapOperands],
- "Matrix product requires inner dimensions to match");
+ ABORT_IF(S_shape[transS == swapOperands ? 1 : 0] != outShape[-(int)swapOperands], + "Matrix product requires inner dimensions to match"); outShape.set(-(int)swapOperands, S_shape[transS != swapOperands]); return outShape; } diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h index 7dbaec46..6dd90faf 100755 --- a/src/graph/node_operators_unary.h +++ b/src/graph/node_operators_unary.h @@ -412,20 +412,75 @@ struct LogSoftmaxNodeOp : public UnaryNodeOp { const std::string type() override { return "logsoftmax"; } }; -struct SumNodeOp : public UnaryNodeOp { +enum class ReduceNodeOpCode { + sum, mean, rms, meanSqr, min, max, prod, logSumExp +}; + +struct ReduceNodeOp : public UnaryNodeOp { int axis_; + ReduceNodeOpCode opCode_; + int reducedDim_; // dimension of axis being reduced, e.g. used in mean() - SumNodeOp(Expr a, int axis) : UnaryNodeOp(a, newShape(a, axis)) {} + ReduceNodeOp(Expr a, int axis, ReduceNodeOpCode opCode) + : UnaryNodeOp(a, newShape(a, axis)), opCode_(opCode) + { + reducedDim_ = a->shape()[axis]; // e.g. used in mean() + ABORT_IF(reducedDim_ != a->shape().elements() / shape().elements(), "bug in determining reducedDim"); + } NodeOps forwardOps() override { using namespace functional; - return {NodeOp(Reduce(_1, val_, child(0)->val()))}; + switch (opCode_) { + case ReduceNodeOpCode::sum: + return {NodeOp(Reduce(_1, val_, child(0)->val()))}; + case ReduceNodeOpCode::mean: + return {NodeOp(Reduce(_1, 1.0f / (float)reducedDim_, val_, child(0)->val()))}; + case ReduceNodeOpCode::rms: + return {NodeOp(Reduce(_1 * _1, 1.0f / (float)reducedDim_, val_, child(0)->val()); + Element(_1 = sqrt(_1), val_))}; + case ReduceNodeOpCode::meanSqr: + return {NodeOp(Reduce(_1 * _1, 1.0f / (float)reducedDim_, val_, child(0)->val()))}; + case ReduceNodeOpCode::min: + return {NodeOp(Reduce(_1, min(_1,_2), std::numeric_limits<float>::max(), val_, child(0)->val()))}; + case ReduceNodeOpCode::max: + return {NodeOp(Reduce(_1, max(_1,_2), std::numeric_limits<float>::lowest(), val_, child(0)->val()))}; + case ReduceNodeOpCode::prod: + return {NodeOp(Reduce(_1, _1 * _2, 1.0f, val_, child(0)->val()))}; + case ReduceNodeOpCode::logSumExp: + return {NodeOp(Reduce(_1, logaddexp(_1,_2), std::numeric_limits<float>::lowest(), val_, child(0)->val()))}; + default: + ABORT("Unexpected reduction op-code {}", (int)opCode_); + } } NodeOps backwardOps() override { using namespace functional; - return {NodeOp(Add(_1, child(0)->grad(), adj_))}; + switch (opCode_) { + case ReduceNodeOpCode::sum: + return {NodeOp(Add(_1, child(0)->grad(), adj_))}; + case ReduceNodeOpCode::mean: + return {NodeOp(Add(_1, 1.0f / (float)reducedDim_, child(0)->grad(), adj_))}; + case ReduceNodeOpCode::rms: // WARNING: UNTESTED!! + // y = (sum_j x_j^2)^0.5 + // dJ/dx_i = dJ/dy * 0.5 (sum_j x_j^2)^-0.5 * 2 x_i = dJ/dy * x_i / y --@REVIEW: is this correct? + // @TODO: do we need protection against div by 0? L'hospital rule? + return {NodeOp(Add(_1 * _2 / _3, child(0)->grad(), adj_, child(0)->val(), val_))}; + case ReduceNodeOpCode::meanSqr: // WARNING: UNTESTED!! + // y = sum_j x_j^2 + // dJ/dx_i = dJ/dy * sum_j dx_j^2/dx_i = dJ/dy * 2 dx_i --@REVIEW: is this correct? + return {NodeOp(Add(_1 * 2.0f * _2, child(0)->grad(), adj_, child(0)->val()))}; + case ReduceNodeOpCode::min: // WARNING: UNTESTED!! + case ReduceNodeOpCode::max: // WARNING: UNTESTED!! + // adj_ gets routed into the min/max value --@REVIEW: is this correct? + return {NodeOp(Add((_1 == _2) * _3, child(0)->grad(), child(0)->val(), val_, adj_))}; + case ReduceNodeOpCode::logSumExp: + // y = log(sum_j exp(x_j)) + // dJ/dx_i = dJ/dy * 1/(sum_j exp(x_j)) exp(x_i) = dJ/dy * exp(x_i - y)) --@REVIEW: is this correct? + return {NodeOp(Add(_1 * exp(_2 - _3), child(0)->grad(), adj_, child(0)->val(), val_))}; + default: + ABORT("Unexpected reduction op-code {}", (int)opCode_); + } } Shape newShape(Expr a, int axis) { @@ -436,66 +491,27 @@ struct SumNodeOp : public UnaryNodeOp { return shape; } - const std::string type() override { return "sum"; } - - const std::string color() override { return "orange"; } - - virtual size_t hash() override { - if(!hash_) { - hash_ = NaryNodeOp::hash(); - util::hash_combine(hash_, axis_); + const std::string type() override { + switch (opCode_) { + case ReduceNodeOpCode::sum: return "sum"; + case ReduceNodeOpCode::mean: return "mean"; + case ReduceNodeOpCode::rms: return "rms"; + case ReduceNodeOpCode::meanSqr: return "meanSqr"; + case ReduceNodeOpCode::min: return "min"; + case ReduceNodeOpCode::max: return "max"; + case ReduceNodeOpCode::prod: return "prod"; + case ReduceNodeOpCode::logSumExp: return "logSumExp"; + default: ABORT("Unexpected reduction op-code {}", (int)opCode_); } - return hash_; } - virtual bool equal(Expr node) override { - if(!NaryNodeOp::equal(node)) - return false; - Ptr<SumNodeOp> cnode = std::dynamic_pointer_cast<SumNodeOp>(node); - if(!cnode) - return false; - if(axis_ != cnode->axis_) - return false; - return true; - } -}; - -struct MeanNodeOp : public UnaryNodeOp { - int axis_; - - MeanNodeOp(Expr a, int axis) : UnaryNodeOp(a, newShape(a, axis)) {} - - NodeOps forwardOps() override { - using namespace functional; - int left = child(0)->shape().elements() / val_->shape().elements(); - float scale = 1.f / left; - - return {NodeOp(Reduce(_1, scale, val_, child(0)->val()))}; - } - - NodeOps backwardOps() override { - using namespace functional; - int left = child(0)->shape().elements() / val_->shape().elements(); - float scale = 1.f / left; - - return {NodeOp(Add(_1, scale, child(0)->grad(), adj_))}; - } - - Shape newShape(Expr a, int axis) { - Shape shape = a->shape(); - axis_ = shape.axis(axis); - shape.set(axis_, 1); - return shape; - } - - const std::string type() override { return "mean"; } - const std::string color() override { return "orange"; } virtual size_t hash() override { if(!hash_) { hash_ = NaryNodeOp::hash(); util::hash_combine(hash_, axis_); + util::hash_combine(hash_, (int)opCode_); } return hash_; } @@ -503,10 +519,10 @@ struct MeanNodeOp : public UnaryNodeOp { virtual bool equal(Expr node) override { if(!NaryNodeOp::equal(node)) return false; - Ptr<MeanNodeOp> cnode = std::dynamic_pointer_cast<MeanNodeOp>(node); + Ptr<ReduceNodeOp> cnode = std::dynamic_pointer_cast<ReduceNodeOp>(node); if(!cnode) return false; - if(axis_ != cnode->axis_) + if(axis_ != cnode->axis_ || opCode_ != cnode->opCode_) return false; return true; } diff --git a/src/layers/constructors.h b/src/layers/constructors.h index d0ac3487..5ed7f3f5 100755 --- a/src/layers/constructors.h +++ b/src/layers/constructors.h @@ -147,4 +147,10 @@ public: // @TODO: change naming convention. typedef Accumulator<MLPFactory> mlp; } // namespace mlp + +typedef ConstructingFactory<Embedding> EmbeddingFactory; +typedef ConstructingFactory<ULREmbedding> ULREmbeddingFactory; + +typedef Accumulator<EmbeddingFactory> embedding; +typedef Accumulator<ULREmbeddingFactory> ulr_embedding; } // namespace marian diff --git a/src/layers/generic.cpp b/src/layers/generic.cpp new file mode 100755 index 00000000..2941b689 --- /dev/null +++ b/src/layers/generic.cpp @@ -0,0 +1,339 @@ +#include "marian.h" + +#include "layers/generic.h" + +using std::size_t; // not sure why this is needed + +namespace marian { + struct CSRSparseTensor { // simplistic for now + Shape shape; + Expr values; // [k_i..k_{i+1}-1] -> value at [i,j] + Expr indices; // [k_i..k_{i+1}-1] -> j of non-null value + Expr offsets; // [i] -> k_i + }; +
+ class EmbeddingFactorMapping { + public: + struct CSRData { + Shape shape; + std::vector<float> weights; + std::vector<IndexType> indices; + std::vector<IndexType> offsets; + }; + // mapPath = path to file with entries in order of vocab entries of the form + // WORD FACTOR1 FACTOR2 FACTOR3... + // listPath = path to file that lists all FACTOR names + // vocab = original vocabulary + // Note: The WORD field in the map file is redundant. It is required for consistency checking only. + // Factors are grouped + // - user specifies list-factor prefixes; all factors beginning with that prefix are in the same group + // - factors within a group as multi-class and normalized that way + // - groups of size 1 are interpreted as sigmoids, multiply with P(u) / P(u-1) + // - one prefix must not contain another + // - all factors not matching a prefix get lumped into yet another class (the lemmas) + // - factor vocab must be sorted such that all groups are consecutive + // - result of Output layer is nevertheless logits, not a normalized probability, due to the sigmoid entries + EmbeddingFactorMapping(Ptr<Options> options) : factorVocab_(New<Options>(), 0) { + std::vector<std::string> paths = options->get<std::vector<std::string>>("embedding-factors"); + ABORT_IF(paths.size() != 2, "--embedding-factors expects two paths"); + auto mapPath = paths[0]; + auto factorVocabPath = paths[1]; + auto vocabPath = options->get<std::string>("vocab"); + + // Note: We misuse the Vocab class a little. + // Specifically, it means that the factorVocab_ must contain </s> and "<unk>". + Vocab vocab(New<Options>(), 0); + vocab.load(vocabPath); + factorVocab_.load(factorVocabPath); + Word numFactors = (Word)factorVocab_.size(); + + // load and parse factorMap + factorMap_.resize(vocab.size()); + factorRefCounts_.resize(numFactors); + std::vector<std::string> tokens; + io::InputFileStream in(mapPath); + std::string line; + size_t numTotalFactors = 0; + for (Word v = 0; io::getline(in, line); v++) { + tokens.clear(); // @BUGBUG: should be done in split() + utils::splitAny(line, tokens, " \t"); + ABORT_IF(tokens.size() < 2 || tokens.front() != vocab[v], "Factor map must list words in same order as vocab, and have at least one factor per word", mapPath); + for (size_t i = 1; i < tokens.size(); i++) { + auto u = factorVocab_[tokens[i]]; + auto& m = factorMap_[v]; + m.push_back(u); + factorRefCounts_[u]++; + } + numTotalFactors += tokens.size() - 1; + } + LOG(info, "[embedding] Factored-embedding map read with total/unique of {}/{} factors for {} words", numTotalFactors, numFactors, vocab.size()); + + // form groups + // @TODO: hard-coded for these initial experiments + std::vector<std::string> groupPrefixes = { + "@C", + "@GL", "@GR" + }; + groupPrefixes.insert(groupPrefixes.begin(), "(unassigned)"); // first group is fallback for normal words (the string is only used for messages) + size_t numGroups = groupPrefixes.size(); + factorGroups_.resize(numFactors, 0); + for (size_t g = 1; g < groupPrefixes.size(); g++) { // set group labels; what does not match any prefix will stay in group 0 + const auto& groupPrefix = groupPrefixes[g]; + for (Word u = 0; u < numFactors; u++) + if (utils::beginsWith(factorVocab_[u], groupPrefix)) { + ABORT_IF(factorGroups_[u] != 0, "Factor {} matches multiple groups, incl. {}", factorVocab_[u], groupPrefix); + factorGroups_[u] = g; + } + } + groupRanges_.resize(numGroups, { SIZE_MAX, (size_t)0 }); + std::vector<size_t> groupCounts(numGroups); // number of group members + for (Word u = 0; u < numFactors; u++) { // determine ranges; these must be non-overlapping, verified via groupCounts + auto g = factorGroups_[u]; + if (groupRanges_[g].first > u) + groupRanges_[g].first = u; + if (groupRanges_[g].second < u + 1) + groupRanges_[g].second = u + 1; + groupCounts[g]++; + } + // determine if a factor needs explicit softmax normalization + groupNeedsNormalization_.resize(numGroups, false); + for (size_t g = 0; g < numGroups; g++) { // detect non-overlapping groups + LOG(info, "[embedding] Factor group '{}' has {} members ({})", + groupPrefixes[g], groupCounts[g], groupCounts[g] == 1 ? "sigmoid" : "softmax"); + // any factor that is not referenced in all words and is not a sigmoid needs normalization + if (g == 0) // @TODO: For now we assume that the main factor is used in all words. Test this. + continue; + if (groupCounts[g] == 1) // sigmoid factors have no normalizer + continue; + groupNeedsNormalization_[g] = true; // needed + ABORT_IF(groupRanges_[g].second - groupRanges_[g].first != groupCounts[g], + "Factor group '{}' members should be consecutive in the factor vocabulary", groupPrefixes[g]); + LOG(info, "[embedding] Factor group '{}' needs needs explicit normalization ({}..{})", groupPrefixes[g], groupRanges_[g].first, groupRanges_[g].second-1); + } + + // create the factor matrix + std::vector<IndexType> data(vocab.size()); + std::iota(data.begin(), data.end(), 0); + factorMatrix_ = csr_rows(data); // [V x U] + } + + size_t factorVocabSize() const { return factorVocab_.size(); } + + // create a CSR matrix M[V,U] from indices[] with + // M[v,u] = 1/c(u) if factor u is a factor of word v, and c(u) is how often u is referenced + CSRData csr_rows(const std::vector<IndexType>& words) const { + std::vector<float> weights; + std::vector<IndexType> indices; + std::vector<IndexType> offsets; + offsets.reserve(words.size() + 1); + indices.reserve(words.size()); // (at least this many) + // loop over all input words, and select the corresponding set of unit indices into CSR format + offsets.push_back((IndexType)indices.size()); + for (auto v : words) { + const auto& m = factorMap_[v]; + for (auto u : m) { + indices.push_back(u); + weights.push_back(1.0f/*/(float)factorRefCounts_[u]*/); + } + offsets.push_back((IndexType)indices.size()); // next matrix row begins at this offset + } + return { Shape({(int)words.size(), (int)factorVocab_.size()}), weights, indices, offsets }; + } + + const CSRData& getFactorMatrix() const { return factorMatrix_; } // [v,u] (sparse) -> =1 if u is factor of v + private: + Vocab factorVocab_; // [factor name] -> factor index = row of E_ + std::vector<std::vector<Word>> factorMap_; // [word index] -> set of factor indices + std::vector<int> factorRefCounts_; // [factor index] -> how often this factor is referenced in factorMap_ + CSRData factorMatrix_; // [v,u] (sparse) -> =1 if u is factor of v + std::vector<size_t> factorGroups_; // [u] -> group id of factor u + public: // @TODO: temporarily; later factor this properly + std::vector<std::pair<size_t, size_t>> groupRanges_; // [group id] -> (u_begin,u_end) index range of factors u for this group. These don't overlap. + std::vector<bool> groupNeedsNormalization_; // [group id] -> true if explicit softmax normalization is necessary + }; + + namespace mlp { + /*private*/ void Output::lazyConstruct(int inputDim) { + // We must construct lazily since we won't know tying nor input dim in constructor. + if (W_) + return; + + auto name = options_->get<std::string>("prefix"); + auto dim = options_->get<int>("dim"); + + if (options_->has("embedding-factors")) { + ABORT_IF(shortlist_, "Shortlists are presently not compatible with factored embeddings"); + embeddingFactorMapping_ = New<EmbeddingFactorMapping>(options_); + dim = (int)embeddingFactorMapping_->factorVocabSize(); + LOG(info, "[embedding] Factored outputs enabled"); + } + + if(tiedParam_) { + W_ = tiedParam_; + transposeW_ = true; + } else { + W_ = graph_->param(name + "_W", {inputDim, dim}, inits::glorot_uniform); + transposeW_ = false; + } + + b_ = graph_->param(name + "_b", {1, dim}, inits::zeros); + } + + Expr Output::apply(Expr input) /*override*/ { + lazyConstruct(input->shape()[-1]); + + if (shortlist_) { + if (!cachedShortW_) { // short versions of parameters are cached within one batch, then clear()ed + if(transposeW_) + cachedShortW_ = rows(W_, shortlist_->indices()); + else + cachedShortW_ = cols(W_, shortlist_->indices()); + cachedShortb_ = cols(b_, shortlist_->indices()); + } + return affine(input, cachedShortW_, cachedShortb_, false, transposeW_); + } + else if (embeddingFactorMapping_) { + auto graph = input->graph(); + auto y = affine(input, W_, b_, false, transposeW_); // [B... x U] factor logits + + // denominators (only for groups that don't normalize out naturally by the final softmax()) + const auto& groupRanges = embeddingFactorMapping_->groupRanges_; // @TODO: factor this properly + auto numGroups = groupRanges.size(); + for (size_t g = 0; g < numGroups; g++) { + if (!embeddingFactorMapping_->groupNeedsNormalization_[g]) // @TODO: if we ever need it, we can combine multiple + continue; + auto range = groupRanges[g]; + // y: [B... x U] + // m: [1 x U] // ones at positions of group members + auto yDim = y->shape()[-1]; + std::vector<float> mVec(yDim, 0.0f); // @TODO: This vector should be produced by embeddingFactorMapping_ + for (size_t i = range.first; i < range.second; i++) + mVec[i] = 1.0f; + // need to compute log denominator over y[range] and subtract it from y[range] + auto groupY = slice(y, Slice((int)range.first, (int)range.second), /*axis=*/-1); // [B... x Ug] + auto groupZ = logsumexp(groupY, /*axis=*/-1); // [B... x 1] + auto m = graph->constant({ 1, (int)mVec.size() }, inits::from_vector(mVec)); // [1 x U] + auto Z = dot(groupZ, m); // [B... x U] + y = y - Z; +#if 0 + // and a log-linear weight + auto name = options_->get<std::string>("prefix"); + auto llWeight = graph->param(name + "_llWeight_" + std::to_string(g), {}, inits::from_value(1.0f)); + y = y * ((llWeight - 1) * m + 1); +#endif + } + + // sum up the unit logits across factors for each target word + auto factorMatrix = embeddingFactorMapping_->getFactorMatrix(); // [V x U] + y = dot_csr( + y, // [B x U] + factorMatrix.shape, + graph->constant({(int)factorMatrix.weights.size()}, inits::from_vector(factorMatrix.weights), Type::float32), + graph->constant({(int)factorMatrix.indices.size()}, inits::from_vector(factorMatrix.indices), Type::uint32), + graph->constant({(int)factorMatrix.offsets.size()}, inits::from_vector(factorMatrix.offsets), Type::uint32), + /*transB=*/ true); // -> [B x V] + + return y; + } + else + return affine(input, W_, b_, false, transposeW_); + } + } + + Embedding::Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options) : LayerBase(graph, options) { + std::string name = opt<std::string>("prefix"); + int dimVoc = opt<int>("dimVocab"); + int dimEmb = opt<int>("dimEmb"); + + bool fixed = opt<bool>("fixed", false); + + if (options_->has("embedding-factors")) { + embeddingFactorMapping_ = New<EmbeddingFactorMapping>(options_); + dimVoc = (int)embeddingFactorMapping_->factorVocabSize(); + LOG(info, "[embedding] Factored embeddings enabled"); + } + + NodeInitializer initFunc = inits::glorot_uniform; + if (options_->has("embFile")) { + std::string file = opt<std::string>("embFile"); + if (!file.empty()) { + bool norm = opt<bool>("normalization", false); + initFunc = inits::from_word2vec(file, dimVoc, dimEmb, norm); + } + } + + E_ = graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed); + } + + // helper to embed a sequence of words (given as indices) via factored embeddings + /*private*/ Expr Embedding::multiRows(const std::vector<IndexType>& data) const + { + auto graph = E_->graph(); + auto factoredData = embeddingFactorMapping_->csr_rows(data); + // multi-hot factor vectors are represented as a sparse CSR matrix + // [row index = word position index] -> set of factor indices for word at this position + ABORT_IF(factoredData.shape != Shape({(int)factoredData.offsets.size()-1/*=rows of CSR*/, E_->shape()[0]}), "shape mismatch??"); + return csr_dot( // the CSR matrix is passed in pieces + factoredData.shape, + graph->constant({(int)factoredData.weights.size()}, inits::from_vector(factoredData.weights), Type::float32), + graph->constant({(int)factoredData.indices.size()}, inits::from_vector(factoredData.indices), Type::uint32), + graph->constant({(int)factoredData.offsets.size()}, inits::from_vector(factoredData.offsets), Type::uint32), + E_); + } + + std::tuple<Expr/*embeddings*/, Expr/*mask*/> Embedding::apply(Ptr<data::SubBatch> subBatch) const /*override final*/ { + auto graph = E_->graph(); + int dimBatch = (int)subBatch->batchSize(); + int dimEmb = E_->shape()[-1]; + int dimWords = (int)subBatch->batchWidth(); + + // factored embeddings: + // - regular: + // - y = x @ E x:[B x 1ofV] ; E:[V x D] ; y:[B x D] + // - factored: + // - u = x @ M one-hot to U-dimensional multi-hot (all factors in one concatenated space) + // - each row of M contains the set of factors for one word => we want a CSR matrix + // - y = (x @ M) @ E (x:[B x 1ofV] ; M:[V x U]) ; E:[U x D] ; y:[B x D] + // - first compute x @ M on the CPU + // - (Uvalues, Uindices, Uoffsets) = csr_rows(Mvalues, Mindices, Moffsets, subBatch->data()): + // - shape (U, specifically) not actually needed here + // - foreach input x[i] + // - locate row M[i,*] + // - copy through its index values (std::vector<push_back>) + // - create a matching ones vector (we can keep growing) + // - convert to GPU-side CSR matrix. CSR matrix now has #rows equal to len(x) + // - CSR matrix product with E + // - csr_dot(Uvalues, Uindices, Uoffsets, E_, transposeU) + // - double-check if all dimensions are specified. Probably not for transpose (which would be like csc_dot()). + // - weighting: + // - core factors' gradients are sums over all words that use the factors; + // - core factors' embeddings move very fast + // - words will need to make up for the move; rare words cannot + // - so, we multiply each factor with 1/refCount + // - core factors get weighed down a lot + // - no impact on gradients, as Adam makes up for it; embeddings still move fast just as before + // - but forward pass weighs them down, so that all factors are in a similar numeric range + // - if it is required to be in a different range, the embeddings can still learn that, but more slowly + + Expr chosenEmbeddings; + if (embeddingFactorMapping_) + chosenEmbeddings = multiRows(subBatch->data()); + else + chosenEmbeddings = rows(E_, subBatch->data()); + + auto batchEmbeddings = reshape(chosenEmbeddings, { dimWords, dimBatch, dimEmb }); + auto batchMask = graph->constant({ dimWords, dimBatch, 1 }, + inits::from_vector(subBatch->mask())); + return std::make_tuple(batchEmbeddings, batchMask); + } + + Expr Embedding::apply(const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) const /*override final*/ { + int dimEmb = E_->shape()[-1]; + Expr chosenEmbeddings; + if (embeddingFactorMapping_) + chosenEmbeddings = multiRows(embIdx); + else + chosenEmbeddings = rows(E_, embIdx); + return reshape(chosenEmbeddings, { dimBeam, 1, dimBatch, dimEmb }); + } +} // namespace marian diff --git a/src/layers/generic.h b/src/layers/generic.h index a9e2be01..d70c7f06 100755 --- a/src/layers/generic.h +++ b/src/layers/generic.h @@ -54,6 +54,8 @@ struct IEmbeddingLayer { virtual Expr apply(const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) const = 0; }; +class EmbeddingFactorMapping; + namespace mlp { class Dense : public LayerBase, public IUnaryLayer { @@ -124,49 +126,51 @@ public: class Output : public LayerBase, public IUnaryLayer { private: - Expr tiedParam_; - Ptr<data::Shortlist> shortlist_; - - Expr W_; + Expr W_; // parameters held by this layer Expr b_; + Expr cachedShortW_; // short-listed version, cached (cleared by clear()) + Expr cachedShortb_; // these match the current value of shortlist_ + Ptr<EmbeddingFactorMapping > embeddingFactorMapping_; + + // optional parameters set/updated after construction + Expr tiedParam_; bool transposeW_{false}; + Ptr<data::Shortlist> shortlist_; + void lazyConstruct(int inputDim); public: Output(Ptr<ExpressionGraph> graph, Ptr<Options> options) - : LayerBase(graph, options) {} + : LayerBase(graph, options) { + clear(); + } void tieTransposed(Expr tied) { - tiedParam_ = tied; + if (W_) + ABORT_IF(tiedParam_.get() != tied.get(), "Tied output projection cannot be changed once weights have been created"); + else + tiedParam_ = tied; } - void setShortlist(Ptr<data::Shortlist> shortlist) { shortlist_ = shortlist; } - - Expr apply(Expr input) override { - if(!W_) { - auto name = options_->get<std::string>("prefix"); - auto dim = options_->get<int>("dim"); - - if(tiedParam_) { - transposeW_ = true; - W_ = tiedParam_; - if(shortlist_) - W_ = rows(W_, shortlist_->indices()); - } else { - W_ = graph_->param(name + "_W", - {input->shape()[-1], dim}, - inits::glorot_uniform); - if(shortlist_) - W_ = cols(W_, shortlist_->indices()); - } - - b_ = graph_->param(name + "_b", {1, dim}, inits::zeros); - if(shortlist_) - b_ = cols(b_, shortlist_->indices()); + void setShortlist(Ptr<data::Shortlist> shortlist) { + if (shortlist_) + ABORT_IF(shortlist.get() != shortlist_.get(), "Output shortlist cannot be changed except after clear()"); + else { + ABORT_IF(cachedShortW_ || cachedShortb_, "No shortlist but cached parameters??"); + shortlist_ = shortlist; } + // cachedShortW_ and cachedShortb_ will be created lazily inside apply() + } - return affine(input, W_, b_, false, transposeW_); + // this is expected to be called in sync with graph->clear(), which invalidates + // cachedShortW_ and cachedShortb_ in the graph's short-term cache + void clear() { + shortlist_ = nullptr; + cachedShortW_ = nullptr; + cachedShortb_ = nullptr; } + Expr apply(Expr input) override; + virtual Expr apply(const std::vector<Expr>& /*inputs*/) override { ABORT("Not implemented"); }; @@ -176,45 +180,15 @@ public: class Embedding : public LayerBase, public IEmbeddingLayer { Expr E_; + Ptr<EmbeddingFactorMapping> embeddingFactorMapping_; + Expr multiRows(const std::vector<IndexType>& data) const; public: - Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options) : LayerBase(graph, options) { - std::string name = opt<std::string>("prefix"); - int dimVoc = opt<int>("dimVocab"); - int dimEmb = opt<int>("dimEmb"); - - bool fixed = opt<bool>("fixed", false); - - NodeInitializer initFunc = inits::glorot_uniform; - if (options_->has("embFile")) { - std::string file = opt<std::string>("embFile"); - if (!file.empty()) { - bool norm = opt<bool>("normalization", false); - initFunc = inits::from_word2vec(file, dimVoc, dimEmb, norm); - } - } + Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options); - E_ = graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed); - } - - std::tuple<Expr/*embeddings*/, Expr/*mask*/> apply(Ptr<data::SubBatch> subBatch) const override final { - auto graph = E_->graph(); - int dimBatch = (int)subBatch->batchSize(); - int dimEmb = E_->shape()[-1]; - int dimWords = (int)subBatch->batchWidth(); - // @TODO: merge this with below. Currently can't only due to the extra beam dimension - auto chosenEmbeddings = rows(E_, subBatch->data()); - auto batchEmbeddings = reshape(chosenEmbeddings, { dimWords, dimBatch, dimEmb }); - auto batchMask = graph->constant({ dimWords, dimBatch, 1 }, - inits::from_vector(subBatch->mask())); - return std::make_tuple(batchEmbeddings, batchMask); - } + std::tuple<Expr/*embeddings*/, Expr/*mask*/> apply(Ptr<data::SubBatch> subBatch) const override final; // special version used in decoding - Expr apply(const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) const override final { - int dimEmb = E_->shape()[-1]; - auto selectedEmbs = rows(E_, embIdx); - return reshape(selectedEmbs, { dimBeam, 1, dimBatch, dimEmb }); - } + Expr apply(const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) const override final; }; class ULREmbedding : public LayerBase, public IEmbeddingLayer { @@ -322,10 +296,4 @@ public: ABORT("not implemented"); // ULR cannot be used for decoding } }; - -typedef ConstructingFactory<Embedding> EmbeddingFactory; -typedef ConstructingFactory<ULREmbedding> ULREmbeddingFactory; - -typedef Accumulator<EmbeddingFactory> embedding; -typedef Accumulator<ULREmbeddingFactory> ulr_embedding; } // namespace marian diff --git a/src/layers/loss.cpp b/src/layers/loss.cpp index 03b79682..d11e4384 100755 --- a/src/layers/loss.cpp +++ b/src/layers/loss.cpp @@ -26,13 +26,26 @@ Expr LossBase::getCrossEntropy(Expr logits, Expr indices, Expr mask, Expr weights) { - auto ce = cross_entropy(logits, indices); + Expr ce; if(smoothing_ > 0) { // @TODO: add this to CE kernels instead +#if 0 + ce = cross_entropy(logits, indices); auto ceq = mean(logsoftmax(logits), /*axis=*/ -1); ce = (1 - smoothing_) * ce - smoothing_ * ceq; +#else // alternative that is cheaper memory-wise + ce = cross_entropy(logits, indices); + auto ceq = mean(logits, /*axis=*/ -1) - logsumexp(logits, /*axis=*/ -1); + ce = (1 - smoothing_) * ce - smoothing_ * ceq; + //auto ceq = mean(logits, /*axis=*/ -1) - Z; + //ce = (1 - smoothing_) * cols(logits, indices) // ce term + // - smoothing_ * mean(logits, /*axis=*/ -1) // smoothing term + // - logsumexp(logits, /*axis=*/ -1); // denominator +#endif } + else + ce = cross_entropy(logits, indices); if(mask) ce = ce * mask; diff --git a/src/models/decoder.h b/src/models/decoder.h index 99fbd4ad..9b7aad9c 100755 --- a/src/models/decoder.h +++ b/src/models/decoder.h @@ -4,6 +4,7 @@ #include "states.h" #include "data/shortlist.h" +#include "layers/constructors.h" #include "layers/generic.h" namespace marian { @@ -14,6 +15,7 @@ protected: std::string prefix_{"decoder"}; bool inference_{false}; size_t batchIndex_{1}; + std::vector<Ptr<IEmbeddingLayer>> embedding_; // @TODO: find a more grammattical name Ptr<data::Shortlist> shortlist_; @@ -31,37 +33,41 @@ public: virtual Ptr<DecoderState> step(Ptr<ExpressionGraph>, Ptr<DecoderState>) = 0; + void lazyCreateEmbedding(Ptr<ExpressionGraph> graph) { + // @TODO: code dup with EncoderTransformer + if (embedding_.size() <= batchIndex_ || !embedding_[batchIndex_]) { // lazy + if (embedding_.size() <= batchIndex_) + embedding_.resize(batchIndex_ + 1); + int dimVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_]; + int dimEmb = opt<int>("dim-emb"); + auto embFactory = embedding()("dimVocab", dimVoc)("dimEmb", dimEmb); + if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all")) + embFactory("prefix", "Wemb"); + else + embFactory("prefix", prefix_ + "_Wemb"); + if(options_->has("embedding-fix-trg")) + embFactory("fixed", opt<bool>("embedding-fix-trg")); + if(options_->has("embedding-vectors")) { + auto embFiles = opt<std::vector<std::string>>("embedding-vectors"); + embFactory("embFile", embFiles[batchIndex_]) // + ("normalization", opt<bool>("embedding-normalization")); + } + if (options_->has("embedding-factors")) { + embFactory("embedding-factors", opt<std::vector<std::string>>("embedding-factors")); + embFactory("vocab", opt<std::vector<std::string>>("vocabs")[batchIndex_]); + } + embedding_[batchIndex_] = embFactory.construct(graph); + } + } + virtual void embeddingsFromBatch(Ptr<ExpressionGraph> graph, Ptr<DecoderState> state, Ptr<data::CorpusBatch> batch) { - - int dimVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_]; - int dimEmb = opt<int>("dim-emb"); - - auto yEmbFactory = embedding() // - ("dimVocab", dimVoc) // - ("dimEmb", dimEmb); - - if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all")) - yEmbFactory("prefix", "Wemb"); - else - yEmbFactory("prefix", prefix_ + "_Wemb"); - - if(options_->has("embedding-fix-trg")) - yEmbFactory("fixed", opt<bool>("embedding-fix-trg")); - - if(options_->has("embedding-vectors")) { - auto embFiles = opt<std::vector<std::string>>("embedding-vectors"); - yEmbFactory("embFile", embFiles[batchIndex_]) // - ("normalization", opt<bool>("embedding-normalization")); - } - - auto yEmb = yEmbFactory.construct(graph); - auto subBatch = (*batch)[batchIndex_]; + lazyCreateEmbedding(graph); Expr y, yMask; std::tie - (y, yMask) = yEmb->apply(subBatch); + (y, yMask) = embedding_[batchIndex_]->apply(subBatch); Expr yData; if(shortlist_) { @@ -82,26 +88,13 @@ public: const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) { - int dimTrgEmb = opt<int>("dim-emb"); - int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_]; - Expr selectedEmbs; if(embIdx.empty()) { - selectedEmbs = graph->constant({1, 1, dimBatch, dimTrgEmb}, inits::zeros); + int dimEmb = opt<int>("dim-emb"); + selectedEmbs = graph->constant({1, 1, dimBatch, dimEmb}, inits::zeros); } else { - // embeddings are loaded from model during translation, no fixing required - auto yEmbFactory = embedding() // - ("dimVocab", dimTrgVoc) // - ("dimEmb", dimTrgEmb); - - if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all")) - yEmbFactory("prefix", "Wemb"); - else - yEmbFactory("prefix", prefix_ + "_Wemb"); - - auto yEmb = yEmbFactory.construct(graph); - - selectedEmbs = yEmb->apply(embIdx, dimBatch, dimBeam); + lazyCreateEmbedding(graph); + selectedEmbs = embedding_[batchIndex_]->apply(embIdx, dimBatch, dimBeam); } state->setTargetEmbeddings(selectedEmbs); } diff --git a/src/models/s2s.h b/src/models/s2s.h index edda79bc..2f4a4579 100755 --- a/src/models/s2s.h +++ b/src/models/s2s.h @@ -124,6 +124,7 @@ public: int dimVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_]; int dimEmb = opt<int>("dim-emb"); + // @TODO: code dup with Decider and EncoderTransformer; actually diverged by now. Unify this. auto embFactory = embedding() // ("dimVocab", dimVoc) // ("dimEmb", dimEmb); diff --git a/src/models/transformer.h b/src/models/transformer.h index 968d481b..f6aa4ff1 100755 --- a/src/models/transformer.h +++ b/src/models/transformer.h @@ -6,7 +6,6 @@ #include "marian.h" #include "layers/constructors.h" -#include "layers/factory.h" #include "models/decoder.h" #include "models/encoder.h" #include "models/states.h" @@ -495,7 +494,7 @@ public: return embFactory.construct(graph_); } - Ptr<IEmbeddingLayer> createWordEmbeddingLayer(size_t subBatchIndex) const { + Ptr<IEmbeddingLayer> createSourceEmbeddingLayer(size_t subBatchIndex) const { // standard encoder word embeddings int dimVoc = opt<std::vector<int>>("dim-vocabs")[subBatchIndex]; int dimEmb = opt<int>("dim-emb"); @@ -511,6 +510,10 @@ public: embFactory("embFile", embFiles[subBatchIndex]) ("normalization", opt<bool>("embedding-normalization")); } + if (options_->has("embedding-factors")) { + embFactory("embedding-factors", opt<std::vector<std::string>>("embedding-factors")); + embFactory("vocab", opt<std::vector<std::string>>("vocabs")[subBatchIndex]); + } return embFactory.construct(graph_); } @@ -520,6 +523,7 @@ public: return apply(batch); } + std::vector<Ptr<IEmbeddingLayer>> embedding_; // @TODO: move away, also rename Ptr<EncoderState> apply(Ptr<data::CorpusBatch> batch) { int dimEmb = opt<int>("dim-emb"); int dimBatch = (int)batch->size(); @@ -527,12 +531,15 @@ public: // create the embedding matrix, considering tying and some other options // embed the source words in the batch Expr batchEmbeddings, batchMask; - Ptr<IEmbeddingLayer> embedding; - if (options_->has("ulr") && options_->get<bool>("ulr") == true) - embedding = createULREmbeddingLayer(); // embedding uses ULR - else - embedding = createWordEmbeddingLayer(batchIndex_); - std::tie(batchEmbeddings, batchMask) = embedding->apply((*batch)[batchIndex_]); + + if (embedding_.empty() || !embedding_[batchIndex_]) { // lazy + embedding_.resize(batch->sets()); + if (options_->has("ulr") && options_->get<bool>("ulr") == true) + embedding_[batchIndex_] = createULREmbeddingLayer(); // embedding uses ULR + else + embedding_[batchIndex_] = createSourceEmbeddingLayer(batchIndex_); + } + std::tie(batchEmbeddings, batchMask) = embedding_[batchIndex_]->apply((*batch)[batchIndex_]); // apply dropout over source words float dropoutSrc = inference_ ? 0 : opt<float>("dropout-src"); if(dropoutSrc) { @@ -601,17 +608,17 @@ public: class DecoderTransformer : public Transformer<DecoderBase> { private: - Ptr<mlp::MLP> output_; + Ptr<mlp::Output> output_; private: - void LazyCreateOutputLayer() + void lazyCreateOutputLayer() { if(output_) // create it lazily return; int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_]; - auto layerOut = mlp::output() // + auto outputFactory = mlp::output() // ("prefix", prefix_ + "_ff_logit_out") // ("dim", dimTrgVoc); @@ -619,18 +626,22 @@ private: std::string tiedPrefix = prefix_ + "_Wemb"; if(opt<bool>("tied-embeddings-all") || opt<bool>("tied-embeddings-src")) tiedPrefix = "Wemb"; - layerOut.tieTransposed(tiedPrefix); + outputFactory.tieTransposed(tiedPrefix); } - if(shortlist_) - layerOut.setShortlist(shortlist_); - - // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim] - // assemble layers into MLP and apply to embeddings, decoder context and - // aligned source context - output_ = mlp::mlp() // - .push_back(layerOut) // - .construct(graph_); + if (options_->has("embedding-factors")) { + // factored embeddings, simplistic version (which just adds the logits, like multiplying probs) + // z = h @ W // h:[B x D] ; W:[D x V] -> [B x V] + // with factors: + // z = h @ W @ M' // h:[B x D] ; W:[D x U] ; M':[U x V] -> [B x V] + // i.e. multiOutput(): + // output = dot_csr(output, M, transB=true) + // @BUGBUG: need to specify output factors separately if not tied-embeddings or tied-embeddings-all + outputFactory("embedding-factors", opt<std::vector<std::string>>("embedding-factors")); + outputFactory("vocab", opt<std::vector<std::string>>("vocabs")[batchIndex_]); + } + + output_ = std::dynamic_pointer_cast<mlp::Output>(outputFactory.construct(graph_)); // (construct() returns only the underlying interface) } public: @@ -662,7 +673,7 @@ public: virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph, Ptr<DecoderState> state) override { ABORT_IF(graph != graph_, "An inconsistent graph parameter was passed to step()"); - LazyCreateOutputLayer(); + lazyCreateOutputLayer(); return step(state); } @@ -818,7 +829,9 @@ public: //************************************************************************// // final feed-forward layer (output) - Expr logits = output_->apply(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim] + if(shortlist_) + output_->setShortlist(shortlist_); + Expr logits = output_->apply(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab or shortlist dim] // return unormalized(!) probabilities Ptr<DecoderState> nextState; @@ -840,7 +853,8 @@ public: } void clear() override { - output_ = nullptr; + if (output_) + output_->clear(); cache_.clear(); alignments_.clear(); } diff --git a/src/models/transformer_factory.h b/src/models/transformer_factory.h index aa31e4d1..825c32b9 100755 --- a/src/models/transformer_factory.h +++ b/src/models/transformer_factory.h @@ -1,3 +1,4 @@ +// @TODO: rename to transformer.h eventually. This is not a Factory as in factory.h. #pragma once #include "marian.h" diff --git a/src/tensors/cpu/add.h b/src/tensors/cpu/add.h index 38a0684d..4bae5bb5 100755 --- a/src/tensors/cpu/add.h +++ b/src/tensors/cpu/add.h @@ -15,8 +15,8 @@ namespace marian { namespace cpu { -template <size_t K, class Functor> -void gAddGeneric(Functor functor, +template <size_t K, class Functor, class AggFunctor> +void gAggregateGeneric(Functor functor, float aggInit, AggFunctor aggFunctor, const functional::Shape full, functional::Tensor<float> out, functional::Array<functional::Tensor<float>, K> ins, @@ -34,16 +34,16 @@ void gAddGeneric(Functor functor, functional::Array<int, N> dims; for(int index = 0; index < outLength; ++index) { if(same) { - out[index] += functional::apply(functor, ins, index) * scale; + out[index] = aggFunctor(out[index], functional::apply(functor, ins, index) * scale); } else { out.shape().dims(index, dims); - out[index] += functional::loops(functor, ins, len, dims) * scale; + out[index] = aggFunctor(out[index], functional::loops(functor, aggInit, aggFunctor, ins, len, dims) * scale); } } } -template <size_t K, class Functor> -void gAddEqual(Functor functor, +template <size_t K, class Functor, class AggFunctor> +void gAggregateEqual(Functor functor, AggFunctor aggFunctor, functional::Tensor<float> out, functional::Array<functional::Tensor<float>, K> ins, float scale, @@ -61,12 +61,12 @@ void gAddEqual(Functor functor, indices[i] = ins[i].shape().bindex(dims); } - out[index] += functional::apply(functor, ins, indices) * scale; + out[index] = aggFunctor(out[index], functional::apply(functor, ins, indices) * scale); } } -template <size_t K, class Functor> -void gAddReduce(Functor functor, +template <size_t K, class Functor, class AggFunctor> +void gAggregateReduce(Functor functor, float aggInit, AggFunctor aggFunctor, const functional::Shape full, functional::Tensor<float> out, functional::Array<functional::Tensor<float>, K> ins, @@ -79,10 +79,10 @@ void gAddReduce(Functor functor, same = same && ins[i].shape().elements() == full.elements(); for(int j = 0; j < rows; ++j) { - float sum = 0; + float colSum = aggInit; if(same) { for(int id = 0; id < cols; ++id) - sum += functional::apply(functor, ins, j * cols + id); + colSum = aggFunctor(colSum, functional::apply(functor, ins, j * cols + id)); } else { functional::Array<int, functional::Shape::size()> dims; for(int id = 0; id < cols; ++id) { @@ -90,15 +90,15 @@ void gAddReduce(Functor functor, functional::Array<int, K> indices; for(size_t i = 0; i < K; ++i) indices[i] = ins[i].shape().bindex(dims); - sum += functional::apply(functor, ins, indices); + colSum = aggFunctor(colSum, functional::apply(functor, ins, indices)); } } - out[j] += sum * scale; + out[j] = aggFunctor(out[j], colSum * scale); } } -template <class Functor, class... Tensors> -void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { +template <class Functor, class AggFunctor, class... Tensors> +void Aggregate(Functor functor, float aggInit, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors) { auto full = marian::Shape::broadcast({out, tensors...}); //int length = out->shape().elements(); @@ -111,15 +111,16 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { if(full.back() != 1 && out->shape().back() == 1) { //size_t m = full.elements() / length; //size_t k = full.back(); - cpu::gAddReduce(functor, full, gOut, gIns, scale); + cpu::gAggregateReduce(functor, aggInit, aggFunctor, full, gOut, gIns, scale); } else if(out->shape() == full) { bool broadcast = false; for(size_t i = 0; i < K; ++i) broadcast = broadcast || gOut.shape() != gIns[i].shape(); - cpu::gAddEqual(functor, gOut, gIns, scale, broadcast); + cpu::gAggregateEqual(functor, aggFunctor, gOut, gIns, scale, broadcast); } else { - cpu::gAddGeneric(functor, full, gOut, gIns, scale); + cpu::gAggregateGeneric(functor, aggInit, aggFunctor, full, gOut, gIns, scale); } } + } // namespace cpu } // namespace marian diff --git a/src/tensors/gpu/add.cu b/src/tensors/gpu/add.cu index 2431948e..32c12783 100755 --- a/src/tensors/gpu/add.cu +++ b/src/tensors/gpu/add.cu @@ -16,8 +16,8 @@ namespace marian { namespace gpu { -template <size_t K, class Functor> -__global__ void gAddGeneric(Functor functor, +template <size_t K, class Functor, class AggFunctor> +__global__ void gAggregateGeneric(Functor functor, float aggInit, AggFunctor aggFunctor, const functional::Shape full, functional::Tensor<float> out, functional::Array<functional::Tensor<float>, K> ins, @@ -37,17 +37,17 @@ __global__ void gAddGeneric(Functor functor, int index = bid + blockDim.x * blockIdx.x + threadIdx.x; if(index < outLength) { if(same) { - out[index] += functional::apply(functor, ins, index) * scale; + out[index] = aggFunctor(out[index], functional::apply(functor, ins, index) * scale); } else { out.shape().dims(index, dims); - out[index] += functional::loops(functor, ins, len, dims) * scale; + out[index] = aggFunctor(out[index], functional::loops(functor, aggInit, aggFunctor, ins, len, dims) * scale); } } } } -template <size_t K, class Functor> -__global__ void gAddEqual(Functor functor, +template <size_t K, class Functor, class AggFunctor> +__global__ void gAggregateEqual(Functor functor, AggFunctor aggFunctor, functional::Tensor<float> out, functional::Array<functional::Tensor<float>, K> ins, float scale, @@ -67,13 +67,13 @@ __global__ void gAddEqual(Functor functor, indices[i] = ins[i].shape().bindex(dims); } - out[index] += functional::apply(functor, ins, indices) * scale; + out[index] = aggFunctor(out[index], functional::apply(functor, ins, indices) * scale); } } } -template <size_t K, class Functor> -__global__ void gAddReduce(Functor functor, +template <size_t K, class Functor, class AggFunctor> +__global__ void gAggregateReduce(Functor functor, float aggInit, AggFunctor aggFunctor, const functional::Shape full, functional::Tensor<float> out, functional::Array<functional::Tensor<float>, K> ins, @@ -92,15 +92,15 @@ __global__ void gAddReduce(Functor functor, float* _sum = _share + blockDim.x; if(same) { - _sum[threadIdx.x] = 0; + _sum[threadIdx.x] = aggInit; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; if(id < cols) - _sum[threadIdx.x] += functional::apply(functor, ins, j * cols + id); + _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::apply(functor, ins, j * cols + id)); } } else { functional::Array<int, functional::Shape::size()> dims; - _sum[threadIdx.x] = 0; + _sum[threadIdx.x] = aggInit; for(int tid = 0; tid < cols; tid += blockDim.x) { int id = tid + threadIdx.x; @@ -109,7 +109,7 @@ __global__ void gAddReduce(Functor functor, functional::Array<int, K> indices; for(int i = 0; i < K; ++i) indices[i] = ins[i].shape().bindex(dims); - _sum[threadIdx.x] += functional::apply(functor, ins, indices); + _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::apply(functor, ins, indices)); } } } @@ -119,16 +119,58 @@ __global__ void gAddReduce(Functor functor, __syncthreads(); int skip = (len + 1) >> 1; if(threadIdx.x < (len >> 1)) { - _sum[threadIdx.x] += _sum[threadIdx.x + skip]; + _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], _sum[threadIdx.x + skip]); } len = (len + 1) >> 1; } __syncthreads(); - out[j] += _sum[0] * scale; + out[j] = aggFunctor(out[j], _sum[0] * scale); } } } +template <class Functor, class AggFunctor, class... Tensors> +void Aggregate(Functor functor, float aggInit, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors) { + cudaSetDevice(out->getDeviceId().no); + + auto full = marian::Shape::broadcast({out, tensors...}); + + int length = out->shape().elements(); + + constexpr size_t K = sizeof...(Tensors); + + functional::Tensor<float> gOut = out; + functional::Array<functional::Tensor<float>, K> gIns = {tensors...}; + + if(full.back() != 1 && out->shape().back() == 1) { + size_t m = full.elements() / length; + size_t k = full.back(); + + int blocks = std::min(MAX_BLOCKS, (int)m); + int threads = std::min(MAX_THREADS, (int)k); + int shared = sizeof(float) * threads * 2; + + gAggregateReduce<<<blocks, threads, shared>>>(functor, aggInit, aggFunctor, full, gOut, gIns, scale); + + } else if(out->shape() == full) { + int threads = std::min(MAX_THREADS, length); + int blocks + = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + + bool broadcast = false; + for(int i = 0; i < K; ++i) + broadcast = broadcast || gOut.shape() != gIns[i].shape(); + gAggregateEqual<<<blocks, threads>>>(functor, aggFunctor, gOut, gIns, scale, broadcast); + } else { + int threads = std::min(MAX_THREADS, length); + int blocks + = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); + + gAggregateGeneric<<<blocks, threads>>>(functor, aggInit, aggFunctor, full, gOut, gIns, scale); + } +} + +// @TODO: this is a duplicate; can be removed, but need to redo all the add.inc entries... template <class Functor, class... Tensors> void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { cudaSetDevice(out->getDeviceId().no); @@ -142,6 +184,8 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { functional::Tensor<float> gOut = out; functional::Array<functional::Tensor<float>, K> gIns = {tensors...}; + auto addFunctor = functional::_1 + functional::_2; + if(full.back() != 1 && out->shape().back() == 1) { size_t m = full.elements() / length; size_t k = full.back(); @@ -150,7 +194,7 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { int threads = std::min(MAX_THREADS, (int)k); int shared = sizeof(float) * threads * 2; - gAddReduce<<<blocks, threads, shared>>>(functor, full, gOut, gIns, scale); + gAggregateReduce<<<blocks, threads, shared>>>(functor, 0, addFunctor, full, gOut, gIns, scale); } else if(out->shape() == full) { int threads = std::min(MAX_THREADS, length); @@ -160,13 +204,13 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { bool broadcast = false; for(int i = 0; i < K; ++i) broadcast = broadcast || gOut.shape() != gIns[i].shape(); - gAddEqual<<<blocks, threads>>>(functor, gOut, gIns, scale, broadcast); + gAggregateEqual<<<blocks, threads>>>(functor, addFunctor, gOut, gIns, scale, broadcast); } else { int threads = std::min(MAX_THREADS, length); int blocks = std::min(MAX_BLOCKS, length / threads + (length % threads != 0)); - gAddGeneric<<<blocks, threads>>>(functor, full, gOut, gIns, scale); + gAggregateGeneric<<<blocks, threads>>>(functor, 0, addFunctor, full, gOut, gIns, scale); } } diff --git a/src/tensors/gpu/add.h b/src/tensors/gpu/add.h index e5e22d88..21e0bb96 100644..100755 --- a/src/tensors/gpu/add.h +++ b/src/tensors/gpu/add.h @@ -8,5 +8,7 @@ namespace gpu { template <class Functor, class... Tensors> void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors); +template <class Functor, class AggFunctor, class... Tensors> +void Aggregate(Functor functor, float initAgg, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors); } } // namespace marian diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc index 27f35b95..69244dce 100644..100755 --- a/src/tensors/gpu/add.inc +++ b/src/tensors/gpu/add.inc @@ -1,3 +1,4 @@ +// see element.inc for instructions on how to maintain this using namespace functional; template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor); @@ -22,3 +23,12 @@ template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>, template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor); template void Add<BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); +template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> >, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); +template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Minimum, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase> >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Minimum, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); +template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Maximum, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase> >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Maximum, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); +template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase> >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); +template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::LogAddExp, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase> >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::LogAddExp, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); +template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); +template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<2>, marian::functional::Assignee<3> > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<2>, marian::functional::Assignee<3> > > >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); +template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); +template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Capture>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Capture>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); diff --git a/src/tensors/gpu/element.inc b/src/tensors/gpu/element.inc index 66f76301..f3cdea28 100755 --- a/src/tensors/gpu/element.inc +++ b/src/tensors/gpu/element.inc @@ -55,6 +55,7 @@ template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, Bin template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, UnaryFunctor<elem::sReLU, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Minus, Assignee<3>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<5> >, Assignee<4> >, Capture> >, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<4> >, Assignee<4> >, Capture> > >, Capture>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> >, Capture> > >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, UnaryFunctor<elem::sReLU, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Minus, Assignee<3>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<5> >, Assignee<4> >, Capture> >, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<4> >, Assignee<4> >, Capture> > >, Capture>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> >, Capture> > >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Div, Assignee<3>, Capture> >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Div, Assignee<3>, Capture> >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::NEq, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::BinaryFunctor<marian::functional::elem::Gt, marian::functional::Assignee<2>, marian::functional::Assignee<3> >, marian::functional::BinaryFunctor<marian::functional::elem::Lt, marian::functional::Assignee<2>, marian::functional::Assignee<3> > >, marian::functional::Capture>, marian::functional::Capture> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::NEq, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::BinaryFunctor<marian::functional::elem::Gt, marian::functional::Assignee<2>, marian::functional::Assignee<3> >, marian::functional::BinaryFunctor<marian::functional::elem::Lt, marian::functional::Assignee<2>, marian::functional::Assignee<3> > >, marian::functional::Capture>, marian::functional::Capture> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>); +template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Assignee<1> > >>(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Assignee<1> > >, std::shared_ptr<marian::TensorBase>); // How to add new specializations: // When you use a new specialization, it will cause a link error of this form (example): // .../src/tensors/tensor_operators.h:41: undefined reference to `void marian::gpu::Element<marian::functional::Assign< ... > ( ... )' diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h index acc7e54c..f77259cb 100755 --- a/src/tensors/tensor.h +++ b/src/tensors/tensor.h @@ -140,6 +140,10 @@ public: template <typename T> void set(const T* begin, const T* end) { + ABORT_IF(end - begin != shape_.elements(), + "Vector size ({}) and underlying type ({}) do not match", + end - begin, + std::string(shape_)); ABORT_IF(!matchType<T>(type_), "Requested type ({}) and underlying type ({}) do not match", request<T>(), diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h index f7de2f20..8dd17a5c 100755 --- a/src/tensors/tensor_operators.h +++ b/src/tensors/tensor_operators.h @@ -51,7 +51,7 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) { gpu::Add(functor, scale, out, tensors...); else #endif - cpu::Add(functor, scale, out, tensors...); + cpu::Aggregate(functor, 0.0f, functional::_1 + functional::_2, scale, out, tensors...); } template <class Functor, class... Tensors> @@ -59,6 +59,16 @@ void Add(Functor functor, marian::Tensor out, Tensors... tensors) { Add(functor, 1, out, tensors...); } +template <class Functor, class AggFunctor, class... Tensors> +void Aggregate(Functor functor, float aggInit, AggFunctor aggFunctor, marian::Tensor out, Tensors... tensors) { +#ifdef CUDA_FOUND + if(out->getBackend()->getDeviceId().type == DeviceType::gpu) + gpu::Aggregate(functor, aggInit, aggFunctor, 1.0f, out, tensors...); + else +#endif + cpu::Aggregate(functor, aggInit, aggFunctor, 1.0f, out, tensors...); +} + template <class Functor, class... Tensors> void Reduce(Functor functor, float scale, @@ -74,6 +84,14 @@ void Reduce(Functor functor, marian::Tensor out, Tensors... tensors) { Add(functor, out, tensors...); } +template <class Functor, class AggFunctor, class... Tensors> +void Reduce(Functor functor, AggFunctor aggFunctor, float aggInit, + marian::Tensor out, + Tensors... tensors) { + out->set(aggInit); + Aggregate(functor, aggInit, aggFunctor, out, tensors...); +} + // clang-format off DISPATCH7(Prod, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float) DISPATCH8(ProdBatched, marian::Tensor, Ptr<Allocator>, const marian::Tensor, const marian::Tensor, bool, bool, float, float) diff --git a/src/tests/operator_tests.cpp b/src/tests/operator_tests.cpp index 08fdd8de..2607f41f 100755 --- a/src/tests/operator_tests.cpp +++ b/src/tests/operator_tests.cpp @@ -204,12 +204,19 @@ void tests(DeviceType device) { graph->clear(); values.clear(); - std::vector<float> vA({1, 2, 3, 4, 5, 6, 7, 8}); - std::vector<float> vS1({6, 8, 10, 12}); - std::vector<float> vS2({10, 26}); - - std::vector<float> vW({2.77778f, 6.77778f}); - + std::vector<float> vA({1, 6, 3, 8, + 5, 2, 7, 4}); + // import numpy as np + // a = np.array([[1, 6, 3, 8], [5, 2, 7, 4]]) + std::vector<float> vS1({6, 8, 10, 12}); // s1 = np.sum(a, axis=0) + std::vector<float> vS2({18, 18}); // np.sum(a, axis = 1) + std::vector<float> vS4({2.6925824f, 1.80277564f}); // np.std(a, axis = 1) + std::vector<float> vV5({7.25, 3.25}); // np.var(a, axis = 1) + std::vector<float> vM6({8, 7}); // np.max(a, axis = 1) + std::vector<float> vM7({1, 2}); // np.min(a, axis = 1) + std::vector<float> vP8({144, 280}); // np.prod(a, axis = 1) + std::vector<float> vL9({8.13364336f, 7.17551536f}); // np.log(np.sum(np.exp(a), axis=1)) + std::vector<float> vW({5.0f, 4.55555556f}); // np.mean(a*s1,axis=-1) / np.mean(s1,axis=-1) auto a = graph->constant({2, 4}, inits::from_vector(vA)); @@ -218,6 +225,14 @@ void tests(DeviceType device) { auto m3 = mean(s1, /*axis=*/ 1); + auto s4 = marian::std(a, /*axis=*/ 1); + auto v5 = var(a, /*axis=*/ 1); + + auto m6 = max(a, /*axis=*/ 1); + auto m7 = min(a, /*axis=*/ 1); + auto p8 = prod(a, /*axis=*/ 1); + auto l9 = logsumexp(a, /*axis=*/ 1); + auto sp = scalar_product(s2, s2, /*axis=*/ 0); auto wa = weighted_average(a, s1, /*axis=*/ -1); @@ -227,21 +242,30 @@ void tests(DeviceType device) { CHECK(s1->shape() == Shape({1, 4})); CHECK(s2->shape() == Shape({2, 1})); CHECK(m3->shape() == Shape({1, 1})); + CHECK(s4->shape() == Shape({2, 1})); + CHECK(v5->shape() == Shape({2, 1})); + CHECK(m6->shape() == Shape({2, 1})); + CHECK(m7->shape() == Shape({2, 1})); + CHECK(p8->shape() == Shape({2, 1})); + CHECK(l9->shape() == Shape({2, 1})); CHECK(sp->shape() == Shape({1, 1})); CHECK(wa->shape() == Shape({2, 1})); - s1->val()->get(values); - CHECK( values == vS1 ); + s1->val()->get(values); CHECK(values == vS1); + s2->val()->get(values); CHECK(values == vS2); - s2->val()->get(values); - CHECK( values == vS2 ); + CHECK(m3->val()->scalar() == 9); - CHECK( m3->val()->scalar() == 9 ); - CHECK( sp->val()->scalar() == 776 ); + s4->val()->get(values); CHECK(std::equal(values.begin(), values.end(), vS4.begin(), floatApprox)); + v5->val()->get(values); CHECK(values == vV5); + m6->val()->get(values); CHECK(values == vM6); + m7->val()->get(values); CHECK(values == vM7); + p8->val()->get(values); CHECK(values == vP8); + l9->val()->get(values); CHECK(std::equal(values.begin(), values.end(), vL9.begin(), floatApprox)); - wa->val()->get(values); - CHECK( std::equal(values.begin(), values.end(), - vW.begin(), floatApprox) ); + CHECK(sp->val()->scalar() == 648); + + wa->val()->get(values); CHECK(std::equal(values.begin(), values.end(), vW.begin(), floatApprox)); } SECTION("concatenation") { diff --git a/vs/Marian.vcxproj b/vs/Marian.vcxproj index a6c560d3..a82c879e 100755 --- a/vs/Marian.vcxproj +++ b/vs/Marian.vcxproj @@ -575,6 +575,7 @@ <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
<ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
</ClCompile>
+ <ClCompile Include="..\src\layers\generic.cpp" />
<ClCompile Include="..\src\layers\loss.cpp" />
<ClCompile Include="..\src\layers\weight.cpp" />
<ClCompile Include="..\src\microsoft\quicksand.cpp">
diff --git a/vs/Marian.vcxproj.filters b/vs/Marian.vcxproj.filters index d9e56843..a7c2331e 100755 --- a/vs/Marian.vcxproj.filters +++ b/vs/Marian.vcxproj.filters @@ -481,6 +481,9 @@ <ClCompile Include="..\src\examples\iris\iris.cpp">
<Filter>examples\iris</Filter>
</ClCompile>
+ <ClCompile Include="..\src\layers\generic.cpp">
+ <Filter>layers</Filter>
+ </ClCompile>
</ItemGroup>
<ItemGroup>
<ClInclude Include="..\src\marian.h" />
|