27 files changed, 770 insertions, 266 deletions
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 8c757f10..4c16aa1d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -47,6 +47,7 @@ add_library(marian STATIC
   graph/node_initializers.cpp
 
   layers/convolution.cpp
+  layers/generic.cpp
   layers/loss.cpp
   layers/weight.cpp
 
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index 70a30a5f..77604285 100755
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -30,6 +30,7 @@ const std::set<std::string> PATHS = {
   "train-sets",
   "vocabs",
   "embedding-vectors",
+  "embedding-factors",
   "valid-sets",
   "valid-script-path",
   "valid-log",
@@ -385,6 +386,8 @@ void ConfigParser::addOptionsTraining(cli::CLIWrapper& cli) {
      "Fix source embeddings. Affects all encoders");
   cli.add<bool>("--embedding-fix-trg",
      "Fix target embeddings. Affects all decoders");
+  cli.add_nondefault<std::vector<std::string>>("--embedding-factors",
+     "Paths to (factor map, factor list) file for factored embeddings");
 
   cli.add<bool>("--multi-node",
      "Enable asynchronous multi-node training through MPI (and legacy sync if combined with --sync-sgd)");
@@ -466,6 +469,8 @@ void ConfigParser::addOptionsTranslation(cli::CLIWrapper& cli) {
       "stdout");
   cli.add<std::vector<std::string>>("--vocabs,-v",
       "Paths to vocabulary files have to correspond to --input");
+  cli.add_nondefault<std::vector<std::string>>("--embedding-factors",
+      "Paths to (factor map, factor list) file for factored embeddings");
   // decoding options
   cli.add<size_t>("--beam-size,-b",
       "Beam size used during search with validating translator",
@@ -528,6 +533,8 @@ void ConfigParser::addOptionsScoring(cli::CLIWrapper& cli) {
       "Paths to vocabulary files have to correspond to --train-sets. "
       "If this parameter is not supplied we look for vocabulary files source.{yml,json} and target.{yml,json}. "
       "If these files do not exists they are created");
+  cli.add_nondefault<std::vector<std::string>>("--embedding-factors",
+      "Paths to (factor map, factor list) file for factored embeddings");
   cli.add<bool>("--n-best",
       "Score n-best list instead of plain text corpus");
   cli.add<std::string>("--n-best-feature",
diff --git a/src/common/utils.cpp b/src/common/utils.cpp
index bde78835..252afa54 100755
--- a/src/common/utils.cpp
+++ b/src/common/utils.cpp
@@ -144,6 +144,11 @@ std::string withCommas(size_t n) {
   return res;
 }
 
+bool beginsWith(const std::string& text, const std::string& prefix) {
+  return text.size() >= prefix.size()
+         && !text.compare(0, prefix.size(), prefix);
+}
+
 bool endsWith(const std::string& text, const std::string& suffix) {
   return text.size() >= suffix.size()
          && !text.compare(text.size() - suffix.size(), suffix.size(), suffix);
diff --git a/src/common/utils.h b/src/common/utils.h
index 94113a0e..d76d07fa 100755
--- a/src/common/utils.h
+++ b/src/common/utils.h
@@ -36,6 +36,7 @@ std::string exec(const std::string& cmd);
 std::pair<std::string, int> hostnameAndProcessId();
 
 std::string withCommas(size_t n);
+bool beginsWith(const std::string& text, const std::string& prefix);
 bool endsWith(const std::string& text, const std::string& suffix);
 
 std::string toUpper(const std::string& s);
diff --git a/src/functional/tmp.h b/src/functional/tmp.h
index 08383660..7c8f6fa1 100755
--- a/src/functional/tmp.h
+++ b/src/functional/tmp.h
@@ -118,55 +118,56 @@ __HDI__ float apply(Functor functor,
 
 /******************************************************************************/
 
+// @TODO: Rename this. It is a reduction loop.
 template <size_t n, size_t N, size_t K>
 struct Loop {
-  template <class Functor>
+  template <class Functor, class AggFunctor>
   __HDI__ static float result(
-      Functor functor,
+      Functor functor, float aggInit, AggFunctor aggFunctor,
       functional::Array<functional::Tensor<float>, K>& in,
       const functional::Array<int, K>& pAcc,
       const functional::Array<int, N>& length,
       const functional::Array<int, N>& dim) {
-    float sum = 0;
+    float agg = aggInit;
     functional::Array<int, K> acc;
     for(int i = 0; i < length[N - n]; ++i) {
       for(size_t j = 0; j < K; ++j) {
         acc[j] = pAcc[j] + (dim[N - n] + i) * in[j].shape().bstride(N - n);
       }
-      sum += Loop<n - 1, N, K>::result(functor, in, acc, length, dim);
+      agg = aggFunctor(agg, Loop<n - 1, N, K>::result(functor, aggInit, aggFunctor, in, acc, length, dim));
     }
-    return sum;
+    return agg;
   }
 };
 
 template <size_t N, size_t K>
 struct Loop<1, N, K> {
-  template <class Functor>
+  template <class Functor, class AggFunctor>
   __HDI__ static float result(
-      Functor functor,
+      Functor functor, float aggInit, AggFunctor aggFunctor,
       functional::Array<functional::Tensor<float>, K>& in,
       const functional::Array<int, K>& pAcc,
       const functional::Array<int, N>& length,
       const functional::Array<int, N>& dim) {
-    float sum = 0;
+    float agg = aggInit;
     functional::Array<int, K> acc;
     for(int i = 0; i < length[N - 1]; ++i) {
       for(size_t j = 0; j < K; ++j) {
         acc[j] = pAcc[j] + (dim[N - 1] + i) * in[j].shape().bstride(N - 1);
       }
-      sum += apply<K>(functor, in, acc);
+      agg = aggFunctor(agg, apply<K>(functor, in, acc));
     }
-    return sum;
+    return agg;
   }
 };
 
-template <size_t N, size_t K, class Functor>
-__HDI__ float loops(Functor functor,
+template <size_t N, size_t K, class Functor, class AggFunctor>
+__HDI__ float loops(Functor functor, float aggInit, AggFunctor aggFunctor,
                     functional::Array<functional::Tensor<float>, K>& in,
                     const functional::Array<int, N>& length,
                     const functional::Array<int, N>& dim) {
   functional::Array<int, K> acc = {0};
-  return Loop<N, N, K>::result(functor, in, acc, length, dim);
+  return Loop<N, N, K>::result(functor, aggInit, aggFunctor, in, acc, length, dim);
 }
 }  // namespace functional
 }  // namespace marian
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 826bd9f0..6a07611d 100755
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -306,11 +306,36 @@ Expr slice(Expr a, int axis, Slice slice) { // numpy __getslice__ semantics, but
 }
 
 Expr sum(Expr a, int ax) {
-  return Expression<SumNodeOp>(a, ax);
+  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::sum);
 }
 
 Expr mean(Expr a, int ax) {
-  return Expression<MeanNodeOp>(a, ax);
+  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::mean);
+}
+
+Expr std(Expr a, int ax) {
+  return Expression<ReduceNodeOp>(a - mean(a,ax), ax, ReduceNodeOpCode::rms);
+}
+
+Expr var(Expr a, int ax) {
+  return Expression<ReduceNodeOp>(a - mean(a, ax), ax, ReduceNodeOpCode::meanSqr);
+}
+
+Expr max(Expr a, int ax) {
+  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::max);
+}
+
+Expr min(Expr a, int ax) {
+  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::min);
+}
+
+Expr prod(Expr a, int ax) {
+  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::prod);
+}
+
+// log(sum(exp(a)))
+Expr logsumexp(Expr a, int ax) {
+  return Expression<ReduceNodeOp>(a, ax, ReduceNodeOpCode::logSumExp);
 }
 
 Expr scalar_product(Expr a, Expr b, int ax) {
diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
index 58149bde..78aed834 100755
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@@ -173,6 +173,13 @@ static inline Expr narrow(Expr a, int axis, size_t start, size_t length) { // Py
 /*********************************************************/
 
 Expr sum(Expr a, int ax = 0);
+Expr mean(Expr a, int ax = 0);
+Expr std(Expr a, int ax);
+Expr var(Expr a, int ax);
+Expr max(Expr a, int ax);
+Expr min(Expr a, int ax);
+Expr prod(Expr a, int ax);
+Expr logsumexp(Expr a, int ax);
 
 Expr softmax(Expr x, int axis = -1);
 
@@ -182,8 +189,6 @@ Expr softmax(Expr a, Expr zeroOneMask, int axis = -1);
 
 Expr logsoftmax(Expr a);
 
-Expr mean(Expr a, int ax = 0);
-
 Expr cross_entropy(Expr a, Expr b);
 
 Expr scalar_product(Expr a, Expr b, int ax = 0);
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index 10e2ca76..7d090823 100755
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -432,8 +432,8 @@ public:
     ABORT_IF(S_offsets->shape()[0] - 1 != S_shape[0],
         "Sparse matrix offset vector has incorrect size");
     auto outShape = D->shape();
-    ABORT_IF(S_shape[transS == swapOperands ? 1 : 0] != outShape[-(int)swapOperands],
-             "Matrix product requires inner dimensions to match");
+    ABORT_IF(S_shape[transS == swapOperands ? 1 : 0] != outShape[-(int)swapOperands],
+             "Matrix product requires inner dimensions to match");
     outShape.set(-(int)swapOperands, S_shape[transS != swapOperands]);
     return outShape;
   }
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index 7dbaec46..6dd90faf 100755
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -412,20 +412,75 @@ struct LogSoftmaxNodeOp : public UnaryNodeOp {
   const std::string type() override { return "logsoftmax"; }
 };
 
-struct SumNodeOp : public UnaryNodeOp {
+enum class ReduceNodeOpCode {
+  sum, mean, rms, meanSqr, min, max, prod, logSumExp
+};
+
+struct ReduceNodeOp : public UnaryNodeOp {
   int axis_;
+  ReduceNodeOpCode opCode_;
+  int reducedDim_; // dimension of axis being reduced, e.g. used in mean()
 
-  SumNodeOp(Expr a, int axis) : UnaryNodeOp(a, newShape(a, axis)) {}
+  ReduceNodeOp(Expr a, int axis, ReduceNodeOpCode opCode)
+      : UnaryNodeOp(a, newShape(a, axis)), opCode_(opCode)
+  {
+    reducedDim_ = a->shape()[axis]; // e.g. used in mean()
+    ABORT_IF(reducedDim_ != a->shape().elements() / shape().elements(), "bug in determining reducedDim");
+  }
 
   NodeOps forwardOps() override {
     using namespace functional;
 
-    return {NodeOp(Reduce(_1, val_, child(0)->val()))};
+    switch (opCode_) {
+    case ReduceNodeOpCode::sum:
+      return {NodeOp(Reduce(_1, val_, child(0)->val()))};
+    case ReduceNodeOpCode::mean:
+      return {NodeOp(Reduce(_1, 1.0f / (float)reducedDim_, val_, child(0)->val()))};
+    case ReduceNodeOpCode::rms:
+      return {NodeOp(Reduce(_1 * _1, 1.0f / (float)reducedDim_, val_, child(0)->val());
+                     Element(_1 = sqrt(_1), val_))};
+    case ReduceNodeOpCode::meanSqr:
+      return {NodeOp(Reduce(_1 * _1, 1.0f / (float)reducedDim_, val_, child(0)->val()))};
+    case ReduceNodeOpCode::min:
+      return {NodeOp(Reduce(_1, min(_1,_2), std::numeric_limits<float>::max(), val_, child(0)->val()))};
+    case ReduceNodeOpCode::max:
+      return {NodeOp(Reduce(_1, max(_1,_2), std::numeric_limits<float>::lowest(), val_, child(0)->val()))};
+    case ReduceNodeOpCode::prod:
+      return {NodeOp(Reduce(_1, _1 * _2, 1.0f, val_, child(0)->val()))};
+    case ReduceNodeOpCode::logSumExp:
+      return {NodeOp(Reduce(_1, logaddexp(_1,_2), std::numeric_limits<float>::lowest(), val_, child(0)->val()))};
+    default:
+      ABORT("Unexpected reduction op-code {}", (int)opCode_);
+    }
   }
 
   NodeOps backwardOps() override {
     using namespace functional;
-    return {NodeOp(Add(_1, child(0)->grad(), adj_))};
+    switch (opCode_) {
+    case ReduceNodeOpCode::sum:
+      return {NodeOp(Add(_1, child(0)->grad(), adj_))};
+    case ReduceNodeOpCode::mean:
+      return {NodeOp(Add(_1, 1.0f / (float)reducedDim_, child(0)->grad(), adj_))};
+    case ReduceNodeOpCode::rms: // WARNING: UNTESTED!!
+      // y = (sum_j x_j^2)^0.5
+      // dJ/dx_i = dJ/dy * 0.5 (sum_j x_j^2)^-0.5 * 2 x_i = dJ/dy * x_i / y  --@REVIEW: is this correct?
+      // @TODO: do we need protection against div by 0? L'hospital rule?
+      return {NodeOp(Add(_1 * _2 / _3, child(0)->grad(), adj_, child(0)->val(), val_))};
+    case ReduceNodeOpCode::meanSqr: // WARNING: UNTESTED!!
+      // y = sum_j x_j^2
+      // dJ/dx_i = dJ/dy * sum_j dx_j^2/dx_i = dJ/dy * 2 dx_i  --@REVIEW: is this correct?
+      return {NodeOp(Add(_1 * 2.0f * _2, child(0)->grad(), adj_, child(0)->val()))};
+    case ReduceNodeOpCode::min:  // WARNING: UNTESTED!!
+    case ReduceNodeOpCode::max:  // WARNING: UNTESTED!!
+      // adj_ gets routed into the min/max value  --@REVIEW: is this correct?
+      return {NodeOp(Add((_1 == _2) * _3, child(0)->grad(), child(0)->val(), val_, adj_))};
+    case ReduceNodeOpCode::logSumExp:
+      // y = log(sum_j exp(x_j))
+      // dJ/dx_i = dJ/dy * 1/(sum_j exp(x_j)) exp(x_i) = dJ/dy * exp(x_i - y))  --@REVIEW: is this correct?
+      return {NodeOp(Add(_1 * exp(_2 - _3), child(0)->grad(), adj_, child(0)->val(), val_))};
+    default:
+      ABORT("Unexpected reduction op-code {}", (int)opCode_);
+    }
   }
 
   Shape newShape(Expr a, int axis) {
@@ -436,66 +491,27 @@ struct SumNodeOp : public UnaryNodeOp {
     return shape;
   }
 
-  const std::string type() override { return "sum"; }
-
-  const std::string color() override { return "orange"; }
-
-  virtual size_t hash() override {
-    if(!hash_) {
-      hash_ = NaryNodeOp::hash();
-      util::hash_combine(hash_, axis_);
+  const std::string type() override {
+    switch (opCode_) {
+    case ReduceNodeOpCode::sum:       return "sum";
+    case ReduceNodeOpCode::mean:      return "mean";
+    case ReduceNodeOpCode::rms:       return "rms";
+    case ReduceNodeOpCode::meanSqr:   return "meanSqr";
+    case ReduceNodeOpCode::min:       return "min";
+    case ReduceNodeOpCode::max:       return "max";
+    case ReduceNodeOpCode::prod:      return "prod";
+    case ReduceNodeOpCode::logSumExp: return "logSumExp";
+    default: ABORT("Unexpected reduction op-code {}", (int)opCode_);
     }
-    return hash_;
   }
 
-  virtual bool equal(Expr node) override {
-    if(!NaryNodeOp::equal(node))
-      return false;
-    Ptr<SumNodeOp> cnode = std::dynamic_pointer_cast<SumNodeOp>(node);
-    if(!cnode)
-      return false;
-    if(axis_ != cnode->axis_)
-      return false;
-    return true;
-  }
-};
-
-struct MeanNodeOp : public UnaryNodeOp {
-  int axis_;
-
-  MeanNodeOp(Expr a, int axis) : UnaryNodeOp(a, newShape(a, axis)) {}
-
-  NodeOps forwardOps() override {
-    using namespace functional;
-    int left = child(0)->shape().elements() / val_->shape().elements();
-    float scale = 1.f / left;
-
-    return {NodeOp(Reduce(_1, scale, val_, child(0)->val()))};
-  }
-
-  NodeOps backwardOps() override {
-    using namespace functional;
-    int left = child(0)->shape().elements() / val_->shape().elements();
-    float scale = 1.f / left;
-
-    return {NodeOp(Add(_1, scale, child(0)->grad(), adj_))};
-  }
-
-  Shape newShape(Expr a, int axis) {
-    Shape shape = a->shape();
-    axis_ = shape.axis(axis);
-    shape.set(axis_, 1);
-    return shape;
-  }
-
-  const std::string type() override { return "mean"; }
-
   const std::string color() override { return "orange"; }
 
   virtual size_t hash() override {
     if(!hash_) {
       hash_ = NaryNodeOp::hash();
       util::hash_combine(hash_, axis_);
+      util::hash_combine(hash_, (int)opCode_);
     }
     return hash_;
   }
@@ -503,10 +519,10 @@ struct MeanNodeOp : public UnaryNodeOp {
   virtual bool equal(Expr node) override {
     if(!NaryNodeOp::equal(node))
       return false;
-    Ptr<MeanNodeOp> cnode = std::dynamic_pointer_cast<MeanNodeOp>(node);
+    Ptr<ReduceNodeOp> cnode = std::dynamic_pointer_cast<ReduceNodeOp>(node);
     if(!cnode)
       return false;
-    if(axis_ != cnode->axis_)
+    if(axis_ != cnode->axis_ || opCode_ != cnode->opCode_)
       return false;
     return true;
   }
diff --git a/src/layers/constructors.h b/src/layers/constructors.h
index d0ac3487..5ed7f3f5 100755
--- a/src/layers/constructors.h
+++ b/src/layers/constructors.h
@@ -147,4 +147,10 @@ public:
 // @TODO: change naming convention.
 typedef Accumulator<MLPFactory> mlp;
 }  // namespace mlp
+
+typedef ConstructingFactory<Embedding> EmbeddingFactory;
+typedef ConstructingFactory<ULREmbedding> ULREmbeddingFactory;
+
+typedef Accumulator<EmbeddingFactory> embedding;
+typedef Accumulator<ULREmbeddingFactory> ulr_embedding;
 }  // namespace marian
diff --git a/src/layers/generic.cpp b/src/layers/generic.cpp
new file mode 100755
index 00000000..2941b689
--- /dev/null
+++ b/src/layers/generic.cpp
@@ -0,0 +1,339 @@
+#include "marian.h"
+
+#include "layers/generic.h"
+
+using std::size_t; // not sure why this is needed
+
+namespace marian {
+  struct CSRSparseTensor { // simplistic for now
+    Shape shape;
+    Expr values;  // [k_i..k_{i+1}-1] -> value at [i,j]
+    Expr indices; // [k_i..k_{i+1}-1] -> j of non-null value
+    Expr offsets; // [i] -> k_i
+  };
+
+  class EmbeddingFactorMapping {
+  public:
+    struct CSRData {
+      Shape shape;
+      std::vector<float> weights;
+      std::vector<IndexType> indices;
+      std::vector<IndexType> offsets;
+    };
+    // mapPath = path to file with entries in order of vocab entries of the form
+    //   WORD FACTOR1 FACTOR2 FACTOR3...
+    // listPath = path to file that lists all FACTOR names
+    // vocab = original vocabulary
+    // Note: The WORD field in the map file is redundant. It is required for consistency checking only.
+    // Factors are grouped
+    //  - user specifies list-factor prefixes; all factors beginning with that prefix are in the same group
+    //  - factors within a group as multi-class and normalized that way
+    //  - groups of size 1 are interpreted as sigmoids, multiply with P(u) / P(u-1)
+    //  - one prefix must not contain another
+    //  - all factors not matching a prefix get lumped into yet another class (the lemmas)
+    //  - factor vocab must be sorted such that all groups are consecutive
+    //  - result of Output layer is nevertheless logits, not a normalized probability, due to the sigmoid entries
+    EmbeddingFactorMapping(Ptr<Options> options) : factorVocab_(New<Options>(), 0) {
+      std::vector<std::string> paths = options->get<std::vector<std::string>>("embedding-factors");
+      ABORT_IF(paths.size() != 2, "--embedding-factors expects two paths");
+      auto mapPath = paths[0];
+      auto factorVocabPath = paths[1];
+      auto vocabPath = options->get<std::string>("vocab");
+
+      // Note: We misuse the Vocab class a little.
+      // Specifically, it means that the factorVocab_ must contain </s> and "<unk>".
+      Vocab vocab(New<Options>(), 0);
+      vocab.load(vocabPath);
+      factorVocab_.load(factorVocabPath);
+      Word numFactors = (Word)factorVocab_.size();
+
+      // load and parse factorMap
+      factorMap_.resize(vocab.size());
+      factorRefCounts_.resize(numFactors);
+      std::vector<std::string> tokens;
+      io::InputFileStream in(mapPath);
+      std::string line;
+      size_t numTotalFactors = 0;
+      for (Word v = 0; io::getline(in, line); v++) {
+        tokens.clear(); // @BUGBUG: should be done in split()
+        utils::splitAny(line, tokens, " \t");
+        ABORT_IF(tokens.size() < 2 || tokens.front() != vocab[v], "Factor map must list words in same order as vocab, and have at least one factor per word", mapPath);
+        for (size_t i = 1; i < tokens.size(); i++) {
+          auto u = factorVocab_[tokens[i]];
+          auto& m = factorMap_[v];
+          m.push_back(u);
+          factorRefCounts_[u]++;
+        }
+        numTotalFactors += tokens.size() - 1;
+      }
+      LOG(info, "[embedding] Factored-embedding map read with total/unique of {}/{} factors for {} words", numTotalFactors, numFactors, vocab.size());
+
+      // form groups
+      // @TODO: hard-coded for these initial experiments
+      std::vector<std::string> groupPrefixes = {
+        "@C",
+        "@GL", "@GR"
+      };
+      groupPrefixes.insert(groupPrefixes.begin(), "(unassigned)");     // first group is fallback for normal words (the string is only used for messages)
+      size_t numGroups = groupPrefixes.size();
+      factorGroups_.resize(numFactors, 0);
+      for (size_t g = 1; g < groupPrefixes.size(); g++) { // set group labels; what does not match any prefix will stay in group 0
+        const auto& groupPrefix = groupPrefixes[g];
+        for (Word u = 0; u < numFactors; u++)
+          if (utils::beginsWith(factorVocab_[u], groupPrefix)) {
+            ABORT_IF(factorGroups_[u] != 0, "Factor {} matches multiple groups, incl. {}", factorVocab_[u], groupPrefix);
+            factorGroups_[u] = g;
+          }
+      }
+      groupRanges_.resize(numGroups, { SIZE_MAX, (size_t)0 });
+      std::vector<size_t> groupCounts(numGroups); // number of group members
+      for (Word u = 0; u < numFactors; u++) { // determine ranges; these must be non-overlapping, verified via groupCounts
+        auto g = factorGroups_[u];
+        if (groupRanges_[g].first > u)
+            groupRanges_[g].first = u;
+        if (groupRanges_[g].second < u + 1)
+            groupRanges_[g].second = u + 1;
+        groupCounts[g]++;
+      }
+      // determine if a factor needs explicit softmax normalization
+      groupNeedsNormalization_.resize(numGroups, false);
+      for (size_t g = 0; g < numGroups; g++) { // detect non-overlapping groups
+        LOG(info, "[embedding] Factor group '{}' has {} members ({})",
+            groupPrefixes[g], groupCounts[g], groupCounts[g] == 1 ? "sigmoid" : "softmax");
+        // any factor that is not referenced in all words and is not a sigmoid needs normalization
+        if (g == 0) // @TODO: For now we assume that the main factor is used in all words. Test this.
+          continue;
+        if (groupCounts[g] == 1) // sigmoid factors have no normalizer
+          continue;
+        groupNeedsNormalization_[g] = true; // needed
+        ABORT_IF(groupRanges_[g].second - groupRanges_[g].first != groupCounts[g],
+                 "Factor group '{}' members should be consecutive in the factor vocabulary", groupPrefixes[g]);
+        LOG(info, "[embedding] Factor group '{}' needs needs explicit normalization ({}..{})", groupPrefixes[g], groupRanges_[g].first, groupRanges_[g].second-1);
+      }
+
+      // create the factor matrix
+      std::vector<IndexType> data(vocab.size());
+      std::iota(data.begin(), data.end(), 0);
+      factorMatrix_ = csr_rows(data); // [V x U]
+    }
+
+    size_t factorVocabSize() const { return factorVocab_.size(); }
+
+    // create a CSR matrix M[V,U] from indices[] with
+    // M[v,u] = 1/c(u) if factor u is a factor of word v, and c(u) is how often u is referenced
+    CSRData csr_rows(const std::vector<IndexType>& words) const {
+      std::vector<float> weights;
+      std::vector<IndexType> indices;
+      std::vector<IndexType> offsets;
+      offsets.reserve(words.size() + 1);
+      indices.reserve(words.size()); // (at least this many)
+      // loop over all input words, and select the corresponding set of unit indices into CSR format
+      offsets.push_back((IndexType)indices.size());
+      for (auto v : words) {
+        const auto& m = factorMap_[v];
+        for (auto u : m) {
+          indices.push_back(u);
+          weights.push_back(1.0f/*/(float)factorRefCounts_[u]*/);
+        }
+        offsets.push_back((IndexType)indices.size()); // next matrix row begins at this offset
+      }
+      return { Shape({(int)words.size(), (int)factorVocab_.size()}), weights, indices, offsets };
+    }
+
+    const CSRData& getFactorMatrix() const { return factorMatrix_; } // [v,u] (sparse) -> =1 if u is factor of v
+  private:
+    Vocab factorVocab_;                                  // [factor name] -> factor index = row of E_
+    std::vector<std::vector<Word>> factorMap_;           // [word index] -> set of factor indices
+    std::vector<int> factorRefCounts_;                   // [factor index] -> how often this factor is referenced in factorMap_
+    CSRData factorMatrix_;                               // [v,u] (sparse) -> =1 if u is factor of v
+    std::vector<size_t> factorGroups_;                   // [u] -> group id of factor u
+  public: // @TODO: temporarily; later factor this properly
+    std::vector<std::pair<size_t, size_t>> groupRanges_; // [group id] -> (u_begin,u_end) index range of factors u for this group. These don't overlap.
+    std::vector<bool> groupNeedsNormalization_;          // [group id] -> true if explicit softmax normalization is necessary
+  };
+
+  namespace mlp {
+    /*private*/ void Output::lazyConstruct(int inputDim) {
+      // We must construct lazily since we won't know tying nor input dim in constructor.
+      if (W_)
+        return;
+
+      auto name = options_->get<std::string>("prefix");
+      auto dim = options_->get<int>("dim");
+
+      if (options_->has("embedding-factors")) {
+        ABORT_IF(shortlist_, "Shortlists are presently not compatible with factored embeddings");
+        embeddingFactorMapping_ = New<EmbeddingFactorMapping>(options_);
+        dim = (int)embeddingFactorMapping_->factorVocabSize();
+        LOG(info, "[embedding] Factored outputs enabled");
+      }
+
+      if(tiedParam_) {
+        W_ = tiedParam_;
+        transposeW_ = true;
+      } else {
+        W_ = graph_->param(name + "_W", {inputDim, dim}, inits::glorot_uniform);
+        transposeW_ = false;
+      }
+
+      b_ = graph_->param(name + "_b", {1, dim}, inits::zeros);
+    }
+
+    Expr Output::apply(Expr input) /*override*/ {
+      lazyConstruct(input->shape()[-1]);
+
+      if (shortlist_) {
+        if (!cachedShortW_) { // short versions of parameters are cached within one batch, then clear()ed
+          if(transposeW_)
+            cachedShortW_ = rows(W_, shortlist_->indices());
+          else
+            cachedShortW_ = cols(W_, shortlist_->indices());
+          cachedShortb_ = cols(b_, shortlist_->indices());
+        }
+        return affine(input, cachedShortW_, cachedShortb_, false, transposeW_);
+      }
+      else if (embeddingFactorMapping_) {
+        auto graph = input->graph();
+        auto y = affine(input, W_, b_, false, transposeW_); // [B... x U] factor logits
+
+        // denominators (only for groups that don't normalize out naturally by the final softmax())
+        const auto& groupRanges = embeddingFactorMapping_->groupRanges_; // @TODO: factor this properly
+        auto numGroups = groupRanges.size();
+        for (size_t g = 0; g < numGroups; g++) {
+          if (!embeddingFactorMapping_->groupNeedsNormalization_[g]) // @TODO: if we ever need it, we can combine multiple
+              continue;
+          auto range = groupRanges[g];
+          // y: [B... x U]
+          // m: [1 x U]         // ones at positions of group members
+          auto yDim = y->shape()[-1];
+          std::vector<float> mVec(yDim, 0.0f); // @TODO: This vector should be produced by embeddingFactorMapping_
+          for (size_t i = range.first; i < range.second; i++)
+            mVec[i] = 1.0f;
+          // need to compute log denominator over y[range] and subtract it from y[range]
+          auto groupY = slice(y, Slice((int)range.first, (int)range.second), /*axis=*/-1); // [B... x Ug]
+          auto groupZ = logsumexp(groupY, /*axis=*/-1); // [B... x 1]
+          auto m = graph->constant({ 1, (int)mVec.size() }, inits::from_vector(mVec)); // [1 x U]
+          auto Z = dot(groupZ, m); // [B... x U]
+          y = y - Z;
+#if 0
+          // and a log-linear weight
+          auto name = options_->get<std::string>("prefix");
+          auto llWeight = graph->param(name + "_llWeight_" + std::to_string(g), {}, inits::from_value(1.0f));
+          y = y * ((llWeight  - 1) * m + 1);
+#endif
+        }
+
+        // sum up the unit logits across factors for each target word
+        auto factorMatrix = embeddingFactorMapping_->getFactorMatrix(); // [V x U]
+        y = dot_csr(
+            y,  // [B x U]
+            factorMatrix.shape,
+            graph->constant({(int)factorMatrix.weights.size()}, inits::from_vector(factorMatrix.weights), Type::float32),
+            graph->constant({(int)factorMatrix.indices.size()}, inits::from_vector(factorMatrix.indices), Type::uint32),
+            graph->constant({(int)factorMatrix.offsets.size()}, inits::from_vector(factorMatrix.offsets), Type::uint32),
+            /*transB=*/ true); // -> [B x V]
+
+        return y;
+      }
+      else
+        return affine(input, W_, b_, false, transposeW_);
+    }
+  }
+
+  Embedding::Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options) : LayerBase(graph, options) {
+    std::string name = opt<std::string>("prefix");
+    int dimVoc = opt<int>("dimVocab");
+    int dimEmb = opt<int>("dimEmb");
+
+    bool fixed = opt<bool>("fixed", false);
+
+    if (options_->has("embedding-factors")) {
+      embeddingFactorMapping_ = New<EmbeddingFactorMapping>(options_);
+      dimVoc = (int)embeddingFactorMapping_->factorVocabSize();
+      LOG(info, "[embedding] Factored embeddings enabled");
+    }
+
+    NodeInitializer initFunc = inits::glorot_uniform;
+    if (options_->has("embFile")) {
+      std::string file = opt<std::string>("embFile");
+      if (!file.empty()) {
+        bool norm = opt<bool>("normalization", false);
+        initFunc = inits::from_word2vec(file, dimVoc, dimEmb, norm);
+      }
+    }
+
+    E_ = graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed);
+  }
+
+  // helper to embed a sequence of words (given as indices) via factored embeddings
+  /*private*/ Expr Embedding::multiRows(const std::vector<IndexType>& data) const
+  {
+    auto graph = E_->graph();
+    auto factoredData = embeddingFactorMapping_->csr_rows(data);
+    // multi-hot factor vectors are represented as a sparse CSR matrix
+    // [row index = word position index] -> set of factor indices for word at this position
+    ABORT_IF(factoredData.shape != Shape({(int)factoredData.offsets.size()-1/*=rows of CSR*/, E_->shape()[0]}), "shape mismatch??");
+    return csr_dot( // the CSR matrix is passed in pieces
+        factoredData.shape,
+        graph->constant({(int)factoredData.weights.size()}, inits::from_vector(factoredData.weights), Type::float32),
+        graph->constant({(int)factoredData.indices.size()}, inits::from_vector(factoredData.indices), Type::uint32),
+        graph->constant({(int)factoredData.offsets.size()}, inits::from_vector(factoredData.offsets), Type::uint32),
+        E_);
+  }
+
+  std::tuple<Expr/*embeddings*/, Expr/*mask*/> Embedding::apply(Ptr<data::SubBatch> subBatch) const /*override final*/ {
+    auto graph = E_->graph();
+    int dimBatch = (int)subBatch->batchSize();
+    int dimEmb = E_->shape()[-1];
+    int dimWords = (int)subBatch->batchWidth();
+
+    // factored embeddings:
+    //  - regular:
+    //     - y = x @ E    x:[B x 1ofV] ; E:[V x D] ; y:[B x D]
+    //  - factored:
+    //     - u = x @ M    one-hot to U-dimensional multi-hot (all factors in one concatenated space)
+    //        - each row of M contains the set of factors for one word => we want a CSR matrix
+    //     - y = (x @ M) @ E   (x:[B x 1ofV] ; M:[V x U]) ; E:[U x D] ; y:[B x D]
+    //  - first compute x @ M on the CPU
+    //     - (Uvalues, Uindices, Uoffsets) = csr_rows(Mvalues, Mindices, Moffsets, subBatch->data()):
+    //        - shape (U, specifically) not actually needed here
+    //     - foreach input x[i]
+    //        - locate row M[i,*]
+    //        - copy through its index values (std::vector<push_back>)
+    //     - create a matching ones vector (we can keep growing)
+    //     - convert to GPU-side CSR matrix. CSR matrix now has #rows equal to len(x)
+    //     - CSR matrix product with E
+    //     - csr_dot(Uvalues, Uindices, Uoffsets, E_, transposeU)
+    //        - double-check if all dimensions are specified. Probably not for transpose (which would be like csc_dot()).
+    //  - weighting:
+    //     - core factors' gradients are sums over all words that use the factors;
+    //        - core factors' embeddings move very fast
+    //        - words will need to make up for the move; rare words cannot
+    //     - so, we multiply each factor with 1/refCount
+    //        - core factors get weighed down a lot
+    //        - no impact on gradients, as Adam makes up for it; embeddings still move fast just as before
+    //        - but forward pass weighs them down, so that all factors are in a similar numeric range
+    //        - if it is required to be in a different range, the embeddings can still learn that, but more slowly
+
+    Expr chosenEmbeddings;
+    if (embeddingFactorMapping_)
+      chosenEmbeddings = multiRows(subBatch->data());
+    else
+      chosenEmbeddings = rows(E_, subBatch->data());
+
+    auto batchEmbeddings = reshape(chosenEmbeddings, { dimWords, dimBatch, dimEmb });
+    auto batchMask = graph->constant({ dimWords, dimBatch, 1 },
+                                     inits::from_vector(subBatch->mask()));
+    return std::make_tuple(batchEmbeddings, batchMask);
+  }
+
+  Expr Embedding::apply(const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) const /*override final*/ {
+    int dimEmb = E_->shape()[-1];
+    Expr chosenEmbeddings;
+    if (embeddingFactorMapping_)
+      chosenEmbeddings = multiRows(embIdx);
+    else
+      chosenEmbeddings = rows(E_, embIdx);
+    return reshape(chosenEmbeddings, { dimBeam, 1, dimBatch, dimEmb });
+  }
+}  // namespace marian
diff --git a/src/layers/generic.h b/src/layers/generic.h
index a9e2be01..d70c7f06 100755
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@@ -54,6 +54,8 @@ struct IEmbeddingLayer {
   virtual Expr apply(const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) const = 0;
 };
 
+class EmbeddingFactorMapping;
+
 namespace mlp {
 
 class Dense : public LayerBase, public IUnaryLayer {
@@ -124,49 +126,51 @@ public:
 
 class Output : public LayerBase, public IUnaryLayer {
 private:
-  Expr tiedParam_;
-  Ptr<data::Shortlist> shortlist_;
-
-  Expr W_;
+  Expr W_;  // parameters held by this layer
   Expr b_;
+  Expr cachedShortW_;   // short-listed version, cached (cleared by clear())
+  Expr cachedShortb_;   // these match the current value of shortlist_
+  Ptr<EmbeddingFactorMapping > embeddingFactorMapping_;
+
+  // optional parameters set/updated after construction
+  Expr tiedParam_;
   bool transposeW_{false};
+  Ptr<data::Shortlist> shortlist_;
 
+  void lazyConstruct(int inputDim);
 public:
   Output(Ptr<ExpressionGraph> graph, Ptr<Options> options)
-      : LayerBase(graph, options) {}
+      : LayerBase(graph, options) {
+    clear();
+  }
 
   void tieTransposed(Expr tied) {
-    tiedParam_ = tied;
+    if (W_)
+      ABORT_IF(tiedParam_.get() != tied.get(), "Tied output projection cannot be changed once weights have been created");
+    else
+      tiedParam_ = tied;
   }
 
-  void setShortlist(Ptr<data::Shortlist> shortlist) { shortlist_ = shortlist; }
-
-  Expr apply(Expr input) override {
-    if(!W_) {
-      auto name = options_->get<std::string>("prefix");
-      auto dim = options_->get<int>("dim");
-
-      if(tiedParam_) {
-        transposeW_ = true;
-        W_ = tiedParam_;
-        if(shortlist_)
-          W_ = rows(W_, shortlist_->indices());
-      } else {
-        W_ = graph_->param(name + "_W",
-                           {input->shape()[-1], dim},
-                           inits::glorot_uniform);
-        if(shortlist_)
-          W_ = cols(W_, shortlist_->indices());
-      }
-
-      b_ = graph_->param(name + "_b", {1, dim}, inits::zeros);
-      if(shortlist_)
-        b_ = cols(b_, shortlist_->indices());
+  void setShortlist(Ptr<data::Shortlist> shortlist) {
+    if (shortlist_)
+      ABORT_IF(shortlist.get() != shortlist_.get(), "Output shortlist cannot be changed except after clear()");
+    else {
+      ABORT_IF(cachedShortW_ || cachedShortb_, "No shortlist but cached parameters??");
+      shortlist_ = shortlist;
     }
+    // cachedShortW_ and cachedShortb_ will be created lazily inside apply()
+  }
 
-    return affine(input, W_, b_, false, transposeW_);
+  // this is expected to be called in sync with graph->clear(), which invalidates
+  // cachedShortW_ and cachedShortb_ in the graph's short-term cache
+  void clear() {
+    shortlist_ = nullptr;
+    cachedShortW_ = nullptr;
+    cachedShortb_ = nullptr;
   }
 
+  Expr apply(Expr input) override;
+
   virtual Expr apply(const std::vector<Expr>& /*inputs*/) override {
     ABORT("Not implemented");
   };
@@ -176,45 +180,15 @@ public:
 
 class Embedding : public LayerBase, public IEmbeddingLayer {
   Expr E_;
+  Ptr<EmbeddingFactorMapping> embeddingFactorMapping_;
+  Expr multiRows(const std::vector<IndexType>& data) const;
 public:
-  Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options) : LayerBase(graph, options) {
-    std::string name = opt<std::string>("prefix");
-    int dimVoc = opt<int>("dimVocab");
-    int dimEmb = opt<int>("dimEmb");
-
-    bool fixed = opt<bool>("fixed", false);
-
-    NodeInitializer initFunc = inits::glorot_uniform;
-    if (options_->has("embFile")) {
-      std::string file = opt<std::string>("embFile");
-      if (!file.empty()) {
-        bool norm = opt<bool>("normalization", false);
-        initFunc = inits::from_word2vec(file, dimVoc, dimEmb, norm);
-      }
-    }
+  Embedding(Ptr<ExpressionGraph> graph, Ptr<Options> options);
 
-    E_ = graph_->param(name, {dimVoc, dimEmb}, initFunc, fixed);
-  }
-
-  std::tuple<Expr/*embeddings*/, Expr/*mask*/> apply(Ptr<data::SubBatch> subBatch) const override final {
-    auto graph = E_->graph();
-    int dimBatch = (int)subBatch->batchSize();
-    int dimEmb = E_->shape()[-1];
-    int dimWords = (int)subBatch->batchWidth();
-    // @TODO: merge this with below. Currently can't only due to the extra beam dimension
-    auto chosenEmbeddings = rows(E_, subBatch->data());
-    auto batchEmbeddings = reshape(chosenEmbeddings, { dimWords, dimBatch, dimEmb });
-    auto batchMask = graph->constant({ dimWords, dimBatch, 1 },
-                                     inits::from_vector(subBatch->mask()));
-    return std::make_tuple(batchEmbeddings, batchMask);
-  }
+  std::tuple<Expr/*embeddings*/, Expr/*mask*/> apply(Ptr<data::SubBatch> subBatch) const override final;
 
   // special version used in decoding
-  Expr apply(const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) const override final {
-    int dimEmb = E_->shape()[-1];
-    auto selectedEmbs = rows(E_, embIdx);
-    return reshape(selectedEmbs, { dimBeam, 1, dimBatch, dimEmb });
-  }
+  Expr apply(const std::vector<IndexType>& embIdx, int dimBatch, int dimBeam) const override final;
 };
 
 class ULREmbedding : public LayerBase, public IEmbeddingLayer {
@@ -322,10 +296,4 @@ public:
     ABORT("not implemented"); // ULR cannot be used for decoding
   }
 };
-
-typedef ConstructingFactory<Embedding> EmbeddingFactory;
-typedef ConstructingFactory<ULREmbedding> ULREmbeddingFactory;
-
-typedef Accumulator<EmbeddingFactory> embedding;
-typedef Accumulator<ULREmbeddingFactory> ulr_embedding;
 }  // namespace marian
diff --git a/src/layers/loss.cpp b/src/layers/loss.cpp
index 03b79682..d11e4384 100755
--- a/src/layers/loss.cpp
+++ b/src/layers/loss.cpp
@@ -26,13 +26,26 @@ Expr LossBase::getCrossEntropy(Expr logits,
                                Expr indices,
                                Expr mask,
                                Expr weights) {
-  auto ce = cross_entropy(logits, indices);
 
+  Expr ce;
   if(smoothing_ > 0) {
     // @TODO: add this to CE kernels instead
+#if 0
+    ce = cross_entropy(logits, indices);
     auto ceq = mean(logsoftmax(logits), /*axis=*/ -1);
     ce = (1 - smoothing_) * ce - smoothing_ * ceq;
+#else   // alternative that is cheaper memory-wise
+    ce = cross_entropy(logits, indices);
+    auto ceq = mean(logits, /*axis=*/ -1) - logsumexp(logits, /*axis=*/ -1);
+    ce = (1 - smoothing_) * ce - smoothing_ * ceq;
+    //auto ceq = mean(logits, /*axis=*/ -1) - Z;
+    //ce = (1 - smoothing_) * cols(logits, indices)   // ce term
+    //     - smoothing_ * mean(logits, /*axis=*/ -1)  // smoothing term
+    //     - logsumexp(logits, /*axis=*/ -1);         // denominator
+#endif
   }
+  else
+    ce = cross_entropy(logits, indices);
 
   if(mask)
     ce = ce * mask;
diff --git a/src/models/decoder.h b/src/models/decoder.h
index 99fbd4ad..9b7aad9c 100755
--- a/src/models/decoder.h
+++ b/src/models/decoder.h
@@ -4,6 +4,7 @@
 #include "states.h"
 
 #include "data/shortlist.h"
+#include "layers/constructors.h"
 #include "layers/generic.h"
 
 namespace marian {
@@ -14,6 +15,7 @@ protected:
   std::string prefix_{"decoder"};
   bool inference_{false};
   size_t batchIndex_{1};
+  std::vector<Ptr<IEmbeddingLayer>> embedding_; // @TODO: find a more grammattical name
 
   Ptr<data::Shortlist> shortlist_;
 
@@ -31,37 +33,41 @@ public:
 
   virtual Ptr<DecoderState> step(Ptr<ExpressionGraph>, Ptr<DecoderState>) = 0;
 
+  void lazyCreateEmbedding(Ptr<ExpressionGraph> graph) {
+    // @TODO: code dup with EncoderTransformer
+    if (embedding_.size() <= batchIndex_ || !embedding_[batchIndex_]) { // lazy
+      if (embedding_.size() <= batchIndex_)
+        embedding_.resize(batchIndex_ + 1);
+      int dimVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
+      int dimEmb = opt<int>("dim-emb");
+      auto embFactory = embedding()("dimVocab", dimVoc)("dimEmb", dimEmb);
+      if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all"))
+        embFactory("prefix", "Wemb");
+      else
+        embFactory("prefix", prefix_ + "_Wemb");
+      if(options_->has("embedding-fix-trg"))
+        embFactory("fixed", opt<bool>("embedding-fix-trg"));
+      if(options_->has("embedding-vectors")) {
+        auto embFiles = opt<std::vector<std::string>>("embedding-vectors");
+        embFactory("embFile", embFiles[batchIndex_])  //
+            ("normalization", opt<bool>("embedding-normalization"));
+      }
+      if (options_->has("embedding-factors")) {
+        embFactory("embedding-factors", opt<std::vector<std::string>>("embedding-factors"));
+        embFactory("vocab", opt<std::vector<std::string>>("vocabs")[batchIndex_]);
+      }
+      embedding_[batchIndex_] = embFactory.construct(graph);
+    }
+  }
+
   virtual void embeddingsFromBatch(Ptr<ExpressionGraph> graph,
                                    Ptr<DecoderState> state,
                                    Ptr<data::CorpusBatch> batch) {
-
-    int dimVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
-    int dimEmb = opt<int>("dim-emb");
-
-    auto yEmbFactory = embedding()  //
-        ("dimVocab", dimVoc)        //
-        ("dimEmb", dimEmb);
-
-    if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all"))
-      yEmbFactory("prefix", "Wemb");
-    else
-      yEmbFactory("prefix", prefix_ + "_Wemb");
-
-    if(options_->has("embedding-fix-trg"))
-      yEmbFactory("fixed", opt<bool>("embedding-fix-trg"));
-
-    if(options_->has("embedding-vectors")) {
-      auto embFiles = opt<std::vector<std::string>>("embedding-vectors");
-      yEmbFactory("embFile", embFiles[batchIndex_])  //
-          ("normalization", opt<bool>("embedding-normalization"));
-    }
-
-    auto yEmb = yEmbFactory.construct(graph);
-
     auto subBatch = (*batch)[batchIndex_];
 
+    lazyCreateEmbedding(graph);
     Expr y, yMask; std::tie
-    (y, yMask) = yEmb->apply(subBatch);
+    (y, yMask) = embedding_[batchIndex_]->apply(subBatch);
 
     Expr yData;
     if(shortlist_) {
@@ -82,26 +88,13 @@ public:
                                         const std::vector<IndexType>& embIdx,
                                         int dimBatch,
                                         int dimBeam) {
-    int dimTrgEmb = opt<int>("dim-emb");
-    int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
-
     Expr selectedEmbs;
     if(embIdx.empty()) {
-      selectedEmbs = graph->constant({1, 1, dimBatch, dimTrgEmb}, inits::zeros);
+      int dimEmb = opt<int>("dim-emb");
+      selectedEmbs = graph->constant({1, 1, dimBatch, dimEmb}, inits::zeros);
     } else {
-      // embeddings are loaded from model during translation, no fixing required
-      auto yEmbFactory = embedding()  //
-          ("dimVocab", dimTrgVoc)     //
-          ("dimEmb", dimTrgEmb);
-  
-      if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all"))
-        yEmbFactory("prefix", "Wemb");
-      else
-        yEmbFactory("prefix", prefix_ + "_Wemb");
-  
-      auto yEmb = yEmbFactory.construct(graph);
-
-      selectedEmbs = yEmb->apply(embIdx, dimBatch, dimBeam);
+      lazyCreateEmbedding(graph);
+      selectedEmbs = embedding_[batchIndex_]->apply(embIdx, dimBatch, dimBeam);
     }
     state->setTargetEmbeddings(selectedEmbs);
   }
diff --git a/src/models/s2s.h b/src/models/s2s.h
index edda79bc..2f4a4579 100755
--- a/src/models/s2s.h
+++ b/src/models/s2s.h
@@ -124,6 +124,7 @@ public:
     int dimVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
     int dimEmb = opt<int>("dim-emb");
 
+    // @TODO: code dup with Decider and EncoderTransformer; actually diverged by now. Unify this.
     auto embFactory = embedding()  //
         ("dimVocab", dimVoc)       //
         ("dimEmb", dimEmb);
diff --git a/src/models/transformer.h b/src/models/transformer.h
index 968d481b..f6aa4ff1 100755
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -6,7 +6,6 @@
 #include "marian.h"
 
 #include "layers/constructors.h"
-#include "layers/factory.h"
 #include "models/decoder.h"
 #include "models/encoder.h"
 #include "models/states.h"
@@ -495,7 +494,7 @@ public:
     return embFactory.construct(graph_);
   }
 
-  Ptr<IEmbeddingLayer> createWordEmbeddingLayer(size_t subBatchIndex) const {
+  Ptr<IEmbeddingLayer> createSourceEmbeddingLayer(size_t subBatchIndex) const {
     // standard encoder word embeddings
     int dimVoc = opt<std::vector<int>>("dim-vocabs")[subBatchIndex];
     int dimEmb = opt<int>("dim-emb");
@@ -511,6 +510,10 @@ public:
       embFactory("embFile", embFiles[subBatchIndex])
                 ("normalization", opt<bool>("embedding-normalization"));
     }
+    if (options_->has("embedding-factors")) {
+      embFactory("embedding-factors", opt<std::vector<std::string>>("embedding-factors"));
+      embFactory("vocab", opt<std::vector<std::string>>("vocabs")[subBatchIndex]);
+    }
     return embFactory.construct(graph_);
   }
 
@@ -520,6 +523,7 @@ public:
     return apply(batch);
   }
 
+  std::vector<Ptr<IEmbeddingLayer>> embedding_; // @TODO: move away, also rename
   Ptr<EncoderState> apply(Ptr<data::CorpusBatch> batch) {
     int dimEmb = opt<int>("dim-emb");
     int dimBatch = (int)batch->size();
@@ -527,12 +531,15 @@ public:
     // create the embedding matrix, considering tying and some other options
     // embed the source words in the batch
     Expr batchEmbeddings, batchMask;
-    Ptr<IEmbeddingLayer> embedding;
-    if (options_->has("ulr") && options_->get<bool>("ulr") == true)
-      embedding = createULREmbeddingLayer(); // embedding uses ULR
-    else
-      embedding = createWordEmbeddingLayer(batchIndex_);
-    std::tie(batchEmbeddings, batchMask) = embedding->apply((*batch)[batchIndex_]);
+
+    if (embedding_.empty() || !embedding_[batchIndex_]) { // lazy
+      embedding_.resize(batch->sets());
+      if (options_->has("ulr") && options_->get<bool>("ulr") == true)
+        embedding_[batchIndex_] = createULREmbeddingLayer(); // embedding uses ULR
+      else
+        embedding_[batchIndex_] = createSourceEmbeddingLayer(batchIndex_);
+    }
+    std::tie(batchEmbeddings, batchMask) = embedding_[batchIndex_]->apply((*batch)[batchIndex_]);
     // apply dropout over source words
     float dropoutSrc = inference_ ? 0 : opt<float>("dropout-src");
     if(dropoutSrc) {
@@ -601,17 +608,17 @@ public:
 
 class DecoderTransformer : public Transformer<DecoderBase> {
 private:
-  Ptr<mlp::MLP> output_;
+  Ptr<mlp::Output> output_;
 
 private:
-  void LazyCreateOutputLayer()
+  void lazyCreateOutputLayer()
   {
     if(output_) // create it lazily
       return;
 
     int dimTrgVoc = opt<std::vector<int>>("dim-vocabs")[batchIndex_];
 
-    auto layerOut = mlp::output()              //
+    auto outputFactory = mlp::output()         //
         ("prefix", prefix_ + "_ff_logit_out")  //
         ("dim", dimTrgVoc);
 
@@ -619,18 +626,22 @@ private:
       std::string tiedPrefix = prefix_ + "_Wemb";
       if(opt<bool>("tied-embeddings-all") || opt<bool>("tied-embeddings-src"))
         tiedPrefix = "Wemb";
-      layerOut.tieTransposed(tiedPrefix);
+      outputFactory.tieTransposed(tiedPrefix);
     }
 
-    if(shortlist_)
-      layerOut.setShortlist(shortlist_);
-
-    // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim]
-    // assemble layers into MLP and apply to embeddings, decoder context and
-    // aligned source context
-    output_ = mlp::mlp()                //
-                  .push_back(layerOut)  //
-                  .construct(graph_);
+    if (options_->has("embedding-factors")) {
+      // factored embeddings, simplistic version (which just adds the logits, like multiplying probs)
+      //  z = h @ W        // h:[B x D] ; W:[D x V] -> [B x V]
+      // with factors:
+      //  z = h @ W @ M'        // h:[B x D] ; W:[D x U] ; M':[U x V]  -> [B x V]
+      // i.e. multiOutput():
+      //  output = dot_csr(output, M, transB=true)
+      // @BUGBUG: need to specify output factors separately if not tied-embeddings or tied-embeddings-all
+      outputFactory("embedding-factors", opt<std::vector<std::string>>("embedding-factors"));
+      outputFactory("vocab", opt<std::vector<std::string>>("vocabs")[batchIndex_]);
+    }
+
+    output_ = std::dynamic_pointer_cast<mlp::Output>(outputFactory.construct(graph_)); // (construct() returns only the underlying interface)
   }
 
 public:
@@ -662,7 +673,7 @@ public:
   virtual Ptr<DecoderState> step(Ptr<ExpressionGraph> graph,
                                  Ptr<DecoderState> state) override {
     ABORT_IF(graph != graph_, "An inconsistent graph parameter was passed to step()");
-    LazyCreateOutputLayer();
+    lazyCreateOutputLayer();
     return step(state);
   }
 
@@ -818,7 +829,9 @@ public:
     //************************************************************************//
 
     // final feed-forward layer (output)
-    Expr logits = output_->apply(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab dim]
+    if(shortlist_)
+      output_->setShortlist(shortlist_);
+    Expr logits = output_->apply(decoderContext); // [-4: beam depth=1, -3: max length, -2: batch size, -1: vocab or shortlist dim]
 
     // return unormalized(!) probabilities
     Ptr<DecoderState> nextState;
@@ -840,7 +853,8 @@ public:
   }
 
   void clear() override {
-    output_ = nullptr;
+    if (output_)
+      output_->clear();
     cache_.clear();
     alignments_.clear();
   }
diff --git a/src/models/transformer_factory.h b/src/models/transformer_factory.h
index aa31e4d1..825c32b9 100755
--- a/src/models/transformer_factory.h
+++ b/src/models/transformer_factory.h
@@ -1,3 +1,4 @@
+// @TODO: rename to transformer.h eventually. This is not a Factory as in factory.h.
 #pragma once
 
 #include "marian.h"
diff --git a/src/tensors/cpu/add.h b/src/tensors/cpu/add.h
index 38a0684d..4bae5bb5 100755
--- a/src/tensors/cpu/add.h
+++ b/src/tensors/cpu/add.h
@@ -15,8 +15,8 @@ namespace marian {
 
 namespace cpu {
 
-template <size_t K, class Functor>
-void gAddGeneric(Functor functor,
+template <size_t K, class Functor, class AggFunctor>
+void gAggregateGeneric(Functor functor, float aggInit, AggFunctor aggFunctor,
                  const functional::Shape full,
                  functional::Tensor<float> out,
                  functional::Array<functional::Tensor<float>, K> ins,
@@ -34,16 +34,16 @@ void gAddGeneric(Functor functor,
   functional::Array<int, N> dims;
   for(int index = 0; index < outLength; ++index) {
     if(same) {
-      out[index] += functional::apply(functor, ins, index) * scale;
+      out[index] = aggFunctor(out[index], functional::apply(functor, ins, index) * scale);
     } else {
       out.shape().dims(index, dims);
-      out[index] += functional::loops(functor, ins, len, dims) * scale;
+      out[index] = aggFunctor(out[index], functional::loops(functor, aggInit, aggFunctor, ins, len, dims) * scale);
     }
   }
 }
 
-template <size_t K, class Functor>
-void gAddEqual(Functor functor,
+template <size_t K, class Functor, class AggFunctor>
+void gAggregateEqual(Functor functor, AggFunctor aggFunctor,
                functional::Tensor<float> out,
                functional::Array<functional::Tensor<float>, K> ins,
                float scale,
@@ -61,12 +61,12 @@ void gAddEqual(Functor functor,
         indices[i] = ins[i].shape().bindex(dims);
     }
 
-    out[index] += functional::apply(functor, ins, indices) * scale;
+    out[index] = aggFunctor(out[index], functional::apply(functor, ins, indices) * scale);
   }
 }
 
-template <size_t K, class Functor>
-void gAddReduce(Functor functor,
+template <size_t K, class Functor, class AggFunctor>
+void gAggregateReduce(Functor functor, float aggInit, AggFunctor aggFunctor,
                 const functional::Shape full,
                 functional::Tensor<float> out,
                 functional::Array<functional::Tensor<float>, K> ins,
@@ -79,10 +79,10 @@ void gAddReduce(Functor functor,
     same = same && ins[i].shape().elements() == full.elements();
 
   for(int j = 0; j < rows; ++j) {
-    float sum = 0;
+    float colSum = aggInit;
     if(same) {
       for(int id = 0; id < cols; ++id)
-        sum += functional::apply(functor, ins, j * cols + id);
+        colSum = aggFunctor(colSum, functional::apply(functor, ins, j * cols + id));
     } else {
       functional::Array<int, functional::Shape::size()> dims;
       for(int id = 0; id < cols; ++id) {
@@ -90,15 +90,15 @@ void gAddReduce(Functor functor,
         functional::Array<int, K> indices;
         for(size_t i = 0; i < K; ++i)
           indices[i] = ins[i].shape().bindex(dims);
-        sum += functional::apply(functor, ins, indices);
+        colSum = aggFunctor(colSum, functional::apply(functor, ins, indices));
       }
     }
-    out[j] += sum * scale;
+    out[j] = aggFunctor(out[j], colSum * scale);
   }
 }
 
-template <class Functor, class... Tensors>
-void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
+template <class Functor, class AggFunctor, class... Tensors>
+void Aggregate(Functor functor, float aggInit, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors) {
   auto full = marian::Shape::broadcast({out, tensors...});
 
   //int length = out->shape().elements();
@@ -111,15 +111,16 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
   if(full.back() != 1 && out->shape().back() == 1) {
     //size_t m = full.elements() / length;
     //size_t k = full.back();
-    cpu::gAddReduce(functor, full, gOut, gIns, scale);
+    cpu::gAggregateReduce(functor, aggInit, aggFunctor, full, gOut, gIns, scale);
   } else if(out->shape() == full) {
     bool broadcast = false;
     for(size_t i = 0; i < K; ++i)
       broadcast = broadcast || gOut.shape() != gIns[i].shape();
-    cpu::gAddEqual(functor, gOut, gIns, scale, broadcast);
+    cpu::gAggregateEqual(functor, aggFunctor, gOut, gIns, scale, broadcast);
   } else {
-    cpu::gAddGeneric(functor, full, gOut, gIns, scale);
+    cpu::gAggregateGeneric(functor, aggInit, aggFunctor, full, gOut, gIns, scale);
   }
 }
+
 }  // namespace cpu
 }  // namespace marian
diff --git a/src/tensors/gpu/add.cu b/src/tensors/gpu/add.cu
index 2431948e..32c12783 100755
--- a/src/tensors/gpu/add.cu
+++ b/src/tensors/gpu/add.cu
@@ -16,8 +16,8 @@ namespace marian {
 
 namespace gpu {
 
-template <size_t K, class Functor>
-__global__ void gAddGeneric(Functor functor,
+template <size_t K, class Functor, class AggFunctor>
+__global__ void gAggregateGeneric(Functor functor, float aggInit, AggFunctor aggFunctor,
                             const functional::Shape full,
                             functional::Tensor<float> out,
                             functional::Array<functional::Tensor<float>, K> ins,
@@ -37,17 +37,17 @@ __global__ void gAddGeneric(Functor functor,
     int index = bid + blockDim.x * blockIdx.x + threadIdx.x;
     if(index < outLength) {
       if(same) {
-        out[index] += functional::apply(functor, ins, index) * scale;
+        out[index] = aggFunctor(out[index], functional::apply(functor, ins, index) * scale);
       } else {
         out.shape().dims(index, dims);
-        out[index] += functional::loops(functor, ins, len, dims) * scale;
+        out[index] = aggFunctor(out[index], functional::loops(functor, aggInit, aggFunctor, ins, len, dims) * scale);
       }
     }
   }
 }
 
-template <size_t K, class Functor>
-__global__ void gAddEqual(Functor functor,
+template <size_t K, class Functor, class AggFunctor>
+__global__ void gAggregateEqual(Functor functor, AggFunctor aggFunctor,
                           functional::Tensor<float> out,
                           functional::Array<functional::Tensor<float>, K> ins,
                           float scale,
@@ -67,13 +67,13 @@ __global__ void gAddEqual(Functor functor,
           indices[i] = ins[i].shape().bindex(dims);
       }
 
-      out[index] += functional::apply(functor, ins, indices) * scale;
+      out[index] = aggFunctor(out[index], functional::apply(functor, ins, indices) * scale);
     }
   }
 }
 
-template <size_t K, class Functor>
-__global__ void gAddReduce(Functor functor,
+template <size_t K, class Functor, class AggFunctor>
+__global__ void gAggregateReduce(Functor functor, float aggInit, AggFunctor aggFunctor,
                            const functional::Shape full,
                            functional::Tensor<float> out,
                            functional::Array<functional::Tensor<float>, K> ins,
@@ -92,15 +92,15 @@ __global__ void gAddReduce(Functor functor,
       float* _sum = _share + blockDim.x;
 
       if(same) {
-        _sum[threadIdx.x] = 0;
+        _sum[threadIdx.x] = aggInit;
         for(int tid = 0; tid < cols; tid += blockDim.x) {
           int id = tid + threadIdx.x;
           if(id < cols)
-            _sum[threadIdx.x] += functional::apply(functor, ins, j * cols + id);
+            _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::apply(functor, ins, j * cols + id));
         }
       } else {
         functional::Array<int, functional::Shape::size()> dims;
-        _sum[threadIdx.x] = 0;
+        _sum[threadIdx.x] = aggInit;
 
         for(int tid = 0; tid < cols; tid += blockDim.x) {
           int id = tid + threadIdx.x;
@@ -109,7 +109,7 @@ __global__ void gAddReduce(Functor functor,
             functional::Array<int, K> indices;
             for(int i = 0; i < K; ++i)
               indices[i] = ins[i].shape().bindex(dims);
-            _sum[threadIdx.x] += functional::apply(functor, ins, indices);
+            _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], functional::apply(functor, ins, indices));
           }
         }
       }
@@ -119,16 +119,58 @@ __global__ void gAddReduce(Functor functor,
         __syncthreads();
         int skip = (len + 1) >> 1;
         if(threadIdx.x < (len >> 1)) {
-          _sum[threadIdx.x] += _sum[threadIdx.x + skip];
+          _sum[threadIdx.x] = aggFunctor(_sum[threadIdx.x], _sum[threadIdx.x + skip]);
         }
         len = (len + 1) >> 1;
       }
       __syncthreads();
-      out[j] += _sum[0] * scale;
+      out[j] = aggFunctor(out[j], _sum[0] * scale);
     }
   }
 }
 
+template <class Functor, class AggFunctor, class... Tensors>
+void Aggregate(Functor functor, float aggInit, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors) {
+  cudaSetDevice(out->getDeviceId().no);
+
+  auto full = marian::Shape::broadcast({out, tensors...});
+
+  int length = out->shape().elements();
+
+  constexpr size_t K = sizeof...(Tensors);
+
+  functional::Tensor<float> gOut = out;
+  functional::Array<functional::Tensor<float>, K> gIns = {tensors...};
+
+  if(full.back() != 1 && out->shape().back() == 1) {
+    size_t m = full.elements() / length;
+    size_t k = full.back();
+
+    int blocks = std::min(MAX_BLOCKS, (int)m);
+    int threads = std::min(MAX_THREADS, (int)k);
+    int shared = sizeof(float) * threads * 2;
+
+    gAggregateReduce<<<blocks, threads, shared>>>(functor, aggInit, aggFunctor, full, gOut, gIns, scale);
+
+  } else if(out->shape() == full) {
+    int threads = std::min(MAX_THREADS, length);
+    int blocks
+        = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+    bool broadcast = false;
+    for(int i = 0; i < K; ++i)
+      broadcast = broadcast || gOut.shape() != gIns[i].shape();
+    gAggregateEqual<<<blocks, threads>>>(functor, aggFunctor, gOut, gIns, scale, broadcast);
+  } else {
+    int threads = std::min(MAX_THREADS, length);
+    int blocks
+        = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
+
+    gAggregateGeneric<<<blocks, threads>>>(functor, aggInit, aggFunctor, full, gOut, gIns, scale);
+  }
+}
+
+// @TODO: this is a duplicate; can be removed, but need to redo all the add.inc entries...
 template <class Functor, class... Tensors>
 void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
   cudaSetDevice(out->getDeviceId().no);
@@ -142,6 +184,8 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
   functional::Tensor<float> gOut = out;
   functional::Array<functional::Tensor<float>, K> gIns = {tensors...};
 
+  auto addFunctor = functional::_1 + functional::_2;
+
   if(full.back() != 1 && out->shape().back() == 1) {
     size_t m = full.elements() / length;
     size_t k = full.back();
@@ -150,7 +194,7 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
     int threads = std::min(MAX_THREADS, (int)k);
     int shared = sizeof(float) * threads * 2;
 
-    gAddReduce<<<blocks, threads, shared>>>(functor, full, gOut, gIns, scale);
+    gAggregateReduce<<<blocks, threads, shared>>>(functor, 0, addFunctor, full, gOut, gIns, scale);
 
   } else if(out->shape() == full) {
     int threads = std::min(MAX_THREADS, length);
@@ -160,13 +204,13 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
     bool broadcast = false;
     for(int i = 0; i < K; ++i)
       broadcast = broadcast || gOut.shape() != gIns[i].shape();
-    gAddEqual<<<blocks, threads>>>(functor, gOut, gIns, scale, broadcast);
+    gAggregateEqual<<<blocks, threads>>>(functor, addFunctor, gOut, gIns, scale, broadcast);
   } else {
     int threads = std::min(MAX_THREADS, length);
     int blocks
         = std::min(MAX_BLOCKS, length / threads + (length % threads != 0));
 
-    gAddGeneric<<<blocks, threads>>>(functor, full, gOut, gIns, scale);
+    gAggregateGeneric<<<blocks, threads>>>(functor, 0, addFunctor, full, gOut, gIns, scale);
   }
 }
 
diff --git a/src/tensors/gpu/add.h b/src/tensors/gpu/add.h
index e5e22d88..21e0bb96 100644..100755
--- a/src/tensors/gpu/add.h
+++ b/src/tensors/gpu/add.h
@@ -8,5 +8,7 @@ namespace gpu {
 
 template <class Functor, class... Tensors>
 void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors);
+template <class Functor, class AggFunctor, class... Tensors>
+void Aggregate(Functor functor, float initAgg, AggFunctor aggFunctor, float scale, marian::Tensor out, Tensors... tensors);
 }
 }  // namespace marian
diff --git a/src/tensors/gpu/add.inc b/src/tensors/gpu/add.inc
index 27f35b95..69244dce 100644..100755
--- a/src/tensors/gpu/add.inc
+++ b/src/tensors/gpu/add.inc
@@ -1,3 +1,4 @@
+// see element.inc for instructions on how to maintain this
 using namespace functional;
 template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<1>>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Div, Capture, Assignee<1>>>, Assignee<2>>, float, marian::Tensor, marian::Tensor, marian::Tensor);
@@ -22,3 +23,12 @@ template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Gt, Assignee<2>,
 template void Add<BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, BinaryFunctor<elem::Leq, Assignee<2>, Assignee<3>>, Assignee<1>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Add<BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, marian::Tensor, marian::Tensor, marian::Tensor>(BinaryFunctor<elem::Mult, Assignee<1>, UnaryFunctor<elem::Sigmoid, BinaryFunctor<elem::Minus, Assignee<2>, Assignee<3>>>>, float, marian::Tensor, marian::Tensor, marian::Tensor, marian::Tensor);
 template void Add<BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(BinaryFunctor<elem::Div, Assignee<1>, Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> >, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<1> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Minimum, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase> >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Minimum, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Maximum, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase> >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Maximum, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase> >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Aggregate<marian::functional::Assignee<1>, marian::functional::BinaryFunctor<marian::functional::elem::LogAddExp, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase> >(marian::functional::Assignee<1>, float, marian::functional::BinaryFunctor<marian::functional::elem::LogAddExp, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<2>, marian::functional::Assignee<3> > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::UnaryFunctor<marian::functional::elem::Exp, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::Assignee<2>, marian::functional::Assignee<3> > > >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Div, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Assignee<2> >, marian::functional::Assignee<3> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Add<marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Capture>, marian::functional::Assignee<2> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::BinaryFunctor<marian::functional::elem::Mult, marian::functional::Assignee<1>, marian::functional::Capture>, marian::functional::Assignee<2> >, float, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
diff --git a/src/tensors/gpu/element.inc b/src/tensors/gpu/element.inc
index 66f76301..f3cdea28 100755
--- a/src/tensors/gpu/element.inc
+++ b/src/tensors/gpu/element.inc
@@ -55,6 +55,7 @@ template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, Bin
 template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, UnaryFunctor<elem::sReLU, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Minus, Assignee<3>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<5> >, Assignee<4> >, Capture> >, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<4> >, Assignee<4> >, Capture> > >, Capture>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> >, Capture> > >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, UnaryFunctor<elem::sReLU, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Minus, Assignee<3>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<5> >, Assignee<4> >, Capture> >, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, BinaryFunctor<elem::Mult, Capture, Assignee<4> >, Assignee<4> >, Capture> > >, Capture>, BinaryFunctor<elem::Div, BinaryFunctor<elem::Mult, Assignee<4>, Assignee<4> >, Capture> > >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
 template void Element<Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Div, Assignee<3>, Capture> >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(Assign<Var<1>, BinaryFunctor<elem::Minus, Assignee<1>, BinaryFunctor<elem::Mult, Capture, BinaryFunctor<elem::Plus, BinaryFunctor<elem::Div, BinaryFunctor<elem::Div, Assignee<2>, Capture>, BinaryFunctor<elem::Plus, UnaryFunctor<elem::Sqrt, BinaryFunctor<elem::Div, Assignee<3>, Capture> >, Capture> >, BinaryFunctor<elem::Mult, Capture, Assignee<1> > > > > >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
 template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::NEq, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::BinaryFunctor<marian::functional::elem::Gt, marian::functional::Assignee<2>, marian::functional::Assignee<3> >, marian::functional::BinaryFunctor<marian::functional::elem::Lt, marian::functional::Assignee<2>, marian::functional::Assignee<3> > >, marian::functional::Capture>, marian::functional::Capture> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase> >(marian::functional::Assign<marian::functional::Var<1>, marian::functional::BinaryFunctor<marian::functional::elem::NEq, marian::functional::BinaryFunctor<marian::functional::elem::Eq, marian::functional::BinaryFunctor<marian::functional::elem::Minus, marian::functional::BinaryFunctor<marian::functional::elem::Gt, marian::functional::Assignee<2>, marian::functional::Assignee<3> >, marian::functional::BinaryFunctor<marian::functional::elem::Lt, marian::functional::Assignee<2>, marian::functional::Assignee<3> > >, marian::functional::Capture>, marian::functional::Capture> >, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>, std::shared_ptr<marian::TensorBase>);
+template void marian::gpu::Element<marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Assignee<1> > >>(marian::functional::Assign<marian::functional::Var<1>, marian::functional::UnaryFunctor<marian::functional::elem::Sqrt, marian::functional::Assignee<1> > >, std::shared_ptr<marian::TensorBase>);
 // How to add new specializations:
 // When you use a new specialization, it will cause a link error of this form (example):
 //   .../src/tensors/tensor_operators.h:41: undefined reference to `void marian::gpu::Element<marian::functional::Assign< ... > ( ... )'
diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h
index acc7e54c..f77259cb 100755
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@@ -140,6 +140,10 @@ public:
 
   template <typename T>
   void set(const T* begin, const T* end) {
+    ABORT_IF(end - begin != shape_.elements(),
+             "Vector size ({}) and underlying type ({}) do not match",
+             end - begin,
+             std::string(shape_));
     ABORT_IF(!matchType<T>(type_),
              "Requested type ({}) and underlying type ({}) do not match",
              request<T>(),
diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h
index f7de2f20..8dd17a5c 100755
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@@ -51,7 +51,7 @@ void Add(Functor functor, float scale, marian::Tensor out, Tensors... tensors) {
     gpu::Add(functor, scale, out, tensors...);
   else
 #endif
-    cpu::Add(functor, scale, out, tensors...);
+    cpu::Aggregate(functor, 0.0f, functional::_1 + functional::_2, scale, out, tensors...);
 }
 
 template <class Functor, class... Tensors>
@@ -59,6 +59,16 @@ void Add(Functor functor, marian::Tensor out, Tensors... tensors) {
   Add(functor, 1, out, tensors...);
 }
 
+template <class Functor, class AggFunctor, class... Tensors>
+void Aggregate(Functor functor, float aggInit, AggFunctor aggFunctor, marian::Tensor out, Tensors... tensors) {
+#ifdef CUDA_FOUND
+  if(out->getBackend()->getDeviceId().type == DeviceType::gpu)
+    gpu::Aggregate(functor, aggInit, aggFunctor, 1.0f, out, tensors...);
+  else
+#endif
+    cpu::Aggregate(functor, aggInit, aggFunctor, 1.0f, out, tensors...);
+}
+
 template <class Functor, class... Tensors>
 void Reduce(Functor functor,
             float scale,
@@ -74,6 +84,14 @@ void Reduce(Functor functor, marian::Tensor out, Tensors... tensors) {
   Add(functor, out, tensors...);
 }
 
+template <class Functor, class AggFunctor, class... Tensors>
+void Reduce(Functor functor, AggFunctor aggFunctor, float aggInit,
+            marian::Tensor out,
+            Tensors... tensors) {
+  out->set(aggInit);
+  Aggregate(functor, aggInit, aggFunctor, out, tensors...);
+}
+
 // clang-format off
 DISPATCH7(Prod, marian::Tensor, const marian::Tensor&, const marian::Tensor&, bool, bool, float, float)
 DISPATCH8(ProdBatched, marian::Tensor, Ptr<Allocator>, const marian::Tensor, const marian::Tensor, bool, bool, float, float)
diff --git a/src/tests/operator_tests.cpp b/src/tests/operator_tests.cpp
index 08fdd8de..2607f41f 100755
--- a/src/tests/operator_tests.cpp
+++ b/src/tests/operator_tests.cpp
@@ -204,12 +204,19 @@ void tests(DeviceType device) {
     graph->clear();
     values.clear();
 
-    std::vector<float> vA({1, 2, 3, 4, 5, 6, 7, 8});
-    std::vector<float> vS1({6, 8, 10, 12});
-    std::vector<float> vS2({10, 26});
-
-    std::vector<float> vW({2.77778f, 6.77778f});
-
+    std::vector<float> vA({1, 6, 3, 8,
+                           5, 2, 7, 4});
+    // import numpy as np
+    // a = np.array([[1, 6, 3, 8], [5, 2, 7, 4]])
+    std::vector<float> vS1({6, 8, 10, 12});              // s1 = np.sum(a, axis=0)
+    std::vector<float> vS2({18, 18});                    // np.sum(a, axis = 1)
+    std::vector<float> vS4({2.6925824f, 1.80277564f});   // np.std(a, axis = 1)
+    std::vector<float> vV5({7.25, 3.25});                // np.var(a, axis = 1)
+    std::vector<float> vM6({8, 7});                      // np.max(a, axis = 1)
+    std::vector<float> vM7({1, 2});                      // np.min(a, axis = 1)
+    std::vector<float> vP8({144, 280});                  // np.prod(a, axis = 1)
+    std::vector<float> vL9({8.13364336f, 7.17551536f});  // np.log(np.sum(np.exp(a), axis=1))
+    std::vector<float> vW({5.0f, 4.55555556f});          // np.mean(a*s1,axis=-1) / np.mean(s1,axis=-1)
 
     auto a = graph->constant({2, 4}, inits::from_vector(vA));
 
@@ -218,6 +225,14 @@ void tests(DeviceType device) {
 
     auto m3 = mean(s1, /*axis=*/ 1);
 
+    auto s4 = marian::std(a, /*axis=*/ 1);
+    auto v5 = var(a, /*axis=*/ 1);
+
+    auto m6 = max(a, /*axis=*/ 1);
+    auto m7 = min(a, /*axis=*/ 1);
+    auto p8 = prod(a, /*axis=*/ 1);
+    auto l9 = logsumexp(a, /*axis=*/ 1);
+
     auto sp = scalar_product(s2, s2, /*axis=*/ 0);
 
     auto wa = weighted_average(a, s1, /*axis=*/ -1);
@@ -227,21 +242,30 @@ void tests(DeviceType device) {
     CHECK(s1->shape() == Shape({1, 4}));
     CHECK(s2->shape() == Shape({2, 1}));
     CHECK(m3->shape() == Shape({1, 1}));
+    CHECK(s4->shape() == Shape({2, 1}));
+    CHECK(v5->shape() == Shape({2, 1}));
+    CHECK(m6->shape() == Shape({2, 1}));
+    CHECK(m7->shape() == Shape({2, 1}));
+    CHECK(p8->shape() == Shape({2, 1}));
+    CHECK(l9->shape() == Shape({2, 1}));
     CHECK(sp->shape() == Shape({1, 1}));
     CHECK(wa->shape() == Shape({2, 1}));
 
-    s1->val()->get(values);
-    CHECK( values == vS1 );
+    s1->val()->get(values); CHECK(values == vS1);
+    s2->val()->get(values); CHECK(values == vS2);
 
-    s2->val()->get(values);
-    CHECK( values == vS2 );
+    CHECK(m3->val()->scalar() == 9);
 
-    CHECK( m3->val()->scalar() == 9 );
-    CHECK( sp->val()->scalar() == 776 );
+    s4->val()->get(values); CHECK(std::equal(values.begin(), values.end(), vS4.begin(), floatApprox));
+    v5->val()->get(values); CHECK(values == vV5);
+    m6->val()->get(values); CHECK(values == vM6);
+    m7->val()->get(values); CHECK(values == vM7);
+    p8->val()->get(values); CHECK(values == vP8);
+    l9->val()->get(values); CHECK(std::equal(values.begin(), values.end(), vL9.begin(), floatApprox));
 
-    wa->val()->get(values);
-    CHECK( std::equal(values.begin(), values.end(),
-                      vW.begin(), floatApprox) );
+    CHECK(sp->val()->scalar() == 648);
+
+    wa->val()->get(values); CHECK(std::equal(values.begin(), values.end(), vW.begin(), floatApprox));
   }
 
   SECTION("concatenation") {
diff --git a/vs/Marian.vcxproj b/vs/Marian.vcxproj
index a6c560d3..a82c879e 100755
--- a/vs/Marian.vcxproj
+++ b/vs/Marian.vcxproj
@@ -575,6 +575,7 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </ClCompile>
+    <ClCompile Include="..\src\layers\generic.cpp" />
     <ClCompile Include="..\src\layers\loss.cpp" />
     <ClCompile Include="..\src\layers\weight.cpp" />
     <ClCompile Include="..\src\microsoft\quicksand.cpp">
diff --git a/vs/Marian.vcxproj.filters b/vs/Marian.vcxproj.filters
index d9e56843..a7c2331e 100755
--- a/vs/Marian.vcxproj.filters
+++ b/vs/Marian.vcxproj.filters
@@ -481,6 +481,9 @@
     <ClCompile Include="..\src\examples\iris\iris.cpp">
       <Filter>examples\iris</Filter>
     </ClCompile>
+    <ClCompile Include="..\src\layers\generic.cpp">
+      <Filter>layers</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\src\marian.h" />