34 files changed, 346 insertions, 395 deletions
diff --git a/src/3rd_party/cnpy/cnpy.cpp b/src/3rd_party/cnpy/cnpy.cpp
index 277ee7a5..f4df0418 100644
--- a/src/3rd_party/cnpy/cnpy.cpp
+++ b/src/3rd_party/cnpy/cnpy.cpp
@@ -59,9 +59,9 @@ template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const cha
     return lhs;
 }
 
-void cnpy::parse_npy_header(FILE* fp, unsigned int& word_size, unsigned int*& shape, unsigned int& ndims, bool& fortran_order) {  
+void cnpy::parse_npy_header(FILE* fp, unsigned int& word_size, unsigned int*& shape, unsigned int& ndims, bool& fortran_order) {
     char buffer[256];
-    size_t res = fread(buffer,sizeof(char),11,fp);       
+    size_t res = fread(buffer,sizeof(char),11,fp);
     if(res != 11)
         throw std::runtime_error("parse_npy_header: failed fread");
     std::string header = fgets(buffer,256,fp);
@@ -88,7 +88,7 @@ void cnpy::parse_npy_header(FILE* fp, unsigned int& word_size, unsigned int*& sh
     }
 
     //endian, word size, data type
-    //byte order code | stands for not applicable. 
+    //byte order code | stands for not applicable.
     //not sure when this applies except for byte array
     loc1 = header.find("descr")+9;
     bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
@@ -125,7 +125,7 @@ void cnpy::parse_zip_footer(FILE* fp, unsigned short& nrecs, unsigned int& globa
     assert(comment_len == 0);
 }
 
-cnpy::NpyArray load_the_npy_file(FILE* fp) {
+cnpy::NpyArrayPtr load_the_npy_file(FILE* fp) {
     unsigned int* shape;
     unsigned int ndims, word_size;
     bool fortran_order;
@@ -134,13 +134,13 @@ cnpy::NpyArray load_the_npy_file(FILE* fp) {
     for(unsigned int i = 0; i < ndims; i++)
         size *= shape[i];
 
-    cnpy::NpyArray arr;
-    arr.word_size = word_size;
-    arr.shape = std::vector<unsigned int>(shape, shape+ndims);
+    auto arr = cnpy::NpyArrayPtr(new cnpy::NpyArray());
+    arr->word_size = word_size;
+    arr->shape = std::vector<unsigned int>(shape, shape+ndims);
     delete[] shape;
-    arr.resize(size*word_size);    
-    arr.fortran_order = fortran_order;
-    size_t nread = fread(arr.data(), word_size, size,fp);
+    arr->resize(size*word_size);
+    arr->fortran_order = fortran_order;
+    size_t nread = fread(arr->data(), word_size, size,fp);
     if(nread != size)
         throw std::runtime_error("load_the_npy_file: failed fread");
     return arr;
@@ -152,7 +152,7 @@ cnpy::npz_t cnpy::npz_load(std::string fname) {
     if(!fp) printf("npz_load: Error! Unable to open file %s!\n",fname.c_str());
     assert(fp);
 
-    cnpy::npz_t arrays;  
+    cnpy::npz_t arrays;
 
     while(1) {
         std::vector<char> local_header(30);
@@ -170,7 +170,7 @@ cnpy::npz_t cnpy::npz_load(std::string fname) {
         if(vname_res != name_len)
             throw std::runtime_error("npz_load: failed fread");
 
-        //erase the lagging .npy        
+        //erase the lagging .npy
         varname.erase(varname.end()-4, varname.end());
 
         //read in the extra field
@@ -186,16 +186,16 @@ cnpy::npz_t cnpy::npz_load(std::string fname) {
     }
 
     fclose(fp);
-    return arrays;  
+    return arrays;
 }
 
-cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname) {
+cnpy::NpyArrayPtr cnpy::npz_load(std::string fname, std::string varname) {
     FILE* fp = fopen(fname.c_str(),"rb");
 
     if(!fp) {
         printf("npz_load: Error! Unable to open file %s!\n",fname.c_str());
         abort();
-    }       
+    }
 
     while(1) {
         std::vector<char> local_header(30);
@@ -209,7 +209,7 @@ cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname) {
         //read in the variable name
         unsigned short name_len = *(unsigned short*) &local_header[26];
         std::string vname(name_len,' ');
-        size_t vname_res = fread(&vname[0],sizeof(char),name_len,fp);      
+        size_t vname_res = fread(&vname[0],sizeof(char),name_len,fp);
         if(vname_res != name_len)
             throw std::runtime_error("npz_load: failed fread");
         vname.erase(vname.end()-4,vname.end()); //erase the lagging .npy
@@ -219,7 +219,7 @@ cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname) {
         fseek(fp,extra_field_len,SEEK_CUR); //skip past the extra field
 
         if(vname == varname) {
-            NpyArray array = load_the_npy_file(fp);
+            auto array = load_the_npy_file(fp);
             fclose(fp);
             return array;
         }
@@ -233,30 +233,27 @@ cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname) {
     fclose(fp);
 
     std::stringstream ss;
-    ss << "npz_load: Error! Variable name " 
-       << varname 
-       << " not found in " 
-       << fname 
-       <<  "!" 
+    ss << "npz_load: Error! Variable name "
+       << varname
+       << " not found in "
+       << fname
+       <<  "!"
        << std::endl;
 
     throw std::runtime_error(ss.str());
 }
 
-cnpy::NpyArray cnpy::npy_load(std::string fname) {
+cnpy::NpyArrayPtr cnpy::npy_load(std::string fname) {
 
     FILE* fp = fopen(fname.c_str(), "rb");
 
     if(!fp) {
         printf("npy_load: Error! Unable to open file %s!\n",fname.c_str());
-        abort();  
+        abort();
     }
 
-    NpyArray arr = load_the_npy_file(fp);
+    auto arr = load_the_npy_file(fp);
 
     fclose(fp);
     return arr;
 }
-
-
-
diff --git a/src/3rd_party/cnpy/cnpy.h b/src/3rd_party/cnpy/cnpy.h
index f78271a6..0cdd6dca 100644
--- a/src/3rd_party/cnpy/cnpy.h
+++ b/src/3rd_party/cnpy/cnpy.h
@@ -20,27 +20,28 @@
 namespace cnpy {
 
     struct NpyArray {
-        std::shared_ptr<std::vector<char>> ptr;
+        std::vector<char> bytes;
         std::vector<unsigned int> shape;
         unsigned int word_size{1};
         bool fortran_order{0};
-        
-        NpyArray() : ptr{new std::vector<char>()} {}
-        
+
+        NpyArray() {}
+
         void resize(size_t n) {
-            return ptr->resize(n);
+            return bytes.resize(n);
         }
-        
+
         char* data() {
-            return ptr->data();
+            return bytes.data();
         }
 
         const char* data() const {
-            return ptr->data();
+            return bytes.data();
         }
     };
-    
-    typedef std::map<std::string, NpyArray> npz_t;
+
+    typedef std::shared_ptr<NpyArray> NpyArrayPtr;
+    typedef std::map<std::string, NpyArrayPtr> npz_t;
 
     char BigEndianTest();
     char map_type(const std::type_info& t);
@@ -48,20 +49,20 @@ namespace cnpy {
     void parse_npy_header(FILE* fp,unsigned int& word_size, unsigned int*& shape, unsigned int& ndims, bool& fortran_order);
     void parse_zip_footer(FILE* fp, unsigned short& nrecs, unsigned int& global_header_size, unsigned int& global_header_offset);
     npz_t npz_load(std::string fname);
-    NpyArray npz_load(std::string fname, std::string varname);
-    NpyArray npy_load(std::string fname);
+    NpyArrayPtr npz_load(std::string fname, std::string varname);
+    NpyArrayPtr npy_load(std::string fname);
 
     template<typename T> std::vector<char>& operator+=(std::vector<char>& lhs, const T rhs) {
         //write in little endian
         for(char byte = 0; byte < sizeof(T); byte++) {
-            char val = *((char*)&rhs+byte); 
+            char val = *((char*)&rhs+byte);
             lhs.push_back(val);
         }
         return lhs;
     }
 
-    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const std::string rhs); 
-    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const char* rhs); 
+    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const std::string rhs);
+    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const char* rhs);
 
 
     template<typename T> std::string tostring(T i, int pad = 0, char padval = ' ') {
@@ -136,7 +137,7 @@ namespace cnpy {
         if(fp) {
             //zip file exists. we need to add a new npy file to it.
             //first read the footer. this gives us the offset and size of the global header
-            //then read and store the global header. 
+            //then read and store the global header.
             //below, we will write the the new data at the start of the global header then append the global header and footer below it
             unsigned int global_header_size;
             parse_zip_footer(fp,nrecs,global_header_size,global_header_offset);
@@ -202,7 +203,7 @@ namespace cnpy {
         footer += (unsigned int) (global_header_offset + nbytes + local_header.size()); //offset of start of global headers, since global header now starts after newly written array
         footer += (unsigned short) 0; //zip file comment length
 
-        //write everything      
+        //write everything
         fwrite(&local_header[0],sizeof(char),local_header.size(),fp);
         fwrite(&npy_header[0],sizeof(char),npy_header.size(),fp);
         fwrite(data,sizeof(T),nels,fp);
@@ -211,7 +212,7 @@ namespace cnpy {
         fclose(fp);
     }
 
-    template<typename T> std::vector<char> create_npy_header(const T* data, const unsigned int* shape, const unsigned int ndims) {  
+    template<typename T> std::vector<char> create_npy_header(const T* data, const unsigned int* shape, const unsigned int ndims) {
 
         std::vector<char> dict;
         dict += "{'descr': '";
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 409a84e1..38a69d9c 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -24,7 +24,7 @@ cuda_add_library(marian
   graph/expression_graph.cpp
   graph/expression_operators.cu
   graph/node.cpp
-  graph/node_operators.cu
+  graph/node_operators.cpp
   graph/node_initializers.cpp
   layers/convolution.cu
 
diff --git a/src/common/config.cpp b/src/common/config.cpp
index 1f297094..9eca1f10 100644
--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@@ -54,7 +54,7 @@ void Config::loadModelParameters(const std::string& name) {
 void Config::GetYamlFromNpz(YAML::Node& yaml,
                             const std::string& varName,
                             const std::string& fName) {
-  yaml = YAML::Load(cnpy::npz_load(fName, varName).data());
+  yaml = YAML::Load(cnpy::npz_load(fName, varName)->data());
 }
 
 void Config::AddYamlToNpz(const YAML::Node& yaml,
diff --git a/src/common/definitions.h b/src/common/definitions.h
index 749f4fef..8fb3bbb6 100644
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@@ -45,19 +45,19 @@ enum class DeviceType : size_t { gpu = 0, cpu = 1 };
 struct DeviceId {
   size_t no{0};
   DeviceType type{DeviceType::gpu};
-  
+
   DeviceId() : no{0}, type{DeviceType::gpu} {}
   DeviceId(size_t no_, DeviceType type_) : no(no_), type(type_) {}
-  
+
   friend std::ostream& operator<<(std::ostream& out, DeviceId deviceId) {
     out << (deviceId.type == DeviceType::gpu ? "gpu" : "cpu") << deviceId.no;
     return out;
   }
-  
+
   friend bool operator==(DeviceId id1, DeviceId id2) {
     return id1.no == id2.no && id1.type == id2.type;
   }
-  
+
 };
 
 class TensorBase;
diff --git a/src/examples/iris/iris.cpp b/src/examples/iris/iris.cpp
index 9b1bb958..80a3a2a9 100644
--- a/src/examples/iris/iris.cpp
+++ b/src/examples/iris/iris.cpp
@@ -25,20 +25,20 @@ Expr buildIrisClassifier(Ptr<ExpressionGraph> graph,
 
   // Define the input layer
   auto x = graph->constant({N, NUM_FEATURES},
-                           init = inits::from_vector(inputData));
+                           inits::from_vector(inputData));
 
   // Define the hidden layer
-  auto W1 = graph->param("W1", {NUM_FEATURES, 5}, init = inits::uniform());
-  auto b1 = graph->param("b1", {1, 5}, init = inits::zeros);
+  auto W1 = graph->param("W1", {NUM_FEATURES, 5}, inits::uniform());
+  auto b1 = graph->param("b1", {1, 5}, inits::zeros);
   auto h = tanh(affine(x, W1, b1));
 
   // Define the output layer
-  auto W2 = graph->param("W2", {5, NUM_LABELS}, init = inits::uniform());
-  auto b2 = graph->param("b2", {1, NUM_LABELS}, init = inits::zeros);
+  auto W2 = graph->param("W2", {5, NUM_LABELS}, inits::uniform());
+  auto b2 = graph->param("b2", {1, NUM_LABELS}, inits::zeros);
   auto o = affine(h, W2, b2);
 
   if(train) {
-    auto y = graph->constant({N}, init = inits::from_vector(outputData));
+    auto y = graph->constant({N}, inits::from_vector(outputData));
     /* Define cross entropy cost on the output layer.
      * It can be also defined directly as:
      *   -mean(sum(logsoftmax(o) * y, axis=1), axis=0)
diff --git a/src/examples/mnist/model.h b/src/examples/mnist/model.h
index 2ab5a3f5..62e5696a 100644
--- a/src/examples/mnist/model.h
+++ b/src/examples/mnist/model.h
@@ -79,7 +79,7 @@ protected:
     auto features
         = std::static_pointer_cast<data::DataBatch>(batch)->features();
     auto x = g->constant({(int)batch->size(), dims[0]},
-                         init = inits::from_vector(features));
+                         inits::from_vector(features));
 
     // Construct hidden layers
     std::vector<Expr> layers, weights, biases;
@@ -104,11 +104,11 @@ protected:
 
       // Construct a weight node for the outgoing connections from layer i
       weights.emplace_back(g->param(
-          "W" + std::to_string(i), {in, out}, init = inits::uniform()));
+          "W" + std::to_string(i), {in, out}, inits::uniform()));
 
       // Construct a bias node. These weights are initialized to zero
       biases.emplace_back(
-          g->param("b" + std::to_string(i), {1, out}, init = inits::zeros));
+          g->param("b" + std::to_string(i), {1, out}, inits::zeros));
     }
 
     // Perform matrix multiplication and addition for the last layer
@@ -119,7 +119,7 @@ protected:
       // labels
       auto labels = std::static_pointer_cast<data::DataBatch>(batch)->labels();
       auto y = g->constant({(int)batch->size(), 1},
-                           init = inits::from_vector(labels));
+                           inits::from_vector(labels));
 
       // Define a top-level node for training
       return mean(cross_entropy(last, y), axis = 0);
diff --git a/src/examples/mnist/model_lenet.h b/src/examples/mnist/model_lenet.h
index 968ceaf3..a91ef97d 100644
--- a/src/examples/mnist/model_lenet.h
+++ b/src/examples/mnist/model_lenet.h
@@ -29,7 +29,7 @@ protected:
     auto features
         = std::static_pointer_cast<data::DataBatch>(batch)->features();
     auto x = g->constant({(int)batch->size(), 1, 28, 28},
-                         init = inits::from_vector(features));
+                         inits::from_vector(features));
 
     // Construct hidden layers
 
@@ -74,11 +74,11 @@ protected:
 
       // Construct a weight node for the outgoing connections from layer i
       weights.emplace_back(g->param(
-          "W" + std::to_string(i), {in, out}, init = inits::uniform()));
+          "W" + std::to_string(i), {in, out}, inits::uniform()));
 
       // Construct a bias node. These weights are initialized to zero
       biases.emplace_back(
-          g->param("b" + std::to_string(i), {1, out}, init = inits::zeros));
+          g->param("b" + std::to_string(i), {1, out}, inits::zeros));
     }
 
     // Perform matrix multiplication and addition for the last layer
@@ -91,7 +91,7 @@ protected:
       // labels
       auto labels = std::static_pointer_cast<data::DataBatch>(batch)->labels();
       auto y = g->constant({(int)batch->size(), 1},
-                           init = inits::from_vector(labels));
+                           inits::from_vector(labels));
 
       // Define a top-level node for training
       return mean(cross_entropy(last, y), axis = 0);
diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp
index 520476ae..f0ae1ffa 100644
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@@ -17,12 +17,12 @@ void ExpressionGraph::setDevice(DeviceId deviceId) {
   }
 }
 
-Expr ExpressionGraph::dropout(float prob, Shape shape) {
+Expr ExpressionGraph::dropout(float prob, const Shape& shape) {
   return Expression<ConstantNode>(shared_from_this(),
-                                  keywords::init = [prob, this](Tensor t) {
+                                  shape,
+                                  [prob, this](Tensor t) {
                                     Dropout(t, prob);
-                                  },
-                                  keywords::shape = shape);
+                                  });
 }
 
 void ExpressionGraph::checkNan(Tensor t) {
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index 5131daf6..ea1645ec 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -77,7 +77,7 @@ public:
 
   void copyParams(Ptr<ExpressionGraph> graph) {
     for(auto p : *graph->params())
-      param(p->name(), p->shape());
+      param(p->name(), p->shape(), inits::dummy);
     params()->allocateForward();
     params()->vals()->copyFrom(graph->params()->vals());
   }
@@ -200,8 +200,11 @@ public:
     dot.close();
   }
 
-  template <typename... Args>
-  Expr param(std::string name, Shape shape, Args... args) {
+  Expr param(const std::string& pname,
+             const Shape& shape,
+             const NodeInitializer& init,
+             bool fixed = false) {
+    std::string name = pname;
     if(!namespace_.empty())
       name = namespace_ + "::" + name;
 
@@ -214,7 +217,6 @@ public:
                "original shape {}",
                shape, name, p->shape());
 
-      bool fixed = Get(keywords::fixed, false, args...);
       p->setTrainable(!fixed);
       add(p);
       return p;
@@ -229,8 +231,7 @@ public:
     ABORT_IF(get(name), "Non-parameter with name '{}' already exists", name);
 
     // create parameter node (adds to tape)
-    p = Expression<ParamNode>(
-        shared_from_this(), keywords::shape = shape, args...);
+    p = Expression<ParamNode>(shared_from_this(), shape, init, fixed);
 
     // add to list of parameters
     p->set_name(name);
@@ -238,25 +239,21 @@ public:
     return p;
   }
 
-  template <typename... Args>
-  Expr constant(Shape shape, Args... args) {
+  Expr constant(const Shape& shape,
+                const NodeInitializer& init) {
     return Expression<ConstantNode>(
-        shared_from_this(), keywords::shape = shape, args...);
+        shared_from_this(), shape, init);
   }
 
-  template <typename... Args>
-  Expr ones(Args... args) {
-    return Expression<ConstantNode>(
-        shared_from_this(), keywords::init = inits::ones, args...);
+  Expr ones(const Shape& shape) {
+    return Expression<ConstantNode>(shared_from_this(), shape, inits::ones);
   }
 
-  template <typename... Args>
-  Expr zeros(Args... args) {
-    return Expression<ConstantNode>(
-        shared_from_this(), keywords::init = inits::zeros, args...);
+  Expr zeros(const Shape& shape) {
+    return Expression<ConstantNode>(shared_from_this(), shape, inits::zeros);
   }
 
-  Expr dropout(float prob, Shape shape);
+  Expr dropout(float prob, const Shape& shape);
 
   Expr get(std::string name) {
     if(!namespace_.empty())
@@ -340,17 +337,17 @@ public:
         continue;
 
       Shape shape;
-      if(it.second.shape.size() == 1) {
+      if(it.second->shape.size() == 1) {
         shape.resize(2);
         shape.set(0, 1);
-        shape.set(1, it.second.shape[0]);
+        shape.set(1, it.second->shape[0]);
       } else {
-        shape.resize(it.second.shape.size());
-        for(int i = 0; i < it.second.shape.size(); ++i)
-          shape.set(i, it.second.shape[i]);
+        shape.resize(it.second->shape.size());
+        for(int i = 0; i < it.second->shape.size(); ++i)
+          shape.set(i, it.second->shape[i]);
       }
 
-      param(name, shape, init = inits::from_numpy(it.second));
+      param(name, shape, inits::from_numpy(it.second));
     }
 
     if(markReloaded)
@@ -371,7 +368,7 @@ public:
       }
 
       std::vector<float> v;
-      p.second->val() >> v;
+      p.second->val()->get(v);
       auto& pShape = p.second->shape();
 
       unsigned dim = pShape.size();
diff --git a/src/graph/node.h b/src/graph/node.h
index aa450000..74af5771 100644
--- a/src/graph/node.h
+++ b/src/graph/node.h
@@ -13,7 +13,6 @@
 namespace marian {
 
 class Node : public Chainable<Tensor>,
-             public keywords::Keywords,
              public std::enable_shared_from_this<Node> {
 protected:
   size_t id_{0};
@@ -33,11 +32,9 @@ protected:
   std::string debugMessage_;
 
 public:
-  template <typename... Args>
-  Node(Ptr<ExpressionGraph> graph, Args... args)
-      : Keywords(args...),
-        graph_(graph),
-        shape_(Get(keywords::shape, {1, 1, 1, 1})) {}
+  Node(Ptr<ExpressionGraph> graph, Shape shape)
+      : graph_(graph),
+        shape_(shape) {}
 
   virtual ~Node() {
     if(destroy_) {
@@ -143,12 +140,8 @@ public:
 struct NaryNodeOp : public Node {
   size_t hash_{0};
 
-  template <typename... Args>
-  NaryNodeOp(const std::vector<Expr>& nodes, Args... args)
-      : Node(nodes.front()->graph(),
-             keywords::shape
-             = keywords::Get(keywords::shape, nodes.front()->shape(), args...),
-             args...) {
+  NaryNodeOp(const std::vector<Expr>& nodes, Shape shape)
+      : Node(nodes.front()->graph(), shape) {
     children_.resize(nodes.size());
     for(int i = 0; i < nodes.size(); ++i)
       children_[i] = nodes[i];
@@ -158,6 +151,9 @@ struct NaryNodeOp : public Node {
     remove_children_from_top_nodes();
   }
 
+  NaryNodeOp(const std::vector<Expr>& nodes)
+  : NaryNodeOp(nodes, nodes[0]->shape()) {}
+
   virtual ~NaryNodeOp() {}
 
   std::vector<Expr>& children() { return children_; }
diff --git a/src/graph/node_initializers.cpp b/src/graph/node_initializers.cpp
index f82b6f45..0d131c61 100644
--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@@ -72,7 +72,7 @@ void xorshift(Tensor t) {
   std::vector<float> vals(t->size());
   for(auto&& v : vals)
     v = xor128();
-  t << vals;
+  t->set(vals);
 }
 
 void glorot_normal(Tensor t) {
@@ -110,7 +110,7 @@ void ortho(Tensor t) {
 NodeInitializer from_vector(const std::vector<float>& v) {
   auto vPtr = New<std::vector<float>>(v.begin(), v.end());
   return [vPtr](Tensor t) {
-    t->set(vPtr->data(), vPtr->data() + vPtr->size()); 
+    t->set(vPtr->data(), vPtr->data() + vPtr->size());
   };
 }
 
@@ -127,12 +127,12 @@ NodeInitializer from_sparse_vector(
   };
 }
 
-NodeInitializer from_numpy(const cnpy::NpyArray& np) {
+NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np) {
   return [np](Tensor t) {
     size_t size = 1;
-    for(size_t dim : np.shape)
+    for(size_t dim : np->shape)
       size *= dim;
-    t->set((float*)np.data(), (float*)np.data() + size);
+    t->set((float*)np->data(), (float*)np->data() + size);
   };
 }
 
diff --git a/src/graph/node_initializers.h b/src/graph/node_initializers.h
index 6bd83c12..5b069657 100644
--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@@ -27,25 +27,32 @@ NodeInitializer from_value(float v);
 
 NodeInitializer diag(float val);
 
-template <class Distribution>
-void distribution(std::vector<float>& vals, float a, float b) {
+template <class Distribution, class Iterator>
+void distribution(Iterator begin, Iterator end, float a, float b) {
   std::default_random_engine engine(Config::seed++);
   Distribution dist(a, b);
   auto gen = std::bind(dist, engine);
-  std::generate(begin(vals), end(vals), gen);
+  std::generate(begin, end, gen);
+}
+
+template <class Distribution>
+void distribution(std::vector<float>& vals, float a, float b) {
+  distribution<Distribution>(vals.begin(), vals.end(), a, b);
 }
 
 template <class Distribution>
 void distribution(Tensor t, float a, float b) {
   std::vector<float> vals(t->size());
-  distribution<Distribution>(vals, a, b);
-  t << vals;
+  distribution<Distribution>(vals.begin(), vals.end(), a, b);
+  t->set(vals);
 }
 
 NodeInitializer normal(float scale = 0.1, bool ortho = true);
 
 NodeInitializer uniform(float scale = 0.1);
 
+static inline void dummy(Tensor t) {}
+
 void ortho(Tensor t);
 
 void glorot_uniform(Tensor t);
@@ -60,7 +67,7 @@ NodeInitializer from_vector(const std::vector<size_t>& v);
 NodeInitializer from_sparse_vector(
     std::pair<std::vector<size_t>, std::vector<float>>& v);
 
-NodeInitializer from_numpy(const cnpy::NpyArray& np);
+NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np);
 
 NodeInitializer from_word2vec(const std::string& file,
                                           int dimVoc,
diff --git a/src/graph/node_operators.cu b/src/graph/node_operators.cpp
index 76f0b1e3..76f0b1e3 100644
--- a/src/graph/node_operators.cu
+++ b/src/graph/node_operators.cpp
diff --git a/src/graph/node_operators.h b/src/graph/node_operators.h
index b785fade..8720d0bb 100644
--- a/src/graph/node_operators.h
+++ b/src/graph/node_operators.h
@@ -7,12 +7,11 @@
 namespace marian {
 
 struct ConstantNode : public Node {
-  template <typename... Args>
-  ConstantNode(Args... args)
-      : Node(args...),
-        init_(new NodeInitializer(Get(keywords::init, [](Tensor) {}))),
+  ConstantNode(Ptr<ExpressionGraph> graph, const Shape& shape, const NodeInitializer& init)
+      : Node(graph, shape),
+        init_(new NodeInitializer(init)),
         initialized_(false) {
-    ABORT_IF(!Has(keywords::shape), "Constant items require shape information");
+
     setTrainable(false);
   }
 
@@ -42,14 +41,11 @@ private:
 };
 
 struct ParamNode : public Node {
-  template <typename... Args>
-  ParamNode(Args... args)
-      : Node(args...),
-        init_(new NodeInitializer(Get(keywords::init, [](Tensor) {}))),
+  ParamNode(Ptr<ExpressionGraph> graph, const Shape& shape, const NodeInitializer& init, bool fixed = false)
+      : Node(graph, shape),
+        init_(new NodeInitializer(init)),
         initialized_(false) {
-    ABORT_IF(!Has(keywords::shape), "Param items require shape information");
 
-    bool fixed = Get(keywords::fixed, false);
     setTrainable(!fixed);
   }
 
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index a2a47a61..c9e67cd7 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -16,16 +16,13 @@ private:
   float scalar_;
 
 public:
-  template <typename... Args>
   DotNodeOp(Expr a,
             Expr b,
             bool transA,
             bool transB,
-            float scalar,
-            Args... args)
+            float scalar)
       : NaryNodeOp({a, b},
-                   keywords::shape = newShape(a, b, transA, transB),
-                   args...),
+                   newShape(a, b, transA, transB)),
         transA_(transA),
         transB_(transB),
         scalar_(scalar) {}
@@ -149,8 +146,7 @@ public:
                bool transA,
                bool transB,
                float scalar)
-      : NaryNodeOp(nodes, keywords::shape = newShape(nodes[0], nodes[1],
-                                                     transA, transB)),
+      : NaryNodeOp(nodes, newShape(nodes[0], nodes[1], transA, transB)),
         transA_(transA),
         transB_(transB),
         scalar_(scalar){}
@@ -278,16 +274,13 @@ private:
   float scalar_;
 
 public:
-  template <typename... Args>
   DotBatchedNodeOp(Expr a,
                    Expr b,
                    bool transA,
                    bool transB,
-                   float scalar,
-                   Args... args)
+                   float scalar)
       : NaryNodeOp({a, b},
-                   keywords::shape = newShape(a, b, transA, transB),
-                   args...),
+                   newShape(a, b, transA, transB)),
         transA_(transA),
         transB_(transB),
         scalar_(scalar) {}
@@ -407,7 +400,7 @@ public:
 struct ScalarProductNodeOp : public NaryNodeOp {
   template <typename... Args>
   ScalarProductNodeOp(Expr a, Expr b, Args... args)
-      : NaryNodeOp({a, b}, keywords::shape = newShape(a, b, args...), args...) {
+      : NaryNodeOp({a, b}, newShape(a, b, args...)) {
   }
 
   template <typename... Args>
@@ -440,9 +433,8 @@ struct ScalarProductNodeOp : public NaryNodeOp {
 };
 
 struct ElementBinaryNodeOp : public NaryNodeOp {
-  template <typename... Args>
-  ElementBinaryNodeOp(Expr a, Expr b, Args... args)
-      : NaryNodeOp({a, b}, keywords::shape = newShape(a, b), args...) {}
+  ElementBinaryNodeOp(Expr a, Expr b)
+      : NaryNodeOp({a, b}, newShape(a, b)) {}
 
   Shape newShape(Expr a, Expr b) {
     return Shape::broadcast({a, b});
@@ -452,8 +444,7 @@ struct ElementBinaryNodeOp : public NaryNodeOp {
 };
 
 struct PlusNodeOp : public ElementBinaryNodeOp {
-  template <typename... Args>
-  PlusNodeOp(Args... args) : ElementBinaryNodeOp(args...) {}
+  PlusNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -473,8 +464,7 @@ struct PlusNodeOp : public ElementBinaryNodeOp {
 };
 
 struct MinusNodeOp : public ElementBinaryNodeOp {
-  template <typename... Args>
-  MinusNodeOp(Args... args) : ElementBinaryNodeOp(args...) {}
+  MinusNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -494,8 +484,7 @@ struct MinusNodeOp : public ElementBinaryNodeOp {
 };
 
 struct MultNodeOp : public ElementBinaryNodeOp {
-  template <typename... Args>
-  MultNodeOp(Args... args) : ElementBinaryNodeOp(args...) {}
+  MultNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -515,8 +504,7 @@ struct MultNodeOp : public ElementBinaryNodeOp {
 };
 
 struct DivNodeOp : public ElementBinaryNodeOp {
-  template <typename... Args>
-  DivNodeOp(Args... args) : ElementBinaryNodeOp(args...) {}
+  DivNodeOp(Expr a, Expr b) : ElementBinaryNodeOp(a, b) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -565,9 +553,8 @@ struct DivNodeOp : public ElementBinaryNodeOp {
 
 // Cross-entropy node. It computes -b*log(softmax(a)), summing rowwise.
 struct CrossEntropyNodeOp : public NaryNodeOp {
-  template <typename... Args>
-  CrossEntropyNodeOp(Expr a, Expr b, Args... args)
-      : NaryNodeOp({a, b}, keywords::shape = newShape(a), args...) {}
+  CrossEntropyNodeOp(Expr a, Expr b)
+      : NaryNodeOp({a, b}, newShape(a)) {}
 
   Shape newShape(Expr a) {
     Shape shape1 = a->shape();
@@ -591,10 +578,7 @@ struct CrossEntropyNodeOp : public NaryNodeOp {
 struct ConcatenateNodeOp : public NaryNodeOp {
   template <typename... Args>
   ConcatenateNodeOp(const std::vector<Expr>& nodes, Args... args)
-      : NaryNodeOp(nodes,
-                   keywords::shape
-                   = newShape(nodes, keywords::Get(keywords::axis, 0, args...)),
-                   args...) {}
+      : NaryNodeOp(nodes, newShape(nodes, keywords::Get(keywords::axis, 0, args...))) {}
 
   Shape newShape(const std::vector<Expr>& nodes, int ax) {
     Shape shape = nodes.back()->shape();
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index 0a76471b..e857e790 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -14,9 +14,11 @@
 namespace marian {
 
 struct UnaryNodeOp : public NaryNodeOp {
-  template <typename... Args>
-  UnaryNodeOp(Expr a, Args... args)
-      : NaryNodeOp({a}, keywords::shape = a->shape(), args...) {}
+  UnaryNodeOp(Expr a, Shape shape)
+      : NaryNodeOp({a}, shape) {}
+
+  UnaryNodeOp(Expr a)
+      : NaryNodeOp({a}, a->shape()) {}
 
   const std::string color() { return "yellow"; }
 };
@@ -26,9 +28,9 @@ private:
   float scalar_{0};
 
 public:
-  template <typename... Args>
-  ScalarAddNodeOp(Expr a, float scalar, Args... args)
-      : UnaryNodeOp(a, args...), scalar_{scalar} {}
+  ScalarAddNodeOp(Expr a, float scalar)
+      : UnaryNodeOp(a),
+        scalar_{scalar} {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -67,9 +69,8 @@ private:
   float scalar_{0};
 
 public:
-  template <typename... Args>
-  ScalarMultNodeOp(Expr a, float scalar, Args... args)
-      : UnaryNodeOp(a, args...), scalar_{scalar} {}
+  ScalarMultNodeOp(Expr a, float scalar)
+      : UnaryNodeOp(a), scalar_{scalar} {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -104,8 +105,7 @@ public:
 };
 
 struct LogitNodeOp : public UnaryNodeOp {
-  template <typename... Args>
-  LogitNodeOp(Args... args) : UnaryNodeOp(args...) {}
+  LogitNodeOp(Expr a) : UnaryNodeOp(a) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -164,7 +164,7 @@ struct LogitNodeOp : public UnaryNodeOp {
 
 struct TanhNodeOp : public NaryNodeOp {
   TanhNodeOp(const std::vector<Expr>& nodes)
-      : NaryNodeOp(nodes, keywords::shape = newShape(nodes)) {}
+      : NaryNodeOp(nodes, newShape(nodes)) {}
 
   Shape newShape(const std::vector<Expr>& nodes) {
     return Shape::broadcast(nodes);
@@ -214,8 +214,7 @@ struct TanhNodeOp : public NaryNodeOp {
 
 
 struct ReLUNodeOp : public UnaryNodeOp {
-  template <typename... Args>
-  ReLUNodeOp(Args... args) : UnaryNodeOp(args...) {}
+  ReLUNodeOp(Expr a) : UnaryNodeOp(a) {}
 
   NodeOps forwardOps() {
     // f(x) = max(0, x)
@@ -265,9 +264,8 @@ struct ReLUNodeOp : public UnaryNodeOp {
  * \f]
  */
 struct PReLUNodeOp : public UnaryNodeOp {
-  template <typename... Args>
-  PReLUNodeOp(float alpha, Args... args)
-      : UnaryNodeOp(args...), alpha_(alpha) {}
+  PReLUNodeOp(float alpha, Expr a)
+      : UnaryNodeOp(a), alpha_(alpha) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -316,8 +314,7 @@ private:
  *
  */
 struct SwishNodeOp : public UnaryNodeOp {
-  template <typename... Args>
-  SwishNodeOp(Args... args) : UnaryNodeOp(args...) {}
+  SwishNodeOp(Expr a) : UnaryNodeOp(a) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -338,14 +335,12 @@ struct SwishNodeOp : public UnaryNodeOp {
   const std::string type() { return "swish"; }
 };
 
-struct SoftmaxNodeOp : public NaryNodeOp {
-  template <typename... Args>
-  SoftmaxNodeOp(Expr a, Args... args)
-      : NaryNodeOp(a, args...), mask_(nullptr) {}
+struct SoftmaxNodeOp : public UnaryNodeOp {
+  SoftmaxNodeOp(Expr a)
+      : UnaryNodeOp(a), mask_(nullptr) {}
 
-  template <typename... Args>
-  SoftmaxNodeOp(Expr a, Expr mask, Args... args)
-      : NaryNodeOp({a}, args...), mask_(mask) {}
+  SoftmaxNodeOp(Expr a, Expr mask)
+      : UnaryNodeOp(a), mask_(mask) {}
 
   Expr mask_;
 
@@ -396,8 +391,7 @@ struct SoftmaxNodeOp : public NaryNodeOp {
 };
 
 struct LogSoftmaxNodeOp : public UnaryNodeOp {
-  template <typename... Args>
-  LogSoftmaxNodeOp(Args... args) : UnaryNodeOp(args...) {}
+  LogSoftmaxNodeOp(Expr a) : UnaryNodeOp(a) {}
 
   NodeOps forwardOps() { return {NodeOp(LogSoftmax(val_, child(0)->val()))}; }
 
@@ -416,7 +410,7 @@ struct SumNodeOp : public UnaryNodeOp {
 
   template <typename... Args>
   SumNodeOp(Expr a, Args... args)
-      : UnaryNodeOp(a, keywords::shape = newShape(a, args...), args...) {}
+      : UnaryNodeOp(a, newShape(a, args...)) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -465,7 +459,7 @@ struct MeanNodeOp : public UnaryNodeOp {
 
   template <typename... Args>
   MeanNodeOp(Expr a, Args... args)
-      : UnaryNodeOp(a, keywords::shape = newShape(a, args...), args...) {}
+      : UnaryNodeOp(a, newShape(a, args...)) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -516,8 +510,7 @@ struct MeanNodeOp : public UnaryNodeOp {
 };
 
 struct LogNodeOp : public UnaryNodeOp {
-  template <typename... Args>
-  LogNodeOp(Args... args) : UnaryNodeOp(args...) {}
+  LogNodeOp(Expr a) : UnaryNodeOp(a) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -534,8 +527,7 @@ struct LogNodeOp : public UnaryNodeOp {
 };
 
 struct ExpNodeOp : public UnaryNodeOp {
-  template <typename... Args>
-  ExpNodeOp(Args... args) : UnaryNodeOp(args...) {}
+  ExpNodeOp(Expr a) : UnaryNodeOp(a) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -553,9 +545,8 @@ struct ExpNodeOp : public UnaryNodeOp {
 struct SqrtNodeOp : public UnaryNodeOp {
   float epsilon_;
 
-  template <typename... Args>
-  SqrtNodeOp(Expr a, float epsilon, Args... args)
-      : UnaryNodeOp(a, args...), epsilon_(epsilon) {}
+  SqrtNodeOp(Expr a, float epsilon)
+      : UnaryNodeOp(a), epsilon_(epsilon) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -591,8 +582,7 @@ struct SqrtNodeOp : public UnaryNodeOp {
 };
 
 struct SquareNodeOp : public UnaryNodeOp {
-  template <typename... Args>
-  SquareNodeOp(Args... args) : UnaryNodeOp(args...) {}
+  SquareNodeOp(Expr a) : UnaryNodeOp(a) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -609,8 +599,7 @@ struct SquareNodeOp : public UnaryNodeOp {
 };
 
 struct NegNodeOp : public UnaryNodeOp {
-  template <typename... Args>
-  NegNodeOp(Args... args) : UnaryNodeOp(args...) {}
+  NegNodeOp(Expr a) : UnaryNodeOp(a) {}
 
   NodeOps forwardOps() {
     using namespace functional;
@@ -626,9 +615,8 @@ struct NegNodeOp : public UnaryNodeOp {
 };
 
 struct RowsNodeOp : public UnaryNodeOp {
-  template <typename... Args>
-  RowsNodeOp(Expr a, const std::vector<size_t>& indeces, Args... args)
-      : UnaryNodeOp(a, keywords::shape = newShape(a, indeces), args...),
+  RowsNodeOp(Expr a, const std::vector<size_t>& indeces)
+      : UnaryNodeOp(a, newShape(a, indeces)),
         indices_(indeces) {}
 
   NodeOps forwardOps() {
@@ -679,9 +667,8 @@ struct RowsNodeOp : public UnaryNodeOp {
 };
 
 struct ColsNodeOp : public UnaryNodeOp {
-  template <typename... Args>
-  ColsNodeOp(Expr a, const std::vector<size_t>& indeces, Args... args)
-      : UnaryNodeOp(a, keywords::shape = newShape(a, indeces), args...),
+  ColsNodeOp(Expr a, const std::vector<size_t>& indeces)
+      : UnaryNodeOp(a, newShape(a, indeces)),
         indices_(indeces) {}
 
   NodeOps forwardOps() {
@@ -731,7 +718,7 @@ struct ColsNodeOp : public UnaryNodeOp {
 
 struct SelectNodeOp : public UnaryNodeOp {
   SelectNodeOp(Expr a, int axis, const std::vector<size_t>& indeces)
-      : UnaryNodeOp(a, keywords::shape = newShape(a, axis, indeces)),
+      : UnaryNodeOp(a, newShape(a, axis, indeces)),
         indices_(indeces) {}
 
   NodeOps forwardOps() {
@@ -787,7 +774,7 @@ struct TransposeNodeOp : public UnaryNodeOp {
   std::vector<int> axes_;
 
   TransposeNodeOp(Expr a, const std::vector<int>& axes)
-      : UnaryNodeOp(a, keywords::shape = newShape(a, axes)),
+      : UnaryNodeOp(a, newShape(a, axes)),
         axes_{axes} {}
 
   NodeOps forwardOps() {
@@ -844,8 +831,8 @@ private:
 
 public:
   template <typename... Args>
-  ReshapeNodeOp(Expr a, Shape shape, Args... args)
-      : UnaryNodeOp(a, keywords::shape = shape, args...), reshapee_(a) {
+  ReshapeNodeOp(Expr a, Shape shape)
+      : UnaryNodeOp(a, shape), reshapee_(a) {
     Node::destroy_ = false;
   }
 
@@ -909,7 +896,7 @@ private:
 
 public:
   StepNodeOp(Expr a, int step, int axis)
-      : UnaryNodeOp(a, keywords::shape = newShape(a, axis)),
+      : UnaryNodeOp(a, newShape(a, axis)),
         stepNode_(a),
         step_(step) {
     Node::destroy_ = false;
@@ -981,9 +968,8 @@ public:
 };
 
 struct ShiftNodeOp : public UnaryNodeOp {
-  template <typename... Args>
-  ShiftNodeOp(Expr a, Shape shift, Args... args)
-      : UnaryNodeOp(a, keywords::shape = a->shape(), args...), shift_(shift) {}
+  ShiftNodeOp(Expr a, Shape shift)
+      : UnaryNodeOp(a, a->shape()), shift_(shift) {}
 
   NodeOps forwardOps() {
     return {NodeOp(Shift(val_, child(0)->val(), shift_, false))};
diff --git a/src/layers/convolution.cu b/src/layers/convolution.cu
index b0749450..064abedf 100644
--- a/src/layers/convolution.cu
+++ b/src/layers/convolution.cu
@@ -18,11 +18,11 @@ Expr Convolution::apply(Expr x) {
                                kernelNum,
                                kernelDims.first,
                                kernelDims.second},
-                              keywords::init=inits::glorot_uniform);
+                               inits::glorot_uniform);
 
   auto bias = graph_->param(prefix + "_conv_bias",
                             {1, kernelNum, 1, 1},
-                            keywords::init=inits::zeros);
+                            inits::zeros);
 
   std::vector<Expr> nodes = {x, kernel, bias};
   return Expression<ConvolutionOp>(nodes,
diff --git a/src/layers/generic.h b/src/layers/generic.h
index 558280af..b9c1d100 100644
--- a/src/layers/generic.h
+++ b/src/layers/generic.h
@@ -79,7 +79,7 @@ public:
       else {
         W = g->param(name + "_" + nameW,
                      {in->shape()[-1], dim},
-                     keywords::init = inits::glorot_uniform);
+                     inits::glorot_uniform);
       }
 
       Expr b;
@@ -88,7 +88,7 @@ public:
         b = tiedParams_[nameB];
       else
         b = g->param(
-            name + "_" + nameB, {1, dim}, keywords::init = inits::zeros);
+            name + "_" + nameB, {1, dim}, inits::zeros);
 
       params_.push_back(W);
       params_.push_back(b);
@@ -97,17 +97,17 @@ public:
         if(nematusNorm) {
           auto ln_s = g->param(name + "_ln_s" + std::to_string(i),
                                {1, dim},
-                               keywords::init = inits::from_value(1.f));
+                               inits::from_value(1.f));
           auto ln_b = g->param(name + "_ln_b" + std::to_string(i),
                                {1, dim},
-                               keywords::init = inits::zeros);
+                               inits::zeros);
 
           outputs.push_back(
               layer_norm(affine(in, W, b, false, transposeW), ln_s, ln_b, NEMATUS_LN_EPS));
         } else {
           auto gamma = g->param(name + "_gamma" + std::to_string(i),
                                 {1, dim},
-                                keywords::init = inits::from_value(1.0));
+                                inits::from_value(1.0));
 
           params_.push_back(gamma);
           outputs.push_back(layer_norm(dot(in, W, false, transposeW), gamma, b));
@@ -151,14 +151,14 @@ public:
     else {
       W = g->param(name + "_" + nameW,
                    {input->shape()[-1], dim},
-                   keywords::init = inits::glorot_uniform);
+                   inits::glorot_uniform);
     }
     Expr b;
     std::string nameB = "b";
     if(tiedParams_.count(nameB))
       b = tiedParams_[nameB];
     else
-      b = g->param(name + "_" + nameB, {1, dim}, keywords::init = inits::zeros);
+      b = g->param(name + "_" + nameB, {1, dim}, inits::zeros);
 
     params_ = {W, b};
 
@@ -166,15 +166,15 @@ public:
     if(layerNorm) {
       if(nematusNorm) {
         auto ln_s = g->param(
-            name + "_ln_s", {1, dim}, keywords::init = inits::from_value(1.f));
+            name + "_ln_s", {1, dim}, inits::from_value(1.f));
         auto ln_b
-            = g->param(name + "_ln_b", {1, dim}, keywords::init = inits::zeros);
+            = g->param(name + "_ln_b", {1, dim}, inits::zeros);
 
         out = layer_norm(affine(input, W, b, false, transposeW),
                          ln_s, ln_b, NEMATUS_LN_EPS);
       } else {
         auto gamma = g->param(
-            name + "_gamma", {1, dim}, keywords::init = inits::from_value(1.0));
+            name + "_gamma", {1, dim}, inits::from_value(1.0));
 
         params_.push_back(gamma);
         out = layer_norm(dot(input, W, false, transposeW), gamma, b);
@@ -219,8 +219,8 @@ struct EmbeddingFactory : public Factory {
 
     return graph_->param(name,
                          {dimVoc, dimEmb},
-                         keywords::init = initFunc,
-                         keywords::fixed = fixed);
+                         initFunc,
+                         fixed);
   }
 };
 
@@ -239,7 +239,7 @@ Expr Cost(Expr logits,
 
   if(weights)
     ce = weights * ce;
-  
+
   if(smoothing > 0) {
     // @TODO: add this to CE kernels instead
     auto ceq = mean(logsoftmax(logits), axis = -1);
@@ -250,7 +250,7 @@ Expr Cost(Expr logits,
     ce = ce * mask;
 
   auto costSum = sum(ce, axis = -3);
-  
+
   Expr cost;
   // axes:
   //  - time axis (words): -3
diff --git a/src/layers/guided_alignment.h b/src/layers/guided_alignment.h
index fb430ecc..c353f649 100644
--- a/src/layers/guided_alignment.h
+++ b/src/layers/guided_alignment.h
@@ -17,7 +17,7 @@ Expr guidedAlignmentCost(Ptr<ExpressionGraph> graph,
 
   auto aln = graph->constant(
       {dimBatch, 1, dimSrc, dimTrg},
-      keywords::init = inits::from_vector(batch->getGuidedAlignment()));
+      inits::from_vector(batch->getGuidedAlignment()));
 
   std::string guidedCostType
       = options->get<std::string>("guided-alignment-cost");
diff --git a/src/models/amun.h b/src/models/amun.h
index fd4160a3..8382c863 100644
--- a/src/models/amun.h
+++ b/src/models/amun.h
@@ -41,9 +41,9 @@ public:
     using namespace keywords;
 
     LOG(info, "Loading model from {}", name);
-    
+
     auto numpy = cnpy::npz_load(name);
-    
+
     std::map<std::string, std::string> nameMap
         = {{"decoder_U", "decoder_cell1_U"},
            {"decoder_Ux", "decoder_cell1_Ux"},
@@ -91,38 +91,38 @@ public:
            {"encoder_r_bx", "encoder_bi_r_bx"},
            {"encoder_r_gamma1", "encoder_bi_r_gamma1"},
            {"encoder_r_gamma2", "encoder_bi_r_gamma2"}};
-    
+
     if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all"))
       nameMap["Wemb"] = "Wemb";
-    
+
     graph->setReloaded(false);
-    
+
     for(auto it : numpy) {
       auto name = it.first;
-    
+
       if(name == "decoder_c_tt")
         continue;
       if(name.substr(0, 8) == "special:")
         continue;
-    
+
       Shape shape;
-      if(numpy[name].shape.size() == 2) {
+      if(numpy[name]->shape.size() == 2) {
         shape.resize(2);
-        shape.set(0, numpy[name].shape[0]);
-        shape.set(1, numpy[name].shape[1]);
-      } else if(numpy[name].shape.size() == 1) {
+        shape.set(0, numpy[name]->shape[0]);
+        shape.set(1, numpy[name]->shape[1]);
+      } else if(numpy[name]->shape.size() == 1) {
         shape.resize(2);
         shape.set(0, 1);
-        shape.set(1, numpy[name].shape[0]);
+        shape.set(1, numpy[name]->shape[0]);
       }
-    
+
       std::string pName = name;
       if(nameMap.count(name))
         pName = nameMap[name];
-    
-      graph->param(pName, shape, init = inits::from_numpy(numpy[name]));
+
+      graph->param(pName, shape, inits::from_numpy(numpy[name]));
     }
-    
+
     graph->setReloaded(true);
   }
 
@@ -182,7 +182,7 @@ public:
 
     for(auto p : graph->params()->getMap()) {
       std::vector<float> v;
-      p.second->val() >> v;
+      p.second->val()->get(v);
 
       unsigned dim;
       if(p.second->shape()[0] == 1) {
diff --git a/src/models/encdec.h b/src/models/encdec.h
index 071a9eb0..cfda0404 100644
--- a/src/models/encdec.h
+++ b/src/models/encdec.h
@@ -31,7 +31,7 @@ protected:
     auto batchEmbeddings
         = reshape(chosenEmbeddings, {dimWords, dimBatch, dimEmb});
     auto batchMask = graph->constant(
-        {dimWords, dimBatch, 1}, init = inits::from_vector(subBatch->mask()));
+        {dimWords, dimBatch, 1}, inits::from_vector(subBatch->mask()));
 
     return std::make_tuple(batchEmbeddings, batchMask);
   }
@@ -113,10 +113,10 @@ public:
         = reshape(chosenEmbeddings, {dimWords, dimBatch, opt<int>("dim-emb")});
 
     auto yMask = graph->constant({dimWords, dimBatch, 1},
-                                 init = inits::from_vector(subBatch->mask()));
+                                 inits::from_vector(subBatch->mask()));
 
     auto yData = graph->constant({(int)subBatch->data().size(), 1},
-                                 init = inits::from_vector(subBatch->data()));
+                                 inits::from_vector(subBatch->data()));
 
     auto yShifted = shift(y, {1, 0, 0});
 
@@ -150,7 +150,7 @@ public:
     Expr selectedEmbs;
     if(embIdx.empty()) {
       selectedEmbs = graph->constant({1, 1, dimBatch, dimTrgEmb},
-                                     init = inits::zeros);
+                                     inits::zeros);
     } else {
       selectedEmbs = rows(yEmb, embIdx);
       selectedEmbs
@@ -367,7 +367,7 @@ public:
 
       weights = graph->constant(
           {1, dimWords, dimBatch, 1},
-          keywords::init = inits::from_vector(batch->getDataWeights()));
+          inits::from_vector(batch->getDataWeights()));
     }
 
     auto cost = Cost(nextState->getProbs(),
diff --git a/src/models/nematus.h b/src/models/nematus.h
index 3b93ab52..82b77c68 100644
--- a/src/models/nematus.h
+++ b/src/models/nematus.h
@@ -42,21 +42,21 @@ public:
         continue;
 
       Shape shape;
-      if(numpy[name].shape.size() == 2) {
+      if(numpy[name]->shape.size() == 2) {
         shape.resize(2);
-        shape.set(0, numpy[name].shape[0]);
-        shape.set(1, numpy[name].shape[1]);
-      } else if(numpy[name].shape.size() == 1) {
+        shape.set(0, numpy[name]->shape[0]);
+        shape.set(1, numpy[name]->shape[1]);
+      } else if(numpy[name]->shape.size() == 1) {
         shape.resize(2);
         shape.set(0, 1);
-        shape.set(1, numpy[name].shape[0]);
+        shape.set(1, numpy[name]->shape[0]);
       }
 
       std::string pName = name;
       if(nameMap_.count(name))
         pName = nameMap_[name];
 
-      graph->param(pName, shape, init = inits::from_numpy(numpy[name]));
+      graph->param(pName, shape, inits::from_numpy(numpy[name]));
     }
 
     graph->setReloaded(true);
@@ -76,7 +76,7 @@ public:
 
     for(auto p : graph->params()->getMap()) {
       std::vector<float> v;
-      p.second->val() >> v;
+      p.second->val()->get(v);
 
       unsigned dim;
       if(p.second->shape()[0] == 1) {
diff --git a/src/models/s2s.h b/src/models/s2s.h
index 2d1ee281..164c86f5 100644
--- a/src/models/s2s.h
+++ b/src/models/s2s.h
@@ -275,7 +275,7 @@ public:
       int dimBatch = batch->size();
       int dimRnn = opt<int>("dim-rnn");
 
-      start = graph->constant({dimBatch, dimRnn}, init = inits::zeros);
+      start = graph->constant({dimBatch, dimRnn}, inits::zeros);
     }
 
     rnn::States startStates(opt<size_t>("dec-depth"), {start, start});
diff --git a/src/models/transformer.h b/src/models/transformer.h
index c41453db..d8999263 100644
--- a/src/models/transformer.h
+++ b/src/models/transformer.h
@@ -35,7 +35,7 @@ public:
 
     // shared across batch entries
     auto signal = graph->constant({dimWords, 1, dimEmb},
-                                  init = inits::from_vector(vPos));
+                                  inits::from_vector(vPos));
     return input + signal;
   }
 
@@ -48,7 +48,7 @@ public:
       for(int j = 0; j <= i; ++j)
         vMask[i * length + j] = 1.f;
     return graph->constant({1, length, length},
-                           init = inits::from_vector(vMask));
+                           inits::from_vector(vMask));
   }
 
   Expr InverseMask(Expr mask) {
@@ -104,9 +104,9 @@ public:
       // layer normalization
       if(op == 'n') {
         auto scale = graph->param(
-            prefix + "_ln_scale_pre", {1, dimModel}, init = inits::ones);
+            prefix + "_ln_scale_pre", {1, dimModel}, inits::ones);
         auto bias = graph->param(
-            prefix + "_ln_bias_pre", {1, dimModel}, init = inits::zeros);
+            prefix + "_ln_bias_pre", {1, dimModel}, inits::zeros);
         output = layer_norm(output, scale, bias, 1e-6);
       }
     }
@@ -136,9 +136,9 @@ public:
       // highway connection
       if(op == 'h') {
         auto Wh = graph->param(
-            prefix + "_Wh", {dimModel, dimModel}, init = inits::glorot_uniform);
+            prefix + "_Wh", {dimModel, dimModel}, inits::glorot_uniform);
         auto bh
-            = graph->param(prefix + "_bh", {1, dimModel}, init = inits::zeros);
+            = graph->param(prefix + "_bh", {1, dimModel}, inits::zeros);
 
         auto t = affine(prevInput, Wh, bh);
         output = highway(output, prevInput, t);
@@ -146,9 +146,9 @@ public:
       // layer normalization
       if(op == 'n') {
         auto scale = graph->param(
-            prefix + "_ln_scale", {1, dimModel}, init = inits::ones);
+            prefix + "_ln_scale", {1, dimModel}, inits::ones);
         auto bias = graph->param(
-            prefix + "_ln_bias", {1, dimModel}, init = inits::zeros);
+            prefix + "_ln_bias", {1, dimModel}, inits::zeros);
         output = layer_norm(output, scale, bias, 1e-6);
       }
     }
@@ -211,8 +211,8 @@ public:
     int dimModel = q->shape()[-1];
 
     auto Wq = graph->param(
-        prefix + "_Wq", {dimModel, dimModel}, init = inits::glorot_uniform);
-    auto bq = graph->param(prefix + "_bq", {1, dimModel}, init = inits::zeros);
+        prefix + "_Wq", {dimModel, dimModel}, inits::glorot_uniform);
+    auto bq = graph->param(prefix + "_bq", {1, dimModel}, inits::zeros);
     auto qh = affine(q, Wq, bq);
     qh = SplitHeads(qh, dimHeads);
 
@@ -224,15 +224,15 @@ public:
 
       auto Wk = graph->param(prefixProj + "_Wk",
                              {dimModel, dimModel},
-                             init = inits::glorot_uniform);
+                             inits::glorot_uniform);
       auto bk = graph->param(
-          prefixProj + "_bk", {1, dimModel}, init = inits::zeros);
+          prefixProj + "_bk", {1, dimModel}, inits::zeros);
 
       auto Wv = graph->param(prefixProj + "_Wv",
                              {dimModel, dimModel},
-                             init = inits::glorot_uniform);
+                             inits::glorot_uniform);
       auto bv = graph->param(
-          prefixProj + "_bv", {1, dimModel}, init = inits::zeros);
+          prefixProj + "_bv", {1, dimModel}, inits::zeros);
 
       auto kh = affine(keys[i], Wk, bk);
       auto vh = affine(values[i], Wv, bv);
@@ -258,8 +258,8 @@ public:
     int dimAtt = output->shape()[-1];
 
     auto Wo = graph->param(
-        prefix + "_Wo", {dimAtt, dimOut}, init = inits::glorot_uniform);
-    auto bo = graph->param(prefix + "_bo", {1, dimOut}, init = inits::zeros);
+        prefix + "_Wo", {dimAtt, dimOut}, inits::glorot_uniform);
+    auto bo = graph->param(prefix + "_bo", {1, dimOut}, inits::zeros);
     output = affine(output, Wo, bo);
 
     return output;
@@ -336,12 +336,12 @@ public:
     int dimFfn = options->get<int>("transformer-dim-ffn");
 
     auto W1 = graph->param(
-        prefix + "_W1", {dimModel, dimFfn}, init = inits::glorot_uniform);
-    auto b1 = graph->param(prefix + "_b1", {1, dimFfn}, init = inits::zeros);
+        prefix + "_W1", {dimModel, dimFfn}, inits::glorot_uniform);
+    auto b1 = graph->param(prefix + "_b1", {1, dimFfn}, inits::zeros);
 
     auto W2 = graph->param(
-        prefix + "_W2", {dimFfn, dimModel}, init = inits::glorot_uniform);
-    auto b2 = graph->param(prefix + "_b2", {1, dimModel}, init = inits::zeros);
+        prefix + "_W2", {dimFfn, dimModel}, inits::glorot_uniform);
+    auto b2 = graph->param(prefix + "_b2", {1, dimModel}, inits::zeros);
 
     output = affine(output, W1, b1);
     output = swish(output);
diff --git a/src/optimizers/optimizers.cu b/src/optimizers/optimizers.cu
index 49c380e1..afec4708 100644
--- a/src/optimizers/optimizers.cu
+++ b/src/optimizers/optimizers.cu
@@ -50,15 +50,15 @@ void Adagrad::load(const std::string& name,
   auto numpy = cnpy::npz_load(name);
   for(auto it : numpy) {
     auto name = it.first;
-    cnpy::NpyArray& np = it.second;
+    auto np = it.second;
 
     // get the size of gt_
-    totalSize = np.shape[1];
+    totalSize = np->shape[1];
 
     // extract data into vectors
     if(name == "adagrad_gt") {
       vGt.resize(totalSize);
-      std::copy((float*)np.data(), (float*)np.data() + totalSize, vGt.begin());
+      std::copy((float*)np->data(), (float*)np->data() + totalSize, vGt.begin());
     }
   }
 
@@ -174,19 +174,19 @@ void Adam::load(const std::string& name,
   auto numpy = cnpy::npz_load(name);
   for(auto it : numpy) {
     auto name = it.first;
-    cnpy::NpyArray& np = it.second;
+    auto np = it.second;
 
     // get the size of mt_ and vt_, they are the same
-    totalSize = np.shape[1];
+    totalSize = np->shape[1];
 
     // extract data into vectors
     if(name == "adam_mt") {
       vMt.resize(totalSize);
-      std::copy((float*)np.data(), (float*)np.data() + totalSize, vMt.begin());
+      std::copy((float*)np->data(), (float*)np->data() + totalSize, vMt.begin());
     }
     if(name == "adam_vt") {
       vVt.resize(totalSize);
-      std::copy((float*)np.data(), (float*)np.data() + totalSize, vVt.begin());
+      std::copy((float*)np->data(), (float*)np->data() + totalSize, vVt.begin());
     }
   }
 
diff --git a/src/rnn/attention.cu b/src/rnn/attention.cu
index 2faa9d9a..d5e44a2f 100644
--- a/src/rnn/attention.cu
+++ b/src/rnn/attention.cu
@@ -9,7 +9,7 @@ namespace rnn {
 
 struct AttentionNodeOp : public NaryNodeOp {
   AttentionNodeOp(const std::vector<Expr>& nodes)
-      : NaryNodeOp(nodes, keywords::shape = newShape(nodes)) {}
+      : NaryNodeOp(nodes, newShape(nodes)) {}
 
   Shape newShape(const std::vector<Expr>& nodes) {
     Shape shape = Shape::broadcast({nodes[1], nodes[2]});
diff --git a/src/rnn/attention.h b/src/rnn/attention.h
index faece60a..70337355 100644
--- a/src/rnn/attention.h
+++ b/src/rnn/attention.h
@@ -51,15 +51,15 @@ public:
 
     Wa_ = graph->param(prefix + "_W_comb_att",
                        {dimDecState, dimEncState},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     Ua_ = graph->param(prefix + "_Wc_att",
                        {dimEncState, dimEncState},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     va_ = graph->param(prefix + "_U_att",
                        {dimEncState, 1},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     ba_ = graph->param(
-        prefix + "_b_att", {1, dimEncState}, keywords::init = inits::zeros);
+        prefix + "_b_att", {1, dimEncState}, inits::zeros);
 
     if(dropout_ > 0.0f) {
       dropMaskContext_ = graph->dropout(dropout_, {1, dimEncState});
@@ -75,17 +75,17 @@ public:
         // instead of gammaContext_
         Wc_att_lns_ = graph->param(prefix + "_Wc_att_lns",
                                    {1, dimEncState},
-                                   keywords::init = inits::from_value(1.f));
+                                   inits::from_value(1.f));
         Wc_att_lnb_ = graph->param(prefix + "_Wc_att_lnb",
                                    {1, dimEncState},
-                                   keywords::init = inits::zeros);
+                                   inits::zeros);
         // instead of gammaState_
         W_comb_att_lns_ = graph->param(prefix + "_W_comb_att_lns",
                                        {1, dimEncState},
-                                       keywords::init = inits::from_value(1.f));
+                                       inits::from_value(1.f));
         W_comb_att_lnb_ = graph->param(prefix + "_W_comb_att_lnb",
                                        {1, dimEncState},
-                                       keywords::init = inits::zeros);
+                                       inits::zeros);
 
         mappedContext_ = layer_norm(affine(contextDropped_, Ua_, ba_),
                                     Wc_att_lns_,
@@ -94,10 +94,10 @@ public:
       } else {
         gammaContext_ = graph->param(prefix + "_att_gamma1",
                                      {1, dimEncState},
-                                     keywords::init = inits::from_value(1.0));
+                                     inits::from_value(1.0));
         gammaState_ = graph->param(prefix + "_att_gamma2",
                                    {1, dimEncState},
-                                   keywords::init = inits::from_value(1.0));
+                                   inits::from_value(1.0));
 
         mappedContext_
             = layer_norm(dot(contextDropped_, Ua_), gammaContext_, ba_);
@@ -144,7 +144,7 @@ public:
 
     auto alignedSource
         = scalar_product(encState_->getAttended(), e, axis = -3);
-    
+
     contexts_.push_back(alignedSource);
     alignments_.push_back(e);
     return alignedSource;
diff --git a/src/rnn/cells.cu b/src/rnn/cells.cu
index 42373eab..8b38780f 100644
--- a/src/rnn/cells.cu
+++ b/src/rnn/cells.cu
@@ -9,9 +9,8 @@ namespace rnn {
 struct GRUFastNodeOp : public NaryNodeOp {
   bool final_;
 
-  template <typename... Args>
-  GRUFastNodeOp(const std::vector<Expr>& nodes, bool final, Args... args)
-      : NaryNodeOp(nodes, args...), final_(final) {}
+  GRUFastNodeOp(const std::vector<Expr>& nodes, bool final)
+      : NaryNodeOp(nodes), final_(final) {}
 
   NodeOps forwardOps() {
     std::vector<Tensor> inputs;
@@ -53,9 +52,8 @@ Expr gruOps(const std::vector<Expr>& nodes, bool final) {
 /******************************************************************************/
 
 struct LSTMCellNodeOp : public NaryNodeOp {
-  template <typename... Args>
-  LSTMCellNodeOp(const std::vector<Expr>& nodes, Args... args)
-      : NaryNodeOp(nodes, args...) {}
+  LSTMCellNodeOp(const std::vector<Expr>& nodes)
+      : NaryNodeOp(nodes) {}
 
   NodeOps forwardOps() {
     std::vector<Tensor> inputs;
@@ -91,9 +89,8 @@ struct LSTMCellNodeOp : public NaryNodeOp {
 };
 
 struct LSTMOutputNodeOp : public NaryNodeOp {
-  template <typename... Args>
-  LSTMOutputNodeOp(const std::vector<Expr>& nodes, Args... args)
-      : NaryNodeOp(nodes, args...) {}
+  LSTMOutputNodeOp(const std::vector<Expr>& nodes)
+      : NaryNodeOp(nodes) {}
 
   NodeOps forwardOps() {
     std::vector<Tensor> inputs;
diff --git a/src/rnn/cells.h b/src/rnn/cells.h
index 36fda810..2eeed6fa 100644
--- a/src/rnn/cells.h
+++ b/src/rnn/cells.h
@@ -36,15 +36,15 @@ public:
 
     U_ = graph->param(prefix + "_U",
                       {dimState, dimState},
-                      keywords::init = inits::glorot_uniform);
+                      inits::glorot_uniform);
 
     if(dimInput)
       W_ = graph->param(prefix + "_W",
                         {dimInput, dimState},
-                        keywords::init = inits::glorot_uniform);
+                        inits::glorot_uniform);
 
     b_ = graph->param(
-        prefix + "_b", {1, dimState}, keywords::init = inits::zeros);
+        prefix + "_b", {1, dimState}, inits::zeros);
 
     if(dropout_ > 0.0f) {
       if(dimInput)
@@ -56,10 +56,10 @@ public:
       if(dimInput)
         gamma1_ = graph->param(prefix + "_gamma1",
                                {1, 3 * dimState},
-                               keywords::init = inits::from_value(1.f));
+                               inits::from_value(1.f));
       gamma2_ = graph->param(prefix + "_gamma2",
                              {1, 3 * dimState},
-                             keywords::init = inits::from_value(1.f));
+                             inits::from_value(1.f));
     }
   }
 
@@ -143,35 +143,35 @@ public:
 
     auto U = graph->param(prefix + "_U",
                           {dimState, 2 * dimState},
-                          keywords::init = inits::glorot_uniform);
+                          inits::glorot_uniform);
     auto Ux = graph->param(prefix + "_Ux",
                            {dimState, dimState},
-                           keywords::init = inits::glorot_uniform);
+                           inits::glorot_uniform);
     U_ = concatenate({U, Ux}, keywords::axis = -1);
 
     if(dimInput > 0) {
       auto W = graph->param(prefix + "_W",
                             {dimInput, 2 * dimState},
-                            keywords::init = inits::glorot_uniform);
+                            inits::glorot_uniform);
       auto Wx = graph->param(prefix + "_Wx",
                              {dimInput, dimState},
-                             keywords::init = inits::glorot_uniform);
+                             inits::glorot_uniform);
       W_ = concatenate({W, Wx}, keywords::axis = -1);
     }
 
     auto b = graph->param(
-        prefix + "_b", {1, 2 * dimState}, keywords::init = inits::zeros);
+        prefix + "_b", {1, 2 * dimState}, inits::zeros);
     auto bx = graph->param(
-        prefix + "_bx", {1, dimState}, keywords::init = inits::zeros);
+        prefix + "_bx", {1, dimState}, inits::zeros);
     b_ = concatenate({b, bx}, keywords::axis = -1);
 
     // @TODO use this and adjust Amun model type saving and loading
     // U_ = graph->param(prefix + "_U", {dimState, 3 * dimState},
-    //                  keywords::init=inits::glorot_uniform);
+    //                  (Expr a) : UnaryNodeOp(a)inits::glorot_uniform);
     // W_ = graph->param(prefix + "_W", {dimInput, 3 * dimState},
-    //                  keywords::init=inits::glorot_uniform);
+    //                  (Expr a) : UnaryNodeOp(a)inits::glorot_uniform);
     // b_ = graph->param(prefix + "_b", {1, 3 * dimState},
-    //                  keywords::init=inits::zeros);
+    //                  (Expr a) : UnaryNodeOp(a)inits::zeros);
 
     if(dropout_ > 0.0f) {
       if(dimInput)
@@ -183,10 +183,10 @@ public:
       if(dimInput)
         gamma1_ = graph->param(prefix + "_gamma1",
                                {1, 3 * dimState},
-                               keywords::init = inits::from_value(1.f));
+                               inits::from_value(1.f));
       gamma2_ = graph->param(prefix + "_gamma2",
                              {1, 3 * dimState},
-                             keywords::init = inits::from_value(1.f));
+                             inits::from_value(1.f));
     }
   }
 
@@ -231,7 +231,7 @@ public:
     if(xWs.empty()) {
       if(!fakeInput_ || fakeInput_->shape() != sU->shape())
         fakeInput_
-            = sU->graph()->constant(sU->shape(), keywords::init = inits::zeros);
+            = sU->graph()->constant(sU->shape(), inits::zeros);
       xW = fakeInput_;
     } else {
       xW = xWs.front();
@@ -299,10 +299,10 @@ public:
 
     auto U = graph->param(prefix + "_U",
                           {dimState, 2 * dimState},
-                          keywords::init = inits::glorot_uniform);
+                          inits::glorot_uniform);
     auto Ux = graph->param(prefix + "_Ux",
                            {dimState, dimState},
-                           keywords::init = inits::glorot_uniform);
+                           inits::glorot_uniform);
 
     if(layerNorm_) {
       U_ = U;
@@ -314,10 +314,10 @@ public:
     if(dimInput > 0) {
       auto W = graph->param(prefix + "_W",
                             {dimInput, 2 * dimState},
-                            keywords::init = inits::glorot_uniform);
+                            inits::glorot_uniform);
       auto Wx = graph->param(prefix + "_Wx",
                              {dimInput, dimState},
-                             keywords::init = inits::glorot_uniform);
+                             inits::glorot_uniform);
       if(layerNorm_) {
         W_ = W;
         Wx_ = Wx;
@@ -327,9 +327,9 @@ public:
     }
 
     auto b = graph->param(
-        prefix + "_b", {1, 2 * dimState}, keywords::init = inits::zeros);
+        prefix + "_b", {1, 2 * dimState}, inits::zeros);
     auto bx = graph->param(
-        prefix + "_bx", {1, dimState}, keywords::init = inits::zeros);
+        prefix + "_bx", {1, dimState}, inits::zeros);
 
     if(layerNorm_) {
       b_ = b;
@@ -338,11 +338,11 @@ public:
       // in specific cases we need to pass bx to the kernel
       if(encoder_ && transition_) {
         auto b0
-            = graph->constant({1, 2 * dimState}, keywords::init = inits::zeros);
+            = graph->constant({1, 2 * dimState}, inits::zeros);
         bbx_ = concatenate({b0, bx}, keywords::axis = -1);
       } else {
         bbx_
-            = graph->constant({1, 3 * dimState}, keywords::init = inits::zeros);
+            = graph->constant({1, 3 * dimState}, inits::zeros);
       }
     } else {
       bbx_ = concatenate({b, bx}, keywords::axis = -1);
@@ -358,27 +358,27 @@ public:
       if(dimInput) {
         W_lns_ = graph->param(prefix + "_W_lns",
                               {1, 2 * dimState},
-                              keywords::init = inits::from_value(1.f));
+                              inits::from_value(1.f));
         W_lnb_ = graph->param(prefix + "_W_lnb",
                               {1, 2 * dimState},
-                              keywords::init = inits::zeros);
+                              inits::zeros);
         Wx_lns_ = graph->param(prefix + "_Wx_lns",
                                {1, 1 * dimState},
-                               keywords::init = inits::from_value(1.f));
+                               inits::from_value(1.f));
         Wx_lnb_ = graph->param(prefix + "_Wx_lnb",
                                {1, 1 * dimState},
-                               keywords::init = inits::zeros);
+                               inits::zeros);
       }
       U_lns_ = graph->param(prefix + "_U_lns",
                             {1, 2 * dimState},
-                            keywords::init = inits::from_value(1.f));
+                            inits::from_value(1.f));
       U_lnb_ = graph->param(
-          prefix + "_U_lnb", {1, 2 * dimState}, keywords::init = inits::zeros);
+          prefix + "_U_lnb", {1, 2 * dimState}, inits::zeros);
       Ux_lns_ = graph->param(prefix + "_Ux_lns",
                              {1, 1 * dimState},
-                             keywords::init = inits::from_value(1.f));
+                             inits::from_value(1.f));
       Ux_lnb_ = graph->param(
-          prefix + "_Ux_lnb", {1, 1 * dimState}, keywords::init = inits::zeros);
+          prefix + "_Ux_lnb", {1, 1 * dimState}, inits::zeros);
     }
   }
 
@@ -468,7 +468,7 @@ public:
     if(transition_) {
       if(!fakeInput_ || fakeInput_->shape() != sU->shape())
         fakeInput_
-            = sU->graph()->constant(sU->shape(), keywords::init = inits::zeros);
+            = sU->graph()->constant(sU->shape(), inits::zeros);
       xW = fakeInput_;
     } else {
       xW = xWs.front();
@@ -514,14 +514,14 @@ public:
 
     U_ = graph->param(prefix + "_U",
                       {dimState, 4 * dimState},
-                      keywords::init = inits::glorot_uniform);
+                      inits::glorot_uniform);
     if(dimInput)
       W_ = graph->param(prefix + "_W",
                         {dimInput, 4 * dimState},
-                        keywords::init = inits::glorot_uniform);
+                        inits::glorot_uniform);
 
     b_ = graph->param(
-        prefix + "_b", {1, 4 * dimState}, keywords::init = inits::zeros);
+        prefix + "_b", {1, 4 * dimState}, inits::zeros);
 
     if(dropout_ > 0.0f) {
       if(dimInput)
@@ -533,10 +533,10 @@ public:
       if(dimInput)
         gamma1_ = graph->param(prefix + "_gamma1",
                                {1, 4 * dimState},
-                               keywords::init = inits::from_value(1.f));
+                               inits::from_value(1.f));
       gamma2_ = graph->param(prefix + "_gamma2",
                              {1, 4 * dimState},
-                             keywords::init = inits::from_value(1.f));
+                             inits::from_value(1.f));
     }
   }
 
@@ -586,7 +586,7 @@ public:
     if(xWs.empty()) {
       if(!fakeInput_ || fakeInput_->shape() != sU->shape())
         fakeInput_
-            = sU->graph()->constant(sU->shape(), keywords::init = inits::zeros);
+            = sU->graph()->constant(sU->shape(), inits::zeros);
       xW = fakeInput_;
     } else {
       xW = xWs.front();
@@ -623,20 +623,20 @@ public:
 
     Um_ = graph->param(prefix + "_Um",
                        {dimState, dimState},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     Wm_ = graph->param(prefix + "_Wm",
                        {dimInput, dimState},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     bm_ = graph->param(
-        prefix + "_bm", {1, dimState}, keywords::init = inits::zeros);
+        prefix + "_bm", {1, dimState}, inits::zeros);
 
     if(CellType::layerNorm_) {
       gamma1m_ = graph->param(prefix + "_gamma1m",
                               {1, dimState},
-                              keywords::init = inits::from_value(1.f));
+                              inits::from_value(1.f));
       gamma2m_ = graph->param(prefix + "_gamma2m",
                               {1, dimState},
-                              keywords::init = inits::from_value(1.f));
+                              inits::from_value(1.f));
     }
   }
 
@@ -697,39 +697,39 @@ public:
 
     Uf_ = graph->param(prefix + "_Uf",
                        {dimState, dimState},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     Wf_ = graph->param(prefix + "_Wf",
                        {dimInput, dimState},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     bf_ = graph->param(
-        prefix + "_bf", {1, dimState}, keywords::init = inits::zeros);
+        prefix + "_bf", {1, dimState}, inits::zeros);
 
     Ui_ = graph->param(prefix + "_Ui",
                        {dimState, dimState},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     Wi_ = graph->param(prefix + "_Wi",
                        {dimInput, dimState},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     bi_ = graph->param(
-        prefix + "_bi", {1, dimState}, keywords::init = inits::zeros);
+        prefix + "_bi", {1, dimState}, inits::zeros);
 
     Uc_ = graph->param(prefix + "_Uc",
                        {dimState, dimState},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     Wc_ = graph->param(prefix + "_Wc",
                        {dimInput, dimState},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     bc_ = graph->param(
-        prefix + "_bc", {1, dimState}, keywords::init = inits::zeros);
+        prefix + "_bc", {1, dimState}, inits::zeros);
 
     Uo_ = graph->param(prefix + "_Uo",
                        {dimState, dimState},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     Wo_ = graph->param(prefix + "_Wo",
                        {dimInput, dimState},
-                       keywords::init = inits::glorot_uniform);
+                       inits::glorot_uniform);
     bo_ = graph->param(
-        prefix + "_bo", {1, dimState}, keywords::init = inits::zeros);
+        prefix + "_bo", {1, dimState}, inits::zeros);
   }
 
   State apply(std::vector<Expr> inputs, State state, Expr mask = nullptr) {
@@ -791,39 +791,39 @@ public:
 
     auto Uf = graph->param(prefix + "_Uf",
                            {dimState, dimState},
-                           keywords::init = inits::glorot_uniform);
+                           inits::glorot_uniform);
     auto Wf = graph->param(prefix + "_Wf",
                            {dimInput, dimState},
-                           keywords::init = inits::glorot_uniform);
+                           inits::glorot_uniform);
     auto bf = graph->param(
-        prefix + "_bf", {1, dimState}, keywords::init = inits::zeros);
+        prefix + "_bf", {1, dimState}, inits::zeros);
 
     auto Ui = graph->param(prefix + "_Ui",
                            {dimState, dimState},
-                           keywords::init = inits::glorot_uniform);
+                           inits::glorot_uniform);
     auto Wi = graph->param(prefix + "_Wi",
                            {dimInput, dimState},
-                           keywords::init = inits::glorot_uniform);
+                           inits::glorot_uniform);
     auto bi = graph->param(
-        prefix + "_bi", {1, dimState}, keywords::init = inits::zeros);
+        prefix + "_bi", {1, dimState}, inits::zeros);
 
     auto Uc = graph->param(prefix + "_Uc",
                            {dimState, dimState},
-                           keywords::init = inits::glorot_uniform);
+                           inits::glorot_uniform);
     auto Wc = graph->param(prefix + "_Wc",
                            {dimInput, dimState},
-                           keywords::init = inits::glorot_uniform);
+                           inits::glorot_uniform);
     auto bc = graph->param(
-        prefix + "_bc", {1, dimState}, keywords::init = inits::zeros);
+        prefix + "_bc", {1, dimState}, inits::zeros);
 
     auto Uo = graph->param(prefix + "_Uo",
                            {dimState, dimState},
-                           keywords::init = inits::glorot_uniform);
+                           inits::glorot_uniform);
     auto Wo = graph->param(prefix + "_Wo",
                            {dimInput, dimState},
-                           keywords::init = inits::glorot_uniform);
+                           inits::glorot_uniform);
     auto bo = graph->param(
-        prefix + "_bo", {1, dimState}, keywords::init = inits::zeros);
+        prefix + "_bo", {1, dimState}, inits::zeros);
 
     U_ = concatenate({Uf, Ui, Uc, Uo}, keywords::axis = -1);
     W_ = concatenate({Wf, Wi, Wc, Wo}, keywords::axis = -1);
diff --git a/src/rnn/rnn.h b/src/rnn/rnn.h
index 7374ed2a..4f1be340 100644
--- a/src/rnn/rnn.h
+++ b/src/rnn/rnn.h
@@ -101,7 +101,7 @@ private:
     int dimBatch = input->shape()[-2];
     int dimState = cell_->getOptions()->get<int>("dimState");
 
-    auto output = graph->zeros(keywords::shape = {1, dimBatch, dimState});
+    auto output = graph->zeros({1, dimBatch, dimState});
     Expr cell = output;
     State startState{output, cell};
 
diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h
index d6924402..24ca0738 100644
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@@ -74,7 +74,7 @@ public:
     else
       std::copy(data(), data() + size(), v.data());
   }
-  
+
   void set(const float* begin, const float* end) {
     if(backend_->getDevice().type == DeviceType::gpu)
       gpu::copy(backend_, begin, end, data());
@@ -189,14 +189,4 @@ public:
 
 typedef std::shared_ptr<TensorBase> Tensor;
 
-static Tensor operator<<(Tensor t, const std::vector<float> &v) {
-  t->set(v);
-  return t;
-}
-
-static Tensor operator>>(Tensor t, std::vector<float> &v) {
-  t->get(v);
-  return t;
-}
-
 }
diff --git a/src/translator/beam_search.h b/src/translator/beam_search.h
index 5c7cda26..20b7b628 100644
--- a/src/translator/beam_search.h
+++ b/src/translator/beam_search.h
@@ -139,7 +139,7 @@ public:
       if(first) {
         // no cost
         prevCosts = graph->constant({1, 1, 1, 1},
-                                    keywords::init = inits::from_value(0));
+                                    inits::from_value(0));
       } else {
         std::vector<float> beamCosts;
 
@@ -164,7 +164,7 @@ public:
 
         prevCosts
             = graph->constant({(int)localBeamSize, 1, dimBatch, 1},
-                              keywords::init = inits::from_vector(beamCosts));
+                              inits::from_vector(beamCosts));
       }
 
       //**********************************************************************
diff --git a/src/translator/scorers.h b/src/translator/scorers.h
index 296431f2..94bda6e7 100644
--- a/src/translator/scorers.h
+++ b/src/translator/scorers.h
@@ -133,7 +133,7 @@ public:
     p[2] = 0;
 
     penalties_ = graph->constant({1, dimVocab_},
-                                 keywords::init = inits::from_vector(p));
+                                 inits::from_vector(p));
     return New<WordPenaltyState>(dimVocab_, penalties_);
   }
 
@@ -169,7 +169,7 @@ public:
     p[2] = 0;
 
     penalties_ = graph->constant({1, dimVocab_},
-                                 keywords::init = inits::from_vector(p));
+                                 inits::from_vector(p));
     return New<WordPenaltyState>(dimVocab_, penalties_);
   }