fix merge

author: Marcin Junczys-Dowmunt <junczys@amu.edu.pl> 2018-07-27 20:14:21 +0300
committer: Marcin Junczys-Dowmunt <junczys@amu.edu.pl> 2018-07-27 20:14:21 +0300
commit: dceb7185d86ed8fd1994e86dc3e3c0e03740ec4a (patch)
tree: 3514f87aa2da28313043959ebd0381b3ba7de233 /src/graph
parent: 5cc8674d974bb5cae7bc8f25a51472166164a579 (diff)
parent: 8b0e2f951b5ce09a622fa7239b2e1e5bd8344fe4 (diff)
5 files changed, 72 insertions, 23 deletions
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index d901000c..199994d0 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -244,12 +244,13 @@ public:
     }
   }
 
-  void backward() {
+  void backward(bool zero = true) {
     ABORT_IF(topNodes_.size() > 1,
              "There are more than one top most node for backward step");
 
     params_->allocateBackward();
-    params_->set_zero_adjoint();
+    if(zero)
+      params_->set_zero_adjoint();
 
     for(auto&& v : topNodes_)
       v->init_dependent();
@@ -264,7 +265,7 @@ public:
       nodesBackward_.pop_back();
 
       for(auto&& child : v->children()) {
-        if(child->trainable())
+        if(child->trainable() && child->type() != "param")
           child->set_zero_adjoint();
       }
 
diff --git a/src/graph/expression_operators.cpp b/src/graph/expression_operators.cpp
index 1666357a..ea8077fa 100644
--- a/src/graph/expression_operators.cpp
+++ b/src/graph/expression_operators.cpp
@@ -313,7 +313,9 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
         if(bc != b)
           bc = rec2(bc);
 
-        std::vector<Expr> nodes = {ac, bc, bias};
+        int rows = ac->shape().elements() / ac->shape()[-1];
+        Expr ones = ac->graph()->ones({rows, 1});
+        std::vector<Expr> nodes = {ac, bc, bias, ones};
         return rec2(Expression<AffineNodeOp>(nodes, transA, transB, scale),
                     true);
       };
@@ -333,13 +335,16 @@ Expr affine(Expr a, Expr b, Expr bias, bool transA, bool transB, float scale) {
   }
   else {
     // general version, MKL, CBlas or CUDA
+
     // if clipValue > 0, the inputs will be clipped to range [-clipValue, clipValue]
     // This is meant to keep values at the same range as used during training when
     // optimizing for 8-bit integer products. Likely to be removed in the future
     // when we explore better ways to handle this.
-    std::vector<Expr> nodes = {clip(a, clipValue), clip(b, clipValue), bias};
-    return Expression<AffineNodeOp>(nodes, transA, transB, scale);
 
+    int rows = a->shape().elements() / a->shape()[-1];
+    Expr ones = a->graph()->ones({rows, 1});
+    std::vector<Expr> nodes = {clip(a, clipValue), clip(b, clipValue), bias, ones};
+    return Expression<AffineNodeOp>(nodes, transA, transB, scale);
   }
 }
 
@@ -462,6 +467,7 @@ Expr shift(Expr a, Shape shift, float padValue) {
 //}
 
 #ifdef CUDA_FOUND
+#ifdef CUDNN
 
 Expr avg_pooling(Expr x,
                  int height,
@@ -526,4 +532,5 @@ Expr pooling_with_masking(Expr x, Expr mask, int width, bool isEven) {
 }
 
 #endif
+#endif
 }
diff --git a/src/graph/expression_operators.h b/src/graph/expression_operators.h
index cc07dafb..53cf5966 100644
--- a/src/graph/expression_operators.h
+++ b/src/graph/expression_operators.h
@@ -106,7 +106,6 @@ Expr flatten_2d(Expr a);
 
 Expr rows(Expr a, const std::vector<size_t>& indices);
 Expr cols(Expr a, const std::vector<size_t>& indices);
-
 Expr select(Expr a, int axis, const std::vector<size_t>& indices);
 
 /*********************************************************/
diff --git a/src/graph/node_operators_binary.h b/src/graph/node_operators_binary.h
index 5b1f9865..ea2a3dfe 100644
--- a/src/graph/node_operators_binary.h
+++ b/src/graph/node_operators_binary.h
@@ -4,9 +4,12 @@
 
 #include "functional/functional.h"
 #include "graph/node.h"
-#include "tensors/gpu/cudnn_wrappers.h"
 #include "tensors/tensor_operators.h"
 
+#ifdef CUDNN
+#include "tensors/gpu/cudnn_wrappers.h"
+#endif
+
 namespace marian {
 
 class DotNodeOp : public NaryNodeOp {
@@ -167,15 +170,17 @@ public:
 
   NodeOps forwardOps() {
     using namespace functional;
+
     return {
-      NodeOp(ProdWithBias(val_,
-                          child(0)->val(),
-                          child(1)->val(),
-                          child(2)->val(),
-                          transA_,
-                          transB_,
-                          0.f,
-                          scalar_))
+      NodeOp(Prod(val_,
+                  child(0)->val(),
+                  child(1)->val(),
+                  transA_, transB_, 0.f, scalar_);
+             Prod(val_,
+                  child(3)->val(),
+                  child(2)->val(),
+                  false, false, 1.f, 1.f)
+             )
     };
   }
 
@@ -202,7 +207,12 @@ public:
                           false,
                           1.0,
                           scalar_)),
-              NodeOp(Add(_1, child(2)->grad(), adj_))};
+              NodeOp(Prod(child(2)->grad(),
+                          child(3)->val(), adj_,
+                          true, false,
+                          0.f, 1.f))
+              //NodeOp(Add(_1, child(2)->grad(), adj_))
+      };
 
     if(transA_ && !transB_)
       return {NodeOp(Prod(child(0)->grad(),
@@ -219,7 +229,12 @@ public:
                           false,
                           1.0,
                           scalar_)),
-              NodeOp(Add(_1, child(2)->grad(), adj_))};
+              NodeOp(Prod(child(2)->grad(),
+                          child(3)->val(), adj_,
+                          true, false,
+                          0.f, 1.f))
+              //NodeOp(Add(_1, child(2)->grad(), adj_))
+      };
 
     if(transA_ && transB_)
       return {NodeOp(Prod(child(0)->grad(),
@@ -236,7 +251,12 @@ public:
                           true,
                           1.0,
                           scalar_)),
-              NodeOp(Add(_1, child(2)->grad(), adj_))};
+              NodeOp(Prod(child(2)->grad(),
+                          child(3)->val(), adj_,
+                          true, false,
+                          0.f, 1.f))
+              //NodeOp(Add(_1, child(2)->grad(), adj_))
+      };
 
     return {NodeOp(Prod(child(0)->grad(),
                         adj_,
@@ -252,7 +272,12 @@ public:
                         false,
                         1.0,
                         scalar_)),
-            NodeOp(Add(_1, child(2)->grad(), adj_))};
+            NodeOp(Prod(child(2)->grad(),
+                        child(3)->val(), adj_,
+                        true, false,
+                        0.f, 1.f))
+            //NodeOp(Add(_1, child(2)->grad(), adj_))
+    };
   }
 
   const std::string type() { return "affine"; }
@@ -294,6 +319,7 @@ public:
   NodeOps forwardOps() {
     // C = alpha * dot(op(A), op(B))
     return {NodeOp(ProdBatched(val_,
+                               graph()->allocator(),
                                child(0)->val(),
                                child(1)->val(),
                                transA_,
@@ -311,6 +337,7 @@ public:
 
     if(!transA_ && transB_)
       return {NodeOp(ProdBatched(child(0)->grad(),
+                                 graph()->allocator(),
                                  adj_,
                                  child(1)->val(),
                                  false,
@@ -318,6 +345,7 @@ public:
                                  1.0,
                                  scalar_)),
               NodeOp(ProdBatched(child(1)->grad(),
+                                 graph()->allocator(),
                                  adj_,
                                  child(0)->val(),
                                  true,
@@ -327,6 +355,7 @@ public:
 
     if(transA_ && !transB_)
       return {NodeOp(ProdBatched(child(0)->grad(),
+                                 graph()->allocator(),
                                  child(1)->val(),
                                  adj_,
                                  false,
@@ -334,6 +363,7 @@ public:
                                  1.0,
                                  scalar_)),
               NodeOp(ProdBatched(child(1)->grad(),
+                                 graph()->allocator(),
                                  child(0)->val(),
                                  adj_,
                                  false,
@@ -343,6 +373,7 @@ public:
 
     if(transA_ && transB_)
       return {NodeOp(ProdBatched(child(0)->grad(),
+                                 graph()->allocator(),
                                  child(1)->val(),
                                  adj_,
                                  true,
@@ -350,6 +381,7 @@ public:
                                  1.0,
                                  scalar_)),
               NodeOp(ProdBatched(child(1)->grad(),
+                                 graph()->allocator(),
                                  adj_,
                                  child(0)->val(),
                                  true,
@@ -358,6 +390,7 @@ public:
                                  scalar_))};
 
     return {NodeOp(ProdBatched(child(0)->grad(),
+                               graph()->allocator(),
                                adj_,
                                child(1)->val(),
                                false,
@@ -365,6 +398,7 @@ public:
                                1.0,
                                scalar_)),
             NodeOp(ProdBatched(child(1)->grad(),
+                               graph()->allocator(),
                                child(0)->val(),
                                adj_,
                                true,
@@ -766,6 +800,7 @@ struct HighwayNodeOp : public NaryNodeOp {
   const std::string type() { return "highway"; }
 };
 
+#ifdef CUDNN
 class ConvolutionOp : public NaryNodeOp {
 public:
   ConvolutionOp(const std::vector<Expr>& nodes,
@@ -802,4 +837,5 @@ public:
 protected:
   ConvolutionWrapper conv_;
 };
+#endif
 }
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index fa6d25c7..d7ef751d 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -7,7 +7,9 @@
 #include "graph/node.h"
 #include "tensors/tensor_operators.h"
 
-//#include "tensors/gpu/cudnn_wrappers.h"
+#ifdef CUDNN
+#include "tensors/gpu/cudnn_wrappers.h"
+#endif
 
 namespace marian {
 
@@ -815,7 +817,7 @@ struct TransposeNodeOp : public UnaryNodeOp {
   }
 
   NodeOps backwardOps() {
-    return {NodeOp(TransposeND(child(0)->grad(), adj_, axes_))};
+    return {NodeOp(TransposeNDGrad(child(0)->grad(), adj_, axes_))};
   }
 
   template <class... Args>
@@ -1009,7 +1011,9 @@ struct ShiftNodeOp : public UnaryNodeOp {
   }
 
   NodeOps backwardOps() {
-    return {NodeOp(Shift(child(0)->grad(), adj_, shift_, /*padValue=*/0.f, /*invert=*/true))};
+    // last parameter beta=1 says to use += (out = in + beta * out)
+    // @TODO: check need for padValue_
+    return {NodeOp(ShiftGrad(child(0)->grad(), adj_, shift_, true))};
   }
 
   const std::string type() { return "shift"; }
@@ -1076,6 +1080,7 @@ struct ShiftNodeOp : public UnaryNodeOp {
 //  Ptr<sparse::CSR> lf_;
 //};
 
+#ifdef CUDNN
 class PoolingOp : public UnaryNodeOp {
 public:
   PoolingOp(Expr x,
@@ -1109,6 +1114,7 @@ public:
 protected:
   PoolingWrapper pooling_;
 };
+#endif
 
 class PoolingWithMaskingOp : public UnaryNodeOp {
 public:
author	Marcin Junczys-Dowmunt <junczys@amu.edu.pl>	2018-07-27 20:14:21 +0300
committer	Marcin Junczys-Dowmunt <junczys@amu.edu.pl>	2018-07-27 20:14:21 +0300
commit	dceb7185d86ed8fd1994e86dc3e3c0e03740ec4a (patch)
tree	3514f87aa2da28313043959ebd0381b3ba7de233 /src/graph
parent	5cc8674d974bb5cae7bc8f25a51472166164a579 (diff)
parent	8b0e2f951b5ce09a622fa7239b2e1e5bd8344fe4 (diff)