Add (again) layer normalization on CPU

author: Tomasz Dwojak <t.dwojak@amu.edu.pl> 2017-05-24 11:25:55 +0300
committer: Tomasz Dwojak <t.dwojak@amu.edu.pl> 2017-05-24 11:25:55 +0300
commit: a1a2ee0b21ae1b271674922135e3d2690c6c8dad (patch)
tree: fc827243b9ba4ce77def3d39ba9765379bde63ca
parent: 0f50f93cfe7e9a49704500ef3585ff9c40bc67b8 (diff)
5 files changed, 63 insertions, 36 deletions
diff --git a/src/amun/cpu/dl4mt/decoder.h b/src/amun/cpu/dl4mt/decoder.h
index b1c10e97..62dd1ba0 100644
--- a/src/amun/cpu/dl4mt/decoder.h
+++ b/src/amun/cpu/dl4mt/decoder.h
@@ -59,9 +59,14 @@ class Decoder {
 
           State = Temp2_ * w_.Wi_;
 
-          AddBiasVector<byRow>(State, w_.Bi_);
+          if (w_.Gamma_.rows()) {
+            LayerNormalization(State, w_.Gamma_);
+            AddBiasVector<byRow>(State, w_.Bi_);
+          } else {
+            AddBiasVector<byRow>(State, w_.Bi_);
+            State = blaze::forEach(State, Tanh());
+          }
 
-          State = blaze::forEach(State, Tanh());
         }
 
         void GetNextState(mblas::Matrix& NextState,
@@ -108,6 +113,9 @@ class Decoder {
         void Init(const mblas::Matrix& SourceContext) {
           using namespace mblas;
           SCU_ = SourceContext * w_.U_;
+          if (w_.Gamma_1_.rows()) {
+            LayerNormalization(SCU_, w_.Gamma_1_);
+          }
           AddBiasVector<byRow>(SCU_, w_.B_);
         }
 
@@ -117,11 +125,10 @@ class Decoder {
           using namespace mblas;
 
           Temp2_ = HiddenState * w_.W_;
+          if (w_.Gamma_2_.rows()) {
+            LayerNormalization(Temp2_, w_.Gamma_2_);
+          }
 
-          // For batching: create an A across different sentences,
-          // maybe by mapping and looping. In the and join different
-          // alignment matrices into one
-          // Or masking?
           Temp1_ = Broadcast<Matrix>(Tanh(), SCU_, Temp2_);
 
           A_.resize(Temp1_.rows(), 1);
@@ -171,12 +178,23 @@ class Decoder {
                   const mblas::Matrix& AlignedSourceContext) {
           using namespace mblas;
 
-          T1_ = State * w_.W1_;
-          T2_ = Embedding * w_.W2_;
-          T3_ = AlignedSourceContext * w_.W3_;
 
+          T1_ = State * w_.W1_;
+          if (w_.Gamma_1_.rows()) {
+            LayerNormalization(T1_, w_.Gamma_1_);
+          }
           AddBiasVector<byRow>(T1_, w_.B1_);
+
+          T2_ = Embedding * w_.W2_;
+          if (w_.Gamma_0_.rows()) {
+            LayerNormalization(T2_, w_.Gamma_0_);
+          }
           AddBiasVector<byRow>(T2_, w_.B2_);
+
+          T3_ = AlignedSourceContext * w_.W3_;
+          if (w_.Gamma_2_.rows()) {
+            LayerNormalization(T3_, w_.Gamma_2_);
+          }
           AddBiasVector<byRow>(T3_, w_.B3_);
 
           auto t = blaze::forEach(T1_ + T2_ + T3_, Tanh());
@@ -229,7 +247,7 @@ class Decoder {
       GetProbs(NextState, Embeddings, AlignedSourceContext_);
     }
 
-    BaseMatrix& GetProbs() {
+    mblas::ArrayMatrix& GetProbs() {
       return Probs_;
     }
 
diff --git a/src/amun/cpu/dl4mt/gru.h b/src/amun/cpu/dl4mt/gru.h
index 84e86357..ec301386 100644
--- a/src/amun/cpu/dl4mt/gru.h
+++ b/src/amun/cpu/dl4mt/gru.h
@@ -1,5 +1,5 @@
 #pragma once
-#include "../mblas/matrix.h"
+#include "cpu/mblas/matrix.h"
 
 namespace amunmt {
 namespace CPU {
@@ -13,66 +13,73 @@ class GRU {
       WWx_ = Concat<byColumn, Matrix>(w_.W_, w_.Wx_);
       UUx_ = Concat<byColumn, Matrix>(w_.U_, w_.Ux_);
     }
-          
+
     void GetNextState(mblas::Matrix& NextState,
                       const mblas::Matrix& State,
                       const mblas::Matrix& Context) const {
       RUH_ = Context * WWx_;
+      if (w_.Gamma_1_.rows()) {
+        LayerNormalization(RUH_, w_.Gamma_1_);
+      }
+
       Temp_ = State * UUx_;
-      
+      if (w_.Gamma_2_.rows()) {
+        LayerNormalization(Temp_, w_.Gamma_2_);
+      }
+
       // @TODO: once broadcasting is available
       // implement this using blaze idioms
       ElementwiseOps(NextState, State);
     }
-          
+
     void ElementwiseOps(mblas::Matrix& NextState,
                         const mblas::Matrix& State) const {
-      
+
       using namespace mblas;
       using namespace blaze;
-      
+
       const size_t rowNo = State.rows();
       const size_t colNo = State.columns();
       NextState.resize(rowNo, colNo);
-      
+
       for(int j = 0; j < rowNo; ++j) {
         auto rowOut = row(NextState, j);
         auto rowState = row(State, j);
-        
+
         auto rowRuh = row(RUH_, j);
         auto rowT   = row(Temp_, j);
-        
+
         auto rowH   = subvector(rowRuh, 2 * colNo, colNo);
         auto rowT2  = subvector(rowT, 2 * colNo, colNo);
-        
+
         for(int i = 0; i < colNo; ++i) {
           float ev1 = expapprox(-(rowRuh[i] + w_.B_(0, i) + rowT[i]));
           float r = 1.0 / (1.0 + ev1);
-          
+
           int k = i + colNo;
           float ev2 = expapprox(-(rowRuh[k] + w_.B_(0, k) + rowT[k]));
-          float u = 1.0 / (1.0 + ev2);              
-    
+          float u = 1.0 / (1.0 + ev2);
+
           float hv = rowH[i] + w_.Bx1_(0, i);
           float t2v = rowT2[i] + w_.Bx2_(0, i);
           hv = tanhapprox(hv + r * t2v);
           rowOut[i] = (1.0 - u) * hv + u * rowState[i];
         }
       }
-      
+
     }
-    
+
     size_t GetStateLength() const {
       return w_.U_.rows();
     }
 
-    
+
   private:
     // Model matrices
-    const Weights& w_;    
+    const Weights& w_;
     mutable mblas::Matrix WWx_;
     mutable mblas::Matrix UUx_;
-    
+
     // reused to avoid allocation
     mutable mblas::Matrix RUH_;
     mutable mblas::Matrix Temp_;
diff --git a/src/amun/cpu/dl4mt/model.cpp b/src/amun/cpu/dl4mt/model.cpp
index 02e77a13..f3e40bae 100644
--- a/src/amun/cpu/dl4mt/model.cpp
+++ b/src/amun/cpu/dl4mt/model.cpp
@@ -16,7 +16,9 @@ Weights::GRU::GRU(const NpzConverter& model, const std::vector<std::string> &key
   Wx_(model[keys.at(3)]),
   Bx1_(model(keys.at(4), true)),
   Bx2_(Bx1_.rows(), Bx1_.columns()),
-  Ux_(model[keys.at(5)])
+  Ux_(model[keys.at(5)]),
+  Gamma_1_(model[keys.at(6)]),
+  Gamma_2_(model[keys.at(7)])
 {
     const_cast<mblas::Matrix&>(Bx2_) = 0.0f;
 }
diff --git a/src/amun/cpu/mblas/matrix.h b/src/amun/cpu/mblas/matrix.h
index 6251979b..6a549ccb 100644
--- a/src/amun/cpu/mblas/matrix.h
+++ b/src/amun/cpu/mblas/matrix.h
@@ -421,11 +421,12 @@ void LayerNormalization(MT& in, const MT& gamma, float eps=1e-9) {
     for (int i = 0; i < cols; ++i) {
       sigma += (in(j, i) - mean) * (in(j, i) - mean);
     }
+    sigma /= cols;
 
     sigma = sqrt(sigma + eps);
 
     for (int i = 0; i < cols; ++i) {
-      in(j, i) = gamma(0, j) * ( (in(j, i) - mean) / sigma);
+      in(j, i) = gamma(i, 0) * ( (in(j, i) - mean) / sigma);
     }
   }
 }
diff --git a/src/marian b/src/marian
-Subproject 2be517298cb5bb0aa75768c1563163c313e89f0
+Subproject 9ce67850b80b0be5fb3d2a51c645b45ac170786
author	Tomasz Dwojak <t.dwojak@amu.edu.pl>	2017-05-24 11:25:55 +0300
committer	Tomasz Dwojak <t.dwojak@amu.edu.pl>	2017-05-24 11:25:55 +0300
commit	a1a2ee0b21ae1b271674922135e3d2690c6c8dad (patch)
tree	fc827243b9ba4ce77def3d39ba9765379bde63ca
parent	0f50f93cfe7e9a49704500ef3585ff9c40bc67b8 (diff)