Welcome to mirror list, hosted at ThFree Co, Russian Federation.

github.com/marian-nmt/marian.git - Unnamed repository; edit this file 'description' to name the repository.
summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorTomasz Dwojak <t.dwojak@amu.edu.pl>2017-05-24 11:25:55 +0300
committerTomasz Dwojak <t.dwojak@amu.edu.pl>2017-05-24 11:25:55 +0300
commita1a2ee0b21ae1b271674922135e3d2690c6c8dad (patch)
treefc827243b9ba4ce77def3d39ba9765379bde63ca
parent0f50f93cfe7e9a49704500ef3585ff9c40bc67b8 (diff)
Add (again) layer normalization on CPU
-rw-r--r--src/amun/cpu/dl4mt/decoder.h38
-rw-r--r--src/amun/cpu/dl4mt/gru.h43
-rw-r--r--src/amun/cpu/dl4mt/model.cpp4
-rw-r--r--src/amun/cpu/mblas/matrix.h3
m---------src/marian11
5 files changed, 63 insertions, 36 deletions
diff --git a/src/amun/cpu/dl4mt/decoder.h b/src/amun/cpu/dl4mt/decoder.h
index b1c10e97..62dd1ba0 100644
--- a/src/amun/cpu/dl4mt/decoder.h
+++ b/src/amun/cpu/dl4mt/decoder.h
@@ -59,9 +59,14 @@ class Decoder {
State = Temp2_ * w_.Wi_;
- AddBiasVector<byRow>(State, w_.Bi_);
+ if (w_.Gamma_.rows()) {
+ LayerNormalization(State, w_.Gamma_);
+ AddBiasVector<byRow>(State, w_.Bi_);
+ } else {
+ AddBiasVector<byRow>(State, w_.Bi_);
+ State = blaze::forEach(State, Tanh());
+ }
- State = blaze::forEach(State, Tanh());
}
void GetNextState(mblas::Matrix& NextState,
@@ -108,6 +113,9 @@ class Decoder {
void Init(const mblas::Matrix& SourceContext) {
using namespace mblas;
SCU_ = SourceContext * w_.U_;
+ if (w_.Gamma_1_.rows()) {
+ LayerNormalization(SCU_, w_.Gamma_1_);
+ }
AddBiasVector<byRow>(SCU_, w_.B_);
}
@@ -117,11 +125,10 @@ class Decoder {
using namespace mblas;
Temp2_ = HiddenState * w_.W_;
+ if (w_.Gamma_2_.rows()) {
+ LayerNormalization(Temp2_, w_.Gamma_2_);
+ }
- // For batching: create an A across different sentences,
- // maybe by mapping and looping. In the and join different
- // alignment matrices into one
- // Or masking?
Temp1_ = Broadcast<Matrix>(Tanh(), SCU_, Temp2_);
A_.resize(Temp1_.rows(), 1);
@@ -171,12 +178,23 @@ class Decoder {
const mblas::Matrix& AlignedSourceContext) {
using namespace mblas;
- T1_ = State * w_.W1_;
- T2_ = Embedding * w_.W2_;
- T3_ = AlignedSourceContext * w_.W3_;
+ T1_ = State * w_.W1_;
+ if (w_.Gamma_1_.rows()) {
+ LayerNormalization(T1_, w_.Gamma_1_);
+ }
AddBiasVector<byRow>(T1_, w_.B1_);
+
+ T2_ = Embedding * w_.W2_;
+ if (w_.Gamma_0_.rows()) {
+ LayerNormalization(T2_, w_.Gamma_0_);
+ }
AddBiasVector<byRow>(T2_, w_.B2_);
+
+ T3_ = AlignedSourceContext * w_.W3_;
+ if (w_.Gamma_2_.rows()) {
+ LayerNormalization(T3_, w_.Gamma_2_);
+ }
AddBiasVector<byRow>(T3_, w_.B3_);
auto t = blaze::forEach(T1_ + T2_ + T3_, Tanh());
@@ -229,7 +247,7 @@ class Decoder {
GetProbs(NextState, Embeddings, AlignedSourceContext_);
}
- BaseMatrix& GetProbs() {
+ mblas::ArrayMatrix& GetProbs() {
return Probs_;
}
diff --git a/src/amun/cpu/dl4mt/gru.h b/src/amun/cpu/dl4mt/gru.h
index 84e86357..ec301386 100644
--- a/src/amun/cpu/dl4mt/gru.h
+++ b/src/amun/cpu/dl4mt/gru.h
@@ -1,5 +1,5 @@
#pragma once
-#include "../mblas/matrix.h"
+#include "cpu/mblas/matrix.h"
namespace amunmt {
namespace CPU {
@@ -13,66 +13,73 @@ class GRU {
WWx_ = Concat<byColumn, Matrix>(w_.W_, w_.Wx_);
UUx_ = Concat<byColumn, Matrix>(w_.U_, w_.Ux_);
}
-
+
void GetNextState(mblas::Matrix& NextState,
const mblas::Matrix& State,
const mblas::Matrix& Context) const {
RUH_ = Context * WWx_;
+ if (w_.Gamma_1_.rows()) {
+ LayerNormalization(RUH_, w_.Gamma_1_);
+ }
+
Temp_ = State * UUx_;
-
+ if (w_.Gamma_2_.rows()) {
+ LayerNormalization(Temp_, w_.Gamma_2_);
+ }
+
// @TODO: once broadcasting is available
// implement this using blaze idioms
ElementwiseOps(NextState, State);
}
-
+
void ElementwiseOps(mblas::Matrix& NextState,
const mblas::Matrix& State) const {
-
+
using namespace mblas;
using namespace blaze;
-
+
const size_t rowNo = State.rows();
const size_t colNo = State.columns();
NextState.resize(rowNo, colNo);
-
+
for(int j = 0; j < rowNo; ++j) {
auto rowOut = row(NextState, j);
auto rowState = row(State, j);
-
+
auto rowRuh = row(RUH_, j);
auto rowT = row(Temp_, j);
-
+
auto rowH = subvector(rowRuh, 2 * colNo, colNo);
auto rowT2 = subvector(rowT, 2 * colNo, colNo);
-
+
for(int i = 0; i < colNo; ++i) {
float ev1 = expapprox(-(rowRuh[i] + w_.B_(0, i) + rowT[i]));
float r = 1.0 / (1.0 + ev1);
-
+
int k = i + colNo;
float ev2 = expapprox(-(rowRuh[k] + w_.B_(0, k) + rowT[k]));
- float u = 1.0 / (1.0 + ev2);
-
+ float u = 1.0 / (1.0 + ev2);
+
float hv = rowH[i] + w_.Bx1_(0, i);
float t2v = rowT2[i] + w_.Bx2_(0, i);
hv = tanhapprox(hv + r * t2v);
rowOut[i] = (1.0 - u) * hv + u * rowState[i];
}
}
-
+
}
-
+
size_t GetStateLength() const {
return w_.U_.rows();
}
-
+
private:
// Model matrices
- const Weights& w_;
+ const Weights& w_;
mutable mblas::Matrix WWx_;
mutable mblas::Matrix UUx_;
-
+
// reused to avoid allocation
mutable mblas::Matrix RUH_;
mutable mblas::Matrix Temp_;
diff --git a/src/amun/cpu/dl4mt/model.cpp b/src/amun/cpu/dl4mt/model.cpp
index 02e77a13..f3e40bae 100644
--- a/src/amun/cpu/dl4mt/model.cpp
+++ b/src/amun/cpu/dl4mt/model.cpp
@@ -16,7 +16,9 @@ Weights::GRU::GRU(const NpzConverter& model, const std::vector<std::string> &key
Wx_(model[keys.at(3)]),
Bx1_(model(keys.at(4), true)),
Bx2_(Bx1_.rows(), Bx1_.columns()),
- Ux_(model[keys.at(5)])
+ Ux_(model[keys.at(5)]),
+ Gamma_1_(model[keys.at(6)]),
+ Gamma_2_(model[keys.at(7)])
{
const_cast<mblas::Matrix&>(Bx2_) = 0.0f;
}
diff --git a/src/amun/cpu/mblas/matrix.h b/src/amun/cpu/mblas/matrix.h
index 6251979b..6a549ccb 100644
--- a/src/amun/cpu/mblas/matrix.h
+++ b/src/amun/cpu/mblas/matrix.h
@@ -421,11 +421,12 @@ void LayerNormalization(MT& in, const MT& gamma, float eps=1e-9) {
for (int i = 0; i < cols; ++i) {
sigma += (in(j, i) - mean) * (in(j, i) - mean);
}
+ sigma /= cols;
sigma = sqrt(sigma + eps);
for (int i = 0; i < cols; ++i) {
- in(j, i) = gamma(0, j) * ( (in(j, i) - mean) / sigma);
+ in(j, i) = gamma(i, 0) * ( (in(j, i) - mean) / sigma);
}
}
}
diff --git a/src/marian b/src/marian
-Subproject 2be517298cb5bb0aa75768c1563163c313e89f0
+Subproject 9ce67850b80b0be5fb3d2a51c645b45ac170786