diff options
author | Tomasz Dwojak <t.dwojak@amu.edu.pl> | 2017-05-24 11:25:55 +0300 |
---|---|---|
committer | Tomasz Dwojak <t.dwojak@amu.edu.pl> | 2017-05-24 11:25:55 +0300 |
commit | a1a2ee0b21ae1b271674922135e3d2690c6c8dad (patch) | |
tree | fc827243b9ba4ce77def3d39ba9765379bde63ca | |
parent | 0f50f93cfe7e9a49704500ef3585ff9c40bc67b8 (diff) |
Add (again) layer normalization on CPU
-rw-r--r-- | src/amun/cpu/dl4mt/decoder.h | 38 | ||||
-rw-r--r-- | src/amun/cpu/dl4mt/gru.h | 43 | ||||
-rw-r--r-- | src/amun/cpu/dl4mt/model.cpp | 4 | ||||
-rw-r--r-- | src/amun/cpu/mblas/matrix.h | 3 | ||||
m--------- | src/marian | 11 |
5 files changed, 63 insertions, 36 deletions
diff --git a/src/amun/cpu/dl4mt/decoder.h b/src/amun/cpu/dl4mt/decoder.h index b1c10e97..62dd1ba0 100644 --- a/src/amun/cpu/dl4mt/decoder.h +++ b/src/amun/cpu/dl4mt/decoder.h @@ -59,9 +59,14 @@ class Decoder { State = Temp2_ * w_.Wi_; - AddBiasVector<byRow>(State, w_.Bi_); + if (w_.Gamma_.rows()) { + LayerNormalization(State, w_.Gamma_); + AddBiasVector<byRow>(State, w_.Bi_); + } else { + AddBiasVector<byRow>(State, w_.Bi_); + State = blaze::forEach(State, Tanh()); + } - State = blaze::forEach(State, Tanh()); } void GetNextState(mblas::Matrix& NextState, @@ -108,6 +113,9 @@ class Decoder { void Init(const mblas::Matrix& SourceContext) { using namespace mblas; SCU_ = SourceContext * w_.U_; + if (w_.Gamma_1_.rows()) { + LayerNormalization(SCU_, w_.Gamma_1_); + } AddBiasVector<byRow>(SCU_, w_.B_); } @@ -117,11 +125,10 @@ class Decoder { using namespace mblas; Temp2_ = HiddenState * w_.W_; + if (w_.Gamma_2_.rows()) { + LayerNormalization(Temp2_, w_.Gamma_2_); + } - // For batching: create an A across different sentences, - // maybe by mapping and looping. In the and join different - // alignment matrices into one - // Or masking? Temp1_ = Broadcast<Matrix>(Tanh(), SCU_, Temp2_); A_.resize(Temp1_.rows(), 1); @@ -171,12 +178,23 @@ class Decoder { const mblas::Matrix& AlignedSourceContext) { using namespace mblas; - T1_ = State * w_.W1_; - T2_ = Embedding * w_.W2_; - T3_ = AlignedSourceContext * w_.W3_; + T1_ = State * w_.W1_; + if (w_.Gamma_1_.rows()) { + LayerNormalization(T1_, w_.Gamma_1_); + } AddBiasVector<byRow>(T1_, w_.B1_); + + T2_ = Embedding * w_.W2_; + if (w_.Gamma_0_.rows()) { + LayerNormalization(T2_, w_.Gamma_0_); + } AddBiasVector<byRow>(T2_, w_.B2_); + + T3_ = AlignedSourceContext * w_.W3_; + if (w_.Gamma_2_.rows()) { + LayerNormalization(T3_, w_.Gamma_2_); + } AddBiasVector<byRow>(T3_, w_.B3_); auto t = blaze::forEach(T1_ + T2_ + T3_, Tanh()); @@ -229,7 +247,7 @@ class Decoder { GetProbs(NextState, Embeddings, AlignedSourceContext_); } - BaseMatrix& GetProbs() { + mblas::ArrayMatrix& GetProbs() { return Probs_; } diff --git a/src/amun/cpu/dl4mt/gru.h b/src/amun/cpu/dl4mt/gru.h index 84e86357..ec301386 100644 --- a/src/amun/cpu/dl4mt/gru.h +++ b/src/amun/cpu/dl4mt/gru.h @@ -1,5 +1,5 @@ #pragma once -#include "../mblas/matrix.h" +#include "cpu/mblas/matrix.h" namespace amunmt { namespace CPU { @@ -13,66 +13,73 @@ class GRU { WWx_ = Concat<byColumn, Matrix>(w_.W_, w_.Wx_); UUx_ = Concat<byColumn, Matrix>(w_.U_, w_.Ux_); } - + void GetNextState(mblas::Matrix& NextState, const mblas::Matrix& State, const mblas::Matrix& Context) const { RUH_ = Context * WWx_; + if (w_.Gamma_1_.rows()) { + LayerNormalization(RUH_, w_.Gamma_1_); + } + Temp_ = State * UUx_; - + if (w_.Gamma_2_.rows()) { + LayerNormalization(Temp_, w_.Gamma_2_); + } + // @TODO: once broadcasting is available // implement this using blaze idioms ElementwiseOps(NextState, State); } - + void ElementwiseOps(mblas::Matrix& NextState, const mblas::Matrix& State) const { - + using namespace mblas; using namespace blaze; - + const size_t rowNo = State.rows(); const size_t colNo = State.columns(); NextState.resize(rowNo, colNo); - + for(int j = 0; j < rowNo; ++j) { auto rowOut = row(NextState, j); auto rowState = row(State, j); - + auto rowRuh = row(RUH_, j); auto rowT = row(Temp_, j); - + auto rowH = subvector(rowRuh, 2 * colNo, colNo); auto rowT2 = subvector(rowT, 2 * colNo, colNo); - + for(int i = 0; i < colNo; ++i) { float ev1 = expapprox(-(rowRuh[i] + w_.B_(0, i) + rowT[i])); float r = 1.0 / (1.0 + ev1); - + int k = i + colNo; float ev2 = expapprox(-(rowRuh[k] + w_.B_(0, k) + rowT[k])); - float u = 1.0 / (1.0 + ev2); - + float u = 1.0 / (1.0 + ev2); + float hv = rowH[i] + w_.Bx1_(0, i); float t2v = rowT2[i] + w_.Bx2_(0, i); hv = tanhapprox(hv + r * t2v); rowOut[i] = (1.0 - u) * hv + u * rowState[i]; } } - + } - + size_t GetStateLength() const { return w_.U_.rows(); } - + private: // Model matrices - const Weights& w_; + const Weights& w_; mutable mblas::Matrix WWx_; mutable mblas::Matrix UUx_; - + // reused to avoid allocation mutable mblas::Matrix RUH_; mutable mblas::Matrix Temp_; diff --git a/src/amun/cpu/dl4mt/model.cpp b/src/amun/cpu/dl4mt/model.cpp index 02e77a13..f3e40bae 100644 --- a/src/amun/cpu/dl4mt/model.cpp +++ b/src/amun/cpu/dl4mt/model.cpp @@ -16,7 +16,9 @@ Weights::GRU::GRU(const NpzConverter& model, const std::vector<std::string> &key Wx_(model[keys.at(3)]), Bx1_(model(keys.at(4), true)), Bx2_(Bx1_.rows(), Bx1_.columns()), - Ux_(model[keys.at(5)]) + Ux_(model[keys.at(5)]), + Gamma_1_(model[keys.at(6)]), + Gamma_2_(model[keys.at(7)]) { const_cast<mblas::Matrix&>(Bx2_) = 0.0f; } diff --git a/src/amun/cpu/mblas/matrix.h b/src/amun/cpu/mblas/matrix.h index 6251979b..6a549ccb 100644 --- a/src/amun/cpu/mblas/matrix.h +++ b/src/amun/cpu/mblas/matrix.h @@ -421,11 +421,12 @@ void LayerNormalization(MT& in, const MT& gamma, float eps=1e-9) { for (int i = 0; i < cols; ++i) { sigma += (in(j, i) - mean) * (in(j, i) - mean); } + sigma /= cols; sigma = sqrt(sigma + eps); for (int i = 0; i < cols; ++i) { - in(j, i) = gamma(0, j) * ( (in(j, i) - mean) / sigma); + in(j, i) = gamma(i, 0) * ( (in(j, i) - mean) / sigma); } } } diff --git a/src/marian b/src/marian -Subproject 2be517298cb5bb0aa75768c1563163c313e89f0 +Subproject 9ce67850b80b0be5fb3d2a51c645b45ac170786 |