45 files changed, 5343 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
index b8bd0267..a2c0da8c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -26,3 +26,5 @@
 *.exe
 *.out
 *.app
+
+build
+\ No newline at end of file
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 00000000..76e90b45
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,24 @@
+cmake_minimum_required(VERSION 3.1.0)
+set(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake)
+
+project(amunn CXX)
+SET(CMAKE_CXX_FLAGS " -std=c++11 -g -O3 -funroll-loops -Wno-unused-result -Wno-deprecated")
+#SET(CUDA_PROPAGATE_HOST_FLAGS OFF)
+SET(CUDA_NVCC_FLAGS " -std=c++11 -g -O3 -arch=sm_35 -lineinfo --use_fast_math")
+#SET(CUDA_VERBOSE_BUILD ON)
+
+include_directories(${amunn_SOURCE_DIR})
+set(EXT_LIBS)
+
+find_package(CUDA REQUIRED)
+
+find_package(Boost COMPONENTS system filesystem program_options timer)
+if(Boost_FOUND)
+    include_directories(${Boost_INCLUDE_DIRS})
+    set(EXT_LIBS ${EXT_LIBS} ${Boost_LIBRARIES})
+else(Boost_FOUND)
+    message(SEND_ERROR "Cannot find Boost libraries. Terminating." )
+endif(Boost_FOUND)
+
+include_directories($amunn_SOURCE_DIR}/src)
+add_subdirectory(src)
diff --git a/scripts/vocab2txt.py b/scripts/vocab2txt.py
new file mode 100644
index 00000000..fb054fa2
--- /dev/null
+++ b/scripts/vocab2txt.py
@@ -0,0 +1,8 @@
+import sys
+import cPickle
+import operator
+
+d = cPickle.load(open(sys.argv[1], 'r'))
+sorted_d = sorted(d.items(), key=operator.itemgetter(1))
+for p in sorted_d:
+    print p[0]
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
new file mode 100644
index 00000000..18e6692c
--- /dev/null
+++ b/src/CMakeLists.txt
@@ -0,0 +1,28 @@
+
+include_directories(.)
+include_directories(common)
+include_directories(dl4mt)
+
+add_library(libamunn OBJECT
+  cnpy/cnpy.cpp
+  rescorer/nbest.cpp
+  common/utils.cpp
+)
+
+cuda_add_executable(
+  amunn
+  decoder/decoder_main.cu
+  $<TARGET_OBJECTS:libamunn>
+)
+
+cuda_add_executable(
+  amunn_rescorer
+  rescorer/rescorer_main.cu
+  $<TARGET_OBJECTS:libamunn>
+)
+
+foreach(exec amunn amunn_rescorer)
+  target_link_libraries(${exec} ${EXT_LIBS})
+  cuda_add_cublas_to_target(${exec})
+  set_target_properties(${exec} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/bin")
+endforeach(exec)
diff --git a/src/bahdanau.h b/src/bahdanau.h
new file mode 100644
index 00000000..f9629f3d
--- /dev/null
+++ b/src/bahdanau.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "bahdanau/model.h"
+#include "bahdanau/encoder.h"
+#include "bahdanau/decoder.h"
diff --git a/src/bahdanau/decoder.h b/src/bahdanau/decoder.h
new file mode 100644
index 00000000..aebedeeb
--- /dev/null
+++ b/src/bahdanau/decoder.h
@@ -0,0 +1,256 @@
+#pragma once
+
+#include "mblas/matrix.h"
+#include "bahdanau/model.h"
+ 
+class Decoder {
+  private:
+    template <class Weights>
+    class Embeddings {
+      public:
+        Embeddings(const Weights& model)
+        : w_(model)
+        {}
+            
+        void Lookup(mblas::Matrix& Rows, const std::vector<size_t>& ids) {
+          using namespace mblas;
+          Assemble(Rows, w_.E_, ids);
+          Broadcast(_1 + _2, Rows, w_.EB_);
+        }
+            
+      private:
+        const Weights& w_;
+    };
+    
+    template <class Weights>
+    class RNN {
+      public:
+        RNN(const Weights& model)
+        : w_(model) {}          
+        
+        void InitializeState(mblas::Matrix& State,
+                             const mblas::Matrix& SourceContext,
+                             const size_t batchSize = 1) {
+          using namespace mblas;
+          CopyRow(Temp1_, SourceContext, 0, 1000);
+          Temp2_.Clear();
+          Temp2_.Resize(batchSize, 1000, 0.0);
+          Broadcast(_1 + _2, Temp2_, Temp1_);
+          Prod(State, Temp2_, w_.Ws_);
+          Broadcast(Tanh(_1 + _2), State, w_.WsB_);
+        }
+        
+        mblas::Matrix& GetNextState(mblas::Matrix& State,
+                                    const mblas::Matrix& Embd,
+                                    const mblas::Matrix& PrevState,
+                                    const mblas::Matrix& Context) {
+          using namespace mblas;
+    
+          Prod(Z_, Embd, w_.Wz_);
+          Prod(Temp1_, PrevState, w_.Uz_);
+          Prod(Temp2_, Context, w_.Cz_);
+          Element(Logit(_1 + _2 + _3),
+                  Z_, Temp1_, Temp2_);
+          
+          Prod(R_, Embd, w_.Wr_);
+          Prod(Temp1_, PrevState, w_.Ur_);
+          Prod(Temp2_, Context, w_.Cr_);
+          Element(Logit(_1 + _2 + _3),
+                  R_, Temp1_, Temp2_);
+          
+          Prod(S_, Embd, w_.W_);
+          Broadcast(_1 + _2, S_, w_.B_); // Broadcasting row-wise
+          Prod(Temp1_, Element(_1 * _2, R_, PrevState), w_.U_);
+          Prod(Temp2_, Context, w_.C_);
+          
+          Element(Tanh(_1 + _2 + _3), S_, Temp1_, Temp2_);
+          
+          Element((1.0 - _1) * _2 + _1 * _3,
+                  Z_, PrevState, S_);
+          
+          State.Resize(Z_.Rows(), Z_.Cols());
+          Swap(State, Z_);
+
+          return State;
+        }
+        
+      private:
+        // Model matrices
+        const Weights& w_;
+        
+        // reused to avoid allocation
+        mblas::Matrix Z_;
+        mblas::Matrix R_;
+        mblas::Matrix S_;
+        
+        mblas::Matrix Temp1_;
+        mblas::Matrix Temp2_;
+    };
+    
+    template <class Weights>
+    class Alignment {
+      public:
+        Alignment(const Weights& model)
+        : w_(model)
+        {}
+          
+        void GetContext(mblas::Matrix& Context,
+                        const mblas::Matrix& SourceContext,
+                        const mblas::Matrix& PrevState) {
+          using namespace mblas;  
+          
+          Prod(Temp1_, SourceContext, w_.Ua_);
+          Prod(Temp2_, PrevState, w_.Wa_);
+          
+          Broadcast(Tanh(_1 + _2), Temp1_, Temp2_);
+          
+          Prod(A_, w_.Va_, Temp1_, false, true);
+          size_t rows1 = SourceContext.Rows();
+          size_t rows2 = PrevState.Rows();     
+          A_.Reshape(rows2, rows1); // due to broadcasting above
+          
+          mblas::Softmax(A_);
+          Prod(Context, A_, SourceContext);
+        }
+      
+      private:
+        const Weights& w_;
+        
+        mblas::Matrix Temp1_;
+        mblas::Matrix Temp2_;
+        mblas::Matrix A_;
+        
+        mblas::Matrix Ones_;
+        mblas::Matrix Sums_;
+    };
+    
+    template <class Weights>
+    class Softmax {
+      public:
+        Softmax(const Weights& model)
+        : w_(model), filtered_(false)
+        {}
+          
+        void GetProbs(mblas::Matrix& Probs,
+                      const mblas::Matrix& PrevState,
+                      const mblas::Matrix& PrevEmbd,
+                      const mblas::Matrix& Context) {
+          
+          using namespace mblas;
+          
+          Prod(T_, PrevState, w_.Uo_);
+          
+          Prod(Temp1_, PrevEmbd, w_.Vo_);
+          Prod(Temp2_, Context, w_.Co_);
+          Element(_1 + _2 + _3, T_, Temp1_, Temp2_);
+          Broadcast(_1 + _2, T_, w_.UoB_); // Broadcasting row-wise
+          PairwiseReduce(Max(_1, _2), T_);
+          
+          if(filtered_) { // use only filtered vocabulary for SoftMax
+            Prod(Probs, T_, FilteredWo_);
+            Broadcast(_1 + _2, Probs, FilteredWoB_); // Broadcasting row-wise
+          }
+          else {
+            Prod(Probs, T_, w_.Wo_);
+            Broadcast(_1 + _2, Probs, w_.WoB_); // Broadcasting row-wise
+          }
+          mblas::Softmax(Probs);
+        }
+        
+        void Filter(const std::vector<size_t>& ids) {
+          using namespace mblas;
+          
+          Matrix TempWo;
+          Transpose(TempWo, w_.Wo_);
+          Assemble(FilteredWo_, TempWo, ids);
+          Transpose(FilteredWo_);
+          
+          Matrix TempWoB;
+          Transpose(TempWoB, w_.WoB_);
+          Assemble(FilteredWoB_, TempWoB, ids);
+          Transpose(FilteredWoB_);
+          
+          filtered_ = true;
+        }
+       
+      private:        
+        const Weights& w_;
+        
+        bool filtered_;
+        mblas::Matrix FilteredWo_;
+        mblas::Matrix FilteredWoB_;
+        
+        mblas::Matrix T_;
+        mblas::Matrix Temp1_;
+        mblas::Matrix Temp2_;
+        
+        mblas::Matrix Ones_;
+        mblas::Matrix Sums_;
+    };
+    
+  public:
+    Decoder(const Weights& model)
+    : embeddings_(model.decEmbeddings_),
+      rnn_(model.decRnn_), alignment_(model.decAlignment_),
+      softmax_(model.decSoftmax_)
+    {}
+    
+    void EmptyState(mblas::Matrix& State, const mblas::Matrix& SourceContext,
+                    size_t batchSize = 1) {
+      State.Resize(batchSize, 1000);
+      rnn_.InitializeState(State, SourceContext, batchSize);
+    }
+    
+    void EmptyEmbedding(mblas::Matrix& Embedding, size_t batchSize = 1) {
+      Embedding.Clear();
+      Embedding.Resize(batchSize, 620, 0);
+    }
+    
+    void MakeStep(mblas::Matrix& NextState,
+                  mblas::Matrix& NextEmbeddings,
+                  mblas::Matrix& Probs,
+                  const std::vector<size_t>& batch,
+                  const mblas::Matrix& State,
+                  const mblas::Matrix& Embeddings,
+                  const mblas::Matrix& SourceContext) {
+      GetProbs(Probs, AlignedSourceContext_,
+               State, Embeddings, SourceContext);
+      Lookup(NextEmbeddings, batch);
+      GetNextState(NextState, NextEmbeddings,
+                   State, AlignedSourceContext_);
+    }
+    
+  //private:
+    
+    void Filter(const std::vector<size_t>& ids) {
+      softmax_.Filter(ids);
+    }
+    
+    void GetProbs(mblas::Matrix& Probs,
+                  mblas::Matrix& AlignedSourceContext,
+                  const mblas::Matrix& PrevState,
+                  const mblas::Matrix& PrevEmbedding,
+                  const mblas::Matrix& SourceContext) {
+      alignment_.GetContext(AlignedSourceContext, SourceContext, PrevState);
+      softmax_.GetProbs(Probs, PrevState, PrevEmbedding, AlignedSourceContext);
+    }
+    
+    void Lookup(mblas::Matrix& Embedding, const std::vector<size_t>& w) {
+      embeddings_.Lookup(Embedding, w);
+    }
+    
+    void GetNextState(mblas::Matrix& State,
+                      const mblas::Matrix& Embedding,
+                      const mblas::Matrix& PrevState,
+                      const mblas::Matrix& AlignedSourceContext) {
+      rnn_.GetNextState(State, Embedding, PrevState, AlignedSourceContext);  
+    }
+    
+  private:
+    mblas::Matrix AlignedSourceContext_; 
+    
+    Embeddings<Weights::DecEmbeddings> embeddings_;
+    RNN<Weights::DecRnn> rnn_;
+    Alignment<Weights::DecAlignment> alignment_;
+    Softmax<Weights::DecSoftmax> softmax_;
+};
diff --git a/src/bahdanau/encoder.h b/src/bahdanau/encoder.h
new file mode 100644
index 00000000..94533427
--- /dev/null
+++ b/src/bahdanau/encoder.h
@@ -0,0 +1,118 @@
+#pragma once
+
+#include "mblas/matrix.h"
+#include "bahdanau/model.h"
+ 
+class Encoder {
+  private:
+    template <class Weights>
+    class Embeddings {
+      public:
+        Embeddings(const Weights& model)
+        : w_(model)
+        {}
+          
+        void Lookup(mblas::Matrix& Row, size_t i) {
+          using namespace mblas;
+          CopyRow(Row, w_.E_, i);
+          Element(_1 + _2,
+                  Row, w_.EB_);
+        }
+      
+      private:
+        const Weights& w_;
+    };
+    
+    template <class Weights>
+    class RNN {
+      public:
+        RNN(const Weights& model)
+        : w_(model) {}
+        
+        void InitializeState(size_t batchSize = 1) {
+          State_.Clear();
+          State_.Resize(batchSize, 1000, 0.0);
+        }
+        
+        void GetNextState(mblas::Matrix& State,
+                          const mblas::Matrix& Embd,
+                          const mblas::Matrix& PrevState) {
+          using namespace mblas;
+    
+          Prod(Za_, Embd, w_.Wz_);
+          Prod(Temp_, PrevState, w_.Uz_);
+          Element(Logit(_1 + _2), Za_, Temp_);
+            
+          Prod(Ra_, Embd, w_.Wr_);
+          Prod(Temp_, PrevState, w_.Ur_);
+          Element(Logit(_1 + _2), Ra_, Temp_);    
+        
+          Prod(Ha_, Embd, w_.W_);
+          Prod(Temp_, Element(_1 * _2, Ra_, PrevState), w_.U_);
+          Element(_1 + _2, Ha_, w_.B_); // Broadcasting row-wise
+          Element(Tanh(_1 + _2), Ha_, Temp_);
+          
+          Element((1.0 - _1) * _2 + _1 * _3, Za_, PrevState, Ha_);
+          
+          Swap(State, Za_);
+        }
+        
+        template <class It>
+        void GetContext(It it, It end, 
+                        mblas::Matrix& Context, bool invert) {
+          InitializeState();
+          
+          size_t n = std::distance(it, end);
+          size_t i = 0;
+          while(it != end) {
+            GetNextState(State_, *it++, State_);
+            if(invert)
+              mblas::PasteRow(Context, State_, n - i - 1, 1000);
+            else
+              mblas::PasteRow(Context, State_, i, 0);
+            ++i;
+          }
+        }
+        
+      private:
+        // Model matrices
+        const Weights& w_;
+        
+        // reused to avoid allocation
+        mblas::Matrix Za_;
+        mblas::Matrix Ra_;
+        mblas::Matrix Ha_;
+        mblas::Matrix Temp_;
+        mblas::Matrix State_;
+    };
+    
+  public:
+    Encoder(const Weights& model)
+    : embeddings_(model.encEmbeddings_),
+      forwardRnn_(model.encForwardRnn_),
+      backwardRnn_(model.encBackwardRnn_)
+    {}
+    
+    void GetContext(const std::vector<size_t>& words,
+                    mblas::Matrix& Context) {
+      std::vector<mblas::Matrix> embeddedWords;
+      
+      Context.Resize(words.size(), 2000);
+      for(auto& w : words) {
+        embeddedWords.emplace_back();
+        embeddings_.Lookup(embeddedWords.back(), w);
+      }
+      
+      forwardRnn_.GetContext(embeddedWords.begin(),
+                             embeddedWords.end(),
+                             Context, false);
+      backwardRnn_.GetContext(embeddedWords.rbegin(),
+                              embeddedWords.rend(),
+                              Context, true);
+    }
+    
+  private:
+    Embeddings<Weights::EncEmbeddings> embeddings_;
+    RNN<Weights::EncForwardRnn> forwardRnn_;
+    RNN<Weights::EncBackwardRnn> backwardRnn_;
+};
diff --git a/src/bahdanau/model.h b/src/bahdanau/model.h
new file mode 100644
index 00000000..ae4445d9
--- /dev/null
+++ b/src/bahdanau/model.h
@@ -0,0 +1,169 @@
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "mblas/matrix.h"
+#include "npz_converter.h"
+
+struct Weights {
+  
+  //////////////////////////////////////////////////////////////////////////////
+  
+  struct EncEmbeddings {
+    EncEmbeddings(const NpzConverter& model)
+    : E_(model["W_0_enc_approx_embdr"]),
+      EB_(model("b_0_enc_approx_embdr", true))
+    {}
+    
+    const mblas::Matrix E_;
+    const mblas::Matrix EB_;
+  };
+  
+  struct EncForwardRnn {
+    EncForwardRnn(const NpzConverter& model) 
+    : W_(model["W_0_enc_input_embdr_0"]),  
+      B_(model("b_0_enc_input_embdr_0", true)),
+      U_(model["W_enc_transition_0"]),
+      Wz_(model["W_0_enc_update_embdr_0"]),
+      Uz_(model["G_enc_transition_0"]),
+      Wr_(model["W_0_enc_reset_embdr_0"]),
+      Ur_(model["R_enc_transition_0"]) 
+    {}
+    
+    const mblas::Matrix W_;
+    const mblas::Matrix B_;
+    const mblas::Matrix U_;
+    const mblas::Matrix Wz_;
+    const mblas::Matrix Uz_;
+    const mblas::Matrix Wr_;
+    const mblas::Matrix Ur_;
+  };
+  
+  struct EncBackwardRnn {
+    EncBackwardRnn(const NpzConverter& model)     
+    : W_(model["W_0_back_enc_input_embdr_0"]),  
+      B_(model("b_0_back_enc_input_embdr_0", true)),
+      U_(model["W_back_enc_transition_0"]),
+      Wz_(model["W_0_back_enc_update_embdr_0"]),
+      Uz_(model["G_back_enc_transition_0"]),
+      Wr_(model["W_0_back_enc_reset_embdr_0"]),
+      Ur_(model["R_back_enc_transition_0"]) 
+    {}
+          
+    const mblas::Matrix W_;
+    const mblas::Matrix B_;
+    const mblas::Matrix U_;
+    const mblas::Matrix Wz_;
+    const mblas::Matrix Uz_;
+    const mblas::Matrix Wr_;
+    const mblas::Matrix Ur_;
+  };
+  
+  //////////////////////////////////////////////////////////////////////////////
+  
+  struct DecEmbeddings {
+    DecEmbeddings(const NpzConverter& model)
+    : E_(model["W_0_dec_approx_embdr"]),
+      EB_(model("b_0_dec_approx_embdr", true))
+    {}
+    
+    const mblas::Matrix E_;
+    const mblas::Matrix EB_;
+  };
+
+  struct DecRnn {
+    DecRnn(const NpzConverter& model)
+    : Ws_(model["W_0_dec_initializer_0"]),
+      WsB_(model("b_0_dec_initializer_0", true)),
+  
+      W_(model["W_0_dec_input_embdr_0"]),
+      B_(model("b_0_dec_input_embdr_0", true)),
+      U_(model["W_dec_transition_0"]),
+      C_(model["W_0_dec_dec_inputter_0"]),
+  
+      Wz_(model["W_0_dec_update_embdr_0"]),
+      Uz_(model["G_dec_transition_0"]),
+      Cz_(model["W_0_dec_dec_updater_0"]),
+  
+      Wr_(model["W_0_dec_reset_embdr_0"]),
+      Ur_(model["R_dec_transition_0"]),
+      Cr_(model["W_0_dec_dec_reseter_0"])
+    {}
+          
+    const mblas::Matrix Ws_;
+    const mblas::Matrix WsB_;
+    const mblas::Matrix W_;
+    const mblas::Matrix B_;
+    const mblas::Matrix U_;
+    const mblas::Matrix C_;
+    const mblas::Matrix Wz_;
+    const mblas::Matrix Uz_;
+    const mblas::Matrix Cz_;
+    const mblas::Matrix Wr_;
+    const mblas::Matrix Ur_;
+    const mblas::Matrix Cr_;
+  };
+  
+  struct DecAlignment {
+    DecAlignment(const NpzConverter& model)
+    : Va_(model("D_dec_transition_0", true)),
+      Wa_(model["B_dec_transition_0"]),
+      Ua_(model["A_dec_transition_0"])
+    {}
+          
+    const mblas::Matrix Va_;
+    const mblas::Matrix Wa_;
+    const mblas::Matrix Ua_;
+  };
+  
+  struct DecSoftmax {
+    DecSoftmax(const NpzConverter& model)
+    : WoB_(model("b_dec_deep_softmax", true)),
+      Uo_(model["W_0_dec_hid_readout_0"]),
+      UoB_(model("b_0_dec_hid_readout_0", true)),
+      Vo_(model["W_0_dec_prev_readout_0"]),
+      Co_(model["W_0_dec_repr_readout"])
+    {
+      const mblas::Matrix Wo1_(model["W1_dec_deep_softmax"]);
+      const mblas::Matrix Wo2_(model["W2_dec_deep_softmax"]);
+      mblas::Prod(const_cast<mblas::Matrix&>(Wo_), Wo1_, Wo2_);
+    }
+          
+    const mblas::Matrix Wo_;
+    const mblas::Matrix WoB_;
+    const mblas::Matrix Uo_;
+    const mblas::Matrix UoB_;
+    const mblas::Matrix Vo_;
+    const mblas::Matrix Co_;
+  };
+  
+  Weights(const std::string& npzFile, size_t device = 0)
+  : Weights(NpzConverter(npzFile), device)
+  {}
+  
+  Weights(const NpzConverter& model, size_t device = 0)
+  : encEmbeddings_(model),
+    decEmbeddings_(model),
+    encForwardRnn_(model),
+    encBackwardRnn_(model),
+    decRnn_(model),
+    decAlignment_(model),
+    decSoftmax_(model),
+    device_(device)
+    {}
+  
+  size_t GetDevice() {
+    return device_;
+  }
+  
+  const EncEmbeddings encEmbeddings_;
+  const DecEmbeddings decEmbeddings_;
+  const EncForwardRnn encForwardRnn_;
+  const EncBackwardRnn encBackwardRnn_;
+  const DecRnn decRnn_;
+  const DecAlignment decAlignment_;
+  const DecSoftmax decSoftmax_;
+  
+  const size_t device_;
+};
diff --git a/src/cnpy/CMakeLists.txt b/src/cnpy/CMakeLists.txt
new file mode 100644
index 00000000..5a7cdd30
--- /dev/null
+++ b/src/cnpy/CMakeLists.txt
@@ -0,0 +1,24 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+if(COMMAND cmake_policy)
+	cmake_policy(SET CMP0003 NEW)
+endif(COMMAND cmake_policy)
+
+project(CNPY)
+
+option(ENABLE_STATIC "Build static (.a) library" ON)
+
+add_library(cnpy SHARED "cnpy.cpp")
+target_link_libraries(cnpy z)
+install(TARGETS "cnpy" LIBRARY DESTINATION lib PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
+
+if(ENABLE_STATIC)
+    add_library(cnpy-static STATIC "cnpy.cpp")
+    set_target_properties(cnpy-static PROPERTIES OUTPUT_NAME "cnpy")
+    install(TARGETS "cnpy-static" ARCHIVE DESTINATION lib)
+endif(ENABLE_STATIC)
+
+install(FILES "cnpy.h" DESTINATION include)
+install(FILES "mat2npz" "npy2mat" "npz2mat" DESTINATION bin PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE)
+
+add_executable(example1 example1.cpp)
+target_link_libraries(example1 cnpy)
diff --git a/src/cnpy/LICENSE b/src/cnpy/LICENSE
new file mode 100644
index 00000000..e60eadbc
--- /dev/null
+++ b/src/cnpy/LICENSE
@@ -0,0 +1,21 @@
+The MIT License
+
+Copyright (c) Carl Rogers, 2011
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/src/cnpy/README b/src/cnpy/README
new file mode 100644
index 00000000..117ca564
--- /dev/null
+++ b/src/cnpy/README
@@ -0,0 +1,37 @@
+Purpose:
+
+Numpy offers the save method for easy saving of arrays into .npy and savez for zipping multiple .npy arrays together into a .npz file. cnpy lets you read and write to these formats in C++. The motivation comes from scientific programming where large amounts of data are generated in C++ and analyzed in Python. Writing to .npy has the advantage of using low-level C++ I/O (fread and fwrite) for speed and binary format for size. The .npy file header takes care of specifying the size, shape, and data type of the array, so specifying the format of the data is unnecessary. Loading data written in numpy formats into C++ is equally simple, but requires you to type-cast the loaded data to the type of your choice.
+
+Installation:
+
+Default installation directory is /usr/local. To specify a different directory, add -DCMAKE_INSTALL_PREFIX=/path/to/install/dir to the cmake invocation in step 4.
+
+1. get cmake at www.cmake.org
+2. create a build directory, say $HOME/build
+3. cd $HOME/build
+4. cmake /path/to/cnpy
+5. make
+6. make install
+
+Using:
+
+To use, #include"cnpy.h" in your source code. Compile the source code mycode.cpp as
+
+g++ -o mycode mycode.cpp -L/path/to/install/dir -lcnpy
+
+Description:
+
+There are two functions for writing data: npy_save, npz_save.
+
+There are 3 functions for reading. npy_load will load a .npy file. npz_load(fname) will load a .npz and return a dictionary of NpyArray structues. npz_load(fname,varname) will load and return the NpyArray for data varname from the specified .npz file.
+Note that NpyArray allocates char* data using new[] and *will not* delete the data upon the NpyArray destruction. You are responsible for delete the data yourself.
+
+The data structure for loaded data is below. Data is loaded into a a raw byte array. The array shape and word size are read from the npy header. You are responsible for casting/copying the data to its intended data type.
+
+struct NpyArray {
+    char* data;
+    std::vector<unsigned int> shape;
+    unsigned int word_size;
+};
+
+See example1.cpp for examples of how to use the library. example1 will also be build during cmake installation.
diff --git a/src/cnpy/cnpy.cpp b/src/cnpy/cnpy.cpp
new file mode 100644
index 00000000..85978dc8
--- /dev/null
+++ b/src/cnpy/cnpy.cpp
@@ -0,0 +1,251 @@
+//Copyright (C) 2011  Carl Rogers
+//Released under MIT License
+//license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php
+
+#include"cnpy.h"
+#include<complex>
+#include<cstdlib>
+#include<algorithm>
+#include<cstring>
+#include<iomanip>
+
+char cnpy::BigEndianTest() {
+    unsigned char x[] = {1,0};
+    short y = *(short*) x;
+    return y == 1 ? '<' : '>';
+}
+
+char cnpy::map_type(const std::type_info& t)
+{
+    if(t == typeid(float) ) return 'f';
+    if(t == typeid(double) ) return 'f';
+    if(t == typeid(long double) ) return 'f';
+
+    if(t == typeid(int) ) return 'i';
+    if(t == typeid(char) ) return 'i';
+    if(t == typeid(short) ) return 'i';
+    if(t == typeid(long) ) return 'i';
+    if(t == typeid(long long) ) return 'i';
+
+    if(t == typeid(unsigned char) ) return 'u';
+    if(t == typeid(unsigned short) ) return 'u';
+    if(t == typeid(unsigned long) ) return 'u';
+    if(t == typeid(unsigned long long) ) return 'u';
+    if(t == typeid(unsigned int) ) return 'u';
+
+    if(t == typeid(bool) ) return 'b';
+
+    if(t == typeid(std::complex<float>) ) return 'c';
+    if(t == typeid(std::complex<double>) ) return 'c';
+    if(t == typeid(std::complex<long double>) ) return 'c';
+
+    else return '?';
+}
+
+template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const std::string rhs) {
+    lhs.insert(lhs.end(),rhs.begin(),rhs.end());
+    return lhs;
+}
+
+template<> std::vector<char>& cnpy::operator+=(std::vector<char>& lhs, const char* rhs) {
+    //write in little endian
+    size_t len = strlen(rhs);
+    lhs.reserve(len);
+    for(size_t byte = 0; byte < len; byte++) {
+        lhs.push_back(rhs[byte]);
+    }
+    return lhs;
+}
+
+void cnpy::parse_npy_header(FILE* fp, unsigned int& word_size, unsigned int*& shape, unsigned int& ndims, bool& fortran_order) {  
+    char buffer[256];
+    size_t res = fread(buffer,sizeof(char),11,fp);       
+    if(res != 11)
+        throw std::runtime_error("parse_npy_header: failed fread");
+    std::string header = fgets(buffer,256,fp);
+    assert(header[header.size()-1] == '\n');
+
+    int loc1, loc2;
+
+    //fortran order
+    loc1 = header.find("fortran_order")+16;
+    fortran_order = (header.substr(loc1,5) == "True" ? true : false);
+
+    //shape
+    loc1 = header.find("(");
+    loc2 = header.find(")");
+    std::string str_shape = header.substr(loc1+1,loc2-loc1-1);
+    if(str_shape.length() == 0) ndims = 0;
+    else if(str_shape[str_shape.size()-1] == ',') ndims = 1;
+    else ndims = std::count(str_shape.begin(),str_shape.end(),',')+1;
+    shape = new unsigned int[ndims];
+    for(unsigned int i = 0;i < ndims;i++) {
+        loc1 = str_shape.find(",");
+        shape[i] = atoi(str_shape.substr(0,loc1).c_str());
+        str_shape = str_shape.substr(loc1+1);
+    }
+
+    //endian, word size, data type
+    //byte order code | stands for not applicable. 
+    //not sure when this applies except for byte array
+    loc1 = header.find("descr")+9;
+    bool littleEndian = (header[loc1] == '<' || header[loc1] == '|' ? true : false);
+    assert(littleEndian);
+
+    //char type = header[loc1+1];
+    //assert(type == map_type(T));
+
+    std::string str_ws = header.substr(loc1+2);
+    loc2 = str_ws.find("'");
+    word_size = atoi(str_ws.substr(0,loc2).c_str());
+}
+
+void cnpy::parse_zip_footer(FILE* fp, unsigned short& nrecs, unsigned int& global_header_size, unsigned int& global_header_offset)
+{
+    std::vector<char> footer(22);
+    fseek(fp,-22,SEEK_END);
+    size_t res = fread(&footer[0],sizeof(char),22,fp);
+    if(res != 22)
+        throw std::runtime_error("parse_zip_footer: failed fread");
+
+    unsigned short disk_no, disk_start, nrecs_on_disk, comment_len;
+    disk_no = *(unsigned short*) &footer[4];
+    disk_start = *(unsigned short*) &footer[6];
+    nrecs_on_disk = *(unsigned short*) &footer[8];
+    nrecs = *(unsigned short*) &footer[10];
+    global_header_size = *(unsigned int*) &footer[12];
+    global_header_offset = *(unsigned int*) &footer[16];
+    comment_len = *(unsigned short*) &footer[20];
+
+    assert(disk_no == 0);
+    assert(disk_start == 0);
+    assert(nrecs_on_disk == nrecs);
+    assert(comment_len == 0);
+}
+
+cnpy::NpyArray load_the_npy_file(FILE* fp) {
+    unsigned int* shape;
+    unsigned int ndims, word_size;
+    bool fortran_order;
+    cnpy::parse_npy_header(fp,word_size,shape,ndims,fortran_order);
+    unsigned long long size = 1; //long long so no overflow when multiplying by word_size
+    for(unsigned int i = 0;i < ndims;i++) size *= shape[i];
+
+    cnpy::NpyArray arr;
+    arr.word_size = word_size;
+    arr.shape = std::vector<unsigned int>(shape,shape+ndims);
+    delete[] shape;
+    arr.data = new char[size*word_size];    
+    arr.fortran_order = fortran_order;
+    size_t nread = fread(arr.data,word_size,size,fp);
+    if(nread != size)
+        throw std::runtime_error("load_the_npy_file: failed fread");
+    return arr;
+}
+
+cnpy::npz_t cnpy::npz_load(std::string fname) {
+    FILE* fp = fopen(fname.c_str(),"rb");
+
+    if(!fp) printf("npz_load: Error! Unable to open file %s!\n",fname.c_str());
+    assert(fp);
+
+    cnpy::npz_t arrays;  
+
+    while(1) {
+        std::vector<char> local_header(30);
+        size_t headerres = fread(&local_header[0],sizeof(char),30,fp);
+        if(headerres != 30)
+            throw std::runtime_error("npz_load: failed fread");
+
+        //if we've reached the global header, stop reading
+        if(local_header[2] != 0x03 || local_header[3] != 0x04) break;
+
+        //read in the variable name
+        unsigned short name_len = *(unsigned short*) &local_header[26];
+        std::string varname(name_len,' ');
+        size_t vname_res = fread(&varname[0],sizeof(char),name_len,fp);
+        if(vname_res != name_len)
+            throw std::runtime_error("npz_load: failed fread");
+
+        //erase the lagging .npy        
+        varname.erase(varname.end()-4,varname.end());
+
+        //read in the extra field
+        unsigned short extra_field_len = *(unsigned short*) &local_header[28];
+        if(extra_field_len > 0) {
+            std::vector<char> buff(extra_field_len);
+            size_t efield_res = fread(&buff[0],sizeof(char),extra_field_len,fp);
+            if(efield_res != extra_field_len)
+                throw std::runtime_error("npz_load: failed fread");
+        }
+
+        arrays[varname] = load_the_npy_file(fp);
+    }
+
+    fclose(fp);
+    return arrays;  
+}
+
+cnpy::NpyArray cnpy::npz_load(std::string fname, std::string varname) {
+    FILE* fp = fopen(fname.c_str(),"rb");
+
+    if(!fp) {
+        printf("npz_load: Error! Unable to open file %s!\n",fname.c_str());
+        abort();
+    }       
+
+    while(1) {
+        std::vector<char> local_header(30);
+        size_t header_res = fread(&local_header[0],sizeof(char),30,fp);
+        if(header_res != 30)
+            throw std::runtime_error("npz_load: failed fread");
+
+        //if we've reached the global header, stop reading
+        if(local_header[2] != 0x03 || local_header[3] != 0x04) break;
+
+        //read in the variable name
+        unsigned short name_len = *(unsigned short*) &local_header[26];
+        std::string vname(name_len,' ');
+        size_t vname_res = fread(&vname[0],sizeof(char),name_len,fp);      
+        if(vname_res != name_len)
+            throw std::runtime_error("npz_load: failed fread");
+        vname.erase(vname.end()-4,vname.end()); //erase the lagging .npy
+
+        //read in the extra field
+        unsigned short extra_field_len = *(unsigned short*) &local_header[28];
+        fseek(fp,extra_field_len,SEEK_CUR); //skip past the extra field
+
+        if(vname == varname) {
+            NpyArray array = load_the_npy_file(fp);
+            fclose(fp);
+            return array;
+        }
+        else {
+            //skip past the data
+            unsigned int size = *(unsigned int*) &local_header[22];
+            fseek(fp,size,SEEK_CUR);
+        }
+    }
+
+    fclose(fp);
+    printf("npz_load: Error! Variable name %s not found in %s!\n",varname.c_str(),fname.c_str());
+    abort();
+}
+
+cnpy::NpyArray cnpy::npy_load(std::string fname) {
+
+    FILE* fp = fopen(fname.c_str(), "rb");
+
+    if(!fp) {
+        printf("npy_load: Error! Unable to open file %s!\n",fname.c_str());
+        abort();  
+    }
+
+    NpyArray arr = load_the_npy_file(fp);
+
+    fclose(fp);
+    return arr;
+}
+
+
+
diff --git a/src/cnpy/cnpy.h b/src/cnpy/cnpy.h
new file mode 100644
index 00000000..b11013b9
--- /dev/null
+++ b/src/cnpy/cnpy.h
@@ -0,0 +1,241 @@
+//Copyright (C) 2011  Carl Rogers
+//Released under MIT License
+//license available in LICENSE file, or at http://www.opensource.org/licenses/mit-license.php
+
+#ifndef LIBCNPY_H_
+#define LIBCNPY_H_
+
+#include<string>
+#include<stdexcept>
+#include<sstream>
+#include<vector>
+#include<cstdio>
+#include<typeinfo>
+#include<iostream>
+#include<cassert>
+#include<zlib.h>
+#include<map>
+
+namespace cnpy {
+
+    struct NpyArray {
+        char* data;
+        std::vector<unsigned int> shape;
+        unsigned int word_size;
+        bool fortran_order;
+        void destruct() {delete[] data;}
+    };
+    
+    struct npz_t : public std::map<std::string, NpyArray>
+    {
+        void destruct()
+        {
+            npz_t::iterator it = this->begin();
+            for(; it != this->end(); ++it) (*it).second.destruct();
+        }
+    };
+
+    char BigEndianTest();
+    char map_type(const std::type_info& t);
+    template<typename T> std::vector<char> create_npy_header(const T* data, const unsigned int* shape, const unsigned int ndims);
+    void parse_npy_header(FILE* fp,unsigned int& word_size, unsigned int*& shape, unsigned int& ndims, bool& fortran_order);
+    void parse_zip_footer(FILE* fp, unsigned short& nrecs, unsigned int& global_header_size, unsigned int& global_header_offset);
+    npz_t npz_load(std::string fname);
+    NpyArray npz_load(std::string fname, std::string varname);
+    NpyArray npy_load(std::string fname);
+
+    template<typename T> std::vector<char>& operator+=(std::vector<char>& lhs, const T rhs) {
+        //write in little endian
+        for(char byte = 0; byte < sizeof(T); byte++) {
+            char val = *((char*)&rhs+byte); 
+            lhs.push_back(val);
+        }
+        return lhs;
+    }
+
+    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const std::string rhs); 
+    template<> std::vector<char>& operator+=(std::vector<char>& lhs, const char* rhs); 
+
+
+    template<typename T> std::string tostring(T i, int pad = 0, char padval = ' ') {
+        std::stringstream s;
+        s << i;
+        return s.str();
+    }
+
+    template<typename T> void npy_save(std::string fname, const T* data, const unsigned int* shape, const unsigned int ndims, std::string mode = "w") {
+        FILE* fp = NULL;
+
+        if(mode == "a") fp = fopen(fname.c_str(),"r+b");
+
+        if(fp) {
+            //file exists. we need to append to it. read the header, modify the array size
+            unsigned int word_size, tmp_dims;
+            unsigned int* tmp_shape = 0;
+            bool fortran_order;
+            parse_npy_header(fp,word_size,tmp_shape,tmp_dims,fortran_order);
+            assert(!fortran_order);
+
+            if(word_size != sizeof(T)) {
+                std::cout<<"libnpy error: "<<fname<<" has word size "<<word_size<<" but npy_save appending data sized "<<sizeof(T)<<"\n";
+                assert( word_size == sizeof(T) );
+            }
+            if(tmp_dims != ndims) {
+                std::cout<<"libnpy error: npy_save attempting to append misdimensioned data to "<<fname<<"\n";
+                assert(tmp_dims == ndims);
+            }
+
+            for(int i = 1; i < ndims; i++) {
+                if(shape[i] != tmp_shape[i]) {
+                    std::cout<<"libnpy error: npy_save attempting to append misshaped data to "<<fname<<"\n";
+                    assert(shape[i] == tmp_shape[i]);
+                }
+            }
+            tmp_shape[0] += shape[0];
+
+            fseek(fp,0,SEEK_SET);
+            std::vector<char> header = create_npy_header(data,tmp_shape,ndims);
+            fwrite(&header[0],sizeof(char),header.size(),fp);
+            fseek(fp,0,SEEK_END);
+
+            delete[] tmp_shape;
+        }
+        else {
+            fp = fopen(fname.c_str(),"wb");
+            std::vector<char> header = create_npy_header(data,shape,ndims);
+            fwrite(&header[0],sizeof(char),header.size(),fp);
+        }
+
+        unsigned int nels = 1;
+        for(int i = 0;i < ndims;i++) nels *= shape[i];
+
+        fwrite(data,sizeof(T),nels,fp);
+        fclose(fp);
+    }
+
+    template<typename T> void npz_save(std::string zipname, std::string fname, const T* data, const unsigned int* shape, const unsigned int ndims, std::string mode = "w")
+    {
+        //first, append a .npy to the fname
+        fname += ".npy";
+
+        //now, on with the show
+        FILE* fp = NULL;
+        unsigned short nrecs = 0;
+        unsigned int global_header_offset = 0;
+        std::vector<char> global_header;
+
+        if(mode == "a") fp = fopen(zipname.c_str(),"r+b");
+
+        if(fp) {
+            //zip file exists. we need to add a new npy file to it.
+            //first read the footer. this gives us the offset and size of the global header
+            //then read and store the global header. 
+            //below, we will write the the new data at the start of the global header then append the global header and footer below it
+            unsigned int global_header_size;
+            parse_zip_footer(fp,nrecs,global_header_size,global_header_offset);
+            fseek(fp,global_header_offset,SEEK_SET);
+            global_header.resize(global_header_size);
+            size_t res = fread(&global_header[0],sizeof(char),global_header_size,fp);
+            if(res != global_header_size){
+                throw std::runtime_error("npz_save: header read error while adding to existing zip");
+            }
+            fseek(fp,global_header_offset,SEEK_SET);
+        }
+        else {
+            fp = fopen(zipname.c_str(),"wb");
+        }
+
+        std::vector<char> npy_header = create_npy_header(data,shape,ndims);
+
+        unsigned long nels = 1;
+        for (int m=0; m<ndims; m++ ) nels *= shape[m];
+        int nbytes = nels*sizeof(T) + npy_header.size();
+
+        //get the CRC of the data to be added
+        unsigned int crc = crc32(0L,(unsigned char*)&npy_header[0],npy_header.size());
+        crc = crc32(crc,(unsigned char*)data,nels*sizeof(T));
+
+        //build the local header
+        std::vector<char> local_header;
+        local_header += "PK"; //first part of sig
+        local_header += (unsigned short) 0x0403; //second part of sig
+        local_header += (unsigned short) 20; //min version to extract
+        local_header += (unsigned short) 0; //general purpose bit flag
+        local_header += (unsigned short) 0; //compression method
+        local_header += (unsigned short) 0; //file last mod time
+        local_header += (unsigned short) 0;     //file last mod date
+        local_header += (unsigned int) crc; //crc
+        local_header += (unsigned int) nbytes; //compressed size
+        local_header += (unsigned int) nbytes; //uncompressed size
+        local_header += (unsigned short) fname.size(); //fname length
+        local_header += (unsigned short) 0; //extra field length
+        local_header += fname;
+
+        //build global header
+        global_header += "PK"; //first part of sig
+        global_header += (unsigned short) 0x0201; //second part of sig
+        global_header += (unsigned short) 20; //version made by
+        global_header.insert(global_header.end(),local_header.begin()+4,local_header.begin()+30);
+        global_header += (unsigned short) 0; //file comment length
+        global_header += (unsigned short) 0; //disk number where file starts
+        global_header += (unsigned short) 0; //internal file attributes
+        global_header += (unsigned int) 0; //external file attributes
+        global_header += (unsigned int) global_header_offset; //relative offset of local file header, since it begins where the global header used to begin
+        global_header += fname;
+
+        //build footer
+        std::vector<char> footer;
+        footer += "PK"; //first part of sig
+        footer += (unsigned short) 0x0605; //second part of sig
+        footer += (unsigned short) 0; //number of this disk
+        footer += (unsigned short) 0; //disk where footer starts
+        footer += (unsigned short) (nrecs+1); //number of records on this disk
+        footer += (unsigned short) (nrecs+1); //total number of records
+        footer += (unsigned int) global_header.size(); //nbytes of global headers
+        footer += (unsigned int) (global_header_offset + nbytes + local_header.size()); //offset of start of global headers, since global header now starts after newly written array
+        footer += (unsigned short) 0; //zip file comment length
+
+        //write everything      
+        fwrite(&local_header[0],sizeof(char),local_header.size(),fp);
+        fwrite(&npy_header[0],sizeof(char),npy_header.size(),fp);
+        fwrite(data,sizeof(T),nels,fp);
+        fwrite(&global_header[0],sizeof(char),global_header.size(),fp);
+        fwrite(&footer[0],sizeof(char),footer.size(),fp);
+        fclose(fp);
+    }
+
+    template<typename T> std::vector<char> create_npy_header(const T* data, const unsigned int* shape, const unsigned int ndims) {  
+
+        std::vector<char> dict;
+        dict += "{'descr': '";
+        dict += BigEndianTest();
+        dict += map_type(typeid(T));
+        dict += tostring(sizeof(T));
+        dict += "', 'fortran_order': False, 'shape': (";
+        dict += tostring(shape[0]);
+        for(int i = 1;i < ndims;i++) {
+            dict += ", ";
+            dict += tostring(shape[i]);
+        }
+        if(ndims == 1) dict += ",";
+        dict += "), }";
+        //pad with spaces so that preamble+dict is modulo 16 bytes. preamble is 10 bytes. dict needs to end with \n
+        int remainder = 16 - (10 + dict.size()) % 16;
+        dict.insert(dict.end(),remainder,' ');
+        dict.back() = '\n';
+
+        std::vector<char> header;
+        header += (char) 0x93;
+        header += "NUMPY";
+        header += (char) 0x01; //major version of numpy format
+        header += (char) 0x00; //minor version of numpy format
+        header += (unsigned short) dict.size();
+        header.insert(header.end(),dict.begin(),dict.end());
+
+        return header;
+    }
+
+
+}
+
+#endif
diff --git a/src/cnpy/example1.cpp b/src/cnpy/example1.cpp
new file mode 100644
index 00000000..7f1ab5d6
--- /dev/null
+++ b/src/cnpy/example1.cpp
@@ -0,0 +1,61 @@
+#include"cnpy.h"
+#include<complex>
+#include<cstdlib>
+#include<iostream>
+#include<map>
+#include<string>
+
+const int Nx = 128;
+const int Ny = 64;
+const int Nz = 32;
+
+int main()
+{
+    //create random data
+    std::complex<double>* data = new std::complex<double>[Nx*Ny*Nz];
+    for(int i = 0;i < Nx*Ny*Nz;i++) data[i] = std::complex<double>(rand(),rand());
+
+    //save it to file
+    const unsigned int shape[] = {Nz,Ny,Nx};
+    cnpy::npy_save("arr1.npy",data,shape,3,"w");
+
+    //load it into a new array
+    cnpy::NpyArray arr = cnpy::npy_load("arr1.npy");
+    std::complex<double>* loaded_data = reinterpret_cast<std::complex<double>*>(arr.data);
+    
+    //make sure the loaded data matches the saved data
+    assert(arr.word_size == sizeof(std::complex<double>));
+    assert(arr.shape.size() == 3 && arr.shape[0] == Nz && arr.shape[1] == Ny && arr.shape[2] == Nx);
+    for(int i = 0; i < Nx*Ny*Nz;i++) assert(data[i] == loaded_data[i]);
+
+    //append the same data to file
+    //npy array on file now has shape (Nz+Nz,Ny,Nx)
+    cnpy::npy_save("arr1.npy",data,shape,3,"a");
+
+    //now write to an npz file
+    //non-array variables are treated as 1D arrays with 1 element
+    double myVar1 = 1.2;
+    char myVar2 = 'a';
+    unsigned int shape2[] = {1};
+    cnpy::npz_save("out.npz","myVar1",&myVar1,shape2,1,"w"); //"w" overwrites any existing file
+    cnpy::npz_save("out.npz","myVar2",&myVar2,shape2,1,"a"); //"a" appends to the file we created above
+    cnpy::npz_save("out.npz","arr1",data,shape,3,"a"); //"a" appends to the file we created above
+
+    //load a single var from the npz file
+    cnpy::NpyArray arr2 = cnpy::npz_load("out.npz","arr1");
+
+    //load the entire npz file
+    cnpy::npz_t my_npz = cnpy::npz_load("out.npz");
+    
+    //check that the loaded myVar1 matches myVar1
+    cnpy::NpyArray arr_mv1 = my_npz["myVar1"];
+    double* mv1 = reinterpret_cast<double*>(arr_mv1.data);
+    assert(arr_mv1.shape.size() == 1 && arr_mv1.shape[0] == 1);
+    assert(mv1[0] == myVar1);
+
+    //cleanup: note that we are responsible for deleting all loaded data
+    delete[] data;
+    delete[] loaded_data;
+    arr2.destruct();
+    my_npz.destruct();
+}
diff --git a/src/common/npz_converter.h b/src/common/npz_converter.h
new file mode 100644
index 00000000..fe478aaf
--- /dev/null
+++ b/src/common/npz_converter.h
@@ -0,0 +1,86 @@
+#pragma once
+
+#include "cnpy/cnpy.h"
+#include "mblas/matrix.h"
+
+class NpzConverter {
+  private:
+    class NpyMatrixWrapper {
+      public:
+        NpyMatrixWrapper(const cnpy::NpyArray& npy)
+        : npy_(npy) {}
+        
+        size_t size() const {
+          return size1() * size2();
+        }
+        
+        float* data() const {
+          return (float*)npy_.data;
+        }
+        
+        float operator()(size_t i, size_t j) const {
+          return ((float*)npy_.data)[i * size2() + j];
+        }
+        
+        size_t size1() const {
+          return npy_.shape[0];
+        }
+        
+        size_t size2() const {
+          if(npy_.shape.size() == 1)
+            return 1;
+          else
+            return npy_.shape[1];        
+        }
+        
+      private:
+        const cnpy::NpyArray& npy_;
+    };
+  
+  public:
+    NpzConverter(const std::string& file)
+      : model_(cnpy::npz_load(file)),
+        destructed_(false) {
+      }
+    
+    ~NpzConverter() {
+      if(!destructed_)
+        model_.destruct();
+    }
+    
+    void Destruct() {
+      model_.destruct();
+      destructed_ = true;
+    }
+    
+    mblas::Matrix operator[](const std::string& key) const {
+      mblas::Matrix matrix;
+      auto it = model_.find(key);
+      if(it != model_.end()) {
+        NpyMatrixWrapper np(it->second);
+        matrix.Resize(np.size1(), np.size2());
+        lib::copy(np.data(), np.data() + np.size(), matrix.begin());
+      }
+      else {
+        std::cerr << "Missing " << key << std::endl; 
+      }
+      return std::move(matrix);
+    }
+  
+    mblas::Matrix operator()(const std::string& key,
+                                   bool transpose) const {
+      mblas::Matrix matrix;
+      auto it = model_.find(key);
+      if(it != model_.end()) {
+        NpyMatrixWrapper np(it->second);
+        matrix.Resize(np.size1(), np.size2());
+        lib::copy(np.data(), np.data() + np.size(), matrix.begin());
+      }
+      mblas::Transpose(matrix);
+      return std::move(matrix);
+    }
+  
+  private:
+    cnpy::npz_t model_;
+    bool destructed_;
+};
diff --git a/src/common/states.h b/src/common/states.h
new file mode 100644
index 00000000..abb03329
--- /dev/null
+++ b/src/common/states.h
@@ -0,0 +1,103 @@
+#pragma once
+
+#include <cmath>
+#include <boost/shared_ptr.hpp>
+#include <queue>
+#include <iostream>
+#include <sstream>
+#include "mblas/matrix.h"
+
+class States;
+
+class StateInfo {
+  public:
+    StateInfo(size_t rowNo, States* states)
+    : rowNo_(rowNo), states_(states) { }
+
+  ~StateInfo();
+
+  size_t GetRowNo() {
+    return rowNo_;
+  }
+  
+  friend std::ostream& operator<<(std::ostream& o, const StateInfo& s);
+  
+  private:
+    size_t rowNo_;
+    States* states_;
+};
+
+typedef boost::shared_ptr<StateInfo> StateInfoPtr;
+
+class States {
+  public:
+    void ConstructStates(mblas::Matrix& Out, const std::vector<StateInfoPtr>& infos) {
+      mblas::RowPairs rowPairs;
+      size_t j = 0;
+      for(auto i : infos)
+        rowPairs.emplace_back(j++, i->GetRowNo());
+      Out.Resize(rowPairs.size(), States_.Cols());
+      mblas::CopyRows(Out, States_, rowPairs);
+    }
+
+    void SaveStates(std::vector<StateInfoPtr>& infos, const mblas::Matrix& In) {
+      mblas::RowPairs rowPairs;
+      size_t append = States_.Rows();
+      for(size_t i = 0; i < In.Rows(); ++i) {
+        if(freeRows_.empty()) {
+          rowPairs.emplace_back(append, i);
+          infos.push_back(StateInfoPtr(new StateInfo(append, this)));
+          append++;
+        }
+        else {
+          size_t rowNo = freeRows_.top();
+          freeRows_.pop();
+          rowPairs.emplace_back(rowNo, i);
+          infos.push_back(StateInfoPtr(new StateInfo(rowNo, this)));
+        } 
+      }
+      if(append > States_.Rows())
+        States_.Resize(append, In.Cols());
+      mblas::CopyRows(States_, In, rowPairs);
+      //std::cerr << "States: " << In.Rows() << " " << freeRows_.size() << " -> " << States_.Rows() << std::endl;
+    }
+      
+    std::string ToString(size_t rowNo) {
+      std::stringstream ss;
+      ss << rowNo << " : ";
+      for(size_t i = 0; i < 5; ++i) {
+        ss << States_(rowNo, i) << " ";
+      }
+      return ss.str();
+    }
+    
+    void Clear() {
+      std::priority_queue<size_t> empty;
+      freeRows_.swap(empty);
+      
+      mblas::Matrix emptyMatrix;
+      mblas::Swap(States_, emptyMatrix);
+    }
+    
+  private:
+    
+    friend class StateInfo;
+    
+    void Free(size_t rowNo) {
+      freeRows_.push(rowNo);
+    }
+    
+    mblas::Matrix States_;
+    std::priority_queue<size_t> freeRows_;
+};
+
+//----------------------------------------------------------------------------//
+
+StateInfo::~StateInfo() {
+  states_->Free(rowNo_);
+}
+
+std::ostream& operator<<(std::ostream& o, const StateInfo& s) {
+  return o << s.states_->ToString(s.rowNo_);
+}
+  
diff --git a/src/common/utils.cpp b/src/common/utils.cpp
new file mode 100644
index 00000000..c5a052f8
--- /dev/null
+++ b/src/common/utils.cpp
@@ -0,0 +1,23 @@
+#include "utils.h"
+#include <iostream>
+
+void Trim(std::string& s) {
+  boost::trim_if(s, boost::is_any_of(" \t\n"));
+}
+
+void Split(const std::string& line, std::vector<std::string>& pieces, const std::string del) {
+  size_t begin = 0;
+  size_t pos = 0;
+  std::string token;
+  while ((pos = line.find(del, begin)) != std::string::npos) {
+    if (pos > begin) {
+      token = line.substr(begin, pos-begin);
+      pieces.push_back(token);
+    }
+    begin = pos + del.size();
+  }
+  if (pos > begin) {
+    token = line.substr(begin, pos-begin);
+  }
+  pieces.push_back(token);
+}
diff --git a/src/common/utils.h b/src/common/utils.h
new file mode 100644
index 00000000..0a7cbbfd
--- /dev/null
+++ b/src/common/utils.h
@@ -0,0 +1,7 @@
+#pragma once
+#include <string>
+#include <boost/algorithm/string.hpp>
+
+void Trim(std::string& s);
+
+void Split(const std::string& line, std::vector<std::string>& pieces, const std::string del=" ");
diff --git a/src/common/vocab.h b/src/common/vocab.h
new file mode 100644
index 00000000..f70c3069
--- /dev/null
+++ b/src/common/vocab.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <map>
+#include <string>
+#include <vector>
+#include <fstream>
+
+class Vocab {
+  public:
+    Vocab(const std::string& txt) {
+        std::ifstream in(txt.c_str());
+        size_t c = 0;
+        std::string line;
+        while(std::getline(in, line)) {
+            str2id_[line] = c++;
+            id2str_.push_back(line);
+        }
+        //str2id_["</s>"] = c;
+        //id2str_.push_back("</s>");
+    }
+
+    size_t operator[](const std::string& word) const {
+        auto it = str2id_.find(word);
+        if(it != str2id_.end())
+            return it->second;
+        else
+            return 1;
+    }
+
+    inline std::vector<size_t> Encode(const std::vector<std::string>& sentence, bool addEOS=false) const {
+      std::vector<size_t> indexes;
+      for (auto& word: sentence) {
+        indexes.push_back((*this)[word]);
+      }
+      if (addEOS) {
+        indexes.push_back((*this)["</s>"]);
+      }
+      return indexes;
+    }
+
+
+    const std::string& operator[](size_t id) const {
+        return id2str_[id];
+    }
+
+    size_t size() {
+      return id2str_.size();
+    }
+
+  private:
+    std::map<std::string, size_t> str2id_;
+    std::vector<std::string> id2str_;
+};
diff --git a/src/decoder.bah/decoder_main.cu b/src/decoder.bah/decoder_main.cu
new file mode 100644
index 00000000..360ee02f
--- /dev/null
+++ b/src/decoder.bah/decoder_main.cu
@@ -0,0 +1,95 @@
+#include <cstdlib>
+#include <iostream>
+#include <string>
+#include <algorithm>
+#include <memory>
+#include <boost/timer/timer.hpp>
+#include <boost/program_options/options_description.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/program_options/variables_map.hpp>
+#include <boost/lexical_cast.hpp>
+
+#include "bahdanau/model.h"
+#include "vocab.h"
+#include "decoder/nmt_decoder.h"
+
+
+void ProgramOptions(int argc, char *argv[],
+    std::string& modelPath,
+    std::string& svPath,
+    std::string& tvPath,
+    size_t& beamsize,
+    size_t& device) {
+  bool help = false;
+
+  namespace po = boost::program_options;
+  po::options_description cmdline_options("Allowed options");
+  cmdline_options.add_options()
+    ("beamsize,b", po::value(&beamsize)->default_value(10),
+     "Beam size")
+    ("device,d", po::value(&device)->default_value(0),
+     "CUDA Device")
+    ("model,m", po::value(&modelPath)->required(),
+     "Path to a model")
+    ("source,s", po::value(&svPath)->required(),
+     "Path to a source vocab file.")
+    ("target,t", po::value(&tvPath)->required(),
+     "Path to a target vocab file.")
+    ("help,h", po::value(&help)->zero_tokens()->default_value(false),
+     "Print this help message and exit.")
+  ;
+  po::variables_map vm;
+  try {
+    po::store(po::command_line_parser(argc, argv).
+              options(cmdline_options).run(), vm);
+    po::notify(vm);
+  } catch (std::exception& e) {
+    std::cout << "Error: " << e.what() << std::endl << std::endl;
+
+    std::cout << "Usage: " + std::string(argv[0]) +  " [options]" << std::endl;
+    std::cout << cmdline_options << std::endl;
+    exit(0);
+  }
+
+  if (help) {
+    std::cout << "Usage: " + std::string(argv[0]) +  " [options]" << std::endl;
+    std::cout << cmdline_options << std::endl;
+    exit(0);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  std::string modelPath, srcVocabPath, trgVocabPath;
+  size_t device = 0;
+  size_t beamsize = 10;
+  ProgramOptions(argc, argv, modelPath, srcVocabPath, trgVocabPath, beamsize, device);
+  std::cerr << "Using device GPU" << device << std::endl;;
+  cudaSetDevice(device);
+  std::cerr << "Loading model... ";
+  std::shared_ptr<Weights> model(new Weights(modelPath));
+  std::shared_ptr<Vocab> srcVocab(new Vocab(srcVocabPath));
+  std::shared_ptr<Vocab> trgVocab(new Vocab(trgVocabPath));
+  std::cerr << "done." << std::endl;
+
+  NMTDecoder decoder(model, srcVocab, trgVocab, beamsize);
+
+  std::cerr << "Start translating...\n";
+
+  std::ios_base::sync_with_stdio(false);
+
+  std::string line;
+  boost::timer::cpu_timer timer;
+  while(std::getline(std::cin, line)) {
+    auto result = decoder.translate(line);
+    for (auto it = result.rbegin(); it != result.rend(); ++it) {
+      std::string word = (*trgVocab)[*it];
+      if(it != result.rbegin())
+        std::cout << " ";
+      if(word != "</s>")
+        std::cout << word;
+    }
+    std::cout << std::endl;
+  }
+  std::cerr << timer.format() << std::endl;
+  return 0;
+}
diff --git a/src/decoder.bah/hypothesis.h b/src/decoder.bah/hypothesis.h
new file mode 100644
index 00000000..92a14c42
--- /dev/null
+++ b/src/decoder.bah/hypothesis.h
@@ -0,0 +1,32 @@
+#pragma once
+
+#include <cstddef>
+#include <vector>
+#include <iostream>
+
+class Hypothesis {
+ public:
+    Hypothesis(size_t word, size_t prev, float cost)
+      : prev_(prev),
+        word_(word),
+        cost_(cost) {
+    }
+
+    size_t GetWord() const {
+      return word_;
+    }
+
+    size_t GetPrevStateIndex() const {
+      return prev_;
+    }
+
+    float GetCost() const {
+      return cost_;
+    }
+
+ private:
+    const size_t prev_;
+    const size_t word_;
+    const float cost_;
+};
+
diff --git a/src/decoder.bah/hypothesis_manager.h b/src/decoder.bah/hypothesis_manager.h
new file mode 100644
index 00000000..f81ebece
--- /dev/null
+++ b/src/decoder.bah/hypothesis_manager.h
@@ -0,0 +1,75 @@
+#pragma once
+
+#include <vector>
+#include <iostream>
+
+#include "decoder/hypothesis.h"
+
+class HypothesisManager {
+    using Hypotheses = std::vector<Hypothesis>;
+ public:
+    HypothesisManager(size_t beamSize, size_t EOSIndex)
+      : beamSize_(beamSize),
+        EOSIndex_(EOSIndex),
+        baseIndex_(0) {
+        hypotheses_.emplace_back(0, 0, 0);
+    }
+
+    void AddHypotheses(const Hypotheses& hypos) {
+      size_t nextBaseIndex = hypotheses_.size();
+      for (const auto& hypo : hypos) {
+        if (hypo.GetWord() == EOSIndex_) {
+          completedHypotheses_.emplace_back(hypo.GetWord(),
+                                            hypo.GetPrevStateIndex() + baseIndex_,
+                                            hypo.GetCost());
+        } else {
+          hypotheses_.emplace_back(hypo.GetWord(), hypo.GetPrevStateIndex() + baseIndex_,
+                                   hypo.GetCost());
+        }
+      }
+      baseIndex_ = nextBaseIndex;
+    }
+
+    std::vector<size_t> GetBestTranslation() {
+      size_t bestHypoId = 0;
+      for (size_t i = 0; i < completedHypotheses_.size(); ++i) {
+        if (completedHypotheses_[bestHypoId].GetCost()
+            < completedHypotheses_[i].GetCost()) {
+          bestHypoId = i;
+        }
+      }
+
+
+      // for (auto hypo : completedHypotheses_) {
+        // std::vector<size_t> words;
+        // words.push_back(hypo.GetWord());
+        // size_t state = hypo.GetPrevStateIndex();
+        // while (state > 0)  {
+          // words.push_back(hypotheses_[state].GetWord());
+          // state = hypotheses_[state].GetPrevStateIndex();
+        // }
+        // for (auto it = words.rbegin(); it != words.rend(); ++it) std::cerr << *it << " ";
+        // std::cerr << hypo.GetCost() << std::endl;
+      // }
+
+      std::vector<size_t> bestSentence;
+      bestSentence.push_back(completedHypotheses_[bestHypoId].GetWord());
+      size_t state = completedHypotheses_[bestHypoId].GetPrevStateIndex();
+
+      while (state > 0) {
+            bestSentence.push_back(hypotheses_[state].GetWord());
+        state = hypotheses_[state].GetPrevStateIndex();
+      }
+
+      return bestSentence;
+    }
+
+ private:
+    Hypotheses hypotheses_;
+    size_t beamSize_;
+    Hypotheses completedHypotheses_;
+    const size_t EOSIndex_;
+    size_t baseIndex_;
+};
+
+
diff --git a/src/decoder.bah/nmt_decoder.h b/src/decoder.bah/nmt_decoder.h
new file mode 100644
index 00000000..4bcaa316
--- /dev/null
+++ b/src/decoder.bah/nmt_decoder.h
@@ -0,0 +1,175 @@
+#pragma once
+
+#include <string>
+#include <memory>
+#include <algorithm>
+#include <functional>
+#include <numeric>
+
+#include <thrust/functional.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_ptr.h>
+#include <thrust/extrema.h>
+#include <thrust/sort.h>
+#include <thrust/sequence.h>
+
+#include "common/vocab.h"
+#include "bahdanau/encoder.h"
+#include "bahdanau/decoder.h"
+#include "bahdanau/model.h"
+#include "common/utils.h"
+#include "mblas/matrix.h"
+#include "decoder/hypothesis_manager.h"
+
+
+using namespace thrust::placeholders;
+
+class NMTDecoder {
+  using Words = std::vector<size_t>;
+  using Hypotheses = std::vector<Hypothesis>;
+ public:
+   NMTDecoder(
+       std::shared_ptr<Weights> model,
+       std::shared_ptr<Vocab> srcVocab,
+       std::shared_ptr<Vocab> trgVocab,
+       const size_t beamSize=1)
+       : model_(model),
+         srcVocab_(srcVocab),
+         trgVocab_(trgVocab),
+         encoder_(new Encoder(*model_)),
+         decoder_(new Decoder(*model_)),
+         beamSize_(beamSize),
+         Costs_() {
+   }
+
+   Words translate(std::string& sentence) {
+     size_t sourceSentenceLength = prepareSourceSentence(sentence);
+     prepareDecoder();
+
+     size_t batchSize = beamSize_;
+     Costs_.Clear();
+     Costs_.Resize(batchSize, 1, 0.0);
+     HypothesisManager hypoManager(batchSize, (*trgVocab_)["</s>"]);
+
+     mblas::Matrix Probs;
+
+     for(size_t len = 0; len < 3 * sourceSentenceLength; ++len) {
+       std::vector<size_t> bestWordIndices, bestWordHyps;
+       decoder_->GetProbs(Probs, AlignedSourceContext_,
+                          PrevState_, PrevEmbedding_, SourceContext_);
+       
+       // Przeniesione tutaj. moze decoder powinien to robic.
+       Element(Log(_1), Probs);
+     
+       // Brzydkie, ale GH tez to ma, troche pomaga przy wiekszym
+       // BeamSize, ale jeszcze gdzies jest problem. 
+       if(len < sourceSentenceLength * 0.5) {
+          size_t eol = (*trgVocab_)["</s>"];
+          for(size_t i = 0; i < Probs.Rows(); ++i) {
+            Probs.Set(i, eol, std::numeric_limits<float>::lowest());
+          }
+       }
+       
+       auto bestHypos = GetBestExtensions(Probs, batchSize);
+       hypoManager.AddHypotheses(bestHypos);
+
+       size_t cidx = 0;
+       std::vector<size_t> costIndeces;
+       for (auto& best: bestHypos) {
+         if (best.GetWord() != (*trgVocab_)["</s>"]) {
+           bestWordIndices.push_back(best.GetWord());
+           bestWordHyps.push_back(best.GetPrevStateIndex());
+           costIndeces.push_back(cidx);
+         } else {
+           //std::cerr << "Finshed at " << Costs_(0, cidx) << std::endl;
+           --batchSize;
+         }
+         cidx++;
+       }
+
+       if (batchSize <= 0)
+         break;
+       
+       // Zrobic warunkowo
+       mblas::Matrix CostsTemp;
+       mblas::Assemble(CostsTemp, Costs_, costIndeces);
+       mblas::Swap(Costs_, CostsTemp);
+       //mblas::debug1(Costs_);
+       
+       decoder_->Lookup(Embedding_, bestWordIndices);
+       Assemble(BestState_, PrevState_, bestWordHyps);
+       decoder_->GetNextState(State_, Embedding_,
+                              BestState_, AlignedSourceContext_);
+
+       mblas::Swap(State_, PrevState_);
+       mblas::Swap(Embedding_, PrevEmbedding_);
+     }
+
+     return hypoManager.GetBestTranslation();
+   }
+
+ private:
+   size_t prepareSourceSentence(std::string& sentence) {
+     Trim(sentence);
+     std::vector<std::string> tokens;
+     Split(sentence, tokens, " ");
+     auto encoded_tokens = srcVocab_->Encode(tokens, true);
+     encoder_->GetContext(encoded_tokens, SourceContext_);
+     return encoded_tokens.size();
+   }
+
+   Hypotheses GetBestExtensions(mblas::Matrix& Probs, size_t batchSize) {
+     Hypotheses hypos;
+
+     // One kernel. Na pewno nie dwa razy transpose wielkiej macierzy, batchsize * vocab
+     Costs_.Reshape(1, batchSize);
+     Broadcast(_1 + _2, Transpose(Probs), Costs_);
+     Costs_.Reshape(batchSize, 1);
+     Transpose(Probs);
+
+     size_t probSize = Probs.Cols() * Probs.Rows();
+     thrust::device_vector<int> keys(probSize);
+     thrust::sequence(keys.begin(), keys.end());
+
+     // warto sortować w odwrotnej kolejnosci, zaoszczedzi kombinacje ponizej
+     thrust::sort_by_key(Probs.begin(), Probs.end(), keys.begin());
+     // OK, to pewnie uzywa thrust::copy? Sprawdzić
+     thrust::host_vector<int> bestKeys(keys.end() - batchSize, keys.end());
+
+     HypothesisManager hypoManager(batchSize, (*trgVocab_)["</s>"]);
+
+     // za pomoca thrust::copy zrobic dwie kopie, jedno do Costs, jedna do wektora na cpu, w drugim kroku uzyc cpu
+     for (size_t i = 0; i < bestKeys.size(); ++i) {
+       Costs_.GetVec()[i] = Probs.GetVec()[probSize - batchSize + i];
+       hypos.emplace_back(bestKeys[i] % Probs.Cols(), bestKeys[i] / Probs.Cols(), Probs.GetVec()[probSize - batchSize + i]);
+     }
+
+     return hypos;
+
+   }
+
+   void prepareDecoder() {
+     decoder_->EmptyState(PrevState_, SourceContext_, 1);
+     decoder_->EmptyEmbedding(PrevEmbedding_, 1);
+   }
+
+ protected:
+    std::shared_ptr<Weights> model_;
+    std::shared_ptr<Vocab> srcVocab_;
+    std::shared_ptr<Vocab> trgVocab_;
+    std::shared_ptr<Encoder> encoder_;
+    std::shared_ptr<Decoder> decoder_;
+    const size_t beamSize_;
+    mblas::Matrix SourceContext_;
+    mblas::Matrix PrevState_;
+    mblas::Matrix PrevEmbedding_;
+    mblas::Matrix BestState_;
+    mblas::Matrix Costs_;
+
+    mblas::Matrix AlignedSourceContext_;
+
+    mblas::Matrix State_;
+    mblas::Matrix Embedding_;
+
+};
diff --git a/src/decoder.bah/result.h b/src/decoder.bah/result.h
new file mode 100644
index 00000000..dda0e1f2
--- /dev/null
+++ b/src/decoder.bah/result.h
@@ -0,0 +1,15 @@
+#pragma once
+
+#include <cstddef>
+
+struct Result {
+  Result(const size_t state, const size_t word, const float score)
+    : state(state),
+      word(word),
+      score(score) {
+  }
+
+  size_t state;
+  size_t word;
+  float score;
+};
diff --git a/src/decoder/decoder_main.cu b/src/decoder/decoder_main.cu
new file mode 100644
index 00000000..7c7f24d3
--- /dev/null
+++ b/src/decoder/decoder_main.cu
@@ -0,0 +1,180 @@
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <limits> 
+#include <boost/timer/timer.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include <thrust/functional.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/device_ptr.h>
+#include <thrust/extrema.h>
+#include <thrust/sort.h>
+#include <thrust/sequence.h>
+
+
+#include "mblas/matrix.h"
+#include "dl4mt.h"
+#include "vocab.h"
+
+using namespace mblas;
+
+typedef std::tuple<size_t, size_t, float> Hypothesis;
+typedef std::vector<Hypothesis> Beam;
+typedef std::vector<Beam> History;
+
+void BestHyps(Beam& bestHyps, const Beam& prevHyps, mblas::Matrix& Probs, const size_t beamSize) {
+  mblas::Matrix Costs(Probs.Rows(), 1);
+  thrust::host_vector<float> vCosts;
+  for(const Hypothesis& h : prevHyps)
+    vCosts.push_back(std::get<2>(h));
+  thrust::copy(vCosts.begin(), vCosts.end(), Costs.begin());
+  
+  mblas::BroadcastVecColumn(Log(_1) + _2, Probs, Costs);
+  
+  thrust::device_vector<unsigned> keys(Probs.size());
+  thrust::sequence(keys.begin(), keys.end());
+  
+  // Here it would be nice to have a partial sort instead of full sort
+  thrust::sort_by_key(Probs.begin(), Probs.end(),
+                      keys.begin(), thrust::greater<float>());
+  
+  thrust::host_vector<unsigned> bestKeys(beamSize);
+  thrust::copy_n(keys.begin(), beamSize, bestKeys.begin());
+  thrust::host_vector<float> bestCosts(beamSize);
+  thrust::copy_n(Probs.begin(), beamSize, bestCosts.begin());
+  
+  for(size_t i = 0; i < beamSize; i++) {
+    size_t wordIndex = bestKeys[i] % Probs.Cols();
+    size_t hypIndex  = bestKeys[i] / Probs.Cols();
+    float  cost = bestCosts[i];
+    bestHyps.emplace_back(wordIndex, hypIndex, cost);  
+  }
+}
+
+void FindBest(const History& history, const Vocab& vcb) {
+  std::vector<size_t> targetWords;
+  
+  size_t best = 0;
+  size_t beamSize = 0;
+  float bestCost = std::numeric_limits<float>::lowest();
+      
+  for(auto b = history.rbegin(); b != history.rend(); b++) {
+    if(b->size() > beamSize) {
+      beamSize = b->size();
+      for(size_t i = 0; i < beamSize; ++i) {
+        if(b == history.rbegin() || std::get<0>((*b)[i]) == vcb["</s>"]) {
+          if(std::get<2>((*b)[i]) > bestCost) {
+            best = i;
+            bestCost = std::get<2>((*b)[i]);
+            targetWords.clear();
+          }
+        }
+      }
+    }
+    
+    auto& bestHyp = (*b)[best];
+    targetWords.push_back(std::get<0>(bestHyp));
+    best = std::get<1>(bestHyp);
+  }
+
+  std::reverse(targetWords.begin(), targetWords.end());
+  for(size_t i = 0; i < targetWords.size(); ++i) {
+    if(vcb[targetWords[i]] != "</s>") {
+      if(i > 0) {
+        std::cout << " ";
+      }
+      std::cout << vcb[targetWords[i]];
+    }
+  }
+  std::cout << std::endl;
+}
+
+int main(int argc, char** argv) {
+  size_t device = 0;
+  
+  if(argc > 1) {
+    if(std::string(argv[1]) == "1")
+      device = 1;
+    else if(std::string(argv[1]) == "2")
+      device = 2;
+  }
+  
+  cudaSetDevice(device);
+  
+  Weights weights("/home/marcinj/Badania/mosesNMT/moses/FF/NMT/testmodel/model.npz", device);
+  Vocab svcb("/home/marcinj/Badania/mosesNMT/moses/FF/NMT/testmodel/vocab.en.txt");
+  Vocab tvcb("/home/marcinj/Badania/mosesNMT/moses/FF/NMT/testmodel/vocab.de.txt");
+  
+  std::cerr << "Creating encoder" << std::endl;
+  Encoder encoder(weights);
+
+  std::cerr << "Creating decoder" << std::endl;
+  Decoder decoder(weights);
+  
+  mblas::Matrix State, NextState, BeamState;
+  mblas::Matrix Embeddings, NextEmbeddings;
+  mblas::Matrix Probs;
+
+  std::string source;
+  boost::timer::auto_cpu_timer timer;
+  
+  while(std::getline(std::cin, source)) {
+    std::vector<std::string> sourceSplit;
+    boost::split(sourceSplit, source, boost::is_any_of(" "),
+                 boost::token_compress_on);
+      
+    std::vector<size_t> sourceWords(sourceSplit.size());
+    std::transform(sourceSplit.begin(), sourceSplit.end(), sourceWords.begin(),
+                   [&](const std::string& w) { return svcb[w]; });
+    sourceWords.push_back(svcb["</s>"]);
+    
+    mblas::Matrix SourceContext;
+    encoder.GetContext(sourceWords, SourceContext);
+  
+    size_t beamSize = 12;
+    
+    decoder.EmptyState(State, SourceContext, 1);
+    decoder.EmptyEmbedding(Embeddings, 1);
+    
+    History history;
+    
+    Beam prevHyps;
+    prevHyps.emplace_back(0, 0, 0.0);
+    
+    do {
+      decoder.MakeStep(NextState, Probs, State, Embeddings, SourceContext);
+      
+      Beam hyps;
+      BestHyps(hyps, prevHyps, Probs, beamSize);
+      history.push_back(hyps);
+      
+      Beam survivors;
+      std::vector<size_t> beamWords;
+      std::vector<size_t> beamStateIds;
+      for(auto& h : hyps) {
+        if(std::get<0>(h) != tvcb["</s>"]) {
+          survivors.push_back(h);
+          beamWords.push_back(std::get<0>(h));
+          beamStateIds.push_back(std::get<1>(h));
+        }
+      }
+      beamSize = survivors.size();
+      
+      if(beamSize == 0)
+        break;
+      
+      decoder.Lookup(NextEmbeddings, beamWords);
+      mblas::Assemble(BeamState, NextState, beamStateIds);
+      
+      mblas::Swap(Embeddings, NextEmbeddings);
+      mblas::Swap(State, BeamState);
+      prevHyps.swap(survivors);
+      
+    } while(history.size() < sourceWords.size() * 3);
+    
+    FindBest(history, tvcb);
+  }
+}
+\ No newline at end of file
diff --git a/src/dl4mt.h b/src/dl4mt.h
new file mode 100644
index 00000000..380928e1
--- /dev/null
+++ b/src/dl4mt.h
@@ -0,0 +1,5 @@
+#pragma once
+
+#include "dl4mt/model.h"
+#include "dl4mt/encoder.h"
+#include "dl4mt/decoder.h"
diff --git a/src/dl4mt/decoder.h b/src/dl4mt/decoder.h
new file mode 100644
index 00000000..8f2f972e
--- /dev/null
+++ b/src/dl4mt/decoder.h
@@ -0,0 +1,263 @@
+#pragma once
+
+#include "mblas/matrix.h"
+#include "dl4mt/model.h"
+#include "dl4mt/gru.h"
+ 
+class Decoder {
+  private:
+    template <class Weights>
+    class Embeddings {
+      public:
+        Embeddings(const Weights& model)
+        : w_(model)
+        {}
+            
+        void Lookup(mblas::Matrix& Rows, const std::vector<size_t>& ids) {
+          using namespace mblas;
+          Assemble(Rows, w_.E_, ids);
+        }
+        
+        size_t GetDim() {
+          return w_.E_.Cols();    
+        }
+        
+      private:
+        const Weights& w_;
+    };
+    
+    template <class Weights1, class Weights2>
+    class RNNHidden {
+      public:
+        RNNHidden(const Weights1& initModel, const Weights2& gruModel)
+        : w_(initModel), gru_(gruModel) {}          
+        
+        void InitializeState(mblas::Matrix& State,
+                             const mblas::Matrix& SourceContext,
+                             const size_t batchSize = 1) {
+          using namespace mblas;
+          
+          Mean(Temp1_, SourceContext);
+          Temp2_.Clear();
+          Temp2_.Resize(batchSize, SourceContext.Cols(), 0.0);
+          BroadcastVec(_1 + _2, Temp2_, Temp1_);
+          Prod(State, Temp2_, w_.Wi_);
+          BroadcastVec(Tanh(_1 + _2), State, w_.Bi_);
+        }
+        
+        void GetNextState(mblas::Matrix& NextState,
+                          const mblas::Matrix& State,
+                          const mblas::Matrix& Context) {
+          gru_.GetNextState(NextState, State, Context);
+        }
+        
+      private:
+        const Weights1& w_;
+        const GRU<Weights2> gru_;
+        
+        mblas::Matrix Temp1_;
+        mblas::Matrix Temp2_;
+    };
+    
+    template <class Weights>
+    class RNNFinal {
+      public:
+        RNNFinal(const Weights& model)
+        : gru_(model) {}          
+        
+        void GetNextState(mblas::Matrix& NextState,
+                          const mblas::Matrix& State,
+                          const mblas::Matrix& Context) {
+          gru_.GetNextState(NextState, State, Context);
+        }
+        
+      private:
+        const GRU<Weights> gru_;
+    };
+        
+    template <class Weights>
+    class Alignment {
+      public:
+        Alignment(const Weights& model)
+        : w_(model)
+        {
+          for(int i = 0; i < 2; ++i) {
+            cudaStreamCreate(&s_[i]);
+            cublasCreate(&h_[i]);
+            cublasSetStream(h_[i], s_[i]);            
+          }
+        }
+          
+        void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext,
+                                     const mblas::Matrix& HiddenState,
+                                     const mblas::Matrix& SourceContext) {
+          using namespace mblas;  
+          
+          Prod(h_[0], Temp1_, SourceContext, w_.U_);
+          Prod(h_[1], Temp2_, HiddenState, w_.W_);
+          BroadcastVec(_1 + _2, Temp2_, w_.B_, s_[1]);
+          
+          cudaDeviceSynchronize();
+          
+          Broadcast(Tanh(_1 + _2), Temp1_, Temp2_);
+          
+          Prod(A_, w_.V_, Temp1_, false, true);
+          
+          size_t rows1 = SourceContext.Rows();
+          size_t rows2 = HiddenState.Rows();     
+          A_.Reshape(rows2, rows1); // due to broadcasting above
+          Element(_1 + w_.C_(0,0), A_);
+          
+          mblas::Softmax(A_);
+          Prod(AlignedSourceContext, A_, SourceContext);
+        }
+      
+      private:
+        const Weights& w_;
+        
+        cublasHandle_t h_[2];
+        cudaStream_t s_[2];
+        
+        mblas::Matrix Temp1_;
+        mblas::Matrix Temp2_;
+        mblas::Matrix A_;
+        
+        mblas::Matrix Ones_;
+        mblas::Matrix Sums_;
+    };
+    
+    template <class Weights>
+    class Softmax {
+      public:
+        Softmax(const Weights& model)
+        : w_(model), filtered_(false)
+        {
+          for(int i = 0; i < 3; ++i) {
+            cudaStreamCreate(&s_[i]);
+            cublasCreate(&h_[i]);
+            cublasSetStream(h_[i], s_[i]);            
+          }
+        }
+          
+        void GetProbs(mblas::Matrix& Probs,
+                  const mblas::Matrix& State,
+                  const mblas::Matrix& Embedding,
+                  const mblas::Matrix& AlignedSourceContext) {
+          using namespace mblas;
+          
+          Prod(h_[0], T1_, State, w_.W1_);
+          Prod(h_[1], T2_, Embedding, w_.W2_);
+          Prod(h_[2], T3_, AlignedSourceContext, w_.W3_);
+          
+          BroadcastVec(_1 + _2, T1_, w_.B1_, s_[0]);
+          BroadcastVec(_1 + _2, T2_, w_.B2_, s_[1]);
+          BroadcastVec(_1 + _2, T3_, w_.B3_, s_[2]);
+      
+          cudaDeviceSynchronize();
+      
+          Element(Tanh(_1 + _2 + _3), T1_, T2_, T3_);
+          
+          Prod(Probs, T1_, w_.W4_);
+          BroadcastVec(_1 + _2, Probs, w_.B4_);
+          mblas::Softmax(Probs);
+        }
+    
+        void Filter(const std::vector<size_t>& ids) {
+        }
+       
+      private:        
+        const Weights& w_;
+        
+        cublasHandle_t h_[3];
+        cudaStream_t s_[3];
+        
+        bool filtered_;
+        mblas::Matrix FilteredWo_;
+        mblas::Matrix FilteredWoB_;
+        
+        mblas::Matrix T1_;
+        mblas::Matrix T2_;
+        mblas::Matrix T3_;
+    };
+    
+  public:
+    Decoder(const Weights& model)
+    : embeddings_(model.decEmbeddings_),
+      rnn1_(model.decInit_, model.decGru1_),
+      rnn2_(model.decGru2_),
+      alignment_(model.decAlignment_),
+      softmax_(model.decSoftmax_)
+    {}
+    
+    void MakeStep(mblas::Matrix& NextState,
+                  mblas::Matrix& Probs,
+                  const mblas::Matrix& State,
+                  const mblas::Matrix& Embeddings,
+                  const mblas::Matrix& SourceContext) {
+      GetHiddenState(HiddenState_, State, Embeddings);
+      GetAlignedSourceContext(AlignedSourceContext_, HiddenState_, SourceContext);
+      GetNextState(NextState, HiddenState_, AlignedSourceContext_);
+      GetProbs(Probs, NextState, Embeddings, AlignedSourceContext_);
+    }
+    
+    void EmptyState(mblas::Matrix& State,
+                    const mblas::Matrix& SourceContext,
+                    size_t batchSize = 1) {
+      rnn1_.InitializeState(State, SourceContext, batchSize);
+    }
+    
+    void EmptyEmbedding(mblas::Matrix& Embedding,
+                        size_t batchSize = 1) {
+      Embedding.Clear();
+      Embedding.Resize(batchSize, embeddings_.GetDim(), 0);
+    }
+    
+    void Lookup(mblas::Matrix& Embedding,
+                const std::vector<size_t>& w) {
+      embeddings_.Lookup(Embedding, w);
+    }
+    
+    void Filter(const std::vector<size_t>& ids) {
+    
+    }
+      
+    
+    //private:
+    
+    void GetHiddenState(mblas::Matrix& HiddenState,
+                        const mblas::Matrix& PrevState,
+                        const mblas::Matrix& Embedding) {
+      rnn1_.GetNextState(HiddenState, PrevState, Embedding);
+    }
+    
+    void GetAlignedSourceContext(mblas::Matrix& AlignedSourceContext,
+                                 const mblas::Matrix& HiddenState,
+                                 const mblas::Matrix& SourceContext) {
+      alignment_.GetAlignedSourceContext(AlignedSourceContext, HiddenState, SourceContext);
+    }
+    
+    void GetNextState(mblas::Matrix& State,
+                      const mblas::Matrix& HiddenState,
+                      const mblas::Matrix& AlignedSourceContext) {
+      rnn2_.GetNextState(State, HiddenState, AlignedSourceContext);
+    }
+    
+    
+    void GetProbs(mblas::Matrix& Probs,
+                  const mblas::Matrix& State,
+                  const mblas::Matrix& Embedding,
+                  const mblas::Matrix& AlignedSourceContext) {
+      softmax_.GetProbs(Probs, State, Embedding, AlignedSourceContext);
+    }
+    
+    
+  private:
+    mblas::Matrix HiddenState_;
+    mblas::Matrix AlignedSourceContext_;  
+    
+    Embeddings<Weights::DecEmbeddings> embeddings_;
+    RNNHidden<Weights::DecInit, Weights::DecGRU1> rnn1_;
+    RNNFinal<Weights::DecGRU2> rnn2_;
+    Alignment<Weights::DecAlignment> alignment_;
+    Softmax<Weights::DecSoftmax> softmax_;
+};
diff --git a/src/dl4mt/encoder.h b/src/dl4mt/encoder.h
new file mode 100644
index 00000000..b59adaa4
--- /dev/null
+++ b/src/dl4mt/encoder.h
@@ -0,0 +1,95 @@
+#pragma once
+
+#include "mblas/matrix.h"
+#include "dl4mt/model.h"
+#include "dl4mt/gru.h"
+ 
+class Encoder {
+  private:
+    template <class Weights>
+    class Embeddings {
+      public:
+        Embeddings(const Weights& model)
+        : w_(model)
+        {}
+          
+        void Lookup(mblas::Matrix& Row, size_t i) {
+          using namespace mblas;
+          CopyRow(Row, w_.E_, i);
+        }
+      
+      private:
+        const Weights& w_;
+    };
+    
+    template <class Weights>
+    class RNN {
+      public:
+        RNN(const Weights& model)
+        : gru_(model) {}
+        
+        void InitializeState(size_t batchSize = 1) {
+          State_.Clear();
+          State_.Resize(batchSize, 1024, 0.0);
+        }
+        
+        void GetNextState(mblas::Matrix& NextState,
+                          const mblas::Matrix& State,
+                          const mblas::Matrix& Embd) {
+          gru_.GetNextState(NextState, State, Embd);
+        }
+        
+        template <class It>
+        void GetContext(It it, It end, 
+                        mblas::Matrix& Context, bool invert) {
+          InitializeState();
+          
+          size_t n = std::distance(it, end);
+          size_t i = 0;
+          while(it != end) {
+            GetNextState(State_, State_, *it++);
+            if(invert)
+              mblas::PasteRow(Context, State_, n - i - 1, 1024);
+            else
+              mblas::PasteRow(Context, State_, i, 0);
+            ++i;
+          }
+        }
+        
+      private:
+        // Model matrices
+        const GRU<Weights> gru_;
+        
+        mblas::Matrix State_;
+    };
+    
+  public:
+    Encoder(const Weights& model)
+    : embeddings_(model.encEmbeddings_),
+      forwardRnn_(model.encForwardGRU_),
+      backwardRnn_(model.encBackwardGRU_)
+    {}
+    
+    void GetContext(const std::vector<size_t>& words,
+                    mblas::Matrix& Context) {
+      std::vector<mblas::Matrix> embeddedWords;
+      
+      Context.Resize(words.size(), 2048);
+      for(auto& w : words) {
+        embeddedWords.emplace_back();
+        embeddings_.Lookup(embeddedWords.back(), w);
+      }
+      
+      forwardRnn_.GetContext(embeddedWords.cbegin(),
+                             embeddedWords.cend(),
+                             Context, false);
+      backwardRnn_.GetContext(embeddedWords.crbegin(),
+                              embeddedWords.crend(),
+                              Context, true);
+    }
+    
+  private:
+    Embeddings<Weights::EncEmbeddings> embeddings_;
+    RNN<Weights::EncForwardGRU> forwardRnn_;
+    RNN<Weights::EncBackwardGRU> backwardRnn_;
+};
diff --git a/src/dl4mt/gru.h b/src/dl4mt/gru.h
new file mode 100644
index 00000000..e09d9327
--- /dev/null
+++ b/src/dl4mt/gru.h
@@ -0,0 +1,172 @@
+#pragma once
+
+#include "mblas/matrix.h"
+
+template <class Weights>
+class SlowGRU {
+  public:
+    SlowGRU(const Weights& model)
+    : w_(model) {}
+          
+    void GetNextState(mblas::Matrix& NextState,
+                      const mblas::Matrix& State,
+                      const mblas::Matrix& Context) const {
+      using namespace mblas;
+      
+      const size_t cols = State.Cols();
+      
+      // @TODO: Optimization
+      // @TODO: Launch streams to perform GEMMs in parallel
+      // @TODO: Join matrices and perform single GEMM --------
+      Prod(RU_, Context, w_.W_);
+      Prod(H_,  Context, w_.Wx_);
+      // -----------------------------------------------------
+      
+      // @TODO: Join matrices and perform single GEMM --------
+      Prod(Temp1_, State, w_.U_);
+      Prod(Temp2_, State, w_.Ux_);        
+      // -----------------------------------------------------
+      
+      // @TODO: Organize into one kernel ---------------------
+      BroadcastVec(_1 + _2, RU_, w_.B_); // Broadcasting row-wise
+      Element(Logit(_1 + _2), RU_, Temp1_);
+      Slice(R_, RU_, 0, cols);
+      Slice(U_, RU_, 1, cols);
+      
+      BroadcastVec(_1 + _2, H_,    w_.Bx1_); // Broadcasting row-wise
+      BroadcastVec(_1 + _2, Temp2_, w_.Bx2_); // Broadcasting row-wise
+      
+      Element(Tanh(_1 + _2 * _3), H_, R_, Temp2_);
+      Element((1.0 - _1) * _2 + _1 * _3, U_, H_, State);
+      // -----------------------------------------------------
+      
+      Swap(NextState, U_);
+    }
+    
+  private:
+    // Model matrices
+    const Weights& w_;
+    
+    // reused to avoid allocation
+    mutable mblas::Matrix RU_;
+    mutable mblas::Matrix R_;
+    mutable mblas::Matrix U_;
+    mutable mblas::Matrix H_;
+    mutable mblas::Matrix Temp1_;
+    mutable mblas::Matrix Temp2_;
+};
+
+__global__ void gElementwiseOps(float* out,
+                                const float* state,
+                                const float* ru,
+                                const float* h,
+                                const float* t1,
+                                const float* t2,
+                                const float* b,
+                                const float* bx1,
+                                const float* bx2,
+                                size_t rows, size_t cols) {
+
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) {
+      float* rowOut = out + j * cols;
+      const float* rowRu = ru + j * cols * 2;
+      const float* rowT1 = t1 + j * cols * 2;
+      
+      const float* rowH = h + j * cols;
+      const float* rowT2 = t2 + j * cols;
+      const float* rowState = state + j * cols;
+      
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int i = tid + threadIdx.x;
+        if(i < cols) {
+          float ev1 = expf(-(rowRu[i] + b[i] + rowT1[i]));
+          float r = 1.0 / (1.0 + ev1);
+          
+          int k = i + cols;
+          float ev2 = expf(-(rowRu[k] + b[k] + rowT1[k]));
+          float u = 1.0 / (1.0 + ev2);              
+
+          float hv = rowH[i] + bx1[i];
+          float t2v = rowT2[i] + bx2[i];
+          hv = tanhf(hv + r * t2v);
+          rowOut[i] = (1.0 - u) * hv + u * rowState[i];
+        }
+      }
+    }
+  }
+}
+
+
+template <class Weights>
+class FastGRU {
+  public:
+    FastGRU(const Weights& model)
+    : w_(model) {
+      for(int i = 0; i < 4; ++i) {
+        cudaStreamCreate(&s_[i]);
+        cublasCreate(&h_[i]);
+        cublasSetStream(h_[i], s_[i]);            
+      }
+    }
+          
+    void GetNextState(mblas::Matrix& NextState,
+                      const mblas::Matrix& State,
+                      const mblas::Matrix& Context) const {
+      using namespace mblas;
+      
+      const size_t cols = State.Cols();
+      
+      // @TODO: Optimization
+      // @TODO: Launch streams to perform GEMMs in parallel
+      // @TODO: Join matrices and perform single GEMM --------
+      Prod(h_[0], RU_, Context, w_.W_);
+      Prod(h_[1], H_,  Context, w_.Wx_);
+      // -----------------------------------------------------
+      
+      // @TODO: Join matrices and perform single GEMM --------
+      Prod(h_[2], Temp1_, State, w_.U_);
+      Prod(h_[3], Temp2_, State, w_.Ux_);        
+      // -----------------------------------------------------
+      cudaDeviceSynchronize();
+      
+      ElementwiseOps(NextState, State, RU_, H_, Temp1_, Temp2_);
+    }
+        
+    void ElementwiseOps(mblas::Matrix& NextState,
+                        const mblas::Matrix& State,
+                        const mblas::Matrix& RU,
+                        const mblas::Matrix& H,
+                        const mblas::Matrix& Temp1,
+                        const mblas::Matrix& Temp2) const {
+      const size_t rows = State.Rows();
+      const size_t cols = State.Cols();
+      NextState.Resize(rows, cols);
+      
+      int blocks  = std::min(MAX_BLOCKS, (int)rows);
+      int threads = std::min(MAX_THREADS, (int)cols);
+      gElementwiseOps<<<blocks, threads>>>(NextState.data(), State.data(),
+                                          RU.data(), H.data(),
+                                          Temp1.data(), Temp2.data(),
+                                          w_.B_.data(), w_.Bx1_.data(), w_.Bx2_.data(),
+                                          rows, cols);
+      cudaStreamSynchronize(0);
+    }
+    
+  private:
+    // Model matrices
+    const Weights& w_;
+    
+    cublasHandle_t h_[4];
+    cudaStream_t s_[4];
+        
+    // reused to avoid allocation
+    mutable mblas::Matrix RU_;
+    mutable mblas::Matrix H_;
+    mutable mblas::Matrix Temp1_;
+    mutable mblas::Matrix Temp2_;
+};
+
+template<class T>
+using GRU = FastGRU<T>;
+\ No newline at end of file
diff --git a/src/dl4mt/model.h b/src/dl4mt/model.h
new file mode 100644
index 00000000..de790aab
--- /dev/null
+++ b/src/dl4mt/model.h
@@ -0,0 +1,191 @@
+#pragma once
+
+#include <map>
+#include <string>
+
+#include "mblas/matrix.h"
+#include "npz_converter.h"
+
+struct Weights {
+  
+  //////////////////////////////////////////////////////////////////////////////
+  
+  struct EncEmbeddings {
+    EncEmbeddings(const NpzConverter& model)
+    : E_(model["Wemb"])
+    {}
+    
+    const mblas::Matrix E_;
+  };
+  
+  struct EncForwardGRU {
+    EncForwardGRU(const NpzConverter& model) 
+    : W_(model["encoder_W"]),  
+      B_(model("encoder_b", true)),
+      U_(model["encoder_U"]),
+      Wx_(model["encoder_Wx"]),
+      Bx1_(model("encoder_bx", true)),
+      Bx2_(Bx1_.Rows(), Bx1_.Cols(), 0.0),
+      Ux_(model["encoder_Ux"])
+    { }
+    
+    const mblas::Matrix W_;
+    const mblas::Matrix B_;
+    const mblas::Matrix U_;
+    const mblas::Matrix Wx_;
+    const mblas::Matrix Bx1_;
+    const mblas::Matrix Bx2_;
+    const mblas::Matrix Ux_;
+  };
+  
+  struct EncBackwardGRU {
+    EncBackwardGRU(const NpzConverter& model) 
+    : W_(model["encoder_r_W"]),  
+      B_(model("encoder_r_b", true)),
+      U_(model["encoder_r_U"]),
+      Wx_(model["encoder_r_Wx"]),
+      Bx1_(model("encoder_r_bx", true)),
+      Bx2_(Bx1_.Rows(), Bx1_.Cols(), 0.0),
+      Ux_(model["encoder_r_Ux"])
+    {}
+    
+    const mblas::Matrix W_;
+    const mblas::Matrix B_;
+    const mblas::Matrix U_;
+    const mblas::Matrix Wx_;
+    const mblas::Matrix Bx1_;
+    const mblas::Matrix Bx2_;
+    const mblas::Matrix Ux_;
+  };
+  
+  //////////////////////////////////////////////////////////////////////////////
+  
+  struct DecEmbeddings {
+    DecEmbeddings(const NpzConverter& model)
+    : E_(model["Wemb_dec"])
+    {}
+    
+    const mblas::Matrix E_;
+  };
+
+  struct DecInit {
+    DecInit(const NpzConverter& model)
+    : Wi_(model["ff_state_W"]),
+      Bi_(model("ff_state_b", true))
+    {}
+    
+    const mblas::Matrix Wi_;
+    const mblas::Matrix Bi_;
+  };
+  
+  struct DecGRU1 {
+    DecGRU1(const NpzConverter& model)
+    : W_(model["decoder_W"]),
+      B_(model("decoder_b", true)),
+      U_(model["decoder_U"]),      
+      Wx_(model["decoder_Wx"]),
+      Bx1_(model("decoder_bx", true)),
+      Bx2_(Bx1_.Rows(), Bx1_.Cols(), 0.0),
+      Ux_(model["decoder_Ux"])
+    {}
+    
+    const mblas::Matrix W_;
+    const mblas::Matrix B_;
+    const mblas::Matrix U_;
+    const mblas::Matrix Wx_;
+    const mblas::Matrix Bx1_;
+    const mblas::Matrix Bx2_;
+    const mblas::Matrix Ux_;
+  };
+  
+  struct DecGRU2 {
+    DecGRU2(const NpzConverter& model)
+    : W_(model["decoder_Wc"]),
+      B_(model("decoder_b_nl", true)),
+      U_(model["decoder_U_nl"]),      
+      Wx_(model["decoder_Wcx"]),
+      Bx2_(model("decoder_bx_nl", true)),
+      Bx1_(Bx2_.Rows(), Bx2_.Cols(), 0.0),
+      Ux_(model["decoder_Ux_nl"])
+    {}
+          
+    const mblas::Matrix W_;
+    const mblas::Matrix B_;
+    const mblas::Matrix U_;
+    const mblas::Matrix Wx_;
+    const mblas::Matrix Bx2_;
+    const mblas::Matrix Bx1_;
+    const mblas::Matrix Ux_;
+  };
+  
+  struct DecAlignment {
+    DecAlignment(const NpzConverter& model)
+    : V_(model("decoder_U_att", true)),
+      W_(model["decoder_W_comb_att"]),
+      B_(model("decoder_b_att", true)),
+      U_(model["decoder_Wc_att"]),
+      C_(model["decoder_c_tt"]) // scalar?
+    {}
+          
+    const mblas::Matrix V_;
+    const mblas::Matrix W_;
+    const mblas::Matrix B_;
+    const mblas::Matrix U_;
+    const mblas::Matrix C_;
+  };
+  
+  struct DecSoftmax {
+    DecSoftmax(const NpzConverter& model)
+    : W1_(model["ff_logit_lstm_W"]),
+      B1_(model("ff_logit_lstm_b", true)),
+      W2_(model["ff_logit_prev_W"]),
+      B2_(model("ff_logit_prev_b", true)),
+      W3_(model["ff_logit_ctx_W"]),
+      B3_(model("ff_logit_ctx_b", true)),
+      W4_(model["ff_logit_W"]),
+      B4_(model("ff_logit_b", true))
+    {}
+          
+    const mblas::Matrix W1_;
+    const mblas::Matrix B1_;
+    const mblas::Matrix W2_;
+    const mblas::Matrix B2_;
+    const mblas::Matrix W3_;
+    const mblas::Matrix B3_;
+    const mblas::Matrix W4_;
+    const mblas::Matrix B4_;
+  };
+  
+  Weights(const std::string& npzFile, size_t device = 0)
+  : Weights(NpzConverter(npzFile), device)
+  {}
+  
+  Weights(const NpzConverter& model, size_t device = 0)
+  : encEmbeddings_(model),
+    encForwardGRU_(model),
+    encBackwardGRU_(model),
+    decEmbeddings_(model),
+    decInit_(model),
+    decGru1_(model),
+    decGru2_(model),
+    decAlignment_(model),
+    decSoftmax_(model),
+    device_(device)
+    {}
+  
+  size_t GetDevice() {
+    return device_;
+  }
+  
+  const EncEmbeddings encEmbeddings_;
+  const DecEmbeddings decEmbeddings_;
+  const EncForwardGRU encForwardGRU_;
+  const EncBackwardGRU encBackwardGRU_;
+  const DecInit decInit_;
+  const DecGRU1 decGru1_;
+  const DecGRU2 decGru2_;
+  const DecAlignment decAlignment_;
+  const DecSoftmax decSoftmax_;
+  
+  const size_t device_;
+};
diff --git a/src/mblas/base_matrix.h b/src/mblas/base_matrix.h
new file mode 100644
index 00000000..71a74a7d
--- /dev/null
+++ b/src/mblas/base_matrix.h
@@ -0,0 +1,13 @@
+#pragma once
+
+namespace mblas {
+
+class BaseMatrix {
+  public:
+    //virtual ~BaseMatrix() {}
+    
+    virtual size_t Rows() const = 0;
+    virtual size_t Cols() const = 0;
+};
+
+}
+\ No newline at end of file
diff --git a/src/mblas/bind.hpp b/src/mblas/bind.hpp
new file mode 100644
index 00000000..9386f0b1
--- /dev/null
+++ b/src/mblas/bind.hpp
@@ -0,0 +1,511 @@
+#pragma once
+
+
+#if __cplusplus > 199711L
+
+
+#include <type_traits>
+#include <utility>
+#include <thrust/tuple.h>
+#include <thrust/functional.h>
+
+
+namespace thrust
+{
+namespace experimental
+{
+
+
+template<class T>
+struct is_placeholder : thrust::detail::false_type {};
+
+
+template<unsigned int i>
+struct is_placeholder<
+  thrust::detail::functional::actor<
+    thrust::detail::functional::argument<i>
+  >
+> : thrust::detail::true_type {};
+
+
+namespace detail
+{
+namespace bind_detail
+{
+
+
+template<class T>
+using decay_t = typename std::decay<T>::type;
+
+
+template<class _Tp, _Tp... _Ip>
+struct integer_sequence
+{
+  typedef _Tp value_type;
+  static_assert(std::is_integral<_Tp>::value,
+                "std::integer_sequence can only be instantiated with an integral type" );
+  static constexpr size_t size() noexcept { return sizeof...(_Ip); }
+};
+
+
+template<size_t... _Ip>
+using index_sequence = integer_sequence<size_t, _Ip...>;
+
+
+template <class _Tp, _Tp _Sp, _Tp _Ep, class _IntSequence>
+struct make_integer_sequence_impl_unchecked;
+
+
+template <class _Tp, _Tp _Sp, _Tp _Ep, _Tp ..._Indices>
+struct make_integer_sequence_impl_unchecked<_Tp, _Sp, _Ep,
+                                            integer_sequence<_Tp, _Indices...>>
+{
+  typedef typename make_integer_sequence_impl_unchecked
+                   <
+                      _Tp, _Sp+1, _Ep,
+                      integer_sequence<_Tp, _Indices..., _Sp>
+                   >::type type;
+};
+
+
+template <class _Tp, _Tp _Ep, _Tp ..._Indices>
+struct make_integer_sequence_impl_unchecked<_Tp, _Ep, _Ep,
+                                            integer_sequence<_Tp, _Indices...>>
+{
+  typedef integer_sequence<_Tp, _Indices...> type;
+};
+
+
+template <class _Tp, _Tp _Ep>
+struct make_integer_sequence_impl
+{
+  static_assert(std::is_integral<_Tp>::value,
+                "std::make_integer_sequence can only be instantiated with an integral type" );
+  static_assert(0 <= _Ep, "std::make_integer_sequence input shall not be negative");
+  typedef typename make_integer_sequence_impl_unchecked
+                   <
+                      _Tp, 0, _Ep, integer_sequence<_Tp>
+                   >::type type;
+};
+
+
+template<class _Tp, _Tp _Np>
+using make_integer_sequence = typename make_integer_sequence_impl<_Tp, _Np>::type;
+
+
+template<size_t _Np>
+using make_index_sequence = make_integer_sequence<size_t, _Np>;
+
+
+template<class... _Tp>
+using index_sequence_for = make_index_sequence<sizeof...(_Tp)>;
+
+
+__thrust_hd_warning_disable__
+template<typename F, typename Tuple, size_t... I>
+__host__ __device__
+auto apply_impl(F&& f, Tuple&& t, index_sequence<I...>)
+  -> decltype(
+       std::forward<F>(f)(
+         thrust::get<I>(std::forward<Tuple>(t))...
+       )
+     )
+{
+  return std::forward<F>(f)(
+    thrust::get<I>(std::forward<Tuple>(t))...
+  );
+}
+
+
+template<typename F, typename Tuple>
+__host__ __device__
+auto apply(F&& f, Tuple&& t)
+  -> decltype(
+       apply_impl(
+         std::forward<F>(f),
+         std::forward<Tuple>(t),
+         make_index_sequence<thrust::tuple_size<decay_t<Tuple>>::value>()
+       )
+     )
+{
+  using Indices = make_index_sequence<thrust::tuple_size<decay_t<Tuple>>::value>;
+  return apply_impl(
+    std::forward<F>(f),
+    std::forward<Tuple>(t),
+    Indices()
+  );
+}
+
+
+template<class ArgTuple, class BoundArg>
+__host__ __device__
+auto substitute_arg(ArgTuple&&, BoundArg&& bound_arg,
+                    typename thrust::detail::disable_if<
+                      is_placeholder<decay_t<BoundArg>>::value
+                    >::type* = 0)
+  -> decltype(
+       std::forward<BoundArg>(bound_arg)
+     )
+{
+  return std::forward<BoundArg>(bound_arg);
+}
+
+
+template<unsigned int i>
+struct placeholder
+  : thrust::detail::functional::actor<
+      thrust::detail::functional::argument<i>
+    >
+{};
+
+
+template<class T>
+struct argument_index
+  : thrust::detail::integral_constant<
+      unsigned int, 0
+    >
+{};
+
+
+template<unsigned int i>
+struct argument_index<
+  thrust::detail::functional::actor<
+    thrust::detail::functional::argument<i>
+  >
+>
+  : thrust::detail::integral_constant<
+      unsigned int, i
+    >
+{};
+
+
+template<class ArgTuple, class BoundArg>
+__host__ __device__
+auto substitute_arg(ArgTuple&& arg_tuple, const BoundArg&,
+                   typename thrust::detail::enable_if<
+                     is_placeholder<decay_t<BoundArg>>::value
+                   >::type* = 0)
+  -> decltype(
+       thrust::get<
+         argument_index<BoundArg>::value
+       >(std::forward<ArgTuple>(arg_tuple))
+     )
+{
+  const unsigned int idx = argument_index<BoundArg>::value;
+  return thrust::get<idx>(std::forward<ArgTuple>(arg_tuple));
+}
+
+
+// XXX WAR nvbug 1527140
+//     unpack template parameter packs into thrust::tuple manually
+template<class... T>
+struct tuple_war_1527140;
+
+template<>
+struct tuple_war_1527140<>
+{
+  using type = thrust::tuple<>;
+};
+
+template<class T1>
+struct tuple_war_1527140<T1>
+{
+  using type = thrust::tuple<T1>;
+};
+
+template<class T1, class T2>
+struct tuple_war_1527140<T1,T2>
+{
+  using type = thrust::tuple<T1,T2>;
+};
+
+template<class T1, class T2, class T3>
+struct tuple_war_1527140<T1,T2,T3>
+{
+  using type = thrust::tuple<T1,T2,T3>;
+};
+
+template<class T1, class T2, class T3, class T4>
+struct tuple_war_1527140<T1,T2,T3,T4>
+{
+  using type = thrust::tuple<T1,T2,T3,T4>;
+};
+
+template<class T1, class T2, class T3, class T4, class T5>
+struct tuple_war_1527140<T1,T2,T3,T4,T5>
+{
+  using type = thrust::tuple<T1,T2,T3,T4,T5>;
+};
+
+template<class T1, class T2, class T3, class T4, class T5, class T6>
+struct tuple_war_1527140<T1,T2,T3,T4,T5,T6>
+{
+  using type = thrust::tuple<T1,T2,T3,T4,T5,T6>;
+};
+
+template<class T1, class T2, class T3, class T4, class T5, class T6, class T7>
+struct tuple_war_1527140<T1,T2,T3,T4,T5,T6,T7>
+{
+  using type = thrust::tuple<T1,T2,T3,T4,T5,T6,T7>;
+};
+
+template<class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8>
+struct tuple_war_1527140<T1,T2,T3,T4,T5,T6,T7,T8>
+{
+  using type = thrust::tuple<T1,T2,T3,T4,T5,T6,T7,T8>;
+};
+
+template<class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9>
+struct tuple_war_1527140<T1,T2,T3,T4,T5,T6,T7,T8,T9>
+{
+  using type = thrust::tuple<T1,T2,T3,T4,T5,T6,T7,T8,T9>;
+};
+
+template<class T1, class T2, class T3, class T4, class T5, class T6, class T7, class T8, class T9, class T10>
+struct tuple_war_1527140<T1,T2,T3,T4,T5,T6,T7,T8,T9,T10>
+{
+  using type = thrust::tuple<T1,T2,T3,T4,T5,T6,T7,T8,T9,T10>;
+};
+
+template<class... T>
+using tuple = typename tuple_war_1527140<T...>::type;
+
+
+// XXX replace this with the variadic forward_as_tuple() when thrust::tuple's constructor can receive && references
+inline __host__ __device__
+tuple<> forward_as_tuple()
+{
+  return tuple<>();
+}
+
+
+template<class T>
+__host__ __device__
+tuple<T&> forward_as_tuple(T& arg)
+{
+  return tuple<T&>(arg);
+}
+
+
+template<class T>
+__host__ __device__
+tuple<const T&> forward_as_tuple(const T& arg)
+{
+  return tuple<const T&>(arg);
+}
+
+
+template<class T1, class T2>
+__host__ __device__
+tuple<T1&,T2&> forward_as_tuple(T1& arg1, T2& arg2)
+{
+  return tuple<T1&,T2&>(arg1, arg2);
+}
+
+
+template<class T1, class T2>
+__host__ __device__
+tuple<T1&,const T2&> forward_as_tuple(T1& arg1, const T2& arg2)
+{
+  return tuple<T1&,const T2&>(arg1, arg2);
+}
+
+
+template<class T1, class T2>
+__host__ __device__
+tuple<const T1&,T2&> forward_as_tuple(const T1& arg1, T2& arg2)
+{
+  return tuple<const T1&,T2&>(arg1, arg2);
+}
+
+
+template<class T1, class T2>
+__host__ __device__
+tuple<const T1&,const T2&> forward_as_tuple(const T1& arg1, const T2& arg2)
+{
+  return tuple<const T1&,const T2&>(arg1, arg2);
+}
+
+
+template<class T1, class T2, class T3>
+__host__ __device__
+tuple<T1&,T2&,T3&> forward_as_tuple(T1& arg1, T2& arg2, T3& arg3)
+{
+  return tuple<T1&,T2&,T3&>(arg1, arg2, arg3);
+}
+
+
+template<class T1, class T2, class T3>
+__host__ __device__
+tuple<T1&,T2&,const T3&> forward_as_tuple(T1& arg1, T2& arg2, const T3& arg3)
+{
+  return tuple<T1&,T2&,const T3&>(arg1, arg2, arg3);
+}
+
+
+template<class T1, class T2, class T3>
+__host__ __device__
+tuple<T1&,const T2&,T3&> forward_as_tuple(T1& arg1, const T2& arg2, T3& arg3)
+{
+  return tuple<T1&,const T2&, T3&>(arg1, arg2, arg3);
+}
+
+
+template<class T1, class T2, class T3>
+__host__ __device__
+tuple<T1&,const T2&,const T3&> forward_as_tuple(T1& arg1, const T2& arg2, const T3& arg3)
+{
+  return tuple<T1&,const T2&,const T3&>(arg1, arg2, arg3);
+}
+
+
+template<class T1, class T2, class T3>
+__host__ __device__
+tuple<const T1&,T2&,T3&> forward_as_tuple(const T1& arg1, T2& arg2, T3& arg3)
+{
+  return tuple<const T1&,T2&,T3&>(arg1, arg2, arg3);
+}
+
+
+template<class T1, class T2, class T3>
+__host__ __device__
+tuple<const T1&,T2&,const T3&> forward_as_tuple(const T1& arg1, T2& arg2, const T3& arg3)
+{
+  return tuple<const T1&,T2&,const T3&>(arg1, arg2, arg3);
+}
+
+
+template<class T1, class T2, class T3>
+__host__ __device__
+tuple<const T1&,const T2&,T3&> forward_as_tuple(const T1& arg1, const T2& arg2, T3& arg3)
+{
+  return tuple<const T1&,const T2&, T3&>(arg1, arg2, arg3);
+}
+
+
+template<class T1, class T2, class T3>
+__host__ __device__
+tuple<const T1&,const T2&,const T3&> forward_as_tuple(const T1& arg1, const T2& arg2, const T3& arg3)
+{
+  return tuple<const T1&,const T2&,const T3&>(arg1, arg2, arg3);
+}
+
+
+template<class ArgTuple, class BoundArgTuple, size_t... I>
+__host__ __device__
+auto substitute_impl(ArgTuple&& arg_tuple, BoundArgTuple&& bound_arg_tuple, index_sequence<I...>)
+  -> decltype(
+       forward_as_tuple(
+         substitute_arg(
+           std::forward<ArgTuple>(arg_tuple),
+           thrust::get<I>(std::forward<BoundArgTuple>(bound_arg_tuple))
+         )...
+       )
+     )
+{
+  return forward_as_tuple(
+    substitute_arg(
+      std::forward<ArgTuple>(arg_tuple),
+      thrust::get<I>(std::forward<BoundArgTuple>(bound_arg_tuple))
+    )...
+  );
+}
+
+
+template<class ArgTuple, class BoundArgTuple>
+__host__ __device__
+auto substitute(ArgTuple&& arg_tuple, BoundArgTuple&& bound_arg_tuple)
+  -> decltype(
+       substitute_impl(
+         std::forward<ArgTuple>(arg_tuple),
+         std::forward<BoundArgTuple>(bound_arg_tuple),
+         make_index_sequence<thrust::tuple_size<decay_t<BoundArgTuple>>::value>()
+       )
+     )
+{
+  using Indices = make_index_sequence<thrust::tuple_size<decay_t<BoundArgTuple>>::value>;
+  return substitute_impl(std::forward<ArgTuple>(arg_tuple), std::forward<BoundArgTuple>(bound_arg_tuple), Indices());
+}
+
+
+template<class F, class... BoundArgs>
+class bind_expression
+{
+  public:
+    __host__ __device__
+    bind_expression(const F& f, const BoundArgs&... bound_args)
+      : fun_(f),
+        bound_args_(bound_args...)
+    {}
+
+    template<class... OtherArgs>
+    __host__ __device__
+    auto operator()(OtherArgs&&... args) const
+      -> decltype(
+           apply(
+             *std::declval<const F*>(),
+             substitute(
+               forward_as_tuple(std::forward<OtherArgs>(args)...),
+               *std::declval<const tuple<BoundArgs...>*>()
+             )
+           )
+         )
+    {
+      return apply(
+        fun_,
+        substitute(
+          forward_as_tuple(std::forward<OtherArgs>(args)...),
+          bound_args_
+        )
+      );
+    }
+
+    template<class... OtherArgs>
+    __host__ __device__
+    auto operator()(OtherArgs&&... args)
+      -> decltype(
+           apply(
+             *std::declval<F*>(),
+             substitute(
+               forward_as_tuple(std::forward<OtherArgs>(args)...),
+               *std::declval<tuple<BoundArgs...>*>()
+             )
+           )
+         )
+    {
+      return apply(
+        fun_,
+        substitute(
+          forward_as_tuple(std::forward<OtherArgs>(args)...),
+          bound_args_
+        )
+      );
+    }
+
+  private:
+    F fun_;
+    tuple<BoundArgs...> bound_args_;
+};
+
+
+} // end bind_detail
+} // end detail
+
+
+template<class F, class... BoundArgs>
+__host__ __device__
+detail::bind_detail::bind_expression<
+  detail::bind_detail::decay_t<F>,
+  detail::bind_detail::decay_t<BoundArgs>...
+> bind(F&& f, BoundArgs&&... bound_args)
+{
+  using namespace thrust::experimental::detail::bind_detail;
+  return bind_expression<decay_t<F>,decay_t<BoundArgs>...>(std::forward<F>(f), std::forward<BoundArgs>(bound_args)...);
+}
+
+
+} // end experimental
+} // end thrust
+
+#endif // __cplusplus
+
diff --git a/src/mblas/expression.cpp b/src/mblas/expression.cpp
new file mode 100644
index 00000000..df38338e
--- /dev/null
+++ b/src/mblas/expression.cpp
@@ -0,0 +1,69 @@
+
+#include <iostream>
+#include <functional>
+#include <cmath>
+
+template <class F, class E>
+class Expr1 {
+  public:
+    Expr1(F f, E e) : f_(f), e_(e) {}
+
+    operator float() {
+      std::cerr << "calc1" << std::endl;
+      return f_(e_);
+    }
+
+  private:
+    F f_;
+    E e_;
+};
+
+template <class F, class E1, class E2>
+class Expr2 {
+  public:
+    Expr2(F f, E1 e1, E2 e2) : f_(f), e1_(e1), e2_(e2) {}
+
+    operator float() {
+      std::cerr << "calc2" << std::endl;
+      return f_(e1_, e2_);
+    }
+
+  private:
+    F f_;
+    E1 e1_;
+    E2 e2_;
+};
+
+template <class F, class E>
+Expr1<F, E> expr(F f, E e) {
+    return Expr1<F, E>(f, e);
+}
+
+template <class F, class E1, class E2>
+Expr2<F, E1, E2> expr(F f, E1 e1, E2 e2) {
+    return Expr2<F, E1, E2>(f, e1, e2);
+}
+
+template <class E1, class E2>
+auto operator+(E1 e1, E2 e2) -> decltype(expr(std::plus<float>(), e1, e2))  {
+    return expr(std::plus<float>(), e1, e2);
+}
+
+template <class E1, class E2>
+auto operator*(E1 e1, E2 e2) -> decltype(expr(std::multiplies<float>(), e1, e2))  {
+    return expr(std::multiplies<float>(), e1, e2);
+}
+
+int main(int argc, char** argv) {
+   auto e =  5.0 + ((expr(exp, expr(log, 3)) + 1.0) * -2.0);
+   std::cerr << "t" << std::endl;
+   float f = e;
+   std::cerr << f << std::endl;
+
+   return 0;
+
+  //Exp exp1 = eTanh(ePlus(eMult(eMinus(_1, _2), _3), eMult(_3, _4)));
+  //Exp exp1 = tanh(((_1 - _2) * _3) + (_2 * _4));
+  //TupleIterator(M1, M2, M3, M4, M5.begin());
+}
+
diff --git a/src/mblas/matrix.h b/src/mblas/matrix.h
new file mode 100644
index 00000000..9c466055
--- /dev/null
+++ b/src/mblas/matrix.h
@@ -0,0 +1,720 @@
+#pragma once
+
+#include <cmath>
+
+#include "base_matrix.h"
+
+#define MAX_THREADS 512
+#define MAX_BLOCKS 65535
+
+#include <cublas_v2.h>   
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+
+#include "thrust_functions.h"
+
+namespace lib = thrust;
+namespace iterlib = thrust;
+
+namespace mblas {
+
+using namespace thrust::placeholders;
+
+template <class VecType>
+class TMatrix : public BaseMatrix {
+  public:
+    typedef typename VecType::value_type value_type;
+    typedef typename VecType::iterator iterator;
+    typedef typename VecType::const_iterator const_iterator;
+    
+    TMatrix()
+    : rows_(0), cols_(0)
+    {}
+    
+    TMatrix(size_t rows, size_t cols)
+    : rows_(rows), cols_(cols), data_(rows_ * cols_)
+    {}
+    
+    TMatrix(size_t rows, size_t cols, value_type val)
+    : rows_(rows), cols_(cols), data_(rows_ * cols_, val)
+    {}
+    
+    TMatrix(TMatrix&& m)
+    : rows_(m.rows_), cols_(m.cols_), data_(std::move(m.data_)) {}
+        
+    TMatrix(const TMatrix& m) = delete;
+    
+    value_type operator()(size_t i, size_t j) const {
+      return data_[i * cols_ + j];
+    }
+
+    void Set(size_t i, size_t j, float value)  {
+      data_[i * cols_ + j] = value;
+    }
+
+    
+    size_t Rows() const {
+      return rows_;
+    }
+    
+    size_t Cols() const {
+      return cols_;
+    }
+    
+    void Resize(size_t rows, size_t cols) {
+      rows_ = rows;
+      cols_ = cols;
+      data_.resize(rows_ * cols_);
+    }
+    
+    void Resize(size_t rows, size_t cols, value_type val) {
+      rows_ = rows;
+      cols_ = cols;
+      data_.resize(rows_ * cols_, val);
+    }
+    
+    void Reserve(size_t rows, size_t cols) {
+      data_.reserve(rows * cols);
+    }
+    
+    void Reshape(size_t rows, size_t cols) {
+      rows_ = rows;
+      cols_ = cols;
+    }
+    
+    void Purge() {
+      Clear();
+      VecType temp;
+      data_.swap(temp);
+    }
+
+    void Clear() {
+      data_.clear();
+      rows_ = 0;
+      cols_ = 0;
+    }
+    
+    VecType& GetVec() {
+      return data_;
+    }
+    
+    const VecType& GetVec() const {
+      return data_;
+    }
+    
+    value_type* data() {
+      return thrust::raw_pointer_cast(data_.data());
+    }
+    
+    const value_type* data() const {
+      return thrust::raw_pointer_cast(data_.data());
+    }
+    
+    iterator begin() {
+      return data_.begin();
+    }
+    
+    iterator end() {
+      return data_.end();
+    }
+    
+    const_iterator begin() const{
+      return data_.begin();
+    }
+    
+    const_iterator end() const {
+      return data_.end();
+    }
+    
+    size_t size() const {
+      return data_.size();    
+    }
+    
+  private:
+    size_t rows_;
+    size_t cols_;
+    VecType data_;
+};
+
+typedef thrust::device_vector<float> FVec;
+typedef thrust::device_vector<unsigned int> IVec;
+
+class CublasHandler {
+  public:
+
+    ~CublasHandler() {
+      if(handle_ != nullptr) {
+        cublasDestroy(*handle_);
+        delete handle_;
+        handle_ = nullptr;
+      }
+    }
+    
+    static cublasHandle_t GetHandle() {
+      if(instance_.handle_ == nullptr) {
+        instance_.CreateHandle();
+      }
+      return *instance_.handle_;
+    }
+    
+    static void StaticHandle() {
+      instance_.CreateHandle();
+    }
+    
+  private:    
+
+    void CreateHandle() {
+      if(handle_ != nullptr) {
+        cublasDestroy(*handle_);
+        delete handle_;
+      }
+      handle_ = new cublasHandle_t;
+      cublasCreate(handle_);
+    }
+
+    static thread_local cublasHandle_t* handle_;
+    static CublasHandler instance_;
+};
+
+CublasHandler CublasHandler::instance_;
+thread_local cublasHandle_t* CublasHandler::handle_ = nullptr;
+
+typedef TMatrix<FVec> Matrix;
+typedef TMatrix<IVec> IMatrix;
+
+template <class M>
+void debug1(const M& m, size_t pos = 0, size_t l = 5) {
+  std::cerr << m.Rows() << " " << m.Cols() << std::endl;
+  for(size_t i = 0; i < m.Rows(); ++i) {
+    for(size_t j = pos; j < m.Cols() && j < pos + l; ++j) {
+      std::cerr << m.GetVec()[i * m.Cols() + j] << " ";
+    }
+    std::cerr << std::endl;
+    if(i == 4)
+      break;
+  }
+}
+
+Matrix& Swap(Matrix& Out, Matrix& In) {
+  size_t iRows = In.Rows();
+  size_t iCols = In.Cols();
+  size_t oRows = Out.Rows();
+  size_t oCols = Out.Cols();
+  
+  Out.Reshape(iRows, iCols);
+  In.Reshape(oRows, oCols);
+  
+  In.GetVec().swap(Out.GetVec());
+  return Out;
+}
+
+Matrix& Mean(Matrix& Out, const Matrix& In) {
+  size_t m = In.Rows();
+  size_t n = In.Cols();
+  
+  Out.Resize(1, n, 0.f);
+  Matrix Ones(1, m, 1.f);
+  
+  float alpha = 1.0 / m;
+  float beta  = 0.0;
+  cublasSgemv(CublasHandler::GetHandle(), CUBLAS_OP_N, n, m, &alpha, In.data(), n,
+              Ones.data(), 1, &beta, Out.data(), 1);
+  return Out;
+}
+
+Matrix& Transpose(Matrix& Out, const Matrix& In) {
+  size_t m = In.Rows();
+  size_t n = In.Cols();
+  
+  Out.Resize(n, m);
+  
+  float alpha = 1.0;
+  float beta  = 0.0;
+  
+  cublasSgeam(CublasHandler::GetHandle(), CUBLAS_OP_T, CUBLAS_OP_T, m, n, &alpha, In.data(), n,
+              &beta, In.data(), n, Out.data(), m); 
+  
+  return Out;
+}
+
+Matrix& Transpose(Matrix& Out) {
+  Matrix Temp;
+  Transpose(Temp, Out);
+  Swap(Out, Temp);
+  return Out;
+}
+
+Matrix& Copy(Matrix& Out, const Matrix& In) {
+  Out.Resize(In.Rows(), In.Cols());
+  lib::copy(In.begin(), In.end(), Out.begin());
+  return Out;
+}
+
+Matrix& AppendRow(Matrix& Out, const Matrix& In, size_t i) {
+  size_t oldSize = Out.GetVec().size();
+  size_t addSize = In.Cols();
+  Out.Resize(Out.Rows() + 1, In.Cols());
+  Out.GetVec().resize(oldSize + addSize);
+  size_t start = In.Cols() * i;
+  size_t end   = In.Cols() * (i + 1);
+  lib::copy(In.begin() + start, In.begin() + end, Out.begin() + oldSize);
+  return Out;
+}
+
+Matrix& AppendRows(Matrix& Out, const Matrix& In) {
+  size_t oldSize = Out.GetVec().size();
+  size_t addSize = In.GetVec().size();
+  Out.Resize(Out.Rows() + In.Rows(), In.Cols());
+  Out.GetVec().resize(oldSize + addSize);
+  lib::copy(In.begin(), In.end(), Out.begin() + oldSize);
+  return Out;
+}
+
+Matrix& PrependRows(Matrix& Out, const Matrix& In) {
+  Out.Resize(Out.Rows() + In.Rows(), In.Cols());
+  Out.GetVec().insert(Out.begin(), In.begin(), In.end());
+  return Out;
+}
+
+Matrix& PasteRow(Matrix& Out,
+                 const Matrix& In,
+                 const size_t r = 0, const size_t c = 0) {
+  size_t start = r * Out.Cols() + c;
+  lib::copy(In.begin(), In.end(), Out.begin() + start);
+  return Out;
+}
+
+Matrix& CopyRow(Matrix& Out,
+                const Matrix& In,
+                const size_t r = 0, const size_t c = 0) {
+  size_t length = In.Cols() - c;
+  Out.Resize(1, length);
+  size_t start = r * In.Cols() + c;
+  size_t end   = start + length;
+  lib::copy(In.begin() + start, In.begin() + end, Out.begin());
+  return Out;
+}
+
+typedef std::pair<size_t, size_t> RowPair;
+typedef std::vector<RowPair> RowPairs;
+typedef thrust::device_vector<RowPair> DeviceRowPairs;
+
+__global__ void gCopyRows(float* out, const float* in, size_t cols,
+                          const RowPair* devPairs, size_t numPairs) {
+  for(int bid = 0; bid < numPairs; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < numPairs) {
+      size_t dstId = devPairs[j].first;
+      size_t srcId = devPairs[j].second;
+      
+      float* rowOut = out + dstId * cols;
+      const float* rowIn = in + srcId * cols;
+      
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int i = tid + threadIdx.x;
+        if(i < cols)
+          rowOut[i] = rowIn[i];
+      }
+    }
+  }
+}
+
+Matrix& CopyRows(Matrix& Out,
+                 const Matrix& In,
+                 const RowPair* devPairs,
+                 size_t numPairs) {
+  float* d_out = Out.data();
+  const float* d_in = In.data();
+  
+  int threads = std::min(MAX_THREADS, (int)In.Cols());
+  int blocks = std::min(MAX_BLOCKS, (int)numPairs);;
+  gCopyRows<<<blocks, threads>>>(d_out, d_in, In.Cols(), devPairs, numPairs);
+  cudaStreamSynchronize(0);
+  return Out;
+}
+
+Matrix& CopyRows(Matrix& Out,
+                 const Matrix& In,
+                 const RowPairs& pairs) {
+  thrust::device_vector<RowPair> devPairs = pairs;
+  CopyRows(Out, In, thrust::raw_pointer_cast(devPairs.data()), devPairs.size());
+  return Out;
+}
+
+Matrix& Assemble(Matrix& Out,
+                 const Matrix& In,
+                 const std::vector<size_t>& indeces) {
+  RowPairs rowPairs;
+  for(size_t i = 0; i < indeces.size(); i++)
+    rowPairs.emplace_back(i, indeces[i]);
+  Out.Resize(rowPairs.size(), In.Cols());
+  CopyRows(Out, In, rowPairs);
+  return Out;
+}
+
+__global__ void gSlice(float* out, const float* in,
+                       size_t n, size_t dim,
+                       size_t rows, size_t cols) {
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) {
+      float* rowOut = out + j * dim;
+      const float* rowIn = in + j * cols + n * dim;
+      
+      for(int tid = 0; tid < dim; tid += blockDim.x) {
+        int i = tid + threadIdx.x;
+        if(i < dim)
+          rowOut[i] = rowIn[i];
+      }
+    }
+  }
+}
+
+Matrix& Slice(Matrix& Out,
+              const Matrix& In,
+              size_t n, size_t dim) {
+  
+  Out.Resize(In.Rows(), dim);
+  
+  float* d_out = Out.data();
+  const float* d_in = In.data();
+  
+  int threads = std::min(MAX_THREADS, (int)dim);
+  int blocks = std::min(MAX_BLOCKS, (int)In.Rows());
+  gSlice<<<blocks, threads>>>(d_out, d_in, n, dim, In.Rows(), In.Cols());
+  cudaStreamSynchronize(0);
+  return Out;
+}
+
+Matrix& Prod(cublasHandle_t handle, Matrix& C, const Matrix& A, const Matrix& B,
+             bool transA = false, bool transB = false) {
+  Matrix::value_type alpha = 1.0;
+  Matrix::value_type beta = 0.0;
+
+  size_t m = A.Rows();
+  size_t k = A.Cols();
+  if(transA)
+    std::swap(m, k);
+  
+  size_t l = B.Rows();
+  size_t n = B.Cols();
+  if(transB)
+    std::swap(l, n);
+  
+  size_t lda = A.Cols();                                                                              
+  size_t ldb = B.Cols();                                                                              
+  size_t ldc = B.Cols();
+
+  if(transB)
+    ldc = B.Rows();  
+  
+  C.Resize(m, n);
+  
+  cublasOperation_t opA = transA ? CUBLAS_OP_T : CUBLAS_OP_N;
+  cublasOperation_t opB = transB ? CUBLAS_OP_T : CUBLAS_OP_N;
+
+  cublasSgemm(handle, opB, opA,
+              n, m, k, &alpha, B.data(), ldb, A.data(), lda, &beta, C.data(), ldc);
+  return C;
+}
+
+Matrix& Prod(Matrix& C, const Matrix& A, const Matrix& B,
+             bool transA = false, bool transB = false) {
+ 
+ return Prod(CublasHandler::GetHandle(), C, A, B, transA, transB);
+}
+
+__global__ void gSoftMax(float* softMaxP, size_t rows, size_t cols) {
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) { 
+      extern __shared__ float _share[];
+      float* _sum = _share + blockDim.x;
+      float* sp = softMaxP + j * cols;
+      _sum[threadIdx.x] = 0.0;
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int id = tid + threadIdx.x;
+        if(id < cols) {
+          sp[id] = __expf(sp[id]);
+          _sum[threadIdx.x] += sp[id];
+        }
+      }
+      __syncthreads();
+      int len = blockDim.x;
+      while(len != 1) {
+        __syncthreads();
+        int skip = (len + 1) >> 1;
+        if(threadIdx.x < (len >> 1))
+          _sum[threadIdx.x] += _sum[threadIdx.x + skip];
+        len = (len + 1) >> 1;
+      }
+      __syncthreads();
+      for(int tid = 0; tid < cols; tid += blockDim.x){
+        int id = tid + threadIdx.x;
+        if(id < cols)
+          sp[id] /= _sum[0];
+      }
+    }
+  }
+}
+
+Matrix& Softmax(Matrix& Out) {
+  int blocks = std::min(MAX_BLOCKS, (int)Out.Rows());
+  int threads = std::min(MAX_THREADS, (int)Out.Cols());
+  int shared = sizeof(float) * threads * 2;
+  gSoftMax<<<blocks, threads, shared>>>(Out.data(), Out.Rows(), Out.Cols());
+  cudaStreamSynchronize(0);
+  return Out;
+}
+
+template <class Functor>
+__global__ void gBroadcast(Functor functor,
+                           float* out, const float* in1, const float* in2,
+                           size_t rows, size_t rows1, size_t cols) {
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) { 
+      float* rowOut = out + j * cols;
+    
+      const float* rowIn1 = in1 + (j % rows1) * cols;
+      const float* rowIn2 = in2 + (j / rows1) * cols;
+      
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int i = tid + threadIdx.x;
+        if(i < cols)
+          rowOut[i] = functor(rowIn1[i], rowIn2[i]);
+      }
+    }
+  }
+}
+
+template <class Functor>
+Matrix& Broadcast(Functor functor, Matrix& Out, const Matrix& In, cudaStream_t stream = 0) {
+  size_t rows1 = Out.Rows();
+  size_t rows2 = In.Rows();
+  
+  size_t rows = rows1 * rows2;
+  size_t cols  = Out.Cols();
+  
+  Matrix Temp(rows, cols, 1.0);
+  
+  float* d_out = Temp.data();
+  const float* d_in1 = Out.data();
+  const float* d_in2 = In.data();
+  
+  int blocks  = std::min(MAX_BLOCKS, (int)rows);
+  int threads = std::min(MAX_THREADS, (int)cols);
+  gBroadcast<<<blocks, threads, 0, stream>>>(functor, d_out, d_in1, d_in2,
+                                             rows, rows1, cols);
+  cudaStreamSynchronize(stream);
+  Swap(Out, Temp);
+  return Out;
+}
+
+template <class Functor>
+Matrix& BroadcastColumn(Functor functor, Matrix& Out, const Matrix& In, cudaStream_t stream = 0) {
+  // @TODO: Make this efficient with special kernel!
+  Matrix InTemp;
+  Transpose(InTemp, In);
+  
+  Transpose(Out);
+  Broadcast(functor, Out, InTemp, stream);
+  Transpose(Out);
+  return Out;
+}
+
+template <class Functor>
+__global__ void gBroadcastVecColumn(Functor functor,
+                                    float* out, const float* in, size_t rows, size_t cols) {
+  for(int bid = 0; bid < cols; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < cols) { 
+      for(int tid = 0; tid < rows; tid += blockDim.x) {
+        int i = tid + threadIdx.x;
+        if(i < rows) {
+          float* rowOut = out + i * cols + j;
+          const float* rowIn  = in + i;
+          *rowOut = functor(*rowOut, *rowIn);
+        }
+      }
+    }
+  }
+}
+
+template <class Functor>
+Matrix& BroadcastVecColumn(Functor functor, Matrix& Out, const Matrix& In, cudaStream_t stream = 0) {
+  size_t rows  = Out.Rows();
+  size_t cols = Out.Cols();
+    
+  float* d_out = Out.data();
+  const float* d_in = In.data();
+  
+  int blocks  = std::min(MAX_BLOCKS, (int)cols);
+  int threads = std::min(MAX_THREADS, (int)rows);
+  gBroadcastVecColumn<<<blocks, threads, 0, stream>>>(functor, d_out, d_in, rows, cols);
+  cudaStreamSynchronize(stream);
+  return Out;
+}
+
+template <class Functor>
+__global__ void gBroadcastVec(Functor functor,
+                              float* out, const float* in, size_t rows, size_t cols) {
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) {
+      float* rowOut = out + j * cols;    
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int i = tid + threadIdx.x;
+        if(i < cols) {
+          rowOut[i] = functor(rowOut[i], in[i]);
+        }
+      }
+    }
+  }
+}
+
+template <class Functor>
+Matrix& BroadcastVec(Functor functor, Matrix& Out, const Matrix& In, cudaStream_t stream = 0) {
+  //Broadcast(functor, Out, In, stream);
+  size_t rows  = Out.Rows();
+  size_t cols = Out.Cols();
+    
+  float* d_out = Out.data();
+  const float* d_in = In.data();
+  
+  int blocks  = std::min(MAX_BLOCKS, (int)rows);
+  int threads = std::min(MAX_THREADS, (int)cols);
+  gBroadcastVec<<<blocks, threads, 0, stream>>>(functor, d_out, d_in, rows, cols);
+  cudaStreamSynchronize(stream);
+  return Out;
+}
+
+
+template <class Functor>
+__global__ void gElement(Functor functor, float* out,
+                         size_t rows, size_t cols) {
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) {
+      float* rowOut = out + j * cols;
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int i = tid + threadIdx.x;
+        if(i < cols)
+          rowOut[i] = functor(rowOut[i]);;
+      }
+    }
+  }
+}
+
+template <class Functor>
+__global__ void gElement(Functor functor,
+                         float* out, const float* in,
+                         size_t rows, size_t cols) {
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) {
+      float* rowOut = out + j * cols;
+      const float* rowIn = in + j * cols;
+      
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int i = tid + threadIdx.x;
+        if(i < cols)
+          rowOut[i] = functor(rowOut[i], rowIn[i]);;
+      }
+    }
+  }
+}
+
+template <class Functor>
+__global__ void gElement(Functor functor,
+                         float* out, const float* in1, const float* in2,
+                         size_t rows, size_t cols) {
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) {
+      float* rowOut = out + j * cols;
+      const float* rowIn1 = in1 + j * cols;
+      const float* rowIn2 = in2 + j * cols;
+      
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int i = tid + threadIdx.x;
+        if(i < cols)
+          rowOut[i] = functor(rowOut[i], rowIn1[i], rowIn2[i]);
+      }
+    }
+  }
+}
+
+template <class Functor>
+Matrix& Element(Functor functor, Matrix& Out) {
+  float* d_out = Out.data();  
+  int blocks  = std::min(MAX_BLOCKS, (int)Out.Rows());
+  int threads = std::min(MAX_THREADS, (int)Out.Cols());
+  gElement<<<blocks, threads>>>(functor, d_out, Out.Rows(), Out.Cols());
+  cudaStreamSynchronize(0);
+  return Out;
+}
+
+template <class Functor>
+Matrix& Element(Functor functor,
+                Matrix& Out, const Matrix& In) {
+  float* d_out = Out.data();
+  const float* d_in = In.data();
+  
+  int blocks  = std::min(MAX_BLOCKS, (int)Out.Rows());
+  int threads = std::min(MAX_THREADS, (int)Out.Cols());
+  gElement<<<blocks, threads>>>(functor, d_out, d_in, Out.Rows(), Out.Cols());
+  cudaStreamSynchronize(0);
+  return Out;
+}
+
+template <class Functor>
+Matrix& Element(Functor functor,
+                Matrix& Out, const Matrix& In1, const Matrix& In2) {
+  
+  float* d_out = Out.data();
+  const float* d_in1 = In1.data();
+  const float* d_in2 = In2.data();
+  
+  int blocks  = std::min(MAX_BLOCKS, (int)Out.Rows());
+  int threads = std::min(MAX_THREADS, (int)Out.Cols());
+  gElement<<<blocks, threads>>>(functor, d_out, d_in1, d_in2,
+                                Out.Rows(), Out.Cols());
+  cudaStreamSynchronize(0);
+  return Out;
+}
+
+template <class Functor>
+__global__ void gPairwiseReduce(Functor functor,
+                                float* out, const float* in,
+                                size_t rows, size_t cols) {
+  for(int bid = 0; bid < rows; bid += gridDim.x) {
+    int j = bid + blockIdx.x;
+    if(j < rows) {
+      const float* rowIn = in + j * cols * 2;
+      float* rowOut = out + j * cols;
+      for(int tid = 0; tid < cols; tid += blockDim.x) {
+        int i = tid + threadIdx.x;
+        if(i < cols)
+          rowOut[i] = functor(rowIn[i * 2], rowIn[i * 2 + 1]);
+      }
+    }
+  }
+}
+
+template <class Functor>
+Matrix& PairwiseReduce(Functor functor, Matrix& Out) {
+  Matrix Temp(Out.Rows(), Out.Cols() / 2);
+  const float* d_in = Out.data();
+  float* d_out = Temp.data();
+  
+  int blocks  = std::min(MAX_BLOCKS, (int)Temp.Rows());
+  int threads = std::min(MAX_THREADS, (int)Temp.Cols());
+  gPairwiseReduce<<<blocks, threads>>>(functor, d_out, d_in,
+                                       Temp.Rows(), Temp.Cols());
+  cudaStreamSynchronize(0);
+  Swap(Out, Temp);
+  return Out;
+}
+
+}
+\ No newline at end of file
diff --git a/src/mblas/phoenix_functions.h b/src/mblas/phoenix_functions.h
new file mode 100644
index 00000000..314756c6
--- /dev/null
+++ b/src/mblas/phoenix_functions.h
@@ -0,0 +1,42 @@
+#pragma once
+
+#include <cmath>
+#include <boost/phoenix/phoenix.hpp>
+
+namespace mblas
+{
+  template <class T>
+  auto Exp(const T& x) -> decltype(boost::phoenix::bind(exp, x))
+  {
+    return boost::phoenix::bind(exp, x);  
+  }
+  
+  template <typename T>
+  auto Tanh(const T& x) -> decltype(boost::phoenix::bind(tanh, x)) {
+    return boost::phoenix::bind(tanh, x);  
+  }
+  
+  template <typename T>
+  auto Log(const T& x) -> decltype(boost::phoenix::bind(log, x)) {
+    return boost::phoenix::bind(log, x);  
+  }
+  
+  float logit(float x) {
+    return 1.0 / (1.0 + exp(-x));
+  }
+  
+  template <typename T>
+  auto Logit(const T& x) -> decltype(boost::phoenix::bind(logit, x)) { 
+    return boost::phoenix::bind(logit, x);  
+  }
+  
+  
+  float max(float x, float y) {
+    return x > y ? x : y;
+  }
+  
+  template <typename T1, typename T2>
+  auto Max(const T1& x, const T2& y) -> decltype(boost::phoenix::bind(max, x, y)) { 
+    return boost::phoenix::bind(max, x, y);  
+  }
+}
+\ No newline at end of file
diff --git a/src/mblas/strided_iterator.h b/src/mblas/strided_iterator.h
new file mode 100644
index 00000000..a4c9ff6d
--- /dev/null
+++ b/src/mblas/strided_iterator.h
@@ -0,0 +1,160 @@
+#pragma once
+
+#ifdef NO_CUDA
+
+#include <vector>
+#include <functional>
+#include <boost/iterator/counting_iterator.hpp>
+#include <boost/iterator/transform_iterator.hpp>
+#include <boost/iterator/permutation_iterator.hpp>
+
+namespace iterlib = boost;
+namespace func = std;
+
+#else
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
+#include <thrust/functional.h>
+
+namespace iterlib = thrust;
+namespace func = thrust;
+
+#endif
+
+template <typename Iterator>
+class row_repeater {
+  public:
+
+    typedef typename iterlib::iterator_difference<Iterator>::type difference_type;
+
+    struct repeater_functor : public func::unary_function<difference_type,difference_type> {
+        difference_type cols;
+
+        repeater_functor(difference_type cols)
+            : cols(cols) {}
+
+#ifndef NO_CUDA
+        __host__ __device__
+#endif
+        difference_type operator()(const difference_type& i) const { 
+            return i % cols;
+        }
+    };
+
+    typedef typename iterlib::counting_iterator<difference_type> CountingIterator;
+    typedef typename iterlib::transform_iterator<repeater_functor, CountingIterator> TransformIterator;
+    typedef typename iterlib::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
+
+    typedef PermutationIterator iterator;
+
+    row_repeater(Iterator first, Iterator last, difference_type cols)
+        : first(first), last(last), cols(cols) {}
+   
+    iterator begin(void) const {
+        return PermutationIterator(first, TransformIterator(CountingIterator(0), repeater_functor(cols)));
+    }
+
+    iterator end(void) const {
+        return begin() + (last - first) * cols;
+    }
+    
+  protected:
+    Iterator first;
+    Iterator last;
+    difference_type cols;
+};
+
+template <typename Iterator>
+class col_repeater {
+  public:
+
+    typedef typename iterlib::iterator_difference<Iterator>::type difference_type;
+
+    struct repeater_functor : public func::unary_function<difference_type,difference_type> {
+        difference_type cols;
+
+        repeater_functor(difference_type cols)
+            : cols(cols) {}
+
+#ifndef NO_CUDA
+        __host__ __device__
+#endif
+        difference_type operator()(const difference_type& i) const { 
+            return i / cols;
+        }
+    };
+
+    typedef typename iterlib::counting_iterator<difference_type> CountingIterator;
+    typedef typename iterlib::transform_iterator<repeater_functor, CountingIterator> TransformIterator;
+    typedef typename iterlib::permutation_iterator<Iterator,TransformIterator> PermutationIterator;
+
+    typedef PermutationIterator iterator;
+
+    col_repeater(Iterator first, Iterator last, difference_type cols)
+        : first(first), last(last), cols(cols) {}
+   
+    iterator begin(void) const {
+        return PermutationIterator(first, TransformIterator(CountingIterator(0), repeater_functor(cols)));
+    }
+
+    iterator end(void) const {
+        return begin() + (last - first) * cols;
+    }
+    
+  protected:
+    Iterator first;
+    Iterator last;
+    difference_type cols;
+};
+
+template <typename Iterator>
+class strided_range
+{
+    public:
+
+    typedef typename iterlib::iterator_difference<Iterator>::type difference_type;
+
+    struct stride_functor : public func::unary_function<difference_type,difference_type>
+    {
+        difference_type stride;
+
+        stride_functor(difference_type stride)
+            : stride(stride) {}
+
+#ifndef NO_CUDA
+        __host__ __device__
+#endif
+        difference_type operator()(const difference_type& i) const
+        { 
+            return stride * i;
+        }
+    };
+
+    typedef typename iterlib::counting_iterator<difference_type>                   CountingIterator;
+    typedef typename iterlib::transform_iterator<stride_functor, CountingIterator> TransformIterator;
+    typedef typename iterlib::permutation_iterator<Iterator,TransformIterator>     PermutationIterator;
+
+    // type of the strided_range iterator
+    typedef PermutationIterator iterator;
+
+    // construct strided_range for the range [first,last)
+    strided_range(Iterator first, Iterator last, difference_type stride)
+        : first(first), last(last), stride(stride) {}
+   
+    iterator begin(void) const
+    {
+        return PermutationIterator(first, TransformIterator(CountingIterator(0), stride_functor(stride)));
+    }
+
+    iterator end(void) const
+    {
+        return begin() + ((last - first) + (stride - 1)) / stride;
+    }
+    
+    protected:
+    Iterator first;
+    Iterator last;
+    difference_type stride;
+};
diff --git a/src/mblas/thrust_functions.h b/src/mblas/thrust_functions.h
new file mode 100644
index 00000000..8a813d5d
--- /dev/null
+++ b/src/mblas/thrust_functions.h
@@ -0,0 +1,77 @@
+#pragma once
+
+#include <cmath>
+#include <cublas_v2.h>   
+#include <thrust/device_vector.h>
+#include <thrust/functional.h>
+
+namespace thrust
+{
+  namespace detail
+  {
+    namespace functional
+    {
+    
+      template<typename T>
+      struct unary_exp : public thrust::unary_function<T,T> {
+        __host__ __device__
+        T operator()(const T &x) const { return expf(x); }
+      };
+      
+      template<typename Eval>
+      __host__ __device__
+      actor<composite<unary_operator<unary_exp>, actor<Eval>>>
+      Exp(const actor<Eval> &_1) {
+        return compose(unary_operator<unary_exp>(), _1);
+      }
+      
+      template<typename T>
+      struct unary_log : public thrust::unary_function<T,T> {
+        __host__ __device__
+        T operator()(const T &x) const { return logf(x); }
+      };
+      
+      template<typename Eval>
+      __host__ __device__
+      actor<composite<unary_operator<unary_log>, actor<Eval>>>
+      Log(const actor<Eval> &_1) {
+        return compose(unary_operator<unary_log>(), _1);
+      } 
+      
+      template<typename T>
+      struct unary_logit : public thrust::unary_function<T,T> {
+        __host__ __device__
+        T operator()(const T &x) const { return 1.0 / (1.0 + expf(-x)); }
+      };
+      
+      template<typename Eval>
+      __host__ __device__
+      actor<composite<unary_operator<unary_logit>, actor<Eval>>>
+      Logit(const actor<Eval> &_1) {
+        return compose(unary_operator<unary_logit>(), _1);
+      }
+      
+      template<typename T>
+      struct unary_tanh : public thrust::unary_function<T,T> {
+        __host__ __device__
+        T operator()(const T &x) const { return tanhf(x); }
+      };
+      
+      template<typename Eval>
+      __host__ __device__
+      actor<composite<unary_operator<unary_tanh>, actor<Eval>>>
+      Tanh(const actor<Eval> &_1) {
+        return compose(unary_operator<unary_tanh>(), _1);
+      }
+      
+      template<typename T1, typename T2>
+      __host__ __device__
+      actor<composite<binary_operator<thrust::maximum>, actor<T1>, actor<T2>>>
+      Max(const actor<T1> &_1, const actor<T2> &_2) {
+        return compose(binary_operator<thrust::maximum>(),
+                       make_actor(_1),
+                       make_actor(_2));
+      }
+    }
+  }
+}
+\ No newline at end of file
diff --git a/src/plugin/nmt.cu b/src/plugin/nmt.cu
new file mode 100644
index 00000000..39e67635
--- /dev/null
+++ b/src/plugin/nmt.cu
@@ -0,0 +1,259 @@
+#include <cstdlib>
+#include <iostream>
+#include <algorithm>
+#include <string>
+#include <boost/timer/timer.hpp>
+
+#include "nmt.h"
+#include "mblas/matrix.h"
+#include "dl4mt.h"
+#include "common/vocab.h"
+#include "common/states.h"
+
+
+using namespace mblas;
+
+NMT::NMT(const boost::shared_ptr<Weights> model,
+         const boost::shared_ptr<Vocab> src,
+         const boost::shared_ptr<Vocab> trg)
+  : debug_(false), w_(model), src_(src), trg_(trg),
+    encoder_(new Encoder(*w_)), decoder_(new Decoder(*w_)),
+    states_(new States()), firstWord_(true)
+  {
+    for(size_t i = 0; i < trg_->size(); ++i)
+      filteredId_.push_back(i);
+  }
+  
+void NMT::PrintState(StateInfoPtr ptr) {
+  std::cerr << *ptr << std::endl;
+}
+
+size_t NMT::GetDevices(size_t maxDevices) {
+  int num_gpus = 0;   // number of CUDA GPUs
+  cudaGetDeviceCount(&num_gpus);
+  std::cerr << "Number of CUDA devices: " << num_gpus << std::endl;
+  
+  for (int i = 0; i < num_gpus; i++) {
+      cudaDeviceProp dprop;
+      cudaGetDeviceProperties(&dprop, i);
+      std::cerr << i << ": " << dprop.name << std::endl;
+  }
+  return (size_t)std::min(num_gpus, (int)maxDevices);
+}
+
+void NMT::SetDevice() {
+  cudaSetDevice(w_->GetDevice());
+  CublasHandler::StaticHandle();
+}
+
+
+size_t NMT::GetDevice() {
+  return w_->GetDevice();
+}
+
+void NMT::ClearStates() { 
+  firstWord_ = true;
+  states_->Clear();
+}
+
+boost::shared_ptr<Weights> NMT::NewModel(const std::string& path, size_t device) {
+  std::cerr << "Got device " << device << std::endl;
+  cudaSetDevice(device);
+  CublasHandler::StaticHandle();
+  boost::shared_ptr<Weights> weights(new Weights(path, device));
+  return weights;
+}
+
+boost::shared_ptr<Vocab> NMT::NewVocab(const std::string& path) {
+  boost::shared_ptr<Vocab> vocab(new Vocab(path));
+  return vocab;
+}
+
+size_t NMT::TargetVocab(const std::string& str) {
+  return (*trg_)[str];
+}
+
+void NMT::CalcSourceContext(const std::vector<std::string>& s) {  
+  std::vector<size_t> words(s.size());
+  std::transform(s.begin(), s.end(), words.begin(),
+                 [&](const std::string& w) { return (*src_)[w]; });
+  words.push_back((*src_)["</s>"]);
+  
+  SourceContext_.reset(new Matrix());
+  Matrix& SC = *boost::static_pointer_cast<Matrix>(SourceContext_);
+  encoder_->GetContext(words, SC);
+}
+
+StateInfoPtr NMT::EmptyState() {
+  Matrix& SC = *boost::static_pointer_cast<Matrix>(SourceContext_);
+  Matrix Empty;
+  decoder_->EmptyState(Empty, SC, 1);
+  std::vector<StateInfoPtr> infos;
+  states_->SaveStates(infos, Empty);
+  return infos.back();
+}
+
+void NMT::FilterTargetVocab(const std::set<std::string>& filter) {
+  filteredId_.clear();
+  filteredId_.resize(trg_->size(), 1); // set all to UNK
+  
+  std::vector<size_t> numericFilter;
+  size_t k = 0;
+  for(auto& s : filter) {
+    size_t id = (*trg_)[s];
+    numericFilter.push_back(id);
+    filteredId_[id] = k;
+    k++;
+  }
+  // eol
+  numericFilter.push_back(numericFilter.size());
+  decoder_->Filter(numericFilter);
+}
+
+void NMT::BatchSteps(const Batches& batches, LastWords& lastWords,
+                     Scores& probsOut, Scores& unksOut, StateInfos& stateInfos,
+                     bool firstWord) {
+  Matrix& sourceContext = *boost::static_pointer_cast<Matrix>(SourceContext_);
+
+  Matrix prevEmbeddings;
+  Matrix nextEmbeddings;
+  Matrix prevStates;
+  Matrix probs;
+  Matrix nextStates;
+
+  if(firstWord) {
+    decoder_->EmptyEmbedding(prevEmbeddings, lastWords.size());
+  }
+  else {
+    // Not the first word
+    decoder_->Lookup(prevEmbeddings, lastWords);
+  }
+
+  states_->ConstructStates(prevStates, stateInfos);
+
+  for(auto& batch : batches) {
+    decoder_->MakeStep(nextStates, nextEmbeddings, probs,
+                       batch, prevStates, prevEmbeddings, sourceContext);
+
+    StateInfos tempStates;
+    states_->SaveStates(tempStates, nextStates);
+
+    for(size_t i = 0; i < batch.size(); ++i) {
+      if(batch[i] != 0) {
+        float p = probs(i, filteredId_[batch[i]]);
+        probsOut[i] += log(p);
+        stateInfos[i] = tempStates[i];
+      }
+      if(batch[i] == 1) {
+        unksOut[i]++;
+      }
+    }
+    Swap(nextStates, prevStates);
+    Swap(nextEmbeddings, prevEmbeddings);
+  }
+}
+
+void NMT::OnePhrase(
+  const std::vector<std::string>& phrase,
+  const std::string& lastWord,
+  bool firstWord,
+  StateInfoPtr inputState,
+  float& prob, size_t& unks,
+  StateInfoPtr& outputState) {
+  
+  Matrix& sourceContext = *boost::static_pointer_cast<Matrix>(SourceContext_);
+  
+  Matrix prevEmbeddings;
+  Matrix nextEmbeddings;
+  Matrix prevStates;
+  Matrix probs;
+  Matrix alignedSourceContext;
+  Matrix nextStates;
+    
+  if(firstWord) {
+    decoder_->EmptyEmbedding(prevEmbeddings, 1);
+  }
+  else {
+    // Not the first word
+    std::vector<size_t> ids = { (*trg_)[lastWord] };
+    decoder_->Lookup(prevEmbeddings, ids);
+  }
+    
+  std::vector<StateInfoPtr> inputStates = { inputState };
+  states_->ConstructStates(prevStates, inputStates);
+    
+  for(auto& w : phrase) {
+    size_t id = (*trg_)[w];
+    std::vector<size_t> nextIds = { id };
+    if(id == 1)
+      unks++;
+    
+    decoder_->MakeStep(nextStates, nextEmbeddings, probs,
+                       nextIds, prevStates, prevEmbeddings, sourceContext);
+    
+    float p = probs(0, filteredId_[id]);
+    prob += log(p);
+    
+    Swap(nextStates, prevStates);
+    Swap(nextEmbeddings, prevEmbeddings);
+  }
+  
+  std::vector<StateInfoPtr> outputStates;
+  states_->SaveStates(outputStates, prevStates);
+  outputState = outputStates.back();
+}
+
+void NMT::MakeStep(
+  const std::vector<std::string>& nextWords,
+  const std::vector<std::string>& lastWords,
+  std::vector<StateInfoPtr>& inputStates,
+  std::vector<double>& logProbs,
+  std::vector<StateInfoPtr>& outputStates,
+  std::vector<bool>& unks) {
+  
+  Matrix& sourceContext = *boost::static_pointer_cast<Matrix>(SourceContext_);
+  
+  Matrix lastEmbeddings;
+  if(firstWord_) {
+    firstWord_ = false;
+    // Only empty state in state cache, so this is the first word
+    decoder_->EmptyEmbedding(lastEmbeddings, lastWords.size());
+  }
+  else {
+    // Not the first word
+    std::vector<size_t> lastIds(lastWords.size());
+    std::transform(lastWords.begin(), lastWords.end(), lastIds.begin(),
+                   [&](const std::string& w) { return (*trg_)[w]; });
+    decoder_->Lookup(lastEmbeddings, lastIds);
+  }
+  
+  Matrix nextEmbeddings;
+  std::vector<size_t> nextIds(nextWords.size());
+  std::transform(nextWords.begin(), nextWords.end(), nextIds.begin(),
+                 [&](const std::string& w) { return (*trg_)[w]; });
+  
+  Matrix prevStates;
+  states_->ConstructStates(prevStates, inputStates);
+
+  Matrix probs;
+  Matrix nextStates;
+  
+  decoder_->MakeStep(nextStates, nextEmbeddings, probs,
+                     nextIds, prevStates, lastEmbeddings, sourceContext);
+  
+  states_->SaveStates(outputStates, nextStates);
+  
+  for(auto id : nextIds) {
+    if(id != 1)
+      unks.push_back(true);
+    else
+      unks.push_back(false);
+  }
+  
+  for(size_t i = 0; i < nextIds.size(); ++i) {
+    float p = probs(i, filteredId_[nextIds[i]]);
+    //float p = probs(i, nextIds[i]);
+    logProbs.push_back(log(p));
+  }
+  
+}
diff --git a/src/plugin/nmt.h b/src/plugin/nmt.h
new file mode 100644
index 00000000..220360fd
--- /dev/null
+++ b/src/plugin/nmt.h
@@ -0,0 +1,95 @@
+#include <iostream>
+#include <algorithm>
+#include <vector>
+#include <set>
+#include <boost/shared_ptr.hpp>
+
+#include "mblas/base_matrix.h"
+
+class Weights;
+class Vocab;
+class Encoder;
+class Decoder;
+class States;
+
+class StateInfo;
+typedef boost::shared_ptr<StateInfo> StateInfoPtr;
+
+typedef std::vector<size_t> Batch;
+typedef std::vector<Batch> Batches;
+typedef std::vector<StateInfoPtr> StateInfos;
+typedef std::vector<float> Scores;
+typedef std::vector<size_t> LastWords;
+
+
+class NMT {
+  public:
+    NMT(const boost::shared_ptr<Weights> model,
+        const boost::shared_ptr<Vocab> src,
+        const boost::shared_ptr<Vocab> trg);
+  
+    const boost::shared_ptr<Weights> GetModel() {
+      return w_;
+    }
+  
+    static size_t GetDevices(size_t = 1);
+    void SetDevice();
+    size_t GetDevice();
+  
+    void SetDebug(bool debug) {
+      debug_ = debug;
+    }
+  
+    static boost::shared_ptr<Weights> NewModel(const std::string& path, size_t device = 0);
+  
+    static boost::shared_ptr<Vocab> NewVocab(const std::string& path);
+  
+    void CalcSourceContext(const std::vector<std::string>& s);
+    
+    StateInfoPtr EmptyState();
+    
+    void PrintState(StateInfoPtr);
+    
+    void FilterTargetVocab(const std::set<std::string>& filter);
+    
+    size_t TargetVocab(const std::string& str);
+    
+    void BatchSteps(const Batches& batches, LastWords& lastWords,
+                    Scores& probs, Scores& unks, StateInfos& stateInfos,
+                    bool firstWord);
+  
+    void OnePhrase(
+      const std::vector<std::string>& phrase,
+      const std::string& lastWord,
+      bool firstWord,
+      StateInfoPtr inputState,
+      float& prob, size_t& unks,
+      StateInfoPtr& outputState);
+
+    void MakeStep(
+      const std::vector<std::string>& nextWords,
+      const std::vector<std::string>& lastWords,
+      std::vector<StateInfoPtr>& inputStates,
+      std::vector<double>& logProbs,
+      std::vector<StateInfoPtr>& nextStates,
+      std::vector<bool>& unks);
+  
+    void ClearStates();
+    
+  private:
+    bool debug_;
+    
+    const boost::shared_ptr<Weights> w_;
+    const boost::shared_ptr<Vocab> src_;
+    const boost::shared_ptr<Vocab> trg_;
+    
+    boost::shared_ptr<Encoder> encoder_;
+    boost::shared_ptr<Decoder> decoder_;
+    
+    boost::shared_ptr<mblas::BaseMatrix> SourceContext_;
+    
+    boost::shared_ptr<States> states_;
+    bool firstWord_;
+    
+    std::vector<size_t> filteredId_;
+};
diff --git a/src/rescorer/nbest.cpp b/src/rescorer/nbest.cpp
new file mode 100644
index 00000000..e7135fa6
--- /dev/null
+++ b/src/rescorer/nbest.cpp
@@ -0,0 +1,128 @@
+#include "nbest.h"
+
+#include <algorithm>
+
+#include "utils.h"
+#include "vocab.h"
+
+NBest::NBest(
+    const std::string& srcPath,
+    const std::string& nbestPath,
+    const std::shared_ptr<Vocab> srcVocab,
+    const std::shared_ptr<Vocab> trgVocab,
+    const size_t maxBatchSize)
+    : srcVocab_(srcVocab),
+      trgVocab_(trgVocab),
+      maxBatchSize_(maxBatchSize) {
+  ParseInputFile(srcPath);
+  Parse_(nbestPath);
+}
+
+void NBest::ParseInputFile(const std::string& path) {
+    std::ifstream file(path);
+    srcSentences_.clear();
+    std::string line;
+    while (std::getline(file, line).good()) {
+      Trim(line);
+      srcSentences_.push_back(line);
+    }
+}
+
+std::string NBest::GetSentence(const size_t index) const {
+  return srcSentences_[index];
+}
+
+std::vector<std::string> NBest::GetTokens(const size_t index) const {
+  std::vector<std::string> tokens;
+  Split(srcSentences_[index], tokens);
+  return tokens;
+}
+
+std::vector<size_t> NBest::GetEncodedTokens(const size_t index) const {
+  std::vector<std::string> tokens;
+  Split(srcSentences_[index], tokens);
+  return srcVocab_->Encode(tokens, true);
+}
+
+void NBest::Parse_(const std::string& path) {
+  std::ifstream file(path);
+
+  std::string line;
+  size_t lineCount = 0;
+  indexes_.push_back(lineCount);
+
+  while (std::getline(file, line).good()) {
+    boost::trim(line);
+    std::vector<std::string> fields;
+    Split(line, fields, " ||| ");
+    if (lineCount && (data_.back()[0] != fields[0])) {
+      indexes_.push_back(lineCount);
+    }
+    data_.push_back(fields);
+    ++lineCount;
+  }
+  indexes_.push_back(data_.size());
+}
+
+
+inline std::vector< std::vector< std::string > > NBest::SplitBatch(std::vector<std::string>& batch) const {
+  std::vector< std::vector< std::string > > splittedBatch;
+  for (auto& sentence : batch) {
+    Trim(sentence);
+    std::vector<std::string> words;
+    Split(sentence, words);
+    splittedBatch.push_back(words);
+  }
+  return splittedBatch;
+}
+
+inline Batch NBest::EncodeBatch(const std::vector<std::vector<std::string>>& batch) const {
+  Batch encodedBatch;
+  for (auto& sentence: batch) {
+    encodedBatch.push_back(trgVocab_->Encode(sentence, true));
+  }
+  return encodedBatch;
+}
+
+inline Batch NBest::MaskAndTransposeBatch(const Batch& batch) const {
+  size_t maxLength = 0;
+  for (auto& sentence: batch) {
+    maxLength = std::max(maxLength, sentence.size());
+  }
+  Batch masked;
+  for (size_t i = 0; i < maxLength; ++i) {
+      masked.emplace_back(batch.size(), 0);
+      for (size_t j = 0; j < batch.size(); ++j) {
+          if (i < batch[j].size()) {
+              masked[i][j] = batch[j][i];
+          }
+      }
+  }
+  return masked;
+}
+
+
+Batch NBest::ProcessBatch(std::vector<std::string>& batch) const {
+  return MaskAndTransposeBatch(EncodeBatch(SplitBatch(batch)));
+}
+
+std::vector<Batch> NBest::GetBatches(const size_t index) const {
+  std::vector<Batch> batches;
+  std::vector<std::string> sBatch;
+  for (size_t i = indexes_[index]; i <= indexes_[index+1]; ++i) {
+    if (sBatch.size() == maxBatchSize_ || i == indexes_[index+1]) {
+      batches.push_back(ProcessBatch(sBatch));
+      sBatch.clear();
+      if (i == indexes_[index+1]) {
+        break;
+      }
+    }
+    sBatch.push_back(data_[i][1]);
+  }
+  return batches;
+}
+
+
+size_t NBest::size() const {
+  return indexes_.size() - 1;
+}
diff --git a/src/rescorer/nbest.h b/src/rescorer/nbest.h
new file mode 100644
index 00000000..7c91aeec
--- /dev/null
+++ b/src/rescorer/nbest.h
@@ -0,0 +1,57 @@
+#pragma once
+
+#include <string>
+#include <vector>
+#include <memory>
+#include <cmath>
+
+class Vocab;
+
+typedef std::vector <std::vector < size_t > > Batch;
+
+class NBest {
+  public:
+    NBest(
+      const std::string& srcPath,
+      const std::string& nbestPath,
+      const std::shared_ptr<Vocab> srcVocab,
+      const std::shared_ptr<Vocab> trgVocab,
+      const size_t maxBatchSize=64);
+
+    std::vector<Batch> GetBatches(const size_t index) const;
+
+    size_t GetIndex(const size_t index) const {
+      return indexes_[index];
+    }
+
+    std::vector<std::string> operator[](size_t index) const {
+      return data_[index];
+    }
+
+    size_t size() const;
+
+    std::string GetSentence(const size_t index) const;
+
+    std::vector<std::string> GetTokens(const size_t index) const;
+
+    std::vector<size_t> GetEncodedTokens(const size_t index) const;
+
+  private:
+    void Parse_(const std::string& path);
+    std::vector<std::vector<std::string>> SplitBatch(std::vector<std::string>& batch) const;
+    void ParseInputFile(const std::string& path);
+
+    Batch EncodeBatch(const std::vector<std::vector<std::string>>& batch) const;
+
+    Batch MaskAndTransposeBatch(const Batch& batch) const;
+
+    Batch ProcessBatch(std::vector<std::string>& batch) const;
+  private:
+    std::vector<std::vector<std::string> > data_;
+    std::vector<std::string> srcSentences_;
+    std::shared_ptr<Vocab> srcVocab_;
+    std::shared_ptr<Vocab> trgVocab_;
+    std::vector<size_t> indexes_;
+    const size_t maxBatchSize_;
+
+};
diff --git a/src/rescorer/rescorer.h b/src/rescorer/rescorer.h
new file mode 100644
index 00000000..536045b5
--- /dev/null
+++ b/src/rescorer/rescorer.h
@@ -0,0 +1,90 @@
+
+#include <iostream>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dl4mt.h"
+#include "vocab.h"
+#include "utils.h"
+#include "states.h"
+#include "nbest.h"
+
+class Rescorer {
+  public:
+  Rescorer(
+    const std::shared_ptr<Weights> model,
+    const std::shared_ptr<NBest> nBest,
+    const std::string& featureName)
+      : model_(model),
+        nbest_(nBest),
+        encoder_(new Encoder(*model)),
+        decoder_(new Decoder(*model)),
+        featureName_(featureName) {
+  }
+
+  void Score(const size_t index) {
+    std::vector<size_t> sIndexes = nbest_->GetEncodedTokens(index);
+
+    encoder_->GetContext(sIndexes, SourceContext_);
+    size_t batchIndex = 0;
+    for(auto& batch: nbest_->GetBatches(index)) {
+      const auto scores = ScoreBatch(batch);
+      for (size_t j = 0; j < batch[0].size(); ++j) {
+        std::cout
+          << (*nbest_)[nbest_->GetIndex(index) + batchIndex + j][0] << " ||| "
+          << (*nbest_)[nbest_->GetIndex(index) + batchIndex + j][1] << " ||| "
+          << (*nbest_)[nbest_->GetIndex(index) + batchIndex + j][2] << " "
+          << featureName_ << "= " << scores[j] << " ||| "
+          << (*nbest_)[nbest_->GetIndex(index) + batchIndex + j][3]
+          << std::endl;
+      }
+      batchIndex += batch[0].size();
+    }
+  }
+
+
+  private:
+  std::vector<float> ScoreBatch(
+        const std::vector<std::vector<size_t>>& batch) {
+      size_t batchSize = batch[0].size();
+
+      decoder_->EmptyState(PrevState_, SourceContext_, batchSize);
+      decoder_->EmptyEmbedding(PrevEmbedding_, batchSize);
+
+      std::vector<float> scores(batch[0].size(), 0.0f);
+      size_t lengthIndex = 0;
+      for (auto& w : batch) {
+        decoder_->MakeStep(State_, Probs_,
+                           PrevState_, PrevEmbedding_, SourceContext_);
+
+        for (size_t j = 0; j < w.size(); ++j) {
+          if (batch[lengthIndex][j]) {
+            float p = Probs_(j, w[j]);
+            scores[j] += log(p);
+          }
+        }
+
+        decoder_->Lookup(Embedding_, w);
+        
+        mblas::Swap(State_, PrevState_);
+        mblas::Swap(Embedding_, PrevEmbedding_);
+        ++lengthIndex;
+      }
+      return scores;
+    }
+
+  private:
+    std::shared_ptr<Weights> model_;
+    std::shared_ptr<NBest> nbest_;
+    std::shared_ptr<Encoder> encoder_;
+    std::shared_ptr<Decoder> decoder_;
+    const std::string& featureName_;
+
+    mblas::Matrix SourceContext_;
+    mblas::Matrix PrevState_;
+    mblas::Matrix PrevEmbedding_;
+    mblas::Matrix Probs_;
+    mblas::Matrix State_;
+    mblas::Matrix Embedding_;
+};
diff --git a/src/rescorer/rescorer_main.cu b/src/rescorer/rescorer_main.cu
new file mode 100644
index 00000000..a5b5fb3a
--- /dev/null
+++ b/src/rescorer/rescorer_main.cu
@@ -0,0 +1,105 @@
+#include <iostream>
+#include <string>
+#include <memory>
+#include <iomanip>
+
+#include <boost/program_options/options_description.hpp>
+#include <boost/program_options/parsers.hpp>
+#include <boost/program_options/variables_map.hpp>
+#include <boost/timer/timer.hpp>
+#include <boost/chrono/duration.hpp>
+
+typedef boost::chrono::duration<double> sec;
+
+#include "nbest.h"
+#include "vocab.h"
+#include "rescorer.h"
+
+void ProgramOptions(int argc, char *argv[],
+    std::string& modelPath,
+    std::string& svPath,
+    std::string& tvPath,
+    std::string& corpusPath,
+    std::string& nbestPath,
+    std::string& fname,
+    size_t& maxBatchSize,
+    size_t& device) {
+  bool help = false;
+
+  namespace po = boost::program_options;
+  po::options_description cmdline_options("Allowed options");
+  cmdline_options.add_options()
+    ("device,d", po::value(&device)->default_value(0),
+     "CUDA Device")
+    ("batch,b", po::value(&maxBatchSize)->default_value(80),
+     "Max batch size")
+    ("model,m", po::value(&modelPath)->required(),
+     "Path to a model")
+    ("source,s", po::value(&svPath)->required(),
+     "Path to a source vocab file.")
+    ("target,t", po::value(&tvPath)->required(),
+     "Path to a target vocab file.")
+    ("input,i", po::value(&corpusPath)->required(),
+     "Path to the input of the nbest file.")
+    ("n-best,n", po::value(&nbestPath)->required(),
+     "Path to an nbest file.")
+    ("feature-name,f", po::value(&fname)->default_value("NMT0"),
+     "Feature name")
+    ("help,h", po::value(&help)->zero_tokens()->default_value(false),
+     "Print this help message and exit.")
+  ;
+  po::variables_map vm;
+  try {
+    po::store(po::command_line_parser(argc, argv).
+              options(cmdline_options).run(), vm);
+    po::notify(vm);
+  } catch (std::exception& e) {
+    std::cout << "Error: " << e.what() << std::endl << std::endl;
+
+    std::cout << "Usage: " + std::string(argv[0]) +  " [options]" << std::endl;
+    std::cout << cmdline_options << std::endl;
+    exit(0);
+  }
+
+  if (help) {
+    std::cout << "Usage: " + std::string(argv[0]) +  " [options]" << std::endl;
+    std::cout << cmdline_options << std::endl;
+    exit(0);
+  }
+}
+
+int main(int argc, char* argv[]) {
+  std::string modelPath, svPath, tvPath, corpusPath, nbestPath, fname;
+
+  size_t device;
+  size_t maxBatchSize;
+  ProgramOptions(argc, argv, modelPath, svPath,tvPath, corpusPath, nbestPath,
+                 fname, maxBatchSize, device);
+  std::cerr << "Using device: " << device << std::endl;
+  cudaSetDevice(device);
+
+  std::cerr << "Loading model: " << modelPath << std::endl;
+  std::shared_ptr<Weights> weights(new Weights(modelPath, device));
+  std::shared_ptr<Vocab> srcVocab(new Vocab(svPath));
+  std::shared_ptr<Vocab> trgVocab(new Vocab(tvPath));
+
+  std::cerr << "Loading nbest list: " << nbestPath << std::endl;
+  std::shared_ptr<NBest> nbest(new NBest(corpusPath,nbestPath, srcVocab, trgVocab, maxBatchSize));
+
+  std::cerr << "Creating rescorer..." << std::endl;
+  std::shared_ptr<Rescorer> rescorer(new Rescorer(weights, nbest, fname));
+
+  boost::timer::cpu_timer timer;
+  std::cerr << std::fixed << std::setprecision(2);
+  std::cerr << "Rescoring..." << std::endl;
+  for (size_t i = 0; i < nbest->size(); ++i) {
+    rescorer->Score(i);
+    std::cerr << ".";
+    if((i + 1) % 50 == 0) {
+        sec seconds = boost::chrono::nanoseconds(timer.elapsed().user);
+        std::cerr << "[" << i + 1 << " / " << seconds.count() << " s = " << (i+1)/seconds.count() << " sent. per s]" << std::endl;
+    }
+  }
+
+  return 0;
+}
diff --git a/src/test/test.cu b/src/test/test.cu
new file mode 100644
index 00000000..e00dd952
--- /dev/null
+++ b/src/test/test.cu
@@ -0,0 +1,102 @@
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <boost/timer/timer.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include "mblas/matrix.h"
+#include "bahdanau.h"
+#include "vocab.h"
+
+#include "states.h"
+
+using namespace mblas;
+
+int main(int argc, char** argv) {
+  size_t device = 0;
+  
+  if(argc > 1) {
+    if(std::string(argv[1]) == "1")
+      device = 1;
+    else if(std::string(argv[1]) == "2")
+      device = 2;
+  }
+  
+  std::cerr << device << std::endl;
+  cudaSetDevice(device);
+  
+  std::string source = "thank you .";
+  std::string target = "vielen dank .";
+  //std::string source = "you know , one of the intense pleasures of travel and one of the delights of ethnographic research is the opportunity to live amongst those who have not forgotten the old ways , who still feel their past in the wind , touch it in stones polished by rain , taste it in the bitter leaves of plants .";
+  //std::string target = "wissen sie , eine der intensiven freuden des reisens und eine der freuden der ethnografischen forschung ist die chance zu leben unter jenen , die die alten wege nicht vergessen haben , die immer noch ihre vergangenheit im wind spüren , berühren sie in steine poliert durch regen , schmecken sie in den bitteren blätter der pflanzen .";
+  
+  std::cerr << "Loading model" << std::endl;
+  Weights weights("/home/marcinj/Badania/best_nmt/search_model.npz", device);
+  Vocab svcb("/home/marcinj/Badania/best_nmt/vocab/en_de.en.txt");
+  Vocab tvcb("/home/marcinj/Badania/best_nmt/vocab/en_de.de.txt");
+  
+  std::cerr << "Creating encoder" << std::endl;
+  Encoder encoder(weights);
+  std::cerr << "Creating decoder" << std::endl;
+  Decoder decoder(weights);
+  
+  std::vector<std::string> sourceSplit;
+  boost::split(sourceSplit, source, boost::is_any_of(" "),
+               boost::token_compress_on);
+    
+  std::cerr << "Source: " << std::endl;
+  std::vector<size_t> sWords(sourceSplit.size());
+  std::transform(sourceSplit.begin(), sourceSplit.end(), sWords.begin(),
+                 [&](const std::string& w) { std::cerr << svcb[w] << ", "; return svcb[w]; });
+  sWords.push_back(svcb["</s>"]);
+  std::cerr << svcb["</s>"] << std::endl;
+  
+  typedef std::vector<size_t> Batch;
+  
+  std::vector<std::string> targetSplit;
+  boost::split(targetSplit, target, boost::is_any_of(" "),
+               boost::token_compress_on);
+    
+  std::cerr << "Target: " << std::endl;
+  size_t bs = 1000;
+  std::vector<std::vector<size_t>> tWordsBatch(targetSplit.size());
+  std::transform(targetSplit.begin(), targetSplit.end(), tWordsBatch.begin(),
+                 [&](const std::string& w) { std::cerr << tvcb[w] << ", "; return Batch(bs, tvcb[w]); });
+  tWordsBatch.push_back(Batch(bs, tvcb["</s>"]));
+  std::cerr << tvcb["</s>"] << std::endl;
+
+  mblas::Matrix SourceContext;
+  encoder.GetContext(sWords, SourceContext);
+
+  mblas::Matrix State, NextState;
+  mblas::Matrix Embeddings, NextEmbeddings;
+  mblas::Matrix Probs;
+
+  std::cerr << "Testing" << std::endl;
+  boost::timer::auto_cpu_timer timer;
+  size_t batchSize = tWordsBatch[0].size();
+  
+  for(size_t i = 0; i < 1; ++i) {
+    decoder.EmptyState(State, SourceContext, batchSize);
+    decoder.EmptyEmbedding(Embeddings, batchSize);
+    
+    float sum = 0;
+    for(auto batch : tWordsBatch) {
+      decoder.MakeStep(NextState, NextEmbeddings, Probs,
+                       batch, State, Embeddings, SourceContext);
+
+      for(size_t i = 0; i < 1; ++i) {
+        float p = Probs(i, batch[i]);
+        std:: cerr << log(p) << " ";
+        if(i == 0) {
+          sum += log(p);
+        }
+      }
+
+      mblas::Swap(Embeddings, NextEmbeddings);
+      mblas::Swap(State, NextState);
+    }
+    std::cerr << i << " " << sum << std::endl;
+  }
+}
diff --git a/src/test/test.dl4mt.cu b/src/test/test.dl4mt.cu
new file mode 100644
index 00000000..d0224135
--- /dev/null
+++ b/src/test/test.dl4mt.cu
@@ -0,0 +1,100 @@
+#include <cstdlib>
+#include <iostream>
+#include <fstream>
+#include <algorithm>
+#include <boost/timer/timer.hpp>
+#include <boost/algorithm/string.hpp>
+
+#include "mblas/matrix.h"
+#include "dl4mt.h"
+#include "vocab.h"
+
+using namespace mblas;
+
+int main(int argc, char** argv) {
+  size_t device = 0;
+  
+  if(argc > 1) {
+    if(std::string(argv[1]) == "1")
+      device = 1;
+    else if(std::string(argv[1]) == "2")
+      device = 2;
+  }
+  
+  std::cerr << device << std::endl;
+  cudaSetDevice(device);
+  
+  std::string source = "thank you .";
+  std::string target = "vielen Dank .";
+  
+  std::cerr << "Loading model" << std::endl;
+  Weights weights("testmodel/model.npz", device);
+  
+  Vocab svcb("testmodel/vocab.en.txt");
+  Vocab tvcb("testmodel/vocab.de.txt");
+  
+  std::cerr << "Creating encoder" << std::endl;
+  Encoder encoder(weights);
+
+  std::cerr << "Creating decoder" << std::endl;
+  Decoder decoder(weights);
+    
+  std::vector<std::string> sourceSplit;
+  boost::split(sourceSplit, source, boost::is_any_of(" "),
+               boost::token_compress_on);
+    
+  std::cerr << "Source: " << std::endl;
+  std::vector<size_t> sWords(sourceSplit.size());
+  std::transform(sourceSplit.begin(), sourceSplit.end(), sWords.begin(),
+                 [&](const std::string& w) { std::cerr << svcb[w] << ", "; return svcb[w]; });
+  sWords.push_back(svcb["</s>"]);
+  std::cerr << svcb["</s>"] << std::endl;
+  
+  typedef std::vector<size_t> Batch;
+  
+  std::vector<std::string> targetSplit;
+  boost::split(targetSplit, target, boost::is_any_of(" "),
+               boost::token_compress_on);
+    
+  std::cerr << "Target: " << std::endl;
+  size_t bs = 1000;
+  
+  std::vector<std::vector<size_t>> tWordsBatch(targetSplit.size());
+  std::transform(targetSplit.begin(), targetSplit.end(), tWordsBatch.begin(),
+                 [&](const std::string& w) { std::cerr << tvcb[w] << ", "; return Batch(bs, tvcb[w]); });
+  tWordsBatch.push_back(Batch(bs, tvcb["</s>"]));
+  std::cerr << tvcb["</s>"] << std::endl;
+
+  mblas::Matrix SourceContext;
+  encoder.GetContext(sWords, SourceContext);
+
+  mblas::Matrix State, NextState;
+  mblas::Matrix Embeddings, NextEmbeddings;
+  mblas::Matrix Probs;
+
+  std::cerr << "Testing" << std::endl;
+  boost::timer::auto_cpu_timer timer;
+  size_t batchSize = tWordsBatch[0].size();
+
+  for(size_t i = 0; i < 1; ++i) {
+    decoder.EmptyState(State, SourceContext, batchSize);
+    decoder.EmptyEmbedding(Embeddings, batchSize);
+    
+    float sum = 0;
+    for(auto batch : tWordsBatch) {
+      decoder.MakeStep(NextState, Probs,
+                       State, Embeddings, SourceContext);
+      decoder.Lookup(NextEmbeddings, batch);
+      for(size_t i = 0; i < 1; ++i) {
+        float p = Probs(i, batch[i]);
+        if(i == 0) {
+          sum += log(p);
+        }
+      }
+      
+      mblas::Swap(Embeddings, NextEmbeddings);
+      mblas::Swap(State, NextState);
+    }
+    std::cerr << i << " " << sum << std::endl;
+  }
+}