Merge with master

author: Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk> 2018-08-06 12:57:14 +0300
committer: Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk> 2018-08-06 12:57:14 +0300
commit: eb20e9900fe544d3f38b2b15ae4240d0c4132a17 (patch)
tree: bde75868ca435155153863a2dec7ce4df4dab760
parent: 1a0c4c1d19e1a7d9aaebe82ceb4b9cc68aa5cc51 (diff)
parent: 9834e09a88022819262387af714764934c72caf8 (diff)
40 files changed, 1025 insertions, 577 deletions
diff --git a/cmake/FindNCCL.cmake b/cmake/FindNCCL.cmake
index ab3c55a8..d6100e85 100644
--- a/cmake/FindNCCL.cmake
+++ b/cmake/FindNCCL.cmake
@@ -16,10 +16,19 @@ set(NCCL_LIB_PATHS
     /usr/local/cuda/lib64
     $ENV{NCCL_DIR}/lib64
     $ENV{CUDA_TOOLKIT_ROOT_DIR}/lib64
+    /usr/local/cuda/lib
+    $ENV{NCCL_DIR}/lib
+    $ENV{CUDA_TOOLKIT_ROOT_DIR}/lib
 )
 
 find_path(NCCL_INCLUDE_DIR NAMES nccl.h PATHS ${NCCL_INC_PATHS})
-find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_LIB_PATHS})
+
+if (USE_STATIC_LIBS)
+  message(STATUS "Trying to find static NCCL library")
+  find_library(NCCL_LIBRARIES NAMES libnccl_static.a PATHS ${NCCL_LIB_PATHS})
+else (USE_STATIC_LIBS)
+  find_library(NCCL_LIBRARIES NAMES nccl PATHS ${NCCL_LIB_PATHS})
+endif (USE_STATIC_LIBS)
 
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(NCCL DEFAULT_MSG NCCL_INCLUDE_DIR NCCL_LIBRARIES)
diff --git a/src/3rd_party/cnpy/cnpy.h b/src/3rd_party/cnpy/cnpy.h
index 54886397..ac427890 100644
--- a/src/3rd_party/cnpy/cnpy.h
+++ b/src/3rd_party/cnpy/cnpy.h
@@ -38,6 +38,10 @@ namespace cnpy {
         const char* data() const {
             return bytes.data();
         }
+
+        size_t size() {
+            return bytes.size();
+        }
     };
 
     typedef std::shared_ptr<NpyArray> NpyArrayPtr;
@@ -218,7 +222,8 @@ namespace cnpy {
     struct NpzItem : public NpyArray
     {
         std::string name; //name of item in .npz file (without .npy)
-        char type;        //type of item
+        char type;        // type of item
+
         template<typename T>
         NpzItem(const std::string& name, const std::vector<T>& data, const std::vector<unsigned int>& dataShape) :
             name(name), type(map_type(typeid(T)))
@@ -229,6 +234,26 @@ namespace cnpy {
             auto* p = (const char*)data.data();
             std::copy(p, p + bytes.size(), bytes.begin());
         }
+
+        NpzItem(const std::string& name, const std::string& data, const std::vector<unsigned int>& dataShape) :
+            name(name), type(map_type(typeid(char)))
+        {
+            shape = dataShape;
+            word_size = sizeof(char);
+            std::copy(data.data(), data.data() + data.size() + 1, bytes.begin());
+        }
+
+        NpzItem(const std::string& name,
+                const std::vector<char>& data,
+                const std::vector<unsigned int>& dataShape,
+                char type_, size_t word_size_) :
+            name(name), type(type_)
+        {
+            shape = dataShape;
+            word_size = word_size_;
+            bytes.resize(data.size());
+            std::copy(data.begin(), data.end(), bytes.begin());
+        }
     };
 
     //same as npz_save() except that it saves multiple items to .npz file in a single go, which is required when writing to HDFS
@@ -248,22 +273,22 @@ namespace cnpy {
             auto fname = item.name;
             //first, form a "file name" by appending .npy to the item's name
             fname += ".npy";
-    
+
             const auto* data      = item.bytes.data();
             const auto* shape     = item.shape.data();
             const auto  type      = item.type;
             const auto  word_size = item.word_size;
             const unsigned int ndims = item.shape.size();
             std::vector<char> npy_header = create_npy_header(type,word_size,shape,ndims);
-    
+
             unsigned long nels = 1;
             for (int m=0; m<ndims; m++ ) nels *= shape[m];
             int nbytes = nels*word_size + npy_header.size();
-    
+
             //get the CRC of the data to be added
             unsigned int crc = crc32(0L,(unsigned char*)&npy_header[0],npy_header.size());
             crc = crc32(crc,(unsigned char*)data,nels*word_size);
-    
+
             //build the local header
             local_header.clear();
             local_header += "PK"; //first part of sig
@@ -279,13 +304,13 @@ namespace cnpy {
             local_header += (unsigned short) fname.size(); //fname length
             local_header += (unsigned short) 0; //extra field length
             local_header += fname;
-    
+
             //write everything
             unsigned int local_header_offset = ftell(fp); // this is where this local item will begin in the file. Tis gets stored in the corresponding global header.
             fwrite(&local_header[0],sizeof(char),local_header.size(),fp);
             fwrite(&npy_header[0],sizeof(char),npy_header.size(),fp);
             fwrite(data,word_size,nels,fp);
-    
+
             // append to global header
             // A concatenation of global headers for all objects gets written to the end of the file.
             global_header += "PK"; //first part of sig
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 08ed5399..8708f87f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -9,6 +9,8 @@ add_library(marian STATIC
   common/logging.cpp
   common/config.cpp
   common/config_parser.cpp
+  common/binary.cpp
+  common/io.cpp
 
   data/vocab.cpp
   data/corpus_base.cpp
@@ -102,7 +104,10 @@ set_target_properties(marian_scorer PROPERTIES OUTPUT_NAME marian-scorer)
 add_executable(marian_vocab command/marian_vocab.cpp)
 set_target_properties(marian_vocab PROPERTIES OUTPUT_NAME marian-vocab)
 
-set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab)
+add_executable(marian_conv command/marian_conv.cpp)
+set_target_properties(marian_conv PROPERTIES OUTPUT_NAME marian-conv)
+
+set(EXECUTABLES ${EXECUTABLES} marian_train marian_decoder marian_scorer marian_vocab marian_conv)
 
 # marian.zip and marian.tgz
 # This combines marian, marian_decoder in a single ZIP or TAR file for
@@ -117,7 +122,8 @@ if(USE_STATIC_LIBS)
                 "${CMAKE_BINARY_DIR}/marian-decoder"
                 "${CMAKE_BINARY_DIR}/marian-scorer"
                 "${CMAKE_BINARY_DIR}/marian-vocab"
-    DEPENDS marian_train marian_decoder marian_scorer marian_vocab)
+                "${CMAKE_BINARY_DIR}/marian-conv"
+    DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_mmap)
   add_custom_target(marian_zip DEPENDS "${CMAKE_BINARY_DIR}/marian.zip")
 
   add_custom_command(
@@ -127,7 +133,8 @@ if(USE_STATIC_LIBS)
                 "marian-decoder"
                 "marian-scorer"
                 "marian-vocab"
-    DEPENDS marian_train marian_decoder marian_scorer marian_vocab)
+                "marian-conv"
+    DEPENDS marian_train marian_decoder marian_scorer marian_vocab marian_mmap)
   add_custom_target(marian_tgz DEPENDS "${CMAKE_BINARY_DIR}/marian.tgz")
 
   add_custom_target(philly DEPENDS marian_tgz marian_zip)
diff --git a/src/command/marian_conv.cpp b/src/command/marian_conv.cpp
new file mode 100644
index 00000000..3664a7a2
--- /dev/null
+++ b/src/command/marian_conv.cpp
@@ -0,0 +1,61 @@
+#include "marian.h"
+
+#include <boost/program_options.hpp>
+#include <sstream>
+
+int main(int argc, char** argv) {
+  using namespace marian;
+
+  createLoggers();
+
+  namespace po = boost::program_options;
+  po::options_description desc("Allowed options");
+  // clang-format off
+  desc.add_options()
+    ("from,f", po::value<std::string>()->default_value("model.npz"),
+     "Input model")
+    ("to,t", po::value<std::string>()->default_value("model.bin"),
+     "Output model")
+    ("help,h", "Print this message and exit")
+    ;
+  // clang-format on
+
+  po::variables_map vm;
+  try {
+    po::store(po::parse_command_line(argc, argv, desc), vm);
+    po::notify(vm);
+  } catch(std::exception& e) {
+    std::cerr << "Error: " << e.what() << std::endl << std::endl;
+    std::cerr << "Usage: " << argv[0] << " [options]" << std::endl << std::endl;
+    std::cerr << desc << std::endl;
+    exit(1);
+  }
+
+  if(vm.count("help")) {
+    std::cerr << "Usage: " << argv[0] << " [options]" << std::endl << std::endl;
+    std::cerr << desc << std::endl;
+    exit(0);
+  }
+
+  LOG(info, "Outputting {}", vm["to"].as<std::string>());
+
+  YAML::Node config;
+  std::stringstream configStr;
+  marian::io::getYamlFromModel(config,
+                               "special:model.yml",
+                               vm["from"].as<std::string>());
+  configStr << config;
+
+  auto graph = New<ExpressionGraph>(true, false);
+  graph->setDevice(CPU0);
+
+  graph->load(vm["from"].as<std::string>());
+  graph->forward();
+  graph->save(vm["to"].as<std::string>(), configStr.str());
+
+  //graph->saveBinary(vm["bin"].as<std::string>());
+
+  LOG(info, "Finished");
+
+  return 0;
+}
diff --git a/src/command/marian-main.cpp b/src/command/marian_main.cpp
index 5e428570..5e428570 100644
--- a/src/command/marian-main.cpp
+++ b/src/command/marian_main.cpp
diff --git a/src/common/binary.cpp b/src/common/binary.cpp
new file mode 100644
index 00000000..7f199e15
--- /dev/null
+++ b/src/common/binary.cpp
@@ -0,0 +1,161 @@
+#include "common/binary.h"
+#include "common/definitions.h"
+#include "common/file_stream.h"
+#include "common/definitions.h"
+#include "common/types.h"
+#include "common/io_item.h"
+
+#include <string>
+
+namespace marian {
+namespace io {
+
+namespace binary {
+
+struct Header {
+  size_t nameLength;
+  size_t type;
+  size_t shapeLength;
+  size_t dataLength;
+};
+
+template <typename T>
+const T* get(const void*& current, size_t num = 1) {
+  const T* ptr = (const T*)current;
+  current = (const T*)current + num;
+  return ptr;
+}
+
+void loadItems(const void* current,
+               std::vector<io::Item>& items,
+               bool mapped) {
+
+  size_t binaryFileVersion = *get<size_t>(current);
+  ABORT_IF(binaryFileVersion != BINARY_FILE_VERSION,
+           "Binary file versions do not match: {} (file) != {} (expected)",
+           binaryFileVersion,
+           BINARY_FILE_VERSION);
+
+  size_t numHeaders = *get<size_t>(current);
+  const Header* headers = get<Header>(current, numHeaders);
+
+  items.resize(numHeaders);
+  for(int i = 0; i < numHeaders; ++i) {
+    items[i].type = (Type)headers[i].type;
+    items[i].name = get<char>(current, headers[i].nameLength);
+    items[i].mapped = mapped;
+  }
+
+  for(int i = 0; i < numHeaders; ++i) {
+    size_t len = headers[i].shapeLength;
+    items[i].shape.resize(len);
+    const int* arr = get<int>(current, len);
+    std::copy(arr, arr + len, items[i].shape.begin());
+  }
+
+  // move by offset bytes
+  size_t offset = *get<size_t>(current);
+  get<char>(current, offset);
+
+  for(int i = 0; i < numHeaders; ++i) {
+    if(items[i].mapped) {
+      items[i].ptr = get<char>(current, headers[i].dataLength);
+    } else {
+      size_t len = headers[i].dataLength;
+      items[i].bytes.resize(len);
+      const char* ptr = get<char>(current, len);
+      std::copy(ptr, ptr + len, items[i].bytes.begin());
+    }
+  }
+}
+
+void loadItems(const std::string& fileName,
+               std::vector<io::Item>& items) {
+
+  // Read file into buffer
+  size_t fileSize = boost::filesystem::file_size(fileName);
+  char* ptr = new char[fileSize];
+  InputFileStream in(fileName);
+  in.read(ptr, fileSize);
+
+  // Load items from buffer without mapping
+  loadItems(ptr, items, false);
+
+  // Delete buffer
+  delete[] ptr;
+}
+
+io::Item getItem(const void* current,
+                 const std::string& varName) {
+
+  std::vector<io::Item> items;
+  loadItems(current, items);
+
+  for(auto& item : items)
+    if(item.name == varName)
+      return item;
+
+  return io::Item();
+}
+
+io::Item getItem(const std::string& fileName,
+                 const std::string& varName) {
+
+  std::vector<io::Item> items;
+  loadItems(fileName, items);
+
+  for(auto& item : items)
+    if(item.name == varName)
+      return item;
+
+  return io::Item();
+}
+
+void saveItems(const std::string& fileName,
+               const std::vector<io::Item>& items) {
+  OutputFileStream out(fileName);
+  size_t pos = 0;
+
+  size_t binaryFileVersion = BINARY_FILE_VERSION;
+  pos += out.write(&binaryFileVersion);
+
+  std::vector<Header> headers;
+  for(const auto& item : items) {
+    headers.push_back(Header{item.name.size() + 1,
+                             (size_t)item.type,
+                             item.shape.size(),
+                             item.size()});
+  }
+
+  size_t headerSize = headers.size();
+  pos += out.write(&headerSize);
+  pos += out.write(headers.data(), headers.size());
+
+  // Write out all names
+  for(const auto& item : items) {
+    pos += out.write(item.name.data(), item.name.size() + 1);
+  }
+  // Write out all shapes
+  for(const auto& item : items) {
+    pos += out.write(item.shape.data(), item.shape.size());
+  }
+
+  // align to next 256-byte boundary
+  size_t nextpos = ((pos + sizeof(size_t)) / 256 + 1) * 256;
+  size_t offset = nextpos - pos - sizeof(size_t);
+
+  pos += out.write(&offset);
+  for(size_t i = 0; i < offset; i++) {
+    char padding = 0;
+    pos += out.write(&padding);
+  }
+
+  // Write out all values
+  for(const auto& item : items) {
+    pos += out.write(item.data(), item.size());
+  }
+}
+
+}
+}
+}
diff --git a/src/common/binary.h b/src/common/binary.h
new file mode 100644
index 00000000..5616c56e
--- /dev/null
+++ b/src/common/binary.h
@@ -0,0 +1,26 @@
+#pragma once
+
+#include "common/io_item.h"
+
+#include <string>
+#include <vector>
+
+// Increase this if binary format changes
+#define BINARY_FILE_VERSION 1
+
+namespace marian {
+namespace io {
+
+namespace binary {
+
+void loadItems(const void* current, std::vector<io::Item>& items, bool mapped = false);
+void loadItems(const std::string& fileName, std::vector<io::Item>& items);
+
+io::Item getItem(const void* current, const std::string& vName);
+io::Item getItem(const std::string& fileName, const std::string& vName);
+
+void saveItems(const std::string& fileName, const std::vector<io::Item>& items);
+
+}
+}
+}
diff --git a/src/common/config.cpp b/src/common/config.cpp
index 4a6540b0..f92a01d5 100644
--- a/src/common/config.cpp
+++ b/src/common/config.cpp
@@ -1,5 +1,4 @@
 #include "common/config.h"
-#include "3rd_party/cnpy/cnpy.h"
 #include "common/file_stream.h"
 #include "common/logging.h"
 
@@ -47,42 +46,14 @@ void Config::override(const YAML::Node& params) {
 
 void Config::loadModelParameters(const std::string& name) {
   YAML::Node config;
-  GetYamlFromNpz(config, "special:model.yml", name);
+  io::getYamlFromModel(config, "special:model.yml", name);
   override(config);
 }
 
-void Config::GetYamlFromNpz(YAML::Node& yaml,
-                            const std::string& varName,
-                            const std::string& fName) {
-  yaml = YAML::Load(cnpy::npz_load(fName, varName)->data());
-}
-
-// helper to serialize a YAML::Node to a Yaml string in a 0-terminated character
-// vector
-static std::vector<char> asYamlCharVector(const YAML::Node node) {
-  YAML::Emitter out;
-  OutputYaml(node, out);
-  return std::vector<char>(out.c_str(), out.c_str() + strlen(out.c_str()) + 1);
-}
-
-void Config::AddYamlToNpz(const YAML::Node& yaml,
-                          const std::string& varName,
-                          const std::string& fName) {
-  // YAML::Node's Yaml representation is saved as a 0-terminated char vector to
-  // the NPZ file
-  auto yamlCharVector = asYamlCharVector(yaml);
-  unsigned int shape = yamlCharVector.size();
-  cnpy::npz_save(fName, varName, yamlCharVector.data(), &shape, 1, "a");
+void Config::loadModelParameters(const void* ptr) {
+  YAML::Node config;
+  io::getYamlFromModel(config, "special:model.yml", ptr);
+  override(config);
 }
 
-// same as AddYamlToNpz() but adds to an in-memory NpzItem vector instead
-void Config::AddYamlToNpzItems(const YAML::Node& yaml,
-                               const std::string& varName,
-                               std::vector<cnpy::NpzItem>& allItems) {
-  auto yamlCharVector = asYamlCharVector(yaml);
-  allItems.emplace_back(
-      varName,
-      yamlCharVector,
-      std::vector<unsigned int>{(unsigned int)yamlCharVector.size()});
-}
 }  // namespace marian
diff --git a/src/common/config.h b/src/common/config.h
index c3745191..d8359b24 100644
--- a/src/common/config.h
+++ b/src/common/config.h
@@ -1,10 +1,10 @@
 #pragma once
 
 #include <boost/program_options.hpp>
-#include "3rd_party/cnpy/cnpy.h"
 #include "3rd_party/yaml-cpp/yaml.h"
 #include "common/config_parser.h"
 #include "common/file_stream.h"
+#include "common/io.h"
 #include "common/logging.h"
 #include "common/utils.h"
 #ifndef _WIN32  // TODO: why are these needed by a config parser? Can they be
@@ -114,6 +114,7 @@ public:
 
   YAML::Node getModelParameters();
   void loadModelParameters(const std::string& name);
+  void loadModelParameters(const void* ptr);
 
   const std::vector<DeviceId>& getDevices() { return devices_; }
 
@@ -129,17 +130,6 @@ public:
     return out;
   }
 
-  static void AddYamlToNpz(const YAML::Node&,
-                           const std::string&,
-                           const std::string&);
-  static void AddYamlToNpzItems(const YAML::Node&,
-                                const std::string&,
-                                std::vector<cnpy::NpzItem>&);
-
-  static void GetYamlFromNpz(YAML::Node&,
-                             const std::string&,
-                             const std::string&);
-
 private:
   YAML::Node config_;
   std::vector<DeviceId> devices_;
diff --git a/src/common/config_parser.cpp b/src/common/config_parser.cpp
index d40c078c..a4bb64fc 100644
--- a/src/common/config_parser.cpp
+++ b/src/common/config_parser.cpp
@@ -14,7 +14,6 @@
 #endif
 #endif
 
-#include "3rd_party/cnpy/cnpy.h"
 #include "common/definitions.h"
 
 #include "common/config.h"
diff --git a/src/common/definitions.h b/src/common/definitions.h
index 293d0492..0b71a530 100644
--- a/src/common/definitions.h
+++ b/src/common/definitions.h
@@ -60,6 +60,25 @@ struct DeviceId {
   friend bool operator!=(DeviceId id1, DeviceId id2) { return !(id1 == id2); }
 };
 
+// predefine a couple of devices for easier manual use
+const DeviceId CPU0{0, DeviceType::cpu};
+const DeviceId CPU1{1, DeviceType::cpu};
+const DeviceId CPU2{2, DeviceType::cpu};
+const DeviceId CPU3{3, DeviceType::cpu};
+const DeviceId CPU4{4, DeviceType::cpu};
+const DeviceId CPU5{5, DeviceType::cpu};
+const DeviceId CPU6{6, DeviceType::cpu};
+const DeviceId CPU7{7, DeviceType::cpu};
+
+const DeviceId GPU0{0, DeviceType::gpu};
+const DeviceId GPU1{1, DeviceType::gpu};
+const DeviceId GPU2{2, DeviceType::gpu};
+const DeviceId GPU3{3, DeviceType::gpu};
+const DeviceId GPU4{4, DeviceType::gpu};
+const DeviceId GPU5{5, DeviceType::gpu};
+const DeviceId GPU6{6, DeviceType::gpu};
+const DeviceId GPU7{7, DeviceType::gpu};
+
 class TensorBase;
 typedef Ptr<TensorBase> Tensor;
 
diff --git a/src/common/file_stream.h b/src/common/file_stream.h
index 4dcfd264..4236492b 100644
--- a/src/common/file_stream.h
+++ b/src/common/file_stream.h
@@ -116,6 +116,12 @@ public:
     return stream;
   }
 
+  template <typename T>
+  size_t read(T* ptr, size_t num = 1) {
+    istream_.read((char*)ptr, num * sizeof(T));
+    return num * sizeof(T);
+  }
+
   std::string path() { return file_.string(); }
 
   bool empty() { return ifstream_.peek() == std::ifstream::traits_type::eof(); }
@@ -156,6 +162,12 @@ public:
     return stream;
   }
 
+  template <typename T>
+  size_t write(const T* ptr, size_t num = 1) {
+    ostream_.write((char*)ptr, num * sizeof(T));
+    return num * sizeof(T);
+  }
+
   std::string path() { return file_.string(); }
 
 private:
diff --git a/src/common/io.cpp b/src/common/io.cpp
new file mode 100644
index 00000000..6ec0b3c8
--- /dev/null
+++ b/src/common/io.cpp
@@ -0,0 +1,165 @@
+#include "common/io.h"
+
+#include "3rd_party/cnpy/cnpy.h"
+#include "common/shape.h"
+#include "common/types.h"
+
+#include "common/io_item.h"
+#include "common/binary.h"
+
+
+namespace marian {
+
+namespace io {
+
+bool isNpz(const std::string& fileName) {
+  return fileName.size() >= 4 && fileName.substr(fileName.length() - 4) == ".npz";
+}
+
+bool isBin(const std::string& fileName) {
+  return fileName.size() >= 4 && fileName.substr(fileName.length() - 4) == ".bin";
+}
+
+void getYamlFromNpz(YAML::Node& yaml,
+                    const std::string& varName,
+                    const std::string& fileName) {
+  auto item = cnpy::npz_load(fileName, varName);
+  if(item->size() > 0)
+    yaml = YAML::Load(item->data());
+}
+
+void getYamlFromBin(YAML::Node& yaml,
+                    const std::string& varName,
+                    const std::string& fileName) {
+  auto item = binary::getItem(fileName, varName);
+  if(item.size() > 0)
+    yaml = YAML::Load(item.data());
+}
+
+void getYamlFromModel(YAML::Node& yaml,
+                      const std::string& varName,
+                      const std::string& fileName) {
+  if(io::isNpz(fileName)) {
+    io::getYamlFromNpz(yaml, varName, fileName);
+  }
+  else if(io::isBin(fileName)) {
+    io::getYamlFromBin(yaml, varName, fileName);
+  }
+  else {
+    ABORT("Unknown model file format for file {}", fileName);
+  }
+}
+
+void getYamlFromModel(YAML::Node& yaml,
+                      const std::string& varName,
+                      const void* ptr) {
+  auto item = binary::getItem(ptr, varName);
+  if(item.size() > 0)
+    yaml = YAML::Load(item.data());
+}
+
+void addMetaToItems(const std::string& meta,
+                    const std::string& varName,
+                    std::vector<io::Item>& items) {
+  Item item;
+  item.name = varName;
+
+  // increase size by 1 to add \0
+  item.shape = Shape({(int)meta.size() + 1});
+
+  item.bytes.resize(item.shape.elements());
+  std::copy(meta.begin(), meta.end(), item.bytes.begin());
+  // set string terminator
+  item.bytes.back() = '\0';
+
+  item.type = Type::int8;
+
+  items.push_back(item);
+}
+
+void loadItemsFromNpz(const std::string& fileName, std::vector<Item>& items) {
+    auto numpy = cnpy::npz_load(fileName);
+    for(auto it : numpy) {
+
+      Shape shape;
+      if(it.second->shape.size() == 1) {
+        shape.resize(2);
+        shape.set(0, 1);
+        shape.set(1, it.second->shape[0]);
+      } else {
+        shape.resize(it.second->shape.size());
+        for(size_t i = 0; i < it.second->shape.size(); ++i)
+          shape.set(i, it.second->shape[i]);
+      }
+
+      Item item;
+      item.name = it.first;
+      item.shape = shape;
+      item.bytes.swap(it.second->bytes);
+
+      items.emplace_back(std::move(item));
+    }
+}
+
+std::vector<Item> loadItems(const std::string& fileName) {
+  std::vector<Item> items;
+  if(isNpz(fileName)) {
+    loadItemsFromNpz(fileName, items);
+  }
+  else if(isBin(fileName)) {
+    binary::loadItems(fileName, items);
+  }
+  else {
+    ABORT("Unknown model file format for file {}", fileName);
+  }
+
+  return items;
+}
+
+std::vector<Item> loadItems(const void* ptr) {
+  std::vector<Item> items;
+  binary::loadItems(ptr, items, false);
+  return items;
+}
+
+std::vector<Item> mmapItems(const void* ptr) {
+  std::vector<Item> items;
+  binary::loadItems(ptr, items, true);
+  return items;
+}
+
+// @TODO: make cnpy and our wrapper talk to each other in terms of types
+// or implement our own saving routines for npz based on npy, probably better.
+void saveItemsNpz(const std::string& fileName, const std::vector<Item>& items) {
+  std::vector<cnpy::NpzItem> npzItems;
+  for(auto& item : items) {
+    std::vector<unsigned int> shape(item.shape.begin(), item.shape.end());
+    char type = 'f';
+
+    if(item.type == Type::float32)
+      type = cnpy::map_type(typeid(float));
+    else if(item.type == Type::int8)
+      type = cnpy::map_type(typeid(char));
+    else
+      ABORT("Other types not supported yet");
+
+    npzItems.emplace_back(item.name, item.bytes, shape, type, sizeOf(item.type));
+
+  }
+  cnpy::npz_save(fileName, npzItems);
+}
+
+void saveItems(const std::string& fileName, const std::vector<Item>& items) {
+  if(isNpz(fileName)) {
+    saveItemsNpz(fileName, items);
+  }
+  else if(isBin(fileName)) {
+    binary::saveItems(fileName, items);
+  }
+  else {
+    ABORT("Unknown file format for file {}", fileName);
+  }
+}
+
+}
+}
diff --git a/src/common/io.h b/src/common/io.h
new file mode 100644
index 00000000..210360d4
--- /dev/null
+++ b/src/common/io.h
@@ -0,0 +1,44 @@
+#pragma once
+
+#include "3rd_party/yaml-cpp/yaml.h"
+#include "common/io_item.h"
+
+#include <string>
+#include <vector>
+
+// interface for handling model files in marian, both *.npz files and
+// *.bin files have the same way of accessing them and are identified
+// by suffixes (*.npz or *.bin).
+
+// Files with the *.bin suffix are supposed to be memory-mappable for
+// CPU decoding.
+
+namespace marian {
+
+namespace io {
+
+bool isNpz(const std::string& fileName);
+bool isBin(const std::string& fileName);
+
+void getYamlFromModel(YAML::Node& yaml,
+                      const std::string& varName,
+                      const std::string& fileName);
+
+void getYamlFromModel(YAML::Node& yaml,
+                      const std::string& varName,
+                      const void* ptr);
+
+void addMetaToItems(const std::string& meta,
+                    const std::string& varName,
+                    std::vector<io::Item>& items);
+
+std::vector<Item> loadItems(const std::string& fileName);
+
+std::vector<Item> loadItems(const void* ptr);
+
+std::vector<Item> mmapItems(const void* ptr);
+
+void saveItems(const std::string& fileName, const std::vector<Item>& items);
+
+}
+}
diff --git a/src/common/io_item.h b/src/common/io_item.h
new file mode 100644
index 00000000..809ed358
--- /dev/null
+++ b/src/common/io_item.h
@@ -0,0 +1,36 @@
+#pragma once
+
+#include "common/shape.h"
+#include "common/types.h"
+
+#include <string>
+
+namespace marian {
+namespace io {
+
+struct Item {
+  std::vector<char> bytes;
+  const char* ptr{0};
+  bool mapped{false};
+
+  std::string name;
+  Shape shape;
+  Type type{Type::float32};
+
+  const char* data() const {
+    if(mapped)
+      return ptr;
+    else
+      return bytes.data();
+  }
+
+  size_t size() const {
+    if(mapped)
+      return shape.elements() * sizeOf(type);
+    else
+      return bytes.size();
+  }
+};
+
+}
+}
diff --git a/src/tensors/types.h b/src/common/types.h
index fd5f0625..fd5f0625 100644
--- a/src/tensors/types.h
+++ b/src/common/types.h
diff --git a/src/data/npz_converter.cpp b/src/data/npz_converter.cpp
deleted file mode 100644
index d1faad7c..00000000
--- a/src/data/npz_converter.cpp
+++ /dev/null
@@ -1,58 +0,0 @@
-// This file is part of the Marian toolkit.
-
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#include "npz_converter.h"
-
-NpzConverter::NpzConverter(const std::string& file)
-    : model_(cnpy::npz_load(file)), destructed_(false) {}
-
-NpzConverter::~NpzConverter() {
-  if(!destructed_)
-    model_.destruct();
-}
-
-void NpzConverter::Destruct() {
-  model_.destruct();
-  destructed_ = true;
-}
-
-/** TODO: Marcin, what does this function do? Why isn't it a method? */
-mblas::Matrix NpzConverter::operator[](const std::string& key) const {
-  typedef blaze::
-      CustomMatrix<float, blaze::unaligned, blaze::unpadded, blaze::rowMajor>
-          BlazeWrapper;
-  mblas::Matrix matrix;
-  auto it = model_.find(key);
-  if(it != model_.end()) {
-    NpyMatrixWrapper np(it->second);
-    matrix = BlazeWrapper(np.data(), np.size1(), np.size2());
-  } else {
-    std::cerr << "Missing " << key << std::endl;
-  }
-  return std::move(matrix);
-}
-
-mblas::Matrix NpzConverter::operator()(const std::string& key,
-                                       bool transpose) const {
-  mblas::Matrix matrix = (*this)[key];
-  mblas::Trans(matrix);
-  return std::move(matrix);
-}
diff --git a/src/data/npz_converter.h b/src/data/npz_converter.h
deleted file mode 100644
index f133b0a4..00000000
--- a/src/data/npz_converter.h
+++ /dev/null
@@ -1,177 +0,0 @@
-#pragma once
-
-// This file is part of the Marian toolkit.
-
-//
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and associated documentation files (the "Software"), to deal
-// in the Software without restriction, including without limitation the rights
-// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-// copies of the Software, and to permit persons to whom the Software is
-// furnished to do so, subject to the following conditions:
-//
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Software.
-//
-// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-// SOFTWARE.
-
-#include "cnpy/cnpy.h"
-#include "tensor.h"  //XXX Marcin, is this include actually needed? It appears to not be used.
-
-/**
- * @brief Loads model data stored in a npz file.
- *
- * Use of this class enables such data to later be stored in standard Marian
- * data structures.
- *
- * Note: this class makes use of the 3rd-party class <code>npy</code>.
- */
-class NpzConverter {
-  // Private inner classes of the NpzConverter class
-private:
-  /**
-   * Wraps npy data such that the underlying matrix shape and
-   *    matrix data are made accessible.
-   */
-  class NpyMatrixWrapper {
-  public:
-    /**
-     * Constructs a wrapper around an underlying npy data structure,
-     *    enabling the underlying data to be accessed as a matrix.
-     *
-     * @param npy the underlying data
-     */
-    NpyMatrixWrapper(const cnpy::NpyArray& npy) : npy_(npy) {}
-
-    /**
-     * Returns the total number of elements in the underlying matrix.
-     *
-     * @return the total number of elements in the underlying matrix
-     */
-    size_t size() const { return size1() * size2(); }
-
-    /**
-     * Returns a pointer to the raw data that underlies the matrix.
-     *
-     * @return a pointer to the raw data that underlies the matrix
-     */
-    float* data() const { return (float*)npy_.data; }
-
-    /**
-     * Given the index (i, j) of a matrix element,
-     *   this operator returns the float value from the underlying npz data
-     *   that is stored in the matrix.
-     *
-     * XXX: Marcin, is the following correct? Or do I have the row/column labels
-     * swapped?
-     *
-     * @param i Index of a column in the matrix
-     * @param j Index of a row in the matrix
-     *
-     * @return the float value stored at column i, row j of the matrix
-     */
-    float operator()(size_t i, size_t j) const {
-      return ((float*)npy_.data)[i * size2() + j];
-    }
-
-    /**
-     * Returns the number of columns in the matrix.
-     *
-     * XXX: Marcin, is this following correct? Or do I have the row/column
-     * labels swapped?
-     *
-     * @return the number of columns in the matrix
-     */
-    size_t size1() const { return npy_.shape[0]; }
-
-    /**
-     * Returns the number of rows in the matrix.
-     *
-     * XXX: Marcin, is this following correct? Or do I have the row/column
-     * labels swapped?
-     *
-     * @return the number of rows in the matrix
-     */
-    size_t size2() const {
-      if(npy_.shape.size() == 1)
-        return 1;
-      else
-        return npy_.shape[1];
-    }
-
-  private:
-    /** Instance of the underlying (3rd party) data structure. */
-    const cnpy::NpyArray& npy_;
-
-  };  // End of NpyMatrixWrapper class
-
-  // Public methods of the NpzConverter class
-public:
-  /**
-   * Constructs an object that reads npz data from a file.
-   *
-   * @param file Path to file containing npz data
-   */
-  NpzConverter(const std::string& file)
-      : model_(cnpy::npz_load(file)), destructed_(false) {}
-
-  /**
-   * Destructs the model that underlies this NpzConverter object,
-   *    if that data has not already been destructed.
-   */
-  ~NpzConverter() {
-    if(!destructed_)
-      model_.destruct();
-  }
-
-  /**
-   * Destructs the model that underlies this NpzConverter object,
-   *    and marks that data as having been destructed.
-   */
-  void Destruct() {
-    model_.destruct();
-    destructed_ = true;
-  }
-
-  /**
-   * Loads data corresponding to a search key into the provided vector.
-   *
-   * @param key Search key                                    XXX Marcin, what
-   * type of thing is "key"? What are we searching for here?
-   * @param data Container into which data will be loaded     XXX Lane, is there
-   * a way in Doxygen to mark and inout variable?
-   * @param shape Shape object into which the number of rows and columns of the
-   * vectors will be stored
-   */
-  void Load(const std::string& key,
-            std::vector<float>& data,
-            marian::Shape& shape) const {
-    auto it = model_.find(key);
-    if(it != model_.end()) {
-      NpyMatrixWrapper np(it->second);
-      data.clear();
-      data.resize(np.size());
-      std::copy(np.data(), np.data() + np.size(), data.begin());
-
-      shape = {(int)np.size1(), (int)np.size2()};
-
-    } else {
-      std::cerr << "Missing " << key << std::endl;
-    }
-  }
-
-  // Private member data of the NpzConverter class
-private:
-  /** Underlying npz data */
-  cnpy::npz_t model_;
-
-  /** Indicates whether the underlying data has been destructed. */
-  bool destructed_;
-
-};  // End of NpzConverter class
diff --git a/src/graph/expression_graph.cpp b/src/graph/expression_graph.cpp
index 617bc846..23b5c167 100644
--- a/src/graph/expression_graph.cpp
+++ b/src/graph/expression_graph.cpp
@@ -26,4 +26,42 @@ void ExpressionGraph::checkNan(Tensor t) {
   ABORT_IF(throwNaN_, "Not implemented");
   // ABORT_IF(throwNaN_ && IsNan(t), "Tensor has NaN");
 }
+
+void ExpressionGraph::parametersToItems(std::vector<io::Item>& ioItems,
+                                        const std::map<std::string, std::string>& nameMap) {
+
+  for(auto p : params()->getMap()) {
+    std::string pName = p.first;
+
+    if(!namespace_.empty()) {
+      if(pName.substr(0, namespace_.size() + 2) == namespace_ + "::")
+        pName = pName.substr(namespace_.size() + 2);
+    }
+
+    auto it = nameMap.find(pName);
+    if(it != nameMap.end())
+      pName = it->second;
+
+    ABORT_IF(p.second->val()->type() != Type::float32,
+             "Only float32 supported at the moment");
+
+    Tensor val = p.second->val();
+
+    io::Item item;
+    item.name = pName;
+    item.shape = val->shape();
+    item.type =  val->type();
+
+    // Use the actual memory as this will be aligned and padded.
+    // When memory mapping this is required. Shape keeps track of
+    // tensor size. Saving to *.npz will cut to size.
+    auto mem = val->memory();
+    item.bytes.resize(mem->size());
+    copy(backend_, mem->data<char>(), mem->data<char>() + mem->size(), item.bytes.data());
+
+    ioItems.emplace_back(std::move(item));
+  }
+
+}
+
 }  // namespace marian
diff --git a/src/graph/expression_graph.h b/src/graph/expression_graph.h
index 8436e6e4..578cff8e 100644
--- a/src/graph/expression_graph.h
+++ b/src/graph/expression_graph.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#include "3rd_party/cnpy/cnpy.h"
-#include "3rd_party/threadpool.h"
 #include "common/config.h"
 #include "common/definitions.h"
 
@@ -13,12 +11,10 @@
 #include "graph/node_operators.h"
 #include "graph/parameters.h"
 
-#include "3rd_party/cnpy/cnpy.h"
-
-#include <fstream>
 #include <map>
 #include <unordered_set>
 
+
 namespace marian {
 
 template <class T, typename... Args>
@@ -163,11 +159,6 @@ public:
     namespace_ = newNamespace;
   }
 
-  void reserveWorkspaceMB(size_t num) {
-    size_t bytes = num * 1024 * 1024 - 1;
-    tensors_->reserve(bytes);
-  }
-
   void copyParams(Ptr<ExpressionGraph> graph) {
     for(auto p : *graph->params())
       param(p->name(), p->shape(), inits::dummy);
@@ -175,12 +166,18 @@ public:
     params()->vals()->copyFrom(graph->params()->vals());
   }
 
+  // @TODO: remove this
   void forceInit() {
     params()->allocateForward();
     for(auto v : nodesForward_)
       v->init();
   }
 
+  void reserveWorkspaceMB(size_t num) {
+    size_t bytes = num * 1024 * 1024 - 1;
+    tensors_->reserve(bytes);
+  }
+
   void reuseWorkspace(Ptr<ExpressionGraph> graph) {
     tensors_ = graph->tensors_;
   }
@@ -423,65 +420,131 @@ public:
 
   void setThrowNaN(bool throwNaN) { throwNaN_ = throwNaN; }
 
-  void load(const std::string& name, bool markReloaded) {
-    using namespace keywords;
-
-    LOG(info, "Loading model from {}", name);
+private:
+  // convert all parameters into an array of IoItem elements, for saving
+  void itemsToParameters(const std::vector<io::Item>& ioItems,
+                         const std::map<std::string, std::string>& nameMap,
+                         bool markReloaded = true) {
     setReloaded(false);
+    for(auto& item : ioItems) {
+      std::string pName = item.name;
 
-    auto numpy = cnpy::npz_load(name);
-
-    for(auto it : numpy) {
-      auto name = it.first;
-      // skip over special parameters starting with _
-      if(name.substr(0, 8) == "special:")
+      // skip over special parameters starting with "special:"
+      if(pName.substr(0, 8) == "special:")
         continue;
 
-      Shape shape;
-      if(it.second->shape.size() == 1) {
-        shape.resize(2);
-        shape.set(0, 1);
-        shape.set(1, it.second->shape[0]);
-      } else {
-        shape.resize(it.second->shape.size());
-        for(size_t i = 0; i < it.second->shape.size(); ++i)
-          shape.set(i, it.second->shape[i]);
-      }
+      auto it = nameMap.find(pName);
+      if(it != nameMap.end())
+        pName = it->second;
 
-      param(name, shape, inits::from_numpy(it.second));
+      param(pName, item.shape, inits::from_item(item));
     }
-
     if(markReloaded)
       setReloaded(true);
   }
 
-  // convert all parameters into an array pf cnpy::NpzItem elements, for saving
-  void save(std::vector<cnpy::NpzItem>& npzItems) {
-    for(auto p : params()->getMap()) {
-      std::string pName = p.first;
+public:
 
-      if(!namespace_.empty()) {
-        if(pName.substr(0, namespace_.size() + 2) == namespace_ + "::")
-          pName = pName.substr(namespace_.size() + 2);
-      }
+  void load(const std::string& name,
+            const std::map<std::string, std::string>& nameMap,
+            bool markReloaded = true) {
+    LOG(info, "Loading model from {}", name);
+    itemsToParameters(io::loadItems(name), nameMap, markReloaded);
+  }
 
-      std::vector<float> v;
-      p.second->val()->get(v);
+  void load(const std::string& name,
+            bool markReloaded = true) {
 
-      auto& pShape = p.second->shape();
-      std::vector<unsigned int> shape(pShape.begin(), pShape.end());
+    // code to test memory mapping
+    //if(io::isBin(name)) {
+    //  loadMmap(name, markReloaded);
+    //  return;
+    //}
 
-      npzItems.emplace_back(pName, v, shape);
-    }
+    std::map<std::string, std::string> emptyNameMap;
+    load(name, emptyNameMap, markReloaded);
   }
 
-  void save(const std::string& name) {
+  void load(const void* ptr,
+            const std::map<std::string, std::string>& nameMap,
+            bool markReloaded = true) {
+    LOG(info, "Loading model from buffer at {}", ptr);
+    itemsToParameters(io::loadItems(ptr), nameMap, markReloaded);
+  }
+
+  void load(const void* ptr,
+            bool markReloaded = true) {
+    std::map<std::string, std::string> emptyNameMap;
+    load(ptr, emptyNameMap, markReloaded);
+  }
+
+  void mmap(const void* ptr,
+            const std::map<std::string, std::string>& nameMap,
+            bool markReloaded = true) {
+
+    ABORT_IF(backend_->getDevice().type != DeviceType::cpu || !inferenceOnly_,
+             "Memory mapping only supported for CPU inference mode");
+
+    params_ = New<MappedParameters>();
+    params_->init(backend_);
+
+    LOG(info, "Memory mapping model at {}", ptr);
+    itemsToParameters(io::mmapItems(ptr), nameMap, markReloaded);
+  }
+
+  void mmap(const void* ptr,
+            bool markReloaded = true) {
+    std::map<std::string, std::string> emptyNameMap;
+    mmap(ptr, emptyNameMap, markReloaded);
+  }
+
+   // Code to test memory mapping
+   //char* buf_;
+   //void loadMmap(const std::string& name, bool markReloaded) {
+   //   size_t fsize = boost::filesystem::file_size(name);
+   //   buf_ = new char[fsize];
+   //   InputFileStream in(name);
+   //   in.read(buf_, fsize);
+   //   mmap(buf_, markReloaded);
+   //}
+
+private:
+  // convert all parameters into an array of io::Item elements, for saving
+  void parametersToItems(std::vector<io::Item>& ioItems,
+                         const std::map<std::string, std::string>& nameMap);
+
+public:
+
+  void save(const std::string& name,
+            const std::string& meta,
+            const std::map<std::string, std::string>& nameMap) {
     LOG(info, "Saving model to {}", name);
-    std::vector<cnpy::NpzItem> npzItems;
-    save(npzItems);
-    cnpy::npz_save(name, npzItems);
-    LOG(info, "Saved {} items.", npzItems.size());
+
+    std::vector<io::Item> ioItems;
+    parametersToItems(ioItems, nameMap);
+    if(!meta.empty())
+      io::addMetaToItems(meta, "special:model.yml", ioItems);
+    io::saveItems(name, ioItems);
+
+    LOG(info, "Saved {} items.", ioItems.size());
+  }
+
+  void save(const std::string& name) {
+    std::map<std::string, std::string> emptyNameMap;
+    save(name, "", emptyNameMap);
+  }
+
+  void save(const std::string& name,
+            const std::string& meta) {
+    std::map<std::string, std::string> emptyNameMap;
+    save(name, meta, emptyNameMap);
   }
+
+  void save(const std::string& name,
+            const std::map<std::string, std::string>& nameMap) {
+    save(name, "", nameMap);
+  }
+
 };
 
 template <class T, typename... Args>
diff --git a/src/graph/node_initializers.cpp b/src/graph/node_initializers.cpp
index 86a21ca7..99e3ff8c 100644
--- a/src/graph/node_initializers.cpp
+++ b/src/graph/node_initializers.cpp
@@ -125,14 +125,14 @@ NodeInitializer from_sparse_vector(
   };
 }
 
-NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np) {
-  return [np](Tensor t) {
-    size_t size = 1;
-    for(size_t dim : np->shape)
-      size *= dim;
-    t->set((float*)np->data(), (float*)np->data() + size);
-  };
-}
+//NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np) {
+//  return [np](Tensor t) {
+//    size_t size = 1;
+//    for(size_t dim : np->shape)
+//      size *= dim;
+//    t->set((float*)np->data(), (float*)np->data() + size);
+//  };
+//}
 
 // move this somewhere else
 NodeInitializer from_word2vec(const std::string& file,
@@ -154,6 +154,27 @@ NodeInitializer from_word2vec(const std::string& file,
     t->set(embs);
   };
 }
+
+NodeInitializer from_item(const io::Item& item) {
+  if(item.mapped) {
+    return [item](Tensor t) {
+      // @TODO: implement other types, for now croak loudly.
+      ABORT_IF(t->getBackend()->getDevice().type != DeviceType::cpu, "Memory mapping only works for CPU tensors");
+      ABORT_IF(!matchType<float>(t->type()), "Tensor type and type for mapping do not match");
+      auto mp = New<MemoryPiece>((uint8_t*)item.ptr, t->size() * sizeof(float));
+      t->reset(mp);
+    };
+  }
+  else {
+    return [item](Tensor t) {
+      // @TODO: implement other types, for now croak loudly.
+      ABORT_IF(!matchType<float>(t->type()),
+               "Tensor type and type for mapping do not match");
+      t->set((const float*)item.bytes.data(), (const float*)item.bytes.data() + t->size());
+    };
+  }
+}
+
 }  // namespace inits
 
 }  // namespace marian
diff --git a/src/graph/node_initializers.h b/src/graph/node_initializers.h
index 1820f7af..3a961a2b 100644
--- a/src/graph/node_initializers.h
+++ b/src/graph/node_initializers.h
@@ -1,7 +1,6 @@
 // TODO: move to backend, into graph/
 #pragma once
 
-#include "cnpy/cnpy.h"
 #include "common/config.h"
 #include "tensors/tensor.h"
 
@@ -64,10 +63,12 @@ void glorot_normal(Tensor t);
 NodeInitializer from_vector(const std::vector<float>& v);
 NodeInitializer from_vector(const std::vector<size_t>& v);
 
+NodeInitializer from_item(const io::Item& item);
+
 NodeInitializer from_sparse_vector(
     std::pair<std::vector<size_t>, std::vector<float>>& v);
 
-NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np);
+//NodeInitializer from_numpy(const cnpy::NpyArrayPtr& np);
 
 NodeInitializer from_word2vec(const std::string& file,
                               int dimVoc,
diff --git a/src/graph/node_operators_unary.h b/src/graph/node_operators_unary.h
index cbeded24..b51c8a75 100644
--- a/src/graph/node_operators_unary.h
+++ b/src/graph/node_operators_unary.h
@@ -657,7 +657,7 @@ struct RowsNodeOp : public UnaryNodeOp {
   NodeOps forwardOps() {
     // @TODO: solve this with a tensor!
 
-    return {NodeOp(CopyRows(val_, child(0)->val(), indices_))};
+    return {NodeOp(CopyRows(val_, child(0)->val(), indices_, graph()->allocator()))};
   }
 
   NodeOps backwardOps() {
diff --git a/src/graph/parameters.h b/src/graph/parameters.h
index 1b9472fd..20bf47cf 100644
--- a/src/graph/parameters.h
+++ b/src/graph/parameters.h
@@ -11,7 +11,7 @@
 namespace marian {
 
 class Parameters {
-private:
+protected:
   /** @brief List of all parameter nodes of this expression graph. */
   std::vector<Expr> params_;
   std::map<std::string, Expr> named_;
@@ -19,12 +19,15 @@ private:
   Ptr<TensorAllocator> vals_;
   Ptr<TensorAllocator> grads_;
 
-public:
-  void init(Ptr<Backend> backend) {
-    vals_ = New<TensorAllocator>(backend);
-    grads_ = New<TensorAllocator>(backend);
+  size_t totalCapacity(Ptr<TensorAllocator> alloc) {
+    size_t sum = 0;
+    for(auto p : params_) {
+      sum += alloc->capacity(p->shape(), Type::float32);
+    }
+    return sum;
   }
 
+public:
   auto begin() -> decltype(params_.begin()) { return params_.begin(); }
 
   auto end() -> decltype(params_.begin()) { return params_.end(); }
@@ -42,21 +45,18 @@ public:
 
   size_t size() { return params_.size(); }
 
-  size_t totalCapacity(Ptr<TensorAllocator> alloc) {
-    size_t sum = 0;
-    for(auto p : params_) {
-      sum += alloc->capacity(p->shape(), Type::float32);
-    }
-    return sum;
-  }
-
   void add(Expr p, const std::string& name) {
     params_.push_back(p);
     ABORT_IF(named_.count(name), "Parameter '{}' already exists", name);
     named_[name] = p;
   }
 
-  void allocateForward() {
+  virtual void init(Ptr<Backend> backend) {
+    vals_ = New<TensorAllocator>(backend);
+    grads_ = New<TensorAllocator>(backend);
+  }
+
+  virtual void allocateForward() {
     if(!params_.empty() && vals_->size() == 0) {
       vals_->reserveExact(totalCapacity(vals_));
       for(auto p : params_) {
@@ -67,7 +67,7 @@ public:
     }
   }
 
-  void allocateBackward() {
+  virtual void allocateBackward() {
     if(!params_.empty() && grads_->size() == 0) {
       grads_->reserveExact(totalCapacity(grads_));
       for(auto p : params_)
@@ -76,13 +76,13 @@ public:
     }
   }
 
-  void set_zero_adjoint() { grads()->set(0.f); }
+  virtual void set_zero_adjoint() { grads()->set(0.f); }
 
-  Tensor vals() { return vals_->asTensor(); }
+  virtual Tensor vals() { return vals_->asTensor(); }
 
-  Tensor grads() { return grads_->asTensor(); }
+  virtual Tensor grads() { return grads_->asTensor(); }
 
-  void clear() {
+  virtual void clear() {
     params_.clear();
     named_.clear();
 
@@ -90,4 +90,49 @@ public:
     grads_->clear();
   }
 };
+
+class MappedParameters : public Parameters {
+private:
+  Ptr<Backend> backend_;
+
+public:
+  virtual void init(Ptr<Backend> backend) override {
+    backend_ = backend;
+  }
+
+  virtual void allocateForward() override {
+    if(!params_.empty()) {
+      for(auto p : params_) {
+        if(!p->val()) {
+          p->val() = Tensor(new TensorBase(nullptr, p->shape(), Type::float32, backend_));
+        }
+      }
+    }
+  }
+
+  virtual void allocateBackward() override {
+    ABORT("Not implemented for memory-mapped parameters");
+  }
+
+  virtual void set_zero_adjoint() override {
+    ABORT("Not implemented for memory-mapped parameters");
+  }
+
+  virtual Tensor vals() override {
+    ABORT("Not implemented for memory-mapped parameters");
+    return nullptr;
+  }
+
+  virtual Tensor grads() override {
+    ABORT("Not implemented for memory-mapped parameters");
+    return nullptr;
+  }
+
+  virtual void clear() override {
+    params_.clear();
+    named_.clear();
+  }
+};
+
+
 }  // namespace marian
diff --git a/src/layers/weight.cpp b/src/layers/weight.cpp
index d5cee32d..b5d8a2bc 100644
--- a/src/layers/weight.cpp
+++ b/src/layers/weight.cpp
@@ -3,9 +3,8 @@
 namespace marian {
 
 Ptr<WeightingBase> WeightingFactory(Ptr<Options> options) {
-  if(options->has("data-weighting"))
-    return New<DataWeighting>(options->get<std::string>("data-weighting-type"));
-  return nullptr;
+  ABORT_IF(!options->has("data-weighting"), "No data-weighting specified in options");
+  return New<DataWeighting>(options->get<std::string>("data-weighting-type"));
 }
 
 Expr DataWeighting::getWeights(Ptr<ExpressionGraph> graph,
diff --git a/src/microsoft/quicksand.cpp b/src/microsoft/quicksand.cpp
index 593bf81d..14a60bf9 100644
--- a/src/microsoft/quicksand.cpp
+++ b/src/microsoft/quicksand.cpp
@@ -40,7 +40,6 @@ private:
 public:
   BeamSearchDecoder(Ptr<Options> options, Word eos)
       : IBeamSearchDecoder(options, eos) {
-    // createLoggers();
 
     graph_ = New<ExpressionGraph>(true, true);
     graph_->setDevice(DeviceId{0, DeviceType::cpu});
@@ -64,10 +63,12 @@ public:
     for(auto& model : models) {
       Ptr<Options> modelOpts = New<Options>();
       YAML::Node config;
-      Config::GetYamlFromNpz(config, "special:model.yml", model);
+      io::GetYamlFromModel(config, "special:model.yml", model);
       modelOpts->merge(options_);
       modelOpts->merge(config);
+
       auto encdec = models::from_options(modelOpts, models::usage::translation);
+
       scorers_.push_back(New<ScorerWrapper>(
           encdec, "F" + std::to_string(scorers_.size()), 1, model));
     }
diff --git a/src/models/amun.h b/src/models/amun.h
index cbc0c628..507ecc88 100644
--- a/src/models/amun.h
+++ b/src/models/amun.h
@@ -40,10 +40,6 @@ public:
             bool markedReloaded = true) {
     using namespace keywords;
 
-    LOG(info, "Loading model from {}", name);
-
-    auto numpy = cnpy::npz_load(name);
-
     std::map<std::string, std::string> nameMap
         = {{"decoder_U", "decoder_cell1_U"},
            {"decoder_Ux", "decoder_cell1_Ux"},
@@ -95,41 +91,12 @@ public:
     if(opt<bool>("tied-embeddings-src") || opt<bool>("tied-embeddings-all"))
       nameMap["Wemb"] = "Wemb";
 
-    graph->setReloaded(false);
-
-    for(auto it : numpy) {
-      auto name = it.first;
-
-      if(name == "decoder_c_tt")
-        continue;
-      if(name.substr(0, 8) == "special:")
-        continue;
-
-      Shape shape;
-      if(numpy[name]->shape.size() == 2) {
-        shape.resize(2);
-        shape.set(0, numpy[name]->shape[0]);
-        shape.set(1, numpy[name]->shape[1]);
-      } else if(numpy[name]->shape.size() == 1) {
-        shape.resize(2);
-        shape.set(0, 1);
-        shape.set(1, numpy[name]->shape[0]);
-      }
-
-      std::string pName = name;
-      if(nameMap.count(name))
-        pName = nameMap[name];
-
-      graph->param(pName, shape, inits::from_numpy(numpy[name]));
-    }
-
-    graph->setReloaded(true);
+    graph->load(name, nameMap);
   }
 
   void save(Ptr<ExpressionGraph> graph,
             const std::string& name,
             bool saveTranslatorConfig = false) {
-    LOG(info, "Saving model to {}", name);
 
     std::map<std::string, std::string> nameMap
         = {{"decoder_cell1_U", "decoder_U"},
@@ -177,36 +144,7 @@ public:
            {"encoder_bi_r_gamma1", "encoder_r_gamma1"},
            {"encoder_bi_r_gamma2", "encoder_r_gamma2"}};
 
-    unsigned shape[2];
-    std::string mode = "w";
-
-    for(auto p : graph->params()->getMap()) {
-      std::vector<float> v;
-      p.second->val()->get(v);
-
-      unsigned dim;
-      if(p.second->shape()[0] == 1) {
-        shape[0] = p.second->shape()[1];
-        dim = 1;
-      } else {
-        shape[0] = p.second->shape()[0];
-        shape[1] = p.second->shape()[1];
-        dim = 2;
-      }
-
-      std::string pName = p.first;
-      if(nameMap.count(pName))
-        pName = nameMap[pName];
-
-      cnpy::npz_save(name, pName, v.data(), shape, dim, mode);
-      mode = "a";
-    }
-
-    float ctt = 0;
-    shape[0] = 1;
-    cnpy::npz_save(name, "decoder_c_tt", &ctt, shape, 1, mode);
-
-    saveModelParameters(name);
+    graph->save(name, getModelParametersAsString(), nameMap);
 
     if(saveTranslatorConfig) {
       createAmunConfig(name);
diff --git a/src/models/costs.h b/src/models/costs.h
index 777f5147..730d46ce 100644
--- a/src/models/costs.h
+++ b/src/models/costs.h
@@ -134,6 +134,12 @@ public:
     encdec_->load(graph, name, markedReloaded);
   }
 
+  virtual void mmap(Ptr<ExpressionGraph> graph,
+                    const void* ptr,
+                    bool markedReloaded = true) {
+    encdec_->mmap(graph, ptr, markedReloaded);
+  };
+
   virtual void save(Ptr<ExpressionGraph> graph,
                     const std::string& name,
                     bool saveTranslatorConfig = false) {
diff --git a/src/models/encoder_decoder.cpp b/src/models/encoder_decoder.cpp
index 9202aa63..9620f86a 100644
--- a/src/models/encoder_decoder.cpp
+++ b/src/models/encoder_decoder.cpp
@@ -87,8 +87,11 @@ Config::YamlNode EncoderDecoder::getModelParameters() {
   return modelParams;
 }
 
-void EncoderDecoder::saveModelParameters(const std::string& name) {
-  Config::AddYamlToNpz(getModelParameters(), "special:model.yml", name);
+std::string EncoderDecoder::getModelParametersAsString() {
+  auto yaml = getModelParameters();
+  YAML::Emitter out;
+  OutputYaml(yaml, out);
+  return std::string(out.c_str());
 }
 
 void EncoderDecoder::load(Ptr<ExpressionGraph> graph,
@@ -97,18 +100,19 @@ void EncoderDecoder::load(Ptr<ExpressionGraph> graph,
   graph->load(name, markedReloaded && !opt<bool>("ignore-model-config", false));
 }
 
+void EncoderDecoder::mmap(Ptr<ExpressionGraph> graph,
+                          const void* ptr,
+                          bool markedReloaded) {
+  graph->mmap(ptr, markedReloaded && !opt<bool>("ignore-model-config", false));
+}
+
 void EncoderDecoder::save(Ptr<ExpressionGraph> graph,
                           const std::string& name,
                           bool saveTranslatorConfig) {
   // ignore config for now
   LOG(info, "Saving model weights and runtime parameters to {}", name);
-  std::vector<cnpy::NpzItem> npzItems;
-  graph->save(npzItems);                           // model weights
-  Config::AddYamlToNpzItems(getModelParameters(),  // model runtime parameters
-                            "special:model.yml",
-                            npzItems);
-  cnpy::npz_save(name, npzItems);  // save both jointly
-  // LOG(info, "Saved {} items.", npzItems.size());
+
+  graph->save(name, getModelParametersAsString());
 
   if(saveTranslatorConfig)
     createDecoderConfig(name);
diff --git a/src/models/encoder_decoder.h b/src/models/encoder_decoder.h
index c6536603..568aec87 100644
--- a/src/models/encoder_decoder.h
+++ b/src/models/encoder_decoder.h
@@ -16,6 +16,11 @@ public:
                     bool markedReloaded = true)
       = 0;
 
+  virtual void mmap(Ptr<ExpressionGraph> graph,
+                    const void* ptr,
+                    bool markedReloaded = true)
+      = 0;
+
   virtual void save(Ptr<ExpressionGraph> graph,
                     const std::string& name,
                     bool saveTranslatorConfig = false)
@@ -71,7 +76,7 @@ protected:
   std::set<std::string> modelFeatures_;
 
   Config::YamlNode getModelParameters();
-  void saveModelParameters(const std::string& name);
+  std::string getModelParametersAsString();
 
   virtual void createDecoderConfig(const std::string& name);
 
@@ -94,6 +99,10 @@ public:
                     const std::string& name,
                     bool markedReloaded = true);
 
+  virtual void mmap(Ptr<ExpressionGraph> graph,
+                    const void* ptr,
+                    bool markedReloaded = true);
+
   virtual void save(Ptr<ExpressionGraph> graph,
                     const std::string& name,
                     bool saveTranslatorConfig = false);
diff --git a/src/models/nematus.h b/src/models/nematus.h
index 9b877e8f..211626cc 100644
--- a/src/models/nematus.h
+++ b/src/models/nematus.h
@@ -30,82 +30,13 @@ public:
   void load(Ptr<ExpressionGraph> graph,
             const std::string& name,
             bool markedReloaded = true) {
-    using namespace keywords;
-
-    LOG(info, "Loading model from {}", name);
-    auto numpy = cnpy::npz_load(name);
-
-    graph->setReloaded(false);
-
-    for(auto it : numpy) {
-      auto name = it.first;
-
-      if(name == "decoder_c_tt")
-        continue;
-      if(name.substr(0, 8) == "special:")
-        continue;
-
-      Shape shape;
-      if(numpy[name]->shape.size() == 2) {
-        shape.resize(2);
-        shape.set(0, numpy[name]->shape[0]);
-        shape.set(1, numpy[name]->shape[1]);
-      } else if(numpy[name]->shape.size() == 1) {
-        shape.resize(2);
-        shape.set(0, 1);
-        shape.set(1, numpy[name]->shape[0]);
-      }
-
-      std::string pName = name;
-      if(nameMap_.count(name))
-        pName = nameMap_[name];
-
-      graph->param(pName, shape, inits::from_numpy(numpy[name]));
-    }
-
-    graph->setReloaded(true);
+    graph->load(name, nameMap_);
   }
 
   void save(Ptr<ExpressionGraph> graph,
             const std::string& name,
             bool saveTranslatorConfig = false) {
-    LOG(info, "Saving model to {}", name);
-
-    unsigned shape[2];
-    std::string mode = "w";
-
-    if(nameMapRev_.empty())
-      for(auto& kv : nameMap_)
-        nameMapRev_.insert({kv.second, kv.first});
-
-    for(auto p : graph->params()->getMap()) {
-      std::vector<float> v;
-      p.second->val()->get(v);
-
-      unsigned dim;
-      if(p.second->shape()[0] == 1) {
-        shape[0] = p.second->shape()[1];
-        dim = 1;
-      } else {
-        shape[0] = p.second->shape()[0];
-        shape[1] = p.second->shape()[1];
-        dim = 2;
-      }
-
-      std::string pName = p.first;
-      if(nameMapRev_.count(pName))
-        pName = nameMapRev_[pName];
-
-      cnpy::npz_save(name, pName, v.data(), shape, dim, mode);
-      mode = "a";
-    }
-
-    float ctt = 0;
-    shape[0] = 1;
-    cnpy::npz_save(name, "decoder_c_tt", &ctt, shape, 1, mode);
-
-    saveModelParameters(name);
-
+    graph->save(name, getModelParametersAsString(), nameMap_);
     if(saveTranslatorConfig) {
       createAmunConfig(name);
       createDecoderConfig(name);
diff --git a/src/optimizers/optimizers.cpp b/src/optimizers/optimizers.cpp
index 4381419a..fc266325 100644
--- a/src/optimizers/optimizers.cpp
+++ b/src/optimizers/optimizers.cpp
@@ -1,6 +1,7 @@
 #include "optimizers.h"
 
 #include "tensors/tensor_operators.h"
+#include "common/io.h"
 
 namespace marian {
 
@@ -47,19 +48,17 @@ void Adagrad::load(const std::string& name,
   std::vector<float> vGt;
   size_t totalSize = 0;
 
-  auto numpy = cnpy::npz_load(name);
-  for(auto it : numpy) {
-    auto name = it.first;
-    auto np = it.second;
-
+  // @TODO: use new IO
+  auto items = io::loadItems(name);
+  for(auto item : items) {
     // get the size of gt_
-    totalSize = np->shape[1];
+    totalSize = item.shape.elements();
 
     // extract data into vectors
-    if(name == "adagrad_gt") {
+    if(item.name == "adagrad_gt") {
       vGt.resize(totalSize);
       std::copy(
-          (float*)np->data(), (float*)np->data() + totalSize, vGt.begin());
+          (float*)item.data(), (float*)item.data() + totalSize, vGt.begin());
     }
   }
 
@@ -108,9 +107,16 @@ void Adagrad::save(const std::string& name,
     vGt.insert(vGt.end(), tmp.begin(), tmp.end());
   }
 
-  unsigned int shape[2] = {1, (unsigned int)vGt.size()};
+  io::Item item;
+  item.name = "adagrad_gt";
+  item.shape = Shape({1, (int)vGt.size()});
+  item.type = Type::float32;
+  item.bytes.resize(vGt.size() * sizeOf(item.type));
+  std::copy((char*)vGt.data(),
+            (char*)vGt.data() + vGt.size(),
+            item.bytes.begin());
 
-  cnpy::npz_save(name, "adagrad_gt", vGt.data(), shape, 2, "w");
+  io::saveItems(name, {item});
 }
 
 void Adagrad::resetStats() {
@@ -164,24 +170,22 @@ void Adam::load(const std::string& name,
   std::vector<float> vVt;
   size_t totalSize = 0;
 
-  auto numpy = cnpy::npz_load(name);
-  for(auto it : numpy) {
-    auto name = it.first;
-    auto np = it.second;
+  auto items = io::loadItems(name);
+  for(auto item : items) {
 
     // get the size of mt_ and vt_, they are the same
-    totalSize = np->shape[1];
+    totalSize = item.shape.elements();
 
     // extract data into vectors
-    if(name == "adam_mt") {
+    if(item.name == "adam_mt") {
       vMt.resize(totalSize);
       std::copy(
-          (float*)np->data(), (float*)np->data() + totalSize, vMt.begin());
+          (float*)item.data(), (float*)item.data() + totalSize, vMt.begin());
     }
-    if(name == "adam_vt") {
+    if(item.name == "adam_vt") {
       vVt.resize(totalSize);
       std::copy(
-          (float*)np->data(), (float*)np->data() + totalSize, vVt.begin());
+          (float*)item.data(), (float*)item.data() + totalSize, vVt.begin());
     }
   }
 
@@ -236,13 +240,26 @@ void Adam::save(const std::string& name,
     opt->vt_->get(tmp);
     vVt.insert(vVt.end(), tmp.begin(), tmp.end());
   }
-
-  // the shape is the same for mt_ and vt_
-  std::vector<unsigned int> shape{1, (unsigned int)vMt.size()};
-
-  cnpy::npz_save(name,
-                 {cnpy::NpzItem("adam_mt", vMt, shape),
-                  cnpy::NpzItem("adam_vt", vVt, shape)});
+  
+  io::Item itemMt;
+  itemMt.name = "adam_mt";
+  itemMt.shape = Shape({1, (int)vMt.size()});
+  itemMt.type = Type::float32;
+  itemMt.bytes.resize(vMt.size() * sizeOf(itemMt.type));
+  std::copy((char*)vMt.data(),
+            (char*)vMt.data() + vMt.size(),
+            itemMt.bytes.begin());
+
+  io::Item itemVt;
+  itemVt.name = "adam_vt";
+  itemVt.shape = Shape({1, (int)vVt.size()});
+  itemVt.type = Type::float32;
+  itemVt.bytes.resize(vVt.size() * sizeOf(itemVt.type));
+  std::copy((char*)vVt.data(),
+            (char*)vVt.data() + vVt.size(),
+            itemVt.bytes.begin());
+
+  io::saveItems(name, {itemMt, itemVt});
 }
 
 void Adam::resetStats() {
diff --git a/src/tensors/allocator.h b/src/tensors/allocator.h
index 0695d25e..43c961ad 100644
--- a/src/tensors/allocator.h
+++ b/src/tensors/allocator.h
@@ -9,9 +9,9 @@
 #include <vector>
 
 #include "common/definitions.h"
+#include "common/types.h"
 #include "tensors/device.h"
 #include "tensors/memory_piece.h"
-#include "tensors/types.h"
 
 namespace marian {
 
diff --git a/src/tensors/cpu/tensor_operators.cpp b/src/tensors/cpu/tensor_operators.cpp
index 4d5d40dc..b82600e1 100644
--- a/src/tensors/cpu/tensor_operators.cpp
+++ b/src/tensors/cpu/tensor_operators.cpp
@@ -381,7 +381,8 @@ void LogSoftmaxGrad(Tensor grad_, Tensor adj_, Tensor val_) {
 
 void CopyRows(Tensor out_,
               const Tensor in_,
-              const std::vector<size_t>& indices) {
+              const std::vector<size_t>& indices,
+              Ptr<Allocator> allocator) {
   size_t cols = in_->shape()[1];
   size_t rows = indices.size();
 
diff --git a/src/tensors/gpu/algorithm.cu b/src/tensors/gpu/algorithm.cu
index b4a773ec..f559fb42 100644
--- a/src/tensors/gpu/algorithm.cu
+++ b/src/tensors/gpu/algorithm.cu
@@ -26,6 +26,7 @@ template void copy<uint16_t>(Ptr<Backend>, const uint16_t*, const uint16_t*, uin
 template void copy<uint32_t>(Ptr<Backend>, const uint32_t*, const uint32_t*, uint32_t*);
 template void copy<uint64_t>(Ptr<Backend>, const uint64_t*, const uint64_t*, uint64_t*);
 
+template void copy<char>(Ptr<Backend>, const char*, const char*, char*);
 template void copy<float>(Ptr<Backend>, const float*, const float*, float*);
 template void copy<double>(Ptr<Backend>, const double*, const double*, double*);
 // clang-format on
diff --git a/src/tensors/gpu/tensor_operators.cu b/src/tensors/gpu/tensor_operators.cu
index 4f11f9f8..6eb5a8dd 100644
--- a/src/tensors/gpu/tensor_operators.cu
+++ b/src/tensors/gpu/tensor_operators.cu
@@ -735,7 +735,7 @@ __global__ void gCopyRows(float* out,
   }
 }
 
-void CopyRows(Tensor out, const Tensor in, const std::vector<size_t>& indices) {
+void CopyRows(Tensor out, const Tensor in, const std::vector<size_t>& indices, Ptr<Allocator> allocator) {
   cudaSetDevice(out->getDevice().no);
 
   size_t cols = in->shape().back();
@@ -744,17 +744,15 @@ void CopyRows(Tensor out, const Tensor in, const std::vector<size_t>& indices) {
   int threads = std::min(MAX_THREADS, (int)cols);
   int blocks = std::min(MAX_BLOCKS, (int)rowsToCopy);
 
-  size_t* d_indices;
-  CUDA_CHECK(cudaMalloc(&d_indices, rowsToCopy * sizeof(size_t)));
-  CUDA_CHECK(cudaMemcpy(d_indices,
-                        indices.data(),
-                        rowsToCopy * sizeof(size_t),
-                        cudaMemcpyHostToDevice));
+  auto mp_indices = allocator->alloc<size_t>(rowsToCopy);
+  CudaCopy(indices.data(),
+           indices.data() + indices.size(),
+           mp_indices->data<size_t>());
 
   gCopyRows<<<blocks, threads>>>(
-      out->data(), in->data(), cols, d_indices, rowsToCopy);
+      out->data(), in->data(), cols, mp_indices->data<size_t>(), rowsToCopy);
 
-  CUDA_CHECK(cudaFree(d_indices));
+  allocator->free(mp_indices);
 }
 
 __global__ void gPasteRows(float* out,
diff --git a/src/tensors/tensor.h b/src/tensors/tensor.h
index e2c6f23f..983691d4 100644
--- a/src/tensors/tensor.h
+++ b/src/tensors/tensor.h
@@ -1,22 +1,20 @@
 #pragma once
 
-#include <iomanip>
-#include <iostream>
-#include <memory>
-#include <sstream>
-
 #include "common/definitions.h"
 #include "common/shape.h"
+#include "common/types.h"
 #include "tensors/backend.h"
 #include "tensors/memory_piece.h"
-#include "tensors/types.h"
-
-#include <algorithm>
-
 #ifdef CUDA_FOUND
 #include "tensors/gpu/algorithm.h"
 #endif
 
+#include <iomanip>
+#include <iostream>
+#include <memory>
+#include <sstream>
+#include <algorithm>
+
 namespace marian {
 
 class TensorBase : public std::enable_shared_from_this<TensorBase> {
diff --git a/src/tensors/tensor_operators.h b/src/tensors/tensor_operators.h
index 87f4f27d..78f61888 100644
--- a/src/tensors/tensor_operators.h
+++ b/src/tensors/tensor_operators.h
@@ -14,13 +14,27 @@
 #include "tensors/gpu/add.h"
 #include "tensors/gpu/element.h"
 #include "tensors/gpu/prod.h"
+#include "tensors/gpu/algorithm.h"
 #endif
 
 #include "tensors/cpu/add.h"
 #include "tensors/cpu/element.h"
 
+#include <algorithm>
+
 namespace marian {
 
+template <typename InIt, typename OutIt>
+void copy(Ptr<Backend> backend, const InIt beg, const InIt end, OutIt it) {
+#ifdef CUDA_FOUND
+  if(backend->getDevice().type == DeviceType::gpu)
+    gpu::copy(backend, beg, end, it);
+  else
+#endif
+    std::copy(beg, end, it);
+}
+
+
 template <class Functor, class... Tensors>
 void Element(Functor functor, marian::Tensor out, Tensors... tensors) {
 #ifdef CUDA_FOUND
@@ -119,7 +133,7 @@ static inline void Deconcatenate(std::vector<marian::Tensor>& outputs,
   DISPATCH4(HighwayForward, marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor)
   DISPATCH7(HighwayBackward, marian::Tensor, marian::Tensor, marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor, const marian::Tensor)
 
-  DISPATCH3(CopyRows, marian::Tensor, const marian::Tensor, const std::vector<size_t>&)
+  DISPATCH4(CopyRows, marian::Tensor, const marian::Tensor, const std::vector<size_t>&, Ptr<Allocator>)
   DISPATCH3(PasteRows, marian::Tensor, const marian::Tensor, const std::vector<size_t>&)
   DISPATCH3(CopyCols, marian::Tensor, const marian::Tensor, const std::vector<size_t>&)
   DISPATCH3(PasteCols, marian::Tensor, const marian::Tensor, const std::vector<size_t>&)
diff --git a/src/translator/scorers.cpp b/src/translator/scorers.cpp
index 7ac5e2ad..5f126ae3 100644
--- a/src/translator/scorers.cpp
+++ b/src/translator/scorers.cpp
@@ -2,9 +2,9 @@
 
 namespace marian {
 
-Ptr<Scorer> scorerByType(std::string fname,
+Ptr<Scorer> scorerByType(const std::string& fname,
                          float weight,
-                         std::string model,
+                         const std::string& model,
                          Ptr<Config> config) {
   Ptr<Options> options = New<Options>();
   options->merge(config);
@@ -27,6 +27,32 @@ Ptr<Scorer> scorerByType(std::string fname,
   return New<ScorerWrapper>(encdec, fname, weight, model);
 }
 
+Ptr<Scorer> scorerByType(const std::string& fname,
+                         float weight,
+                         const void* ptr,
+                         Ptr<Config> config) {
+  Ptr<Options> options = New<Options>();
+  options->merge(config);
+  options->set("inference", true);
+
+  std::string type = options->get<std::string>("type");
+
+  // @TODO: solve this better
+  if(type == "lm" && config->has("input")) {
+    size_t index = config->get<std::vector<std::string>>("input").size();
+    options->set("index", index);
+  }
+
+  bool skipCost = config->get<bool>("skip-cost");
+  auto encdec = models::from_options(
+      options, skipCost ? models::usage::raw : models::usage::translation);
+
+  LOG(info, "Loading scorer of type {} as feature {}", type, fname);
+
+  return New<ScorerWrapper>(encdec, fname, weight, ptr);
+}
+
+
 std::vector<Ptr<Scorer>> createScorers(Ptr<Config> options) {
   std::vector<Ptr<Scorer>> scorers;
 
@@ -54,4 +80,32 @@ std::vector<Ptr<Scorer>> createScorers(Ptr<Config> options) {
 
   return scorers;
 }
+
+std::vector<Ptr<Scorer>> createScorers(Ptr<Config> options,
+                                       const std::vector<const void*>& ptrs) {
+  std::vector<Ptr<Scorer>> scorers;
+
+  std::vector<float> weights(ptrs.size(), 1.f);
+  if(options->has("weights"))
+    weights = options->get<std::vector<float>>("weights");
+
+  size_t i = 0;
+  for(auto ptr : ptrs) {
+    std::string fname = "F" + std::to_string(i);
+    auto modelOptions = New<Config>(*options);
+
+    try {
+      if(!options->get<bool>("ignore-model-config"))
+        modelOptions->loadModelParameters(ptr);
+    } catch(std::runtime_error& e) {
+      LOG(warn, "No model settings found in model file");
+    }
+
+    scorers.push_back(scorerByType(fname, weights[i], ptr, modelOptions));
+    i++;
+  }
+
+  return scorers;
+}
+
 }  // namespace marian
diff --git a/src/translator/scorers.h b/src/translator/scorers.h
index 402de8b4..8b651ef9 100644
--- a/src/translator/scorers.h
+++ b/src/translator/scorers.h
@@ -68,6 +68,7 @@ class ScorerWrapper : public Scorer {
 private:
   Ptr<EncoderDecoderBase> encdec_;
   std::string fname_;
+  const void* ptr_;
 
 public:
   ScorerWrapper(Ptr<models::ModelBase> encdec,
@@ -76,11 +77,22 @@ public:
                 const std::string& fname)
       : Scorer(name, weight),
         encdec_(std::static_pointer_cast<EncoderDecoderBase>(encdec)),
-        fname_(fname) {}
+        fname_(fname), ptr_{0} {}
+
+  ScorerWrapper(Ptr<models::ModelBase> encdec,
+                const std::string& name,
+                float weight,
+                const void* ptr)
+      : Scorer(name, weight),
+        encdec_(std::static_pointer_cast<EncoderDecoderBase>(encdec)),
+        ptr_{ptr} {}
 
   virtual void init(Ptr<ExpressionGraph> graph) {
     graph->switchParams(getName());
-    encdec_->load(graph, fname_);
+    if(ptr_)
+      encdec_->mmap(graph, ptr_);
+    else
+      encdec_->load(graph, fname_);
   }
 
   virtual void clear(Ptr<ExpressionGraph> graph) {
@@ -119,10 +131,17 @@ public:
   virtual std::vector<float> getAlignment() { return encdec_->getAlignment(); }
 };
 
-Ptr<Scorer> scorerByType(std::string fname,
+Ptr<Scorer> scorerByType(const std::string& fname,
                          float weight,
-                         std::string model,
+                         const std::string& model,
                          Ptr<Config> config);
 
 std::vector<Ptr<Scorer>> createScorers(Ptr<Config> options);
+
+Ptr<Scorer> scorerByType(const std::string& fname,
+                         float weight,
+                         const void* ptr,
+                         Ptr<Config> config);
+
+std::vector<Ptr<Scorer>> createScorers(Ptr<Config> options, const std::vector<const void*>& ptrs);
 }  // namespace marian
author	Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk>	2018-08-06 12:57:14 +0300
committer	Roman Grundkiewicz <rgrundki@exseed.ed.ac.uk>	2018-08-06 12:57:14 +0300
commit	eb20e9900fe544d3f38b2b15ae4240d0c4132a17 (patch)
tree	bde75868ca435155153863a2dec7ce4df4dab760
parent	1a0c4c1d19e1a7d9aaebe82ceb4b9cc68aa5cc51 (diff)
parent	9834e09a88022819262387af714764934c72caf8 (diff)