createMmap: convert ngramized and numberized text files into memory mapped files for training

author: Rico Sennrich <rico.sennrich@gmx.ch> 2015-02-11 17:29:24 +0300
committer: Rico Sennrich <rico.sennrich@gmx.ch> 2015-02-11 17:29:24 +0300
commit: 8a0847e816bef992a161349762618fd90622a895 (patch)
tree: b71c4477cc8578b33ba9d3048bcef4f8c4086fc6
parent: 1479874eb8e501a4b27c3cdf33543e1194cd1be9 (diff)
2 files changed, 147 insertions, 1 deletions
diff --git a/src/Makefile b/src/Makefile
index eea977e..1611ccb 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -138,7 +138,7 @@ RANLIB=ranlib
 
 # Rules
 
-BINS=trainNeuralNetwork testNeuralNetwork prepareNeuralLM testNeuralLM prepareNeuralTM
+BINS=trainNeuralNetwork testNeuralNetwork prepareNeuralLM testNeuralLM prepareNeuralTM createMmap
 LIBS=libnplm.a libnplm.so
 OBJS=util.o model.o
 
@@ -174,6 +174,9 @@ testNeuralLM: testNeuralLM.o $(OBJS)
 prepareNeuralTM: prepareNeuralTM.o $(OBJS)
 	$(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@
 
+createMmap: createMmap.o $(OBJS)
+	$(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@
+
 libnplm.a: neuralLM.o $(OBJS)
 	rm -f $@
 	$(AR) rv $@ $^
diff --git a/src/createMmap.cpp b/src/createMmap.cpp
new file mode 100644
index 0000000..f5f674c
--- /dev/null
+++ b/src/createMmap.cpp
@@ -0,0 +1,143 @@
+#include <iostream>
+#include <vector>
+#include <fstream>
+
+#include <boost/algorithm/string/join.hpp>
+#include <boost/interprocess/managed_shared_memory.hpp>
+#include <boost/interprocess/allocators/allocator.hpp>
+#include <boost/interprocess/managed_mapped_file.hpp>
+#include <boost/interprocess/containers/vector.hpp>
+
+#include <tclap/CmdLine.h>
+
+#include "neuralLM.h"
+#include "util.h"
+
+// take ngramized and numberized file (ready for trainNeuralNetwork)
+// and create mmap_file (for training without loading all data into memory)
+
+using namespace TCLAP;
+using namespace boost;
+using namespace nplm;
+namespace ip = boost::interprocess;
+
+typedef ip::allocator<int, ip::managed_mapped_file::segment_manager> intAllocator;
+typedef ip::vector<int, intAllocator> vec;
+typedef ip::allocator<vec, ip::managed_mapped_file::segment_manager> vecAllocator;
+
+typedef long long int data_size_t; // training data can easily exceed 2G instances
+
+data_size_t getNumLines(const string &filename) {
+  ifstream training(filename.c_str());
+  data_size_t lines = 0;
+  std::string line;
+  while (std::getline(training, line)) {
+    if ((lines%100000)==0) {
+        std::cerr<<lines<<"...";
+    }
+    ++lines;
+  }
+  training.close();
+  return lines;
+}
+
+int getNgramSize(const string &filename) {
+  ifstream training(filename.c_str());
+  std::string line;
+  std::getline(training, line);
+  std::vector<std::string> ngram;
+  splitBySpace(line, ngram);
+  training.close();
+  return ngram.size();
+}
+
+void writeMmap(const string &filename_input,
+          const string &filename_output,
+          int ngram_size,
+          data_size_t num_tokens) {
+
+    // Open the memory mapped file and create the allocators
+    ip::managed_mapped_file mfile(ip::create_only,
+        filename_output.c_str(),
+        num_tokens*ngram_size*sizeof(int)+1024UL*1024UL);
+    intAllocator ialloc(mfile.get_segment_manager());
+    vecAllocator valloc (mfile.get_segment_manager());
+
+    vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc);
+
+    std::cerr<<"The size of mmaped vec is "<<mMapVec->size() << std::endl;
+
+  ifstream training(filename_input.c_str());
+  data_size_t i = 0;
+  std::string line;
+  std::vector<std::string> ngram;
+  while (std::getline(training, line)) {
+
+    if ((i%100000)==0) {
+        std::cerr<<i<<"...";
+    }
+
+    splitBySpace(line, ngram);
+    if (ngram.size() != ngram_size)
+    {
+        std::cerr << "Error: expected " << ngram_size << " fields in instance, found " << ngram.size() << std::endl;
+        std::exit(-1);
+    }
+
+    for (int j=0; j<ngram_size; j++) {
+      mMapVec->at(i*ngram_size+j) = boost::lexical_cast<int>(ngram[j]);
+    }
+
+    ++i;
+  }
+
+  training.close();
+  ip::managed_mapped_file::shrink_to_fit(filename_output.c_str());
+ 
+}
+
+
+int main(int argc, char *argv[])
+{
+  ios::sync_with_stdio(false);
+  int ngram_size;
+  data_size_t num_tokens;
+
+  std::string input_file, output_file;
+
+
+  try
+  {
+    CmdLine cmd("take ngramized and numberized file and create memory mapped file (for training without loading add training data into memory).", ' ', "0.1");
+
+    // The options are printed in reverse order
+    ValueArg<std::string> arg_output_file("", "output_file", "Output training data (memory mapped file).", true, "", "string", cmd);
+    ValueArg<std::string> arg_input_file("", "input_file", "Input training data (numberized n-grams).", true, "", "string", cmd);
+
+    cmd.parse(argc, argv);
+
+    input_file = arg_input_file.getValue();
+    output_file = arg_output_file.getValue();
+
+    std::cerr << "Command line: " << std::endl;
+    std::cerr << boost::algorithm::join(std::vector<std::string>(argv, argv+argc), " ") << std::endl;
+
+    const std::string sep(" Value: ");
+    std::cerr << arg_input_file.getDescription() << sep << arg_input_file.getValue() << std::endl;
+    std::cerr << arg_output_file.getDescription() << sep << arg_output_file.getValue() << std::endl;
+  }
+  catch (TCLAP::ArgException &e)
+  {
+    std::cerr << "error: " << e.error() <<  " for arg " << e.argId() << std::endl;
+    std::exit(1);
+  }
+
+  std::cerr << "counting number of lines:" << std::endl;
+  ngram_size = getNgramSize(input_file);
+  num_tokens = getNumLines(input_file);
+  std::cerr << std::endl;
+  std::cerr << "writing mmap file:" << std::endl;
+  writeMmap(input_file, output_file, ngram_size, num_tokens);
+  std::cerr << std::endl;
+
+}
author	Rico Sennrich <rico.sennrich@gmx.ch>	2015-02-11 17:29:24 +0300
committer	Rico Sennrich <rico.sennrich@gmx.ch>	2015-02-11 17:29:24 +0300
commit	8a0847e816bef992a161349762618fd90622a895 (patch)
tree	b71c4477cc8578b33ba9d3048bcef4f8c4086fc6
parent	1479874eb8e501a4b27c3cdf33543e1194cd1be9 (diff)