upstream 0.3 (reverting all changes from this fork)

author: Rico Sennrich <rico.sennrich@gmx.ch> 2014-11-17 13:39:49 +0300
committer: Rico Sennrich <rico.sennrich@gmx.ch> 2014-11-17 13:51:09 +0300
commit: 7eb6ea415c1a10d27d36182bf00c01d05e137325 (patch)
tree: 4d927fdb05a4f106ad6d921b7227062208610e2f /src
parent: ba48d701c70e03fe1c1e96ecf5e06591ad4d3e27 (diff)
28 files changed, 2500 insertions, 1013 deletions
diff --git a/src/Activation_function.h b/src/Activation_function.h
index 0264cd1..dcd8651 100644
--- a/src/Activation_function.h
+++ b/src/Activation_function.h
@@ -3,7 +3,7 @@
 
 #include <cmath>
 #include <string>
-#include "../3rdparty/Eigen/Dense"
+#include <Eigen/Dense>
 
 #include "util.h"
 
@@ -68,7 +68,6 @@ struct drectifier_functor {
 
 class Activation_function
 {
-    private:
         int size;
 	activation_function_type f;
 
@@ -99,8 +98,10 @@ class Activation_function
         }
 
         template <typename DerivedGOut, typename DerivedGIn, typename DerivedIn, typename DerivedOut>
-	void bProp(const MatrixBase<DerivedGOut> &input, MatrixBase<DerivedGIn> &output,
-		   const MatrixBase<DerivedIn> &finput, const MatrixBase<DerivedOut> &foutput) const
+	void bProp(const MatrixBase<DerivedGOut> &input, 
+      MatrixBase<DerivedGIn> &output,
+		   const MatrixBase<DerivedIn> &finput,
+       const MatrixBase<DerivedOut> &foutput) const
         {
 	    UNCONST(DerivedGIn, output, my_output);
 
diff --git a/src/Makefile b/src/Makefile
index 1da279c..9e8f1b7 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -1,7 +1,8 @@
 ### Compilation options.
 
 # C++ compiler. Tested with g++ and Intel icpc.
-CXX=g++
+#CXX=/usr/bin/g++
+CXX=/opt/local/bin/g++-mp-4.7
 #CXX=icpc
 
 # Compiler options. Note that -DEIGEN_NO_DEBUG is essential for good performance!
@@ -19,22 +20,26 @@ OS:=$(shell uname -s)
 ### Required libraries. You must install these prior to building.
 
 # Set this to the root directory of Boost (should have a subdirectory named boost):
-BOOST=/home/hieu/workspace/boost/boost_1_55_0.gcc
+
+#BOOST=/usr/usc/boost/1.51.0
 #BOOST=/usr
-#BOOST=/opt/local
+BOOST=/opt/local
 # Where to find Boost header files
 BOOST_INC=$(BOOST)/include
 
 # Set this to the root directory of Eigen (should have a subdirectory named Eigen):
-EIGEN=/home/hieu/workspace/eigen-3
+EIGEN=../3rdparty
 
 ### Optional libraries.
 
 # To disable multithreading, comment out the line below:
-#OMP=1
+OMP=1
 
 # To use the MKL library, uncomment the line below and set it to the MKL root:
 #MKL=/usr/usc/intel/12.1.1/mkl
+# Set to 1 if you want to use the Single Dynamic Library; comment out otherwise.
+# This is required for building the Python extensions, but doesn't work with building a static binary.
+MKL_SINGLE=1
 
 # For Python bindings, set the following and run 'make python/nplm.so'.
 PYTHON_VERSION=2.7
@@ -53,11 +58,13 @@ TCLAP=../3rdparty/tclap/include
 # Where to find Boost libraries
 BOOST_LIB=$(BOOST)/lib
 # On some systems, a suffix is appended for the multithreaded version.
-BOOST_LIB_SUFFIX=
-#BOOST_LIB_SUFFIX=-mt
+#BOOST_LIB_SUFFIX=
+BOOST_LIB_SUFFIX=-mt
 
 BOOST_CFLAGS=-I$(BOOST_INC)
 BOOST_LDFLAGS=
+BOOST_LDLIBS=-lboost_iostreams$(BOOST_LIB_SUFFIX) -lboost_system$(BOOST_LIB_SUFFIX) -lboost_filesystem$(BOOST_LIB_SUFFIX)
+#BOOST_LDLIBS=-lboost_system$(BOOST_LIB_SUFFIX) -lboost_thread$(BOOST_LIB_SUFFIX)
 ifdef USE_CHRONO
   BOOST_CFLAGS+=-DUSE_CHRONO
   BOOST_LDLIBS+=-lboost_system$(BOOST_LIB_SUFFIX) -lboost_chrono$(BOOST_LIB_SUFFIX)
@@ -78,28 +85,41 @@ ifdef OMP
 endif
 
 ifdef MKL
-  MKL_CFLAGS=-I$(MKL)/include -DEIGEN_USE_MKL_ALL
-  MKL_LDLIBS=-Wl,--start-group
-  ifeq ($(ARCH),x86_64)
-    MKL_LDFLAGS=-L$(MKL)/lib/intel64 -Wl,-rpath -Wl,$(MKL)/lib/intel64
-    MKL_LDLIBS+=-lmkl_intel_lp64
-  endif
-  ifeq ($(ARCH),i686)
-    MKL_LDFLAGS=-L$(MKL)/lib/ia32 -Wl,-rpath -Wl,$(MKL)/lib/ia32
-    MKL_LDLIBS+=-lmkl_intel
-  endif
-
-  ifneq (,$(findstring g++,$(CXX)))
-    MKL_LDLIBS+=-lmkl_gnu_thread
-  endif
-  ifneq (,$(findstring icpc,$(CXX)))
-    MKL_LDLIBS+=-lmkl_intel_thread
-  endif
-
-  #MKL_LDLIBS=-lmkl_rt
-  MKL_LDLIBS+=-lmkl_core -Wl,--end-group
+  ifdef MKL_SINGLE
+    ifeq ($(ARCH),x86_64)
+      MKL_LDFLAGS=-L$(MKL)/lib/intel64 -Wl,-rpath -Wl,$(MKL)/lib/intel64
+    endif
+    ifeq ($(ARCH),i686)
+      MKL_LDFLAGS=-L$(MKL)/lib/ia32 -Wl,-rpath -Wl,$(MKL)/lib/ia32
+    endif
+    MKL_CFLAGS=-I$(MKL)/include -DEIGEN_USE_MKL_ALL -DMKL_SINGLE
+    MKL_LDLIBS=-lmkl_rt
+
+  else
+
+    MKL_CFLAGS=-I$(MKL)/include -DEIGEN_USE_MKL_ALL
+    MKL_LDLIBS=-Wl,--start-group
+    ifeq ($(ARCH),x86_64)
+      MKL_LDFLAGS=-L$(MKL)/lib/intel64 -Wl,-rpath -Wl,$(MKL)/lib/intel64
+      MKL_LDLIBS+=-lmkl_intel_lp64
+    endif
+    ifeq ($(ARCH),i686)
+      MKL_LDFLAGS=-L$(MKL)/lib/ia32 -Wl,-rpath -Wl,$(MKL)/lib/ia32
+      MKL_LDLIBS+=-lmkl_intel
+    endif
+
+    ifneq (,$(findstring g++,$(CXX)))
+      MKL_LDLIBS+=-lmkl_gnu_thread
+    endif
+    ifneq (,$(findstring icpc,$(CXX)))
+      MKL_LDLIBS+=-lmkl_intel_thread
+    endif
+
+    MKL_LDLIBS+=-lmkl_core -Wl,--end-group
+endif
 endif
 
+
 ifdef STATIC
   LDFLAGS+=-static
 endif
@@ -122,13 +142,13 @@ RANLIB=ranlib
 # Rules
 
 BINS=trainNeuralNetwork testNeuralNetwork prepareNeuralLM testNeuralLM prepareNeuralTM
-LIBS=libneuralLM.a libneuralLM.so
+LIBS=libnplm.a libnplm.so
 OBJS=util.o model.o
 
 all: $(BINS) $(LIBS)
 
 clean:
-	rm -f *.o shared/*.o python/*.o $(BINS) $(LIBS) python/nplm.{cpp,so}
+	rm -f *.o shared/*.o python/*.o $(BINS) $(LIBS) python/nplm.{cpp,so} python/nptm.{cpp,so}
 
 install: all
 	mkdir -p ../bin
@@ -157,19 +177,25 @@ testNeuralLM: testNeuralLM.o $(OBJS)
 prepareNeuralTM: prepareNeuralTM.o $(OBJS)
 	$(CXX) $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@
 
-libneuralLM.a: neuralLM.o $(OBJS)
+libnplm.a: neuralLM.o $(OBJS)
 	rm -f $@
 	$(AR) rv $@ $^
 	$(RANLIB) $@
 
-libneuralLM.so: $(addprefix shared/,neuralLM.o $(OBJS))
+libnplm.so: $(addprefix shared/,neuralLM.o $(OBJS))
 	$(CXX) -shared $(ALL_LDFLAGS) $^ $(ALL_LDLIBS) -o $@
 
-python/nplm.cpp: python/nplm.pyx
-	$(CYTHON) --cplus $^
+%.cpp: %.pyx
+	$(CYTHON) --cplus $^ -o $@
 
 python/nplm.o: python/nplm.cpp
 	$(CXX) -c -fPIC -I. $(ALL_CFLAGS) $(PYTHON_CFLAGS) $< -o $@
 
 python/nplm.so: python/nplm.o $(addprefix shared/,neuralLM.o $(OBJS))
 	$(CXX) -shared $(ALL_LDFLAGS) $(PYTHON_LDFLAGS) $^ $(ALL_LDLIBS) $(PYTHON_LDLIBS) -o $@ 
+
+python/nptm.o: python/nptm.cpp
+	$(CXX) -c -fPIC -I. $(ALL_CFLAGS) $(PYTHON_CFLAGS) $< -o $@
+
+python/nptm.so: python/nptm.o $(addprefix shared/,neuralTM.o $(OBJS))
+	$(CXX) -shared $(ALL_LDFLAGS) $(PYTHON_LDFLAGS) $^ $(ALL_LDLIBS) $(PYTHON_LDLIBS) -o $@ 
diff --git a/src/SoftmaxLoss.h b/src/SoftmaxLoss.h
index de5e043..24f59f5 100644
--- a/src/SoftmaxLoss.h
+++ b/src/SoftmaxLoss.h
@@ -1,7 +1,7 @@
-#ifndef SOFTMAXLOSS_H
+	#ifndef SOFTMAXLOSS_H
 #define SOFTMAXLOSS_H
 
-#include "../3rdparty/Eigen/Dense"
+#include <Eigen/Dense>
 #include "multinomial.h"
 #include "util.h"
 
diff --git a/src/USCMatrix.h b/src/USCMatrix.h
index 092bc4e..caa9553 100644
--- a/src/USCMatrix.h
+++ b/src/USCMatrix.h
@@ -1,7 +1,7 @@
 #ifndef USCMATRIX_H
 #define USCMATRIX_H
 
-#include "../3rdparty/Eigen/Dense"
+#include <Eigen/Dense>
 #include "maybe_omp.h"
 #include "util.h"
 
diff --git a/src/clipper.h b/src/clipper.h
new file mode 100644
index 0000000..dda5c4d
--- /dev/null
+++ b/src/clipper.h
@@ -0,0 +1,16 @@
+#ifndef CLIPPER_H
+#define CLIPPER_H
+
+namespace nplm {
+  struct Clipper{
+  double operator() (double x) const { 
+    return std::min(0.5, std::max(x,-0.5));
+    //return(x);
+  }
+};
+
+}
+
+#endif
+
+
diff --git a/src/graphClasses.h b/src/graphClasses.h
index 9f9e27c..da5f1af 100644
--- a/src/graphClasses.h
+++ b/src/graphClasses.h
@@ -3,7 +3,7 @@
 
 #include <cstdlib>
 #include "neuralClasses.h"
-#include "../3rdparty/Eigen/Dense"
+#include <Eigen/Dense>
 
 namespace nplm
 {
diff --git a/src/model.cpp b/src/model.cpp
index 589a52e..262490f 100644
--- a/src/model.cpp
+++ b/src/model.cpp
@@ -12,26 +12,18 @@ using namespace boost::random;
 namespace nplm
 {
 
-    void model::resize(int ngram_size,
-        int input_vocab_size,
-        int output_vocab_size,
-        int input_embedding_dimension,
-        int num_hidden,
-        int output_embedding_dimension)
+void model::resize(int ngram_size,
+    int input_vocab_size,
+    int output_vocab_size,
+    int input_embedding_dimension,
+    int num_hidden,
+    int output_embedding_dimension)
 {
     input_layer.resize(input_vocab_size, input_embedding_dimension, ngram_size-1);
-    if (num_hidden == 0) {
-        first_hidden_linear.resize(output_embedding_dimension, input_embedding_dimension*(ngram_size-1));
-        first_hidden_activation.resize(output_embedding_dimension);
-        second_hidden_linear.resize(1,1);
-        second_hidden_activation.resize(1);
-    }
-    else {
-        first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
-        first_hidden_activation.resize(num_hidden);
-        second_hidden_linear.resize(output_embedding_dimension, num_hidden);
-        second_hidden_activation.resize(output_embedding_dimension);
-    }
+    first_hidden_linear.resize(num_hidden, input_embedding_dimension*(ngram_size-1));
+    first_hidden_activation.resize(num_hidden);
+    second_hidden_linear.resize(output_embedding_dimension, num_hidden);
+    second_hidden_activation.resize(output_embedding_dimension);
     output_layer.resize(output_vocab_size, output_embedding_dimension);
     this->ngram_size = ngram_size;
     this->input_vocab_size = input_vocab_size;
@@ -42,12 +34,34 @@ namespace nplm
     premultiplied = false;
 }
   
-void model::initialize(mt19937 &init_engine, bool init_normal, double init_range, double init_bias)
+void model::initialize(mt19937 &init_engine,
+    bool init_normal,
+    double init_range,
+    double init_bias,
+    string &parameter_update,
+    double adagrad_epsilon)
 {
-    input_layer.initialize(init_engine, init_normal, init_range);
-    output_layer.initialize(init_engine, init_normal, init_range, init_bias);
-    first_hidden_linear.initialize(init_engine, init_normal, init_range);
-    second_hidden_linear.initialize(init_engine, init_normal, init_range);
+    input_layer.initialize(init_engine,
+        init_normal,
+        init_range,
+        parameter_update,
+        adagrad_epsilon);
+    output_layer.initialize(init_engine,
+        init_normal,
+        init_range,
+        init_bias,
+        parameter_update,
+        adagrad_epsilon);
+    first_hidden_linear.initialize(init_engine,
+        init_normal,
+        init_range,
+        parameter_update,
+        adagrad_epsilon);
+    second_hidden_linear.initialize(init_engine,
+        init_normal,
+        init_range,
+        parameter_update,
+        adagrad_epsilon);
 }
 
 void model::premultiply()
@@ -56,12 +70,7 @@ void model::premultiply()
     // we can multiply them into a single linear layer *if* we are not training
     int context_size = ngram_size-1;
     Matrix<double,Dynamic,Dynamic> U = first_hidden_linear.U;
-    if (num_hidden == 0) {
-        first_hidden_linear.U.resize(output_embedding_dimension, input_vocab_size * context_size);
-    }
-    else {
-        first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
-    }
+    first_hidden_linear.U.resize(num_hidden, input_vocab_size * context_size);
     for (int i=0; i<context_size; i++)
         first_hidden_linear.U.middleCols(i*input_vocab_size, input_vocab_size) = U.middleCols(i*input_embedding_dimension, input_embedding_dimension) * input_layer.W->transpose();
     input_layer.W->resize(1,1); // try to save some memory
@@ -133,6 +142,12 @@ void model::read(const string &filename)
     read(filename, input_words, output_words);
 }
 
+void model::read(const string &filename, vector<string> &words)
+{
+    vector<string> output_words;
+    read(filename, words, output_words);
+}
+
 void model::read(const string &filename, vector<string> &input_words, vector<string> &output_words)
 {
     ifstream file(filename.c_str());
@@ -170,9 +185,13 @@ void model::read(const string &filename, vector<string> &input_words, vector<str
 	else if (line == "\\input_embeddings")
 	    input_layer.read(file);
 	else if (line == "\\hidden_weights 1")
-	    first_hidden_linear.read(file);
+	    first_hidden_linear.read_weights(file);
+	else if (line == "\\hidden_biases 1")
+	    first_hidden_linear.read_biases (file);
 	else if (line == "\\hidden_weights 2")
-	    second_hidden_linear.read(file);
+	    second_hidden_linear.read_weights(file);
+	else if (line == "\\hidden_biases 2")
+	    second_hidden_linear.read_biases (file);
 	else if (line == "\\output_weights")
 	    output_layer.read_weights(file);
 	else if (line == "\\output_biases")
@@ -191,17 +210,22 @@ void model::read(const string &filename, vector<string> &input_words, vector<str
     file.close();
 }
 
-    void model::write(const string &filename, const vector<string> &input_words, const vector<string> &output_words)
+void model::write(const string &filename, const vector<string> &input_words, const vector<string> &output_words)
 { 
     write(filename, &input_words, &output_words);
 }
 
+void model::write(const string &filename, const vector<string> &words)
+{ 
+    write(filename, &words, NULL);
+}
+
 void model::write(const string &filename) 
 { 
     write(filename, NULL, NULL);
 }
 
-    void model::write(const string &filename, const vector<string> *input_pwords, const vector<string> *output_pwords)
+void model::write(const string &filename, const vector<string> *input_pwords, const vector<string> *output_pwords)
 {
     ofstream file(filename.c_str());
     if (!file) throw runtime_error("Could not open file " + filename);
@@ -236,11 +260,19 @@ void model::write(const string &filename)
     file << endl;
     
     file << "\\hidden_weights 1" << endl;
-    first_hidden_linear.write(file);
+    first_hidden_linear.write_weights(file);
     file << endl;
+
+    file << "\\hidden_biases 1" << endl;
+    first_hidden_linear.write_biases(file);
+    file <<endl;
     
     file << "\\hidden_weights 2" << endl;
-    second_hidden_linear.write(file);
+    second_hidden_linear.write_weights(file);
+    file << endl;
+
+    file << "\\hidden_biases 2" << endl;
+    second_hidden_linear.write_biases(file);
     file << endl;
     
     file << "\\output_weights" << endl;
diff --git a/src/model.h b/src/model.h
index 271b22f..3cce06a 100644
--- a/src/model.h
+++ b/src/model.h
@@ -74,7 +74,10 @@ public:
     void initialize(boost::random::mt19937 &init_engine,
         bool init_normal,
         double init_range,
-        double init_bias);
+        double init_bias,
+        string &parameter_udpate,
+        double adagrad_epsilon);
+
     void set_activation_function(activation_function_type f)
     {
         activation_function = f;
@@ -90,9 +93,11 @@ public:
     // a better solution is needed
 
     void read(const std::string &filename);
+    void read(const std::string &filename, std::vector<std::string> &words);
     void read(const std::string &filename, std::vector<std::string> &input_words, std::vector<std::string> &output_words);
-    void write(const std::string &filename, const std::vector<std::string> &input_words, const std::vector<std::string> &output_words);
     void write(const std::string &filename);
+    void write(const std::string &filename, const std::vector<std::string> &words);
+    void write(const std::string &filename, const std::vector<std::string> &input_words, const std::vector<std::string> &output_words);
 
  private:
     void readConfig(std::ifstream &config_file);
diff --git a/src/multinomial.h b/src/multinomial.h
index 1314fcb..8fccdf4 100644
--- a/src/multinomial.h
+++ b/src/multinomial.h
@@ -52,9 +52,9 @@ public:
       double p = unif_real(eng);
       int s;
       if (q[m] > p)
-	  s = m;
+	  	s = m;
       else
-          s = J[m];
+        s = J[m];
       assert (s >= 0);
       return s;
   }
@@ -125,6 +125,7 @@ private:
       {
 	std::cerr << "warning: multinomial: probability differs from one by " << std::fabs(q[*l_it]-1) << std::endl;
       }
+	  q[*l_it] = 1.0;
     }
   }
 
diff --git a/src/neuralClasses.h b/src/neuralClasses.h
index 1b57763..949e445 100644
--- a/src/neuralClasses.h
+++ b/src/neuralClasses.h
@@ -7,7 +7,7 @@
 #include <vector>
 
 #include <boost/unordered_map.hpp> 
-#include "../3rdparty/Eigen/Dense"
+#include <Eigen/Dense>
 #include "maybe_omp.h"
 
 #include "util.h"
@@ -21,16 +21,26 @@
 //#define EIGEN_DONT_PARALLELIZE
 //#define EIGEN_DEFAULT_TO_ROW_MAJOR
 
+using namespace std;
 namespace nplm
 {
 
 // is this cheating?
 using Eigen::Matrix;
+using Eigen::Array;
 using Eigen::MatrixBase;
 using Eigen::Dynamic;
 
 typedef boost::unordered_map<int,bool> int_map;
 
+struct Clipper{
+  double operator() (double x) const { 
+    return std::min(0.5, std::max(x,-0.5));
+    //return(x);
+  }
+};
+
+
 class Linear_layer
 {
     private: 
@@ -38,6 +48,13 @@ class Linear_layer
         Matrix<double,Dynamic,Dynamic> U_gradient;
         Matrix<double,Dynamic,Dynamic> U_velocity;
         Matrix<double,Dynamic,Dynamic> U_running_gradient;
+        Matrix<double,Dynamic,Dynamic> U_running_parameter_update;
+        // Biases
+        Matrix<double,Dynamic,1> b;
+        Matrix<double,Dynamic,1> b_velocity;
+        Matrix<double,Dynamic,1> b_running_gradient;
+        Matrix<double,Dynamic,1> b_running_parameter_update;
+        Matrix<double,Dynamic,1> b_gradient;
 
     friend class model;
 
@@ -49,94 +66,222 @@ class Linear_layer
 	{
 	    U.setZero(rows, cols);
       U_gradient.setZero(rows, cols);
-      U_running_gradient.setZero(rows, cols);
-      U_velocity.setZero(rows, cols);
+      //U_running_gradient.setZero(rows, cols);
+      //U_running_parameter_updates.setZero(rows, cols);
+      //U_velocity.setZero(rows, cols);
+      b.resize(rows);
+      b_gradient.setZero(rows);
+      //b_running_gradient.resize(rows);
+      //b_velocity.resize(rows);
 	}
 
-	void read(std::ifstream &U_file) { readMatrix(U_file, U); }
-	void write(std::ofstream &U_file) { writeMatrix(U, U_file); }
+	void read_weights(std::ifstream &U_file) { readMatrix(U_file, U); }
+	void write_weights(std::ofstream &U_file) { writeMatrix(U, U_file); }
+  void read_biases(std::ifstream &b_file) { readMatrix(b_file, b); }
+  void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
+
 
 	template <typename Engine>
-	void initialize(Engine &engine, bool init_normal, double init_range)
+	void initialize(Engine &engine,
+      bool init_normal,
+      double init_range,
+      string &parameter_update,
+      double adagrad_epsilon)
 	{
+      if (parameter_update == "ADA") {
+        U_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(U.rows(),U.cols())*adagrad_epsilon;
+        b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+      }
+      if (parameter_update == "ADAD") {
+        U_running_gradient.setZero(U.rows(),U.cols());
+        b_running_gradient.setZero(b.size());
+        U_running_parameter_update.setZero(U.rows(),U.cols());
+        b_running_parameter_update.setZero(b.size());
+      }
+
 	    initMatrix(engine, U, init_normal, init_range);
+      initBias(engine, b, init_normal, init_range);
 	}	  
 
 	int n_inputs () const { return U.cols(); }
 	int n_outputs () const { return U.rows(); }
 
-        template <typename DerivedIn, typename DerivedOut>
-	void fProp(const MatrixBase<DerivedIn> &input, const MatrixBase<DerivedOut> &output) const
-        {
-	    UNCONST(DerivedOut, output, my_output);
-	    my_output.leftCols(input.cols()).noalias() = U*input;
-        }
+  template <typename DerivedIn, typename DerivedOut>
+	void fProp(const MatrixBase<DerivedIn> &input,
+      const MatrixBase<DerivedOut> &output) const
+  {
+      UNCONST(DerivedOut, output, my_output);
+      my_output.leftCols(input.cols()).noalias() = U*input;
+      int num_examples = input.cols();
+      for (int example = 0;example < num_examples;example++) 
+      {
+          my_output.leftCols(input.cols()).col(example) += b;
+      }
+  }
 
 	// Sparse input
   template <typename ScalarIn, typename DerivedOut>
-	void fProp(const USCMatrix<ScalarIn> &input, const MatrixBase<DerivedOut> &output_const) const
+	void fProp(const USCMatrix<ScalarIn> &input,
+      const MatrixBase<DerivedOut> &output_const) const
   {
 	    UNCONST(DerivedOut, output_const, output);
 	    output.setZero();
 	    uscgemm(1.0, U, input, output.leftCols(input.cols()));
+      // Each column corresponds to a training example. We 
+      // parallelize the adding of biases per dimension.
+      int num_examples = input.cols();
+      for (int example = 0;example < num_examples;example++) 
+      {
+          output.leftCols(input.cols()).col(example) += b;
+      }
   }
 
-        template <typename DerivedGOut, typename DerivedGIn>
-	void bProp(const MatrixBase<DerivedGOut> &input, MatrixBase<DerivedGIn> &output) const
-        {
+  template <typename DerivedGOut, typename DerivedGIn>
+	void bProp(const MatrixBase<DerivedGOut> &input,
+      MatrixBase<DerivedGIn> &output) const
+  {
 	    UNCONST(DerivedGIn, output, my_output);
 	    my_output.noalias() = U.transpose()*input;
 	}
 
-      template <typename DerivedGOut, typename DerivedIn>
-      void computeGradient(const MatrixBase<DerivedGOut> &bProp_input, 
-         const MatrixBase<DerivedIn> &fProp_input, 
-         double learning_rate, double momentum, double L2_reg)
+  template <typename DerivedGOut, typename DerivedIn>
+  void computeGradient( const MatrixBase<DerivedGOut> &bProp_input, 
+     const MatrixBase<DerivedIn> &fProp_input, 
+     double learning_rate, double momentum, double L2_reg)
+  {
+      U_gradient.noalias() = bProp_input*fProp_input.transpose();
+      
+      // get the bias gradient for all dimensions in parallel
+      int size = b.size();
+      b_gradient = bProp_input.rowwise().sum();
+      // This used to be multithreaded, but there was no measureable difference
+      if (L2_reg > 0.0)
       {
-	    U_gradient.noalias() = bProp_input*fProp_input.transpose();
-
-	    // This used to be multithreaded, but there was no measureable difference
-	    if (L2_reg > 0.0)
-	    {
-	        U_gradient *= 1 - 2*L2_reg;
-	    }
-	    if (momentum > 0.0)
-	    {
-	        U_velocity = momentum*U_velocity + U_gradient;
-	        U += learning_rate * U_velocity;
-	    }
-	    else
-	    {
-	        U += learning_rate * U_gradient;
-	    }
+          U_gradient -=  2*L2_reg*U;
+          b_gradient -= 2*L2_reg*b;
+      }
+      if (momentum > 0.0)
+      {
+          U_velocity = momentum*U_velocity + U_gradient;
+          U += learning_rate * U_velocity;
+          b_velocity = momentum*b_velocity + b_gradient;
+          b += learning_rate * b_velocity;
+      }
+      else
+      {
+          U += learning_rate * U_gradient;
+          b += learning_rate * b_gradient;
+          /* 
+          //UPDATE CLIPPING
+          U += (learning_rate*U_gradient).array().unaryExpr(Clipper()).matrix();
+          b += (learning_rate*b_gradient).array().unaryExpr(Clipper()).matrix();
+          //GRADIENT CLIPPING
+          //U += learning_rate*(U_gradient.array().unaryExpr(Clipper())).matrix();
+          //b += learning_rate*(b_gradient.array().unaryExpr(Clipper())).matrix();
+          */
+      }
 	}
 
-        template <typename DerivedGOut, typename DerivedIn>
-        void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, 
-				    const MatrixBase<DerivedIn> &fProp_input, 
-				    double learning_rate, double momentum, double L2_reg)
-        {
-            U_gradient.noalias() = bProp_input*fProp_input.transpose();
+  template <typename DerivedGOut, typename DerivedIn>
+  void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input, 
+      const MatrixBase<DerivedIn> &fProp_input, 
+      double learning_rate,
+      double L2_reg)
+  {
+      U_gradient.noalias() = bProp_input*fProp_input.transpose();
 
-	    if (L2_reg != 0)
-	    {
-	        U_gradient *= 1 - 2*L2_reg;
-	    }
+      
+      // get the bias gradient for all dimensions in parallel
+      int size = b.size();
+      b_gradient.noalias() = bProp_input.rowwise().sum();
 
-	    // ignore momentum?
+      if (L2_reg != 0)
+      {
+          U_gradient -=  2*L2_reg*U;
+          b_gradient -= 2*L2_reg*b;
+      }
+
+      // ignore momentum?
+      #pragma omp parallel for
+      for (int col=0; col<U.cols(); col++) {
+        U_running_gradient.col(col) += U_gradient.col(col).array().square().matrix();
+        U.col(col) += learning_rate * (U_gradient.col(col).array() / 
+                  U_running_gradient.col(col).array().sqrt()).matrix();
+        /*
+        //UPDATE CLIPPING
+        U.col(col) += (learning_rate * (U_gradient.col(col).array() / U_running_gradient.col(col).array().sqrt())).
+              unaryExpr(Clipper()).matrix();
+        */
+      }
+      b_running_gradient += b_gradient.array().square().matrix();
+      b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+      /*
+      //UPDATE CLIPPING
+      b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+      */
+  }
 
-	    U_running_gradient.array() += U_gradient.array().square();
-	    U.array() += learning_rate * U_gradient.array() / U_running_gradient.array().sqrt();
-        }
+  template <typename DerivedGOut, typename DerivedIn>
+  void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input, 
+      const MatrixBase<DerivedIn> &fProp_input, 
+      double learning_rate,
+      double L2_reg,
+      double conditioning_constant,
+      double decay)
+  {
+      //cerr<<"decay is "<<decay<<" and conditioning constant is "<<conditioning_constant<<endl;
+      U_gradient.noalias() = bProp_input*fProp_input.transpose();
 
-        template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
-        void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, 
-				  const MatrixBase<DerivedIn> &fProp_input, 
-				  const MatrixBase<DerivedGW> &gradient) const
-        {
-	    UNCONST(DerivedGW, gradient, my_gradient);
-	    my_gradient.noalias() = bProp_input*fProp_input.transpose();
-        }
+      Array<double,Dynamic,1> b_current_parameter_update;
+      
+      // get the bias gradient for all dimensions in parallel
+      int size = b.size();
+      b_gradient.noalias() = bProp_input.rowwise().sum();
+
+      if (L2_reg != 0)
+      {
+          U_gradient -=  2*L2_reg*U;
+          b_gradient -= 2*L2_reg*b;
+      }
+
+      // ignore momentum?
+      #pragma omp parallel for
+      //cerr<<"U gradient is "<<U_gradient<<endl;
+      for (int col=0; col<U.cols(); col++) {
+        Array<double,Dynamic,1> U_current_parameter_update;
+        U_running_gradient.col(col) = decay*U_running_gradient.col(col) + 
+                            (1-decay)*U_gradient.col(col).array().square().matrix();
+        //cerr<<"U running gradient is "<<U_running_gradient.col(col)<<endl;
+        //getchar();
+        U_current_parameter_update = ((U_running_parameter_update.col(col).array()+conditioning_constant).sqrt()/
+                                      (U_running_gradient.col(col).array()+conditioning_constant).sqrt()) *
+                                      U_gradient.col(col).array();
+        //cerr<<"U current parameter update is "<<U_current_parameter_update<<endl;
+        //getchar();
+        //update the running parameter update
+        U_running_parameter_update.col(col) = decay*U_running_parameter_update.col(col) +
+                                          (1.-decay)*U_current_parameter_update.square().matrix();
+        U.col(col) += learning_rate*U_current_parameter_update.matrix();  
+      }
+      b_running_gradient = decay*b_running_gradient + 
+                        (1.-decay)*b_gradient.array().square().matrix();
+      b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+                                   (b_running_gradient.array()+conditioning_constant).sqrt()) *
+                                  b_gradient.array();
+      b_running_parameter_update = decay*(b_running_parameter_update) + 
+                                (1.-decay)*b_current_parameter_update.square().matrix();
+      b += learning_rate*b_current_parameter_update.matrix();
+  }
+
+
+  template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
+  void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input, 
+    const MatrixBase<DerivedIn> &fProp_input, 
+    const MatrixBase<DerivedGW> &gradient) const
+  {
+      UNCONST(DerivedGW, gradient, my_gradient);
+      my_gradient.noalias() = bProp_input*fProp_input.transpose();
+  }
 };
 
 class Output_word_embeddings
@@ -149,10 +294,12 @@ class Output_word_embeddings
         Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
         std::vector<double> W_data;
         Matrix<double,Dynamic,1> b;
-        Matrix<double,Dynamic,Dynamic> W_running_gradient;
-        Matrix<double,Dynamic,Dynamic> W_gradient;
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
         Matrix<double,Dynamic,1> b_running_gradient;
         Matrix<double,Dynamic,1> b_gradient;
+        Matrix<double,Dynamic,1> b_running_parameter_update;
 
     public:
         Output_word_embeddings() { }
@@ -160,8 +307,8 @@ class Output_word_embeddings
 
         void resize(int rows, int cols)
         {
-	    W->setZero(rows, cols);
-	    b.setZero(rows);
+          W->setZero(rows, cols);
+          b.setZero(rows);
         }
     void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
       W = input_W;
@@ -172,8 +319,31 @@ class Output_word_embeddings
     void write_biases(std::ofstream &b_file) { writeMatrix(b, b_file); }
 
     template <typename Engine>
-    void initialize(Engine &engine, bool init_normal, double init_range, double init_bias)
+    void initialize(Engine &engine,
+        bool init_normal,
+        double init_range,
+        double init_bias,
+        string &parameter_update,
+        double adagrad_epsilon)
     {
+
+        W_gradient.setZero(W->rows(),W->cols());
+        b_gradient.setZero(b.size());
+        if (parameter_update == "ADA") {
+          W_running_gradient = Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+          b_running_gradient = Matrix<double,Dynamic,1>::Ones(b.size())*adagrad_epsilon;
+          //W_gradient.setZero(W->rows(),W->cols());
+          //b_gradient.setZero(b.size());
+        }
+        if (parameter_update == "ADAD") {
+          W_running_gradient.setZero(W->rows(),W->cols());
+          b_running_gradient.setZero(b.size());
+          W_gradient.setZero(W->rows(),W->cols());
+          //b_gradient.setZero(b.size());
+          //W_running_parameter_update.setZero(W->rows(),W->cols());
+          b_running_parameter_update.setZero(b.size());
+        }
+
         initMatrix(engine, *W, init_normal, init_range);
         b.fill(init_bias);
     }
@@ -198,8 +368,12 @@ class Output_word_embeddings
         UNCONST(DerivedOutV, output, my_output);
         #pragma omp parallel for
         for (int instance_id = 0; instance_id < samples.cols(); instance_id++)
-            for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
-          my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
+        {
+          for (int sample_id = 0; sample_id < samples.rows(); sample_id++)
+          {
+            my_output(sample_id, instance_id) = b(samples(sample_id, instance_id));
+          }
+        }
         USCMatrix<double> sparse_output(W->rows(), samples, my_output);
         uscgemm_masked(1.0, *W, input, sparse_output);
         my_output = sparse_output.values; // too bad, so much copying
@@ -232,15 +406,86 @@ class Output_word_embeddings
           void computeGradient(const MatrixBase<DerivedIn> &predicted_embeddings,
              const MatrixBase<DerivedGOut> &bProp_input,
              double learning_rate,
-             double momentum) //not sure if we want to use momentum here
+             double momentum) //not sure if we want 	to use momentum here
     {
         // W is vocab_size x output_embedding_dimension
         // b is vocab_size x 1
         // predicted_embeddings is output_embedding_dimension x minibatch_size
         // bProp_input is vocab_size x minibatch_size
-
         W->noalias() += learning_rate * bProp_input * predicted_embeddings.transpose();
         b += learning_rate * bProp_input.rowwise().sum();
+
+        /*
+        //GRADIENT CLIPPING
+        W->noalias() += learning_rate * 
+          ((bProp_input * predicted_embeddings.transpose()).array().unaryExpr(Clipper())).matrix();
+        b += learning_rate * (bProp_input.rowwise().sum().array().unaryExpr(Clipper())).matrix();
+        //UPDATE CLIPPING
+        W->noalias() += (learning_rate * 
+        (bProp_input * predicted_embeddings.transpose())).array().unaryExpr(Clipper()).matrix();
+        b += (learning_rate * (bProp_input.rowwise().sum())).array().unaryExpr(Clipper()).matrix();
+        */
+	  }
+
+    template <typename DerivedIn, typename DerivedGOut>
+          void computeGradientAdagrad(
+             const MatrixBase<DerivedIn> &predicted_embeddings,
+             const MatrixBase<DerivedGOut> &bProp_input,
+             double learning_rate) //not sure if we want to use momentum here
+    {
+        // W is vocab_size x output_embedding_dimension
+        // b is vocab_size x 1
+        // predicted_embeddings is output_embedding_dimension x minibatch_size
+        // bProp_input is vocab_size x minibatch_sizea
+        W_gradient.setZero(W->rows(), W->cols());
+        b_gradient.setZero(b.size());
+        W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+        b_gradient.noalias() = bProp_input.rowwise().sum();
+        W_running_gradient += W_gradient.array().square().matrix();
+        b_running_gradient += b_gradient.array().square().matrix();
+        W->noalias() += learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt()).matrix();
+        b += learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt()).matrix();
+        /*
+        //UPDATE CLIPPING
+        *W += (learning_rate * (W_gradient.array()/W_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+        b += (learning_rate * (b_gradient.array()/b_running_gradient.array().sqrt())).unaryExpr(Clipper()).matrix();
+        */
+	  }
+
+    template <typename DerivedIn, typename DerivedGOut>
+          void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+             const MatrixBase<DerivedGOut> &bProp_input,
+             double learning_rate,
+             double conditioning_constant,
+             double decay) //not sure if we want to use momentum here
+    {
+        // W is vocab_size x output_embedding_dimension
+        // b is vocab_size x 1
+        // predicted_embeddings is output_embedding_dimension x minibatch_size
+        // bProp_input is vocab_size x minibatch_size
+        Array<double,Dynamic,Dynamic> W_current_parameter_update;
+        Array<double,Dynamic,1> b_current_parameter_update;
+        W_gradient.setZero(W->rows(), W->cols());
+        b_gradient.setZero(b.size());
+        W_gradient.noalias() = bProp_input * predicted_embeddings.transpose();
+        b_gradient.noalias() = bProp_input.rowwise().sum();
+        W_running_gradient = decay*W_running_gradient +
+                            (1.-decay)*W_gradient.array().square().matrix();
+        b_running_gradient = decay*b_running_gradient+
+                            (1.-decay)*b_gradient.array().square().matrix();
+        W_current_parameter_update = ((W_running_parameter_update.array()+conditioning_constant).sqrt()/
+                                     (W_running_gradient.array()+conditioning_constant).sqrt())*
+                                      W_gradient.array();
+        b_current_parameter_update = ((b_running_parameter_update.array()+conditioning_constant).sqrt()/
+                                     (b_running_gradient.array()+conditioning_constant).sqrt())*
+                                     b_gradient.array();
+        W_running_parameter_update = decay*W_running_parameter_update + 
+                                    (1.-decay)*W_current_parameter_update.square().matrix();
+        b_running_parameter_update = decay*b_running_parameter_update +
+                                    (1.-decay)*b_current_parameter_update.square().matrix();
+
+        *W += learning_rate*W_current_parameter_update.matrix();
+        b += learning_rate*b_current_parameter_update.matrix();
 	  }
 
     // Sparse versions
@@ -264,6 +509,7 @@ class Output_word_embeddings
 			     const MatrixBase<DerivedGOutV> &weights,
 			     double learning_rate, double momentum) //not sure if we want to use momentum here
 	{
+      //cerr<<"in gradient"<<endl;
 	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
 	    uscgemm(learning_rate,
           gradient_output,
@@ -273,27 +519,64 @@ class Output_word_embeddings
           gradient_output,
 		      Matrix<double,Dynamic,1>::Ones(gradient_output.cols()),
           b);
+      /*
+      //IN ORDER TO IMPLEMENT CLIPPING, WE HAVE TO COMPUTE THE GRADIENT
+      //FIRST
+	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+	    uscgemm(1.0,
+          gradient_output,
+          predicted_embeddings.leftCols(samples.cols()).transpose(),
+          W_gradient);
+	    uscgemv(1.0, 
+          gradient_output,
+		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
+          b_gradient);
+
+      int_map update_map; //stores all the parameters that have been updated
+      for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+	        for (int train_id=0; train_id<samples.cols(); train_id++)
+		          update_map[samples(sample_id, train_id)] = 1;
+
+	    // Convert to std::vector for parallelization
+        std::vector<int> update_items;
+        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+            update_items.push_back(it->first);
+        int num_items = update_items.size();
+
+        //#pragma omp parallel for
+        for (int item_id=0; item_id<num_items; item_id++)
+        {
+            int update_item = update_items[item_id];
+            //W->row(update_item) += learning_rate * W_gradient.row(update_item);
+            //b(update_item) += learning_rate * b_gradient(update_item);
+            //UPDATE CLIPPING
+            W->row(update_item) += (learning_rate * W_gradient.row(update_item)).array().unaryExpr(Clipper()).matrix();
+            double update = learning_rate * b_gradient(update_item);
+            b(update_item) += std::min(0.5, std::max(update,-0.5));
+            //GRADIENT CLIPPING
+            W_gradient.row(update_item).setZero();
+            b_gradient(update_item) = 0.;
+        }
+        */
+      //cerr<<"Finished gradient"<<endl;
 	}
 
 	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
         void computeGradientAdagrad(const MatrixBase<DerivedIn> &predicted_embeddings,
 				    const MatrixBase<DerivedGOutI> &samples,
 				    const MatrixBase<DerivedGOutV> &weights,
-				    double learning_rate, double momentum) //not sure if we want to use momentum here
+				    double learning_rate) //not sure if we want to use momentum here
         {
-	    W_gradient.setZero(W->rows(), W->cols());
-	    b_gradient.setZero(b.size());
-	    if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
-	      W_running_gradient.setZero(W->rows(), W->cols());
-	    if (b_running_gradient.size() != b.size())
-	      b_running_gradient.setZero(b.size());
-
+	    //W_gradient.setZero(W->rows(), W->cols());
+	    //b_gradient.setZero(b.size());
+      //FOR CLIPPING, WE DO NOT MULTIPLY THE GRADIENT WITH THE LEARNING RATE
 	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
-	    uscgemm(learning_rate,
+	    uscgemm(1.0,
           gradient_output,
           predicted_embeddings.leftCols(samples.cols()).transpose(),
           W_gradient);
-	    uscgemv(learning_rate, gradient_output,
+	    uscgemv(1.0, 
+          gradient_output,
 		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
           b_gradient);
 
@@ -308,16 +591,98 @@ class Output_word_embeddings
             update_items.push_back(it->first);
         int num_items = update_items.size();
 
-        #pragma omp parallel for
+        //#pragma omp parallel for
         for (int item_id=0; item_id<num_items; item_id++)
         {
             int update_item = update_items[item_id];
-            W_running_gradient.row(update_item).array() += W_gradient.row(update_item).array().square();
+            W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
             b_running_gradient(update_item) += b_gradient(update_item) * b_gradient(update_item);
-            W->row(update_item).array() += learning_rate * W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt();
+            W->row(update_item) += learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
             b(update_item) += learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+            /*
+            //UPDATE CLIPPING
+            W->row(update_item) += (learning_rate * (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt())).unaryExpr(Clipper()).matrix();
+            double update = learning_rate * b_gradient(update_item) / sqrt(b_running_gradient(update_item));
+            b(update_item) += Clipper(update);//std::min(0.5, std::max(update,-0.5));
+            */
+            W_gradient.row(update_item).setZero();
+            b_gradient(update_item) = 0.;
         }
+      }
+
+	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV>
+        void computeGradientAdadelta(const MatrixBase<DerivedIn> &predicted_embeddings,
+				    const MatrixBase<DerivedGOutI> &samples,
+				    const MatrixBase<DerivedGOutV> &weights,
+				    double learning_rate,
+            double conditioning_constant,
+            double decay) //not sure if we want to use momentum here
+        {
+          //cerr<<"decay is "<<decay<<" and constant is "<<conditioning_constant<<endl;
+	    //W_gradient.setZero(W->rows(), W->cols());
+	    //b_gradient.setZero(b.size());
+
+	    USCMatrix<double> gradient_output(W->rows(), samples, weights);
+	    uscgemm(1.0,
+          gradient_output,
+          predicted_embeddings.leftCols(samples.cols()).transpose(),
+          W_gradient);
+	    uscgemv(1.0, 
+          gradient_output,
+		      Matrix<double,Dynamic,1>::Ones(weights.cols()),
+          b_gradient);
+
+      int_map update_map; //stores all the parameters that have been updated
+      for (int sample_id=0; sample_id<samples.rows(); sample_id++)
+	        for (int train_id=0; train_id<samples.cols(); train_id++)
+		          update_map[samples(sample_id, train_id)] = 1;
+
+	    // Convert to std::vector for parallelization
+        std::vector<int> update_items;
+        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+            update_items.push_back(it->first);
+        int num_items = update_items.size();
+
+        #pragma omp parallel for
+        for (int item_id=0; item_id<num_items; item_id++)
+        {
+            Array<double,1,Dynamic> W_current_parameter_update;
+            double b_current_parameter_update;
+
+            int update_item = update_items[item_id];
+            W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+                                                (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+            b_running_gradient(update_item) = decay*b_running_gradient(update_item)+
+                                            (1.-decay)*b_gradient(update_item)*b_gradient(update_item);
+            //cerr<<"Output: W gradient is "<<W_gradient.row(update_item)<<endl;
+            //getchar();
+
+            //cerr<<"Output: W running gradient is "<<W_running_gradient.row(update_item)<<endl;
+            //getchar();
+            W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+                                         (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+                                         W_gradient.row(update_item).array();
+            b_current_parameter_update = (sqrt(b_running_parameter_update(update_item)+conditioning_constant)/
+                                         sqrt(b_running_gradient(update_item)+conditioning_constant))*
+                                         b_gradient(update_item);
+            //cerr<<"Output: W current parameter update is "<<W_current_parameter_update<<endl;
+            //getchar();
+            //cerr<<"Output: W running parameter update before is "<<W_running_parameter_update.row(update_item)<<endl;
+            //getchar();
+            //cerr<<"the second term is "<<(1.-decay)*W_current_parameter_update.square().matrix()<<endl;
+            W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+                                                         (1.-decay)*(W_current_parameter_update.square().matrix());
+            b_running_parameter_update(update_item) = decay*b_running_parameter_update(update_item)+
+                                                      (1.-decay)*b_current_parameter_update*b_current_parameter_update;
+            //cerr<<"Output: W running parameter update is "<<W_running_parameter_update.row(update_item)<<endl;
+            //getchar();
+            W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+            b(update_item) += learning_rate*b_current_parameter_update;
+            W_gradient.row(update_item).setZero();
+            b_gradient(update_item) = 0.;
         }
+      }
+
 
 	template <typename DerivedIn, typename DerivedGOutI, typename DerivedGOutV, typename DerivedGW, typename DerivedGb>
     void computeGradientCheck(const MatrixBase<DerivedIn> &predicted_embeddings,
@@ -345,8 +710,9 @@ class Input_word_embeddings
     private:
         Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *W;
         int context_size, vocab_size;
-        Matrix<double,Dynamic,Dynamic> W_running_gradient;
-        Matrix<double,Dynamic,Dynamic> W_gradient;
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_gradient;
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_running_parameter_update;
+        Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> W_gradient;
 
 	friend class model;
 
@@ -354,29 +720,44 @@ class Input_word_embeddings
         Input_word_embeddings() : context_size(0), vocab_size(0) { }
         Input_word_embeddings(int rows, int cols, int context) { resize(rows, cols, context); }
  
-    void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
-      W = input_W;
-    }
+      void set_W(Matrix<double,Dynamic,Dynamic,Eigen::RowMajor> *input_W) {
+        W = input_W;
+      }
 
-        void resize(int rows, int cols, int context)
-        {
-            context_size = context;
-	    vocab_size = rows;
-            W->setZero(rows, cols);
-        }
+      void resize(int rows, int cols, int context)
+      {
+        context_size = context;
+        vocab_size = rows;
+        W->setZero(rows, cols);
+      }
 
         void read(std::ifstream &W_file) { readMatrix(W_file, *W); }
         void write(std::ofstream &W_file) { writeMatrix(*W, W_file); }
 
-	template <typename Engine>
-	void initialize(Engine &engine, bool init_normal, double init_range)
-        {
-            initMatrix(engine,
-                *W,
-                init_normal,
-                init_range);
+      template <typename Engine>
+      void initialize(Engine &engine,
+          bool init_normal,
+          double init_range,
+          string &parameter_update,
+          double adagrad_epsilon)
+      {
+          W_gradient.setZero(W->rows(),W->cols());
+
+          if (parameter_update == "ADA") {
+            W_running_gradient =  Matrix<double,Dynamic,Dynamic>::Ones(W->rows(),W->cols())*adagrad_epsilon;
+            //W_gradient.setZero(W->rows(),W->cols());
+          } 
+        if (parameter_update == "ADAD") {
+          W_running_gradient.setZero(W->rows(),W->cols());
+          //W_gradient.setZero(W->rows(),W->cols());
+          W_running_parameter_update.setZero(W->rows(),W->cols());
         }
-	
+        initMatrix(engine,
+            *W,
+            init_normal,
+            init_range);
+      }
+
 	int n_inputs() const { return -1; }
 	int n_outputs() const { return W->cols() * context_size; }
 
@@ -436,7 +817,7 @@ class Input_word_embeddings
      const MatrixBase<DerivedIn> &input_words,
      double learning_rate, double momentum, double L2_reg)
   {
-            int embedding_dimension = W->cols();
+      int embedding_dimension = W->cols();
 
 	    // W           is vocab_size                        x embedding_dimension
 	    // input       is ngram_size*vocab_size             x minibatch_size
@@ -453,59 +834,177 @@ class Input_word_embeddings
 	        uscgemm(learning_rate, 
 			USCMatrix<double>(W->rows(), input_words.middleRows(ngram, 1), Matrix<double,1,Dynamic>::Ones(input_words.cols())),
 			bProp_input.block(ngram*embedding_dimension,0,embedding_dimension,input_words.cols()).transpose(),
-      *W);
+      	  	*W);
+	    }
+
+      /*
+      //IF WE WANT TO DO GRADIENT CLIPPING, THEN WE FIRST COMPUTE THE GRADIENT AND THEN
+      //PERFORM CLIPPING WHILE UPDATING
+
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+	      uscgemm(1.0, 
+          USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+          bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+          W_gradient);
 	    }
+      int_map update_map; //stores all the parameters that have been updated
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+        for (int train_id=0; train_id<input_words.cols(); train_id++)
+        {
+          update_map[input_words(ngram,train_id)] = 1;
+        }
+      }
+
+	    // Convert to std::vector for parallelization
+        std::vector<int> update_items;
+        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+        {
+            update_items.push_back(it->first);
+        }
+        int num_items = update_items.size();
+
+        #pragma omp parallel for
+        for (int item_id=0; item_id<num_items; item_id++)
+        {
+            int update_item = update_items[item_id];
+            //UPDATE CLIPPING
+            W->row(update_item) += (learning_rate*
+                W_gradient.row(update_item).array().unaryExpr(Clipper())).matrix();
+            //GRADIENT CLIPPING
+            //W->row(update_item) += learning_rate*
+            //    W_gradient.row(update_item).array().unaryExpr(Clipper()).matrix();
+            //SETTING THE GRADIENT TO ZERO
+            W_gradient.row(update_item).setZero();
+        }
+      */
   }
 
     template <typename DerivedGOut, typename DerivedIn>
     void computeGradientAdagrad(const MatrixBase<DerivedGOut> &bProp_input,
 				    const MatrixBase<DerivedIn> &input_words,
-				    double learning_rate, double momentum, double L2_reg)
+				    double learning_rate,
+            double L2_reg)
     {
             int embedding_dimension = W->cols();
-
-	    W_gradient.setZero(W->rows(), W->cols());
-	    if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
-	        W_running_gradient.setZero(W->rows(), W->cols());
-
+	    //W_gradient.setZero(W->rows(), W->cols());
+      /*
+      if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
+        W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+      */
 	    for (int ngram=0; ngram<context_size; ngram++)
 	    {
-	        uscgemm(learning_rate, 
+	        uscgemm(1.0, 
 			USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
 			bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
       W_gradient);
 	    }
+      int_map update_map; //stores all the parameters that have been updated
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+        for (int train_id=0; train_id<input_words.cols(); train_id++)
+        {
+          update_map[input_words(ngram,train_id)] = 1;
+        }
+      }
+
+	    // Convert to std::vector for parallelization
+        std::vector<int> update_items;
+        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+        {
+            update_items.push_back(it->first);
+        }
+        int num_items = update_items.size();
+
+        #pragma omp parallel for
+        for (int item_id=0; item_id<num_items; item_id++)
+        {
+            int update_item = update_items[item_id];
+            W_running_gradient.row(update_item) += W_gradient.row(update_item).array().square().matrix();
+            W->row(update_item) += learning_rate * 
+              (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()).matrix();
+            /*
+            //UPDATE CLIPPING
+            W->row(update_item) += (learning_rate * 
+              (W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt()))
+                      .unaryExpr(Clipper()).matrix();
+            */
+            W_gradient.row(update_item).setZero();
+        }
+    }
 
-            int_map update_map; //stores all the parameters that have been updated
+    template <typename DerivedGOut, typename DerivedIn>
+    void computeGradientAdadelta(const MatrixBase<DerivedGOut> &bProp_input,
+				    const MatrixBase<DerivedIn> &input_words,
+				    double learning_rate,
+            double L2_reg,
+            double conditioning_constant,
+            double decay)
+    {
+      int embedding_dimension = W->cols();
 
-            for (int train_id=0; train_id<input_words.cols(); train_id++)
-            {
-                update_map[input_words(train_id)] = 1;
-            }
+	    //W_gradient.setZero(W->rows(), W->cols());
+      /*
+      if (W_running_gradient.rows() != W->rows() || W_running_gradient.cols() != W->cols())
+        W_running_gradient = Ones(W->rows(), W->cols())*adagrad_epsilon;
+      */
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+	        uscgemm(1.0, 
+			USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
+			bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
+      W_gradient);
+	    }
+      int_map update_map; //stores all the parameters that have been updated
+	    for (int ngram=0; ngram<context_size; ngram++)
+	    {
+        for (int train_id=0; train_id<input_words.cols(); train_id++)
+        {
+          update_map[input_words(ngram,train_id)] = 1;
+        }
+      }
 
 	    // Convert to std::vector for parallelization
-            std::vector<int> update_items;
-            for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
-            {
-                update_items.push_back(it->first);
-            }
-            int num_items = update_items.size();
-
-            #pragma omp parallel for
-            for (int item_id=0; item_id<num_items; item_id++)
-            {
-	        int update_item = update_items[item_id];
-                W_running_gradient.row(update_item).array() += W_gradient.row(update_item).array().square();
-                W->row(update_item).array() += learning_rate * W_gradient.row(update_item).array() / W_running_gradient.row(update_item).array().sqrt();
-            }
+        std::vector<int> update_items;
+        for (int_map::iterator it = update_map.begin(); it != update_map.end(); ++it)
+        {
+            update_items.push_back(it->first);
         }
+        int num_items = update_items.size();
 
-        template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
-        void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
-				  const MatrixBase<DerivedIn> &input_words,
-				  int x, int minibatch_size,
-				  const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+        #pragma omp parallel for
+        for (int item_id=0; item_id<num_items; item_id++)
         {
+
+            Array<double,1,Dynamic> W_current_parameter_update;
+            int update_item = update_items[item_id];
+            W_running_gradient.row(update_item) = decay*W_running_gradient.row(update_item)+
+                                                (1.-decay)*W_gradient.row(update_item).array().square().matrix();
+
+            W_current_parameter_update = ((W_running_parameter_update.row(update_item).array()+conditioning_constant).sqrt()/
+                                         (W_running_gradient.row(update_item).array()+conditioning_constant).sqrt())*
+                                         W_gradient.row(update_item).array();
+
+            //cerr<<"Input: W current parameter update is "<<W_current_parameter_update<<endl;
+            //getchar();
+            W_running_parameter_update.row(update_item) = decay*W_running_parameter_update.row(update_item)+
+                                                         (1.-decay)*W_current_parameter_update.square().matrix();
+
+            W->row(update_item) += learning_rate*W_current_parameter_update.matrix();
+            //cerr<<"Input: After update, W is  "<<W->row(update_item)<<endl;
+            //getchar();
+            W_gradient.row(update_item).setZero();
+        }
+
+    }
+
+    template <typename DerivedGOut, typename DerivedIn, typename DerivedGW>
+    void computeGradientCheck(const MatrixBase<DerivedGOut> &bProp_input,
+      const MatrixBase<DerivedIn> &input_words,
+      int x, int minibatch_size,
+      const MatrixBase<DerivedGW> &gradient) const //not sure if we want to use momentum here
+    {
 	    UNCONST(DerivedGW, gradient, my_gradient);
             int embedding_dimension = W->cols();
 	    my_gradient.setZero();
@@ -514,7 +1013,8 @@ class Input_word_embeddings
 			  USCMatrix<double>(W->rows(),input_words.middleRows(ngram, 1),Matrix<double,1,Dynamic>::Ones(input_words.cols())),
 			  bProp_input.block(ngram*embedding_dimension, 0, embedding_dimension, input_words.cols()).transpose(),
         my_gradient);
-        }
+    }
 };
 
 } // namespace nplm
+
diff --git a/src/neuralLM.h b/src/neuralLM.h
index f451e8a..dc66206 100644
--- a/src/neuralLM.h
+++ b/src/neuralLM.h
@@ -2,124 +2,51 @@
 #define NEURALLM_H
 
 #include <vector>
-#include <iostream>
-#include <fstream>
-#include <memory>
-#include <stdexcept>
 #include <cctype>
 #include <cstdlib>
-#include <boost/lexical_cast.hpp>
 #include <boost/shared_ptr.hpp>
 
-#include "../3rdparty/Eigen/Dense"
+#include <Eigen/Dense>
 
-#include "param.h"
 #include "util.h"
-#include "model.h"
-#include "propagator.h"
-#include "neuralClasses.h"
 #include "vocabulary.h"
+#include "neuralNetwork.h"
+
+/*
+  To do:
+  - move digit mapping into vocabulary.h
+ */
 
 namespace nplm
 {
 
-class neuralLMShared {
-  public:
-    vocabulary input_vocab, output_vocab;
-    model nn;
-
-    explicit neuralLMShared(const std::string &filename, bool premultiply = false) {
-      std::vector<std::string> input_words, output_words;
-      nn.read(filename, input_words, output_words);
-      input_vocab = vocabulary(input_words);
-      output_vocab = vocabulary(output_words);
-      // this is faster but takes more memory
-      if (premultiply) {
-        nn.premultiply();
-      }
-    }
-};
-
-class neuralLM 
+class neuralLM : public neuralNetwork
 {
-    // Big stuff shared across instances.
-    boost::shared_ptr<neuralLMShared> shared;
-
-    bool normalization;
     char map_digits;
-
-    propagator prop;
-
-    int ngram_size;
-    int width;
-
-    double weight;
-
-
-    std::size_t cache_size;
-    Eigen::Matrix<int,Dynamic,Dynamic> cache_keys;
-    std::vector<double> cache_values;
-    int cache_lookups, cache_hits;
-
-    Eigen::Matrix<int,Eigen::Dynamic,1> ngram; // buffer for lookup_ngram
+    boost::shared_ptr<vocabulary> vocab;
     int start, null;
 
 public:
-    neuralLM(const std::string &filename, bool premultiply = false)
-      : shared(new neuralLMShared(filename, premultiply)),
-        ngram_size(shared->nn.ngram_size), 
-	normalization(false),
-	weight(1.),
-	map_digits(0),
-	width(1),
-	prop(shared->nn, 1),
-        cache_size(0),
-        start(shared->input_vocab.lookup_word("<s>")),
-        null(shared->input_vocab.lookup_word("<null>"))
-    {
-	ngram.setZero(ngram_size);
-	if (cache_size)
-	{
-	  cache_keys.resize(ngram_size, cache_size);
-	  cache_keys.fill(-1);
-	}
-	prop.resize();
+    neuralLM() 
+      : neuralNetwork(),
+        vocab(new vocabulary()),
+	map_digits(0)
+    { 
     }
 
-    void set_normalization(bool value) { normalization = value; }
-    void set_log_base(double value) { weight = 1./std::log(value); }
     void set_map_digits(char value) { map_digits = value; }
 
-    void set_width(int width)
+    void set_vocabulary(const vocabulary &vocab)
     {
-        this->width = width;
-	prop.resize(width);
+        *(this->vocab) = vocab;
+        start = vocab.lookup_word("<s>");
+        null = vocab.lookup_word("<null>");
     }
 
-    const vocabulary &get_vocabulary() const { return shared->input_vocab; }
-
-    int lookup_input_word(const std::string &word) const
-    {
-        if (map_digits)
-	    for (int i=0; i<word.length(); i++)
-	        if (isdigit(word[i]))
-		{
-		    std::string mapped_word(word);
-		    for (; i<word.length(); i++)
-		        if (isdigit(word[i]))
-			    mapped_word[i] = map_digits;
-		    return shared->input_vocab.lookup_word(mapped_word);
-		}
-        return shared->input_vocab.lookup_word(word);
-    }
+    const vocabulary &get_vocabulary() const { return *(this->vocab); }
 
     int lookup_word(const std::string &word) const
     {
-        return lookup_input_word(word);
-    }
-
-    int lookup_output_word(const std::string &word) const
-    {
         if (map_digits)
 	    for (int i=0; i<word.length(); i++)
 	        if (isdigit(word[i]))
@@ -128,133 +55,17 @@ public:
 		    for (; i<word.length(); i++)
 		        if (isdigit(word[i]))
 			    mapped_word[i] = map_digits;
-		    return shared->output_vocab.lookup_word(mapped_word);
+		    return vocab->lookup_word(mapped_word);
 		}
-	return shared->output_vocab.lookup_word(word);
-    }
-
-    Eigen::Matrix<int,Eigen::Dynamic,1> &staging_ngram() { return ngram; }
-    double lookup_from_staging() {
-      return lookup_ngram(ngram);
-    }
-
-    template <typename Derived>
-    double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
-    {
-	assert (ngram.rows() == ngram_size);
-	assert (ngram.cols() == 1);
-
-	std::size_t hash;
-	if (cache_size)
-	{
-	    // First look in cache
-	    hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h
-	    cache_lookups++;
-	    if (cache_keys.col(hash) == ngram)
-	    {
-	        cache_hits++;
-		return cache_values[hash];
-	    }
-	}
-
-	// Make sure that we're single threaded. Multithreading doesn't help,
-	// and in some cases can hurt quite a lot
-	int save_threads = omp_get_max_threads();
-	omp_set_num_threads(1);
-	int save_eigen_threads = Eigen::nbThreads();
-	Eigen::setNbThreads(1);
-	#ifdef __INTEL_MKL__
-	int save_mkl_threads = mkl_get_max_threads();
-	mkl_set_num_threads(1);
-	#endif
-
-        prop.fProp(ngram.col(0));
-
-	int output = ngram(ngram_size-1, 0);
-	double log_prob;
-
-	start_timer(3);
-	if (normalization)
-	{
-	    Eigen::Matrix<double,Eigen::Dynamic,1> scores(shared->output_vocab.size());
-            if (prop.skip_hidden)
-                prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
-            else
-                prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
-	    double logz = logsum(scores.col(0));
-	    log_prob = weight * (scores(output, 0) - logz);
-	}
-	else
-	{
-            if (prop.skip_hidden)
-                log_prob = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, 0);
-            else
-                log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0);
-	}
-	stop_timer(3);
-
-	if (cache_size)
-	{
-	    // Update cache
-	    cache_keys.col(hash) = ngram;
-	    cache_values[hash] = log_prob;
-	}
-
-	#ifdef __INTEL_MKL__
-	mkl_set_num_threads(save_mkl_threads);
-	#endif
-	Eigen::setNbThreads(save_eigen_threads);
-	omp_set_num_threads(save_threads);
-
-	return log_prob;
-    }
-
-    // Look up many n-grams in parallel.
-    template <typename DerivedA, typename DerivedB>
-    void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
-    {
-        UNCONST(DerivedB, log_probs_const, log_probs);
-	assert (ngram.rows() == ngram_size);
-	assert (ngram.cols() <= width);
-
-        prop.fProp(ngram);
-
-	if (normalization)
-	{
-	    Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(shared->output_vocab.size(), ngram.cols());
-            if (prop.skip_hidden)
-                prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
-            else
-                prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
-
-	    // And softmax and loss
-	    Matrix<double,Dynamic,Dynamic> output_probs(shared->nn.output_vocab_size, ngram.cols());
-	    double minibatch_log_likelihood;
-	    SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(shared->nn.ngram_size-1), output_probs, minibatch_log_likelihood);
-	    for (int j=0; j<ngram.cols(); j++)
-	    {
-	        int output = ngram(ngram_size-1, j);
-		log_probs(0, j) = weight * output_probs(output, j);
-	    }
-	}
-	else
-	{
-	    for (int j=0; j<ngram.cols(); j++)
-	    {
-	        int output = ngram(ngram_size-1, j);
-                if (prop.skip_hidden)
-                    log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, output, j);
-                else
-                    log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j);
-	    }
-	}
+        return vocab->lookup_word(word);
     }
 
     double lookup_ngram(const int *ngram_a, int n)
     {
-	for (int i=0; i<ngram_size; i++)
+        Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
+	for (int i=0; i<m->ngram_size; i++)
 	{
-	    if (i-ngram_size+n < 0)
+	    if (i-m->ngram_size+n < 0)
 	    {
 		if (ngram_a[0] == start)
 		    ngram(i) = start;
@@ -263,10 +74,10 @@ public:
 	    }
 	    else
 	    {
-	        ngram(i) = ngram_a[i-ngram_size+n];
+	        ngram(i) = ngram_a[i-m->ngram_size+n];
 	    }
 	}
-	return lookup_ngram(ngram);
+	return neuralNetwork::lookup_ngram(ngram);
     }
 
     double lookup_ngram(const std::vector<int> &ngram_v)
@@ -274,20 +85,26 @@ public:
         return lookup_ngram(ngram_v.data(), ngram_v.size());
     }
 
-    int get_order() const { return ngram_size; }
-
-    void set_cache(std::size_t cache_size)
+    template <typename Derived>
+    double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
     {
-        this->cache_size = cache_size;
-	cache_keys.resize(ngram_size, cache_size);
-	cache_keys.fill(-1); // clears cache
-	cache_values.resize(cache_size);
-	cache_lookups = cache_hits = 0;
+        return neuralNetwork::lookup_ngram(ngram);
+    }
+    
+    template <typename DerivedA, typename DerivedB>
+    void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+    {
+        return neuralNetwork::lookup_ngram(ngram, log_probs_const);
     }
 
-    double cache_hit_rate()
+    void read(const std::string &filename)
     {
-        return static_cast<double>(cache_hits)/cache_lookups;
+        std::vector<std::string> words;
+        m->read(filename, words);
+        set_vocabulary(vocabulary(words));
+        resize();
+	// this is faster but takes more memory
+        //m->premultiply();
     }
 
 };
@@ -314,10 +131,13 @@ void makeNgrams(const std::vector<T> &input, std::vector<std::vector<T> > &outpu
   }
 }
 
-inline void preprocessWords(const std::vector<std::string> &words, std::vector< std::vector<int> > &ngrams,
-			    int ngram_size, const vocabulary &vocab, 
-			    bool numberize, bool add_start_stop, bool ngramize)
-{
+inline void preprocessWords(const std::vector<std::string> &words, 
+    std::vector< std::vector<int> > &ngrams,
+	  int ngram_size, 
+    const vocabulary &vocab, 
+	  bool numberize,
+    bool add_start_stop,
+    bool ngramize) {
   int start = vocab.lookup_word("<s>");
   int stop = vocab.lookup_word("</s>");
   
diff --git a/src/neuralNetwork.h b/src/neuralNetwork.h
new file mode 100644
index 0000000..021a425
--- /dev/null
+++ b/src/neuralNetwork.h
@@ -0,0 +1,188 @@
+#ifndef NEURALNETWORK_H
+#define NEURALNETWORK_H
+
+#include <vector>
+#include <boost/shared_ptr.hpp>
+#include <Eigen/Dense>
+
+#include "util.h"
+#include "model.h"
+#include "propagator.h"
+#include "neuralClasses.h"
+
+namespace nplm
+{
+
+class neuralNetwork
+{
+protected:
+    boost::shared_ptr<model> m;
+
+private:
+    bool normalization;
+    double weight;
+
+    propagator prop;
+
+    std::size_t cache_size;
+    Eigen::Matrix<int,Dynamic,Dynamic> cache_keys;
+    std::vector<double> cache_values;
+    int cache_lookups, cache_hits;
+
+public:
+    neuralNetwork() 
+      : m(new model()),
+        normalization(false),
+	weight(1.),
+	prop(*m, 1),
+        cache_size(0)
+    { 
+    }
+
+    void set_normalization(bool value) { normalization = value; }
+    void set_log_base(double value) { weight = 1./std::log(value); }
+
+    // This must be called if the underlying model is resized.
+    void resize() {
+	if (cache_size)
+	{
+	  cache_keys.resize(m->ngram_size, cache_size);
+	  cache_keys.fill(-1);
+	}
+	prop.resize();
+    }
+
+    void set_width(int width)
+    {
+	prop.resize(width);
+    }
+
+    template <typename Derived>
+    double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+    {
+	assert (ngram.rows() == m->ngram_size);
+	assert (ngram.cols() == 1);
+
+	std::size_t hash;
+	if (cache_size)
+	{
+	    // First look in cache
+	    hash = Eigen::hash_value(ngram) % cache_size; // defined in util.h
+	    cache_lookups++;
+	    if (cache_keys.col(hash) == ngram)
+	    {
+	        cache_hits++;
+		return cache_values[hash];
+	    }
+	}
+
+	// Make sure that we're single threaded. Multithreading doesn't help,
+	// and in some cases can hurt quite a lot
+	int save_threads = omp_get_max_threads();
+	omp_set_num_threads(1);
+	int save_eigen_threads = Eigen::nbThreads();
+	Eigen::setNbThreads(1);
+	#ifdef __INTEL_MKL__
+	int save_mkl_threads = mkl_get_max_threads();
+	mkl_set_num_threads(1);
+	#endif
+
+        prop.fProp(ngram.col(0));
+
+	int output = ngram(m->ngram_size-1, 0);
+	double log_prob;
+
+	start_timer(3);
+	if (normalization)
+	{
+	    Eigen::Matrix<double,Eigen::Dynamic,1> scores(m->output_vocab_size);
+	    prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+	    double logz = logsum(scores.col(0));
+	    log_prob = weight * (scores(output, 0) - logz);
+	}
+	else
+	{
+	    log_prob = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, 0);
+	}
+	stop_timer(3);
+
+	if (cache_size)
+	{
+	    // Update cache
+	    cache_keys.col(hash) = ngram;
+	    cache_values[hash] = log_prob;
+	}
+
+	#ifdef __INTEL_MKL__
+	mkl_set_num_threads(save_mkl_threads);
+	#endif
+	Eigen::setNbThreads(save_eigen_threads);
+	omp_set_num_threads(save_threads);
+
+	return log_prob;
+    }
+
+    // Look up many n-grams in parallel.
+    template <typename DerivedA, typename DerivedB>
+    void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+    {
+        UNCONST(DerivedB, log_probs_const, log_probs);
+	assert (ngram.rows() == m->ngram_size);
+	//assert (ngram.cols() <= prop.get_minibatch_size());
+
+        prop.fProp(ngram);
+
+	if (normalization)
+	{
+	    Eigen::Matrix<double,Eigen::Dynamic,Eigen::Dynamic> scores(m->output_vocab_size, ngram.cols());
+	    prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+
+	    // And softmax and loss
+	    Matrix<double,Dynamic,Dynamic> output_probs(m->output_vocab_size, ngram.cols());
+	    double minibatch_log_likelihood;
+	    SoftmaxLogLoss().fProp(scores.leftCols(ngram.cols()), ngram.row(m->ngram_size-1), output_probs, minibatch_log_likelihood);
+	    for (int j=0; j<ngram.cols(); j++)
+	    {
+	        int output = ngram(m->ngram_size-1, j);
+		log_probs(0, j) = weight * output_probs(output, j);
+	    }
+	}
+	else
+	{
+	    for (int j=0; j<ngram.cols(); j++)
+	    {
+	        int output = ngram(m->ngram_size-1, j);
+	        log_probs(0, j) = weight * prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, output, j);
+	    }
+	}
+    }
+
+    int get_order() const { return m->ngram_size; }
+
+    void read(const std::string &filename)
+    {
+        m->read(filename);
+        resize();
+	// this is faster but takes more memory
+        //m->premultiply();
+    }
+
+    void set_cache(std::size_t cache_size)
+    {
+        this->cache_size = cache_size;
+	cache_keys.resize(m->ngram_size, cache_size);
+	cache_keys.fill(-1); // clears cache
+	cache_values.resize(cache_size);
+	cache_lookups = cache_hits = 0;
+    }
+
+    double cache_hit_rate()
+    {
+        return static_cast<double>(cache_hits)/cache_lookups;
+    }
+
+};
+
+} // namespace nplm
+
+#endif
diff --git a/src/neuralTM.cpp b/src/neuralTM.cpp
new file mode 100644
index 0000000..630ef58
--- /dev/null
+++ b/src/neuralTM.cpp
@@ -0,0 +1 @@
+#include "neuralTM.h"
diff --git a/src/neuralTM.h b/src/neuralTM.h
new file mode 100644
index 0000000..7476d91
--- /dev/null
+++ b/src/neuralTM.h
@@ -0,0 +1,133 @@
+#ifndef NEURALTM_H
+#define NEURALTM_H
+
+#include <vector>
+#include <cctype>
+#include <cstdlib>
+#include <boost/shared_ptr.hpp>
+
+#include <Eigen/Dense>
+
+#include "util.h"
+#include "vocabulary.h"
+#include "neuralNetwork.h"
+
+namespace nplm
+{
+
+class neuralTM : public neuralNetwork
+{
+    char map_digits;
+    boost::shared_ptr<vocabulary> input_vocab, output_vocab;
+    int start, null;
+
+public:
+    neuralTM() 
+      : neuralNetwork(),
+        map_digits(0),
+        input_vocab(new vocabulary()),
+        output_vocab(new vocabulary())
+    { 
+    }
+
+    void set_map_digits(char value) { map_digits = value; }
+
+    void set_input_vocabulary(const vocabulary &vocab)
+    {
+        *(this->input_vocab) = vocab;
+        start = vocab.lookup_word("<s>");
+        null = vocab.lookup_word("<null>");
+    }
+
+    void set_output_vocabulary(const vocabulary &vocab)
+    {
+        *(this->output_vocab) = vocab;
+    }
+
+    const vocabulary &get_input_vocabulary() const { return *(this->input_vocab); }
+    const vocabulary &get_output_vocabulary() const { return *(this->input_vocab); }
+
+    int lookup_input_word(const std::string &word) const
+    {
+        if (map_digits)
+	    for (int i=0; i<word.length(); i++)
+	        if (isdigit(word[i]))
+		{
+		    std::string mapped_word(word);
+		    for (; i<word.length(); i++)
+		        if (isdigit(word[i]))
+			    mapped_word[i] = map_digits;
+		    return input_vocab->lookup_word(mapped_word);
+		}
+        return input_vocab->lookup_word(word);
+    }
+
+    int lookup_output_word(const std::string &word) const
+    {
+        if (map_digits)
+	    for (int i=0; i<word.length(); i++)
+	        if (isdigit(word[i]))
+		{
+		    std::string mapped_word(word);
+		    for (; i<word.length(); i++)
+		        if (isdigit(word[i]))
+			    mapped_word[i] = map_digits;
+		    return output_vocab->lookup_word(mapped_word);
+		}
+	return output_vocab->lookup_word(word);
+    }
+
+    double lookup_ngram(const int *ngram_a, int n)
+    {
+        Eigen::Matrix<int,Eigen::Dynamic,1> ngram(m->ngram_size);
+	for (int i=0; i<m->ngram_size; i++)
+	{
+	    if (i-m->ngram_size+n < 0)
+	    {
+		if (ngram_a[0] == start)
+		    ngram(i) = start;
+		else
+		    ngram(i) = null;
+	    }
+	    else
+	    {
+	        ngram(i) = ngram_a[i-m->ngram_size+n];
+	    }
+	}
+	return neuralNetwork::lookup_ngram(ngram);
+    }
+
+    double lookup_ngram(const std::vector<int> &ngram_v)
+    {
+        return lookup_ngram(ngram_v.data(), ngram_v.size());
+    }
+
+    template <typename Derived>
+    double lookup_ngram(const Eigen::MatrixBase<Derived> &ngram)
+    {
+        return neuralNetwork::lookup_ngram(ngram);
+    }
+    
+    template <typename DerivedA, typename DerivedB>
+    void lookup_ngram(const Eigen::MatrixBase<DerivedA> &ngram, const Eigen::MatrixBase<DerivedB> &log_probs_const)
+    {
+        return neuralNetwork::lookup_ngram(ngram, log_probs_const);
+    }
+
+    void read(const std::string &filename)
+    {
+        std::vector<std::string> input_words;
+        std::vector<std::string> output_words;
+        m->read(filename, input_words, output_words);
+        set_input_vocabulary(vocabulary(input_words));
+        set_output_vocabulary(vocabulary(output_words));
+        resize();
+	// this is faster but takes more memory
+        //m->premultiply();
+    }
+
+};
+
+} // namespace nplm
+
+#endif
diff --git a/src/param.h b/src/param.h
index 8e42853..0615690 100644
--- a/src/param.h
+++ b/src/param.h
@@ -1,3 +1,4 @@
+//The framework for obtaining user arguments has been inspired by Sittichai Jiampojamarn's Many-to-Many alignment model (m2m-aligner). https://code.google.com/p/m2m-aligner/
 #pragma once
 
 #include <string>
@@ -18,7 +19,6 @@ struct param
     std::string input_words_file;
     std::string output_words_file;
     std::string model_prefix;
-    std::string init_model;
 
     int ngram_size;
     int vocab_size;
@@ -30,12 +30,15 @@ struct param
     int output_embedding_dimension;
     std::string activation_function;
     std::string loss_function;
+    std::string parameter_update;
 
     int minibatch_size;
     int validation_minibatch_size;
     int num_epochs;
     double learning_rate;
-
+    double conditioning_constant;
+    double decay;
+    double adagrad_epsilon;
     bool init_normal;
     double init_range;
 
@@ -57,3 +60,4 @@ struct param
 };
 
 } // namespace nplm
+
diff --git a/src/prepareNeuralLM.cpp b/src/prepareNeuralLM.cpp
index 94482d0..13a534a 100644
--- a/src/prepareNeuralLM.cpp
+++ b/src/prepareNeuralLM.cpp
@@ -1,9 +1,20 @@
 #include <iostream>
 #include <vector>
 #include <queue>
-#include <boost/unordered_map.hpp>
-#include <tclap/CmdLine.h>
-#include <boost/algorithm/string/join.hpp>
+#include <deque>
+# include <fstream>
+# include <iterator>
+
+# include <boost/unordered_map.hpp>
+# include <boost/algorithm/string/join.hpp>
+# include <boost/interprocess/managed_shared_memory.hpp>
+# include <boost/interprocess/allocators/allocator.hpp>
+# include <boost/interprocess/managed_mapped_file.hpp>
+#include <boost/interprocess/containers/vector.hpp>
+#include <boost/random/mersenne_twister.hpp>
+#include <boost/random/uniform_int_distribution.hpp>
+
+# include <tclap/CmdLine.h>
 
 #include "neuralLM.h"
 #include "util.h"
@@ -12,12 +23,27 @@ using namespace std;
 using namespace TCLAP;
 using namespace boost;
 using namespace nplm;
+using namespace boost::random;
+namespace ip = boost::interprocess;
+
+typedef ip::allocator<int, ip::managed_mapped_file::segment_manager> intAllocator;
+typedef ip::vector<int, intAllocator> vec;
+typedef ip::allocator<vec, ip::managed_mapped_file::segment_manager> vecAllocator;
+//typedef allocator<ValueType, managed_shared_memory::segment_manager> ShmemAllocator;
+//typedef multimap<int, vec, std::less<int>, ShmemAllocator> MyMap;
+typedef std::vector<vec,vecAllocator> vecvec;
 
-void writeNgrams(const vector<vector<string> > &data, 
-		 int ngram_size, const vocabulary &vocab, 
-		 bool numberize, bool add_start_stop, bool ngramize, 
+typedef long long int data_size_t; // training data can easily exceed 2G instances
+
+template<typename T>
+void writeNgrams(const T &data, 
+		 int ngram_size,
+     const vocabulary &vocab, 
+		 bool numberize,
+     bool add_start_stop,
+     bool ngramize, 
 		 const string &filename)
-{
+	{
     ofstream file(filename.c_str());
     if (!file)
     {
@@ -26,6 +52,7 @@ void writeNgrams(const vector<vector<string> > &data,
     }
 
     vector<vector<int> > ngrams;
+
     for (int i=0; i<data.size(); i++) {
         preprocessWords(data[i], ngrams, ngram_size, vocab, numberize, add_start_stop, ngramize);
 	// write out n-grams
@@ -41,11 +68,233 @@ void writeNgrams(const vector<vector<string> > &data,
     file.close();
 }
 
+// Space efficient version for writing the n-grams.
+// They are not read into memory.
+void writeNgrams(const string &input_filename, 
+		 int ngram_size,
+     const vocabulary &vocab, 
+		 bool numberize,
+     bool add_start_stop,
+     bool ngramize, 
+		 const string &filename,
+     int train_data_size,
+		 vector<float> &sent_weights,
+		 const string &sent_weights_filename)
+{
+    ofstream file(filename.c_str());
+    ofstream output_sent_weights_file(sent_weights_filename.c_str());
+    if (!file)
+    {
+      cerr << "error: could not open " << filename << endl;
+      exit(1);
+    }
+
+    ifstream input_file(input_filename.c_str());
+    vector<vector<int> > ngrams;
+    //for (int i=0; i<train_data.size(); i++) {
+    string line;
+    int counter = 0;
+    cerr<<"Processed ... ";
+    while (getline(input_file,line) && train_data_size-- > 0) {
+            counter++;
+      if ((counter % 100000) == 0) {
+        cerr<<counter<<" training lines ... ";
+      }
+      //stringstream lstr(line);
+      vector<string> lstr_items;
+      splitBySpace(line,lstr_items);
+
+    //for (int i=0; i<data.size(); i++) {
+      preprocessWords(lstr_items,
+          ngrams,
+          ngram_size,
+          vocab,
+          numberize,
+          add_start_stop,
+          ngramize);
+
+	    // write out n-grams
+	    for (int j=0; j<ngrams.size(); j++)
+	    {
+					if (sent_weights.size() != 0) {
+						output_sent_weights_file <<sent_weights[counter-1]<<endl;
+					}	
+	        for (int k=0; k<ngram_size; k++)
+	        {
+	        file << ngrams[j][k] << " ";
+	        }
+	      file << endl;
+	    }
+    }
+    cerr<<endl;
+    input_file.close();
+    file.close();
+    output_sent_weights_file.close();
+}
+
+// Space efficient version for writing the n-grams.
+// They are not read into memory.
+void writeMmapNgrams(const string &input_filename, 
+		 int ngram_size,
+     const vocabulary &vocab, 
+		 bool numberize,
+     bool add_start_stop,
+     bool ngramize, 
+		 const string &filename,
+     unsigned long train_data_size,
+     data_size_t num_tokens,
+     bool randomize)
+{
+    cerr<<"Num tokens is "<<num_tokens<<endl;
+    cerr<<"Training data size is "<<train_data_size<<endl;
+    // Open the memory mapped file and create the allocators
+    ip::managed_mapped_file mfile(ip::create_only,
+        filename.c_str(),
+        num_tokens*ngram_size*sizeof(int)+1024UL*1024UL);
+    intAllocator ialloc(mfile.get_segment_manager());
+    vecAllocator valloc (mfile.get_segment_manager());
+    //vecvec *mMapVecVec= mfile.construct<vecvec>("data")(num_tokens,vec(ialloc),valloc);
+
+    vec *mMapVec= mfile.construct<vec>("vector")(num_tokens*ngram_size,0,ialloc);
+
+    cerr<<"The size of mmaped vec is "<<mMapVec->size()<<endl;
+    // Going over every line in the input file and 
+    // printing the memory mapped ngrams into the 
+    // output file
+    ifstream input_file(input_filename.c_str());
+    //for (int i=0; i<train_data.size(); i++) {
+    string line;
+    int counter = 0;
+    cerr<<"Processed ... ";
+    long int train_ngram_counter = 0;
+    vector<vector<int> > ngrams;
+    while (getline(input_file,line) && train_data_size-- > 0) {
+            counter++;
+      if ((counter % 100000) ==0) {
+        //cerr<<"counter is "<<counter<<endl;
+        cerr<<counter<<" training lines ... ";
+      }
+      //stringstream lstr(line);
+      vector<string> lstr_items;
+      splitBySpace(line,lstr_items);
+
+    //for (int i=0; i<data.size(); i++) {
+      preprocessWords(lstr_items, ngrams,
+          ngram_size,
+          vocab,
+          numberize, 
+          add_start_stop,
+          ngramize);
+      /*
+      cerr<<"line is "<<endl;
+      cerr<<line<<endl;
+      cerr<<"Number of ngrams is "<<ngrams.size()<<endl;
+        if (ngrams.size() ==1 ){
+          cerr<<"The line number was "<<counter<<endl;
+          cerr<<line<<endl;
+        }
+      */
+	    // write out n-grams in mmapped file
+	    for (int j=0; j<ngrams.size(); j++)
+	    {
+        /*
+       for (int k=0; k<ngram_size; k++)
+	        {
+	        cerr << ngrams[j][k] << " ";
+	        }
+	      cerr<< endl; 
+        */
+        for (int k=0; k<ngram_size; k++) {
+          mMapVec->at(train_ngram_counter*ngram_size+k) = ngrams[j][k];
+        }
+        train_ngram_counter++;
+        //cerr<<"Train ngram counter is "<<train_ngram_counter<<endl;
+	    }
+    }
+    cerr<<endl;
+    input_file.close();
+
+    // Shrink the file if it was overused
+    ip::managed_mapped_file::shrink_to_fit(filename.c_str());
+    //now to randomize the items if the randomize flag was set
+    if (randomize == true) {
+      unsigned seed = 1234; //for testing only
+      mt19937 rng(seed);
+       cerr<<"Randomly shuffling data...";
+        data_size_t counter =0;
+        while (counter < num_tokens) {
+          data_size_t upper_limit = counter+5000000;
+          long int vector_size = 5000000;
+          if (counter + 10000000 >= num_tokens) {
+            upper_limit = num_tokens;
+            vector_size = num_tokens - counter;
+          }
+          vector<int> temp(vector_size*ngram_size,0);
+          for (int i=0;i<vector_size;i++){
+           for (int k=0;k<ngram_size;k++) {
+             temp[i*ngram_size+k] = mMapVec->at((i+counter)*ngram_size+k);
+           }
+          }
+          for (data_size_t i=vector_size-1; i>0; i--)
+          {
+            if (i %500000 == 0) {
+              cerr<<"Shuffled "<<num_tokens-1<<" instances...";
+            }
+            data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng);
+            for (int k=0;k<ngram_size;k++) {
+              int temp_val = temp.at(i*ngram_size+k);
+              temp.at(i*ngram_size+k) =
+                temp.at(j*ngram_size+k);
+              temp.at(j*ngram_size+k) = temp_val;
+            }
+          }
+          //Putting it back
+          for (int i=0;i<vector_size;i++){
+           for (int k=0;k<ngram_size;k++) {
+             mMapVec->at((i+counter)*ngram_size+k) = temp[i*ngram_size+k];
+           }
+          }
+          counter = upper_limit;
+        }
+
+      /*
+      for (data_size_t i=num_tokens-1; i>0; i--)
+      {
+        if (i %500000 == 0) {
+          cerr<<"Shuffled "<<num_tokens-1<<" instances...";
+        }
+        data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng);
+        for (int k=0;k<ngram_size;k++) {
+          int temp_val = mMapVec->at(i*ngram_size+k);
+          mMapVec->at(i*ngram_size+k) =
+            mMapVec->at(j*ngram_size+k);
+          mMapVec->at(j*ngram_size+k) = temp_val;
+        }
+      }
+      */
+    cerr<<endl; 
+    }
+}
+
+
 int main(int argc, char *argv[])
 {
+    ios::sync_with_stdio(false);
     int ngram_size, vocab_size, validation_size;
-    bool numberize, ngramize, add_start_stop;
-    string train_text, train_file, validation_text, validation_file, words_file, write_words_file;
+    bool numberize, 
+         ngramize,
+         add_start_stop,
+         mmap_file,
+         randomize;
+
+    string train_text,
+           train_file,
+           validation_text,
+           validation_file,
+           words_file,
+           write_words_file,
+					 sent_weights_text,
+					 output_sent_weights_text;
 
     try
     {
@@ -56,6 +305,10 @@ int main(int argc, char *argv[])
     ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
     ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
     ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend <s> and append </s>. Default: true.", false, true, "bool", cmd);
+    ValueArg<bool> arg_mmap_file("", "mmap_file", "If true, the training file will be a memory mapped file. \n This is "
+        "needed if the entire training data cannot fit in memory. Default: false.", false, false, "bool", cmd);
+
+    ValueArg<bool> arg_randomize("", "randomize", "If true, Randomly shuffle the training ngrams. It can only be used with mmap_file =1 . Default: false.", false, false, "bool", cmd);
 
     ValueArg<int> arg_vocab_size("", "vocab_size", "Vocabulary size.", false, -1, "int", cmd);
     ValueArg<string> arg_words_file("", "words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd);
@@ -66,6 +319,10 @@ int main(int argc, char *argv[])
 	ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
 	ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
 	ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
+	//ValueArg<string> arg_sent_weights_text("", "sent_weights_text", "The sentence weights text", false, "", "string", cmd);
+  //ValueArg<string> arg_sent_weights_file("", "sent_weights_file", "The file to write the per ngram weights", false, "", "string", cmd);
+
+
 
 	cmd.parse(argc, argv);
 
@@ -81,6 +338,13 @@ int main(int argc, char *argv[])
 	numberize = arg_numberize.getValue();
 	ngramize = arg_ngramize.getValue();
 	add_start_stop = arg_add_start_stop.getValue();
+  mmap_file = arg_mmap_file.getValue();
+  randomize = arg_randomize.getValue();
+  //sent_weights_text = arg_sent_weights_text.getValue();
+  //output_sent_weights_text = arg_sent_weights_file.getValue();
+  sent_weights_text = "";
+  output_sent_weights_text = "";
+
 
     // check command line arguments
 
@@ -114,6 +378,8 @@ int main(int argc, char *argv[])
 	cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
 	cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
 	cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
+	cerr << arg_mmap_file.getDescription() << sep << arg_mmap_file.getValue() << endl;
+	//cerr << arg_sent_weights_text.getDescription() << sep << arg_sent_weights_text.getValue() << endl;
     }
     catch (TCLAP::ArgException &e)
     {
@@ -130,24 +396,123 @@ int main(int argc, char *argv[])
     // }
 
     // Read in training data and validation data
-    vector<vector<string> > train_data;
-    readSentFile(train_text, train_data);
-    for (int i=0; i<train_data.size(); i++) {
-        // if data is already ngramized, set/check ngram_size
-        if (!ngramize) {
-            if (ngram_size > 0) {
-                if (ngram_size != train_data[i].size()) {
-                    cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
-                }
-            }
-            // else if --ngram_size has not been specified, set it now
-            else {
-                ngram_size=train_data[i].size();
-            }
+    // vector<vector<string> > train_data;
+    // readSentFile(train_text, train_data);
+    // @vaswani: No more reading the entire training file into memory
+    // Reading it per line with file io
+    
+    //for (int i=0; i<train_data.size(); i++) {
+    // Go over every line in the file and 
+    // 1. if the !ngramize then you should check if 
+    // we have the correct number of items per line
+    // 2. build the vocabulary if the words file has not
+    // been specified.
+    // Construct vocabulary
+    vocabulary vocab;
+    int start, stop;
+    // Add start stop if the vocabulary has not been supplied
+    if (words_file == "") {
+      vocab.insert_word("<s>");
+	    vocab.insert_word("</s>");
+	    vocab.insert_word("<null>");
+      // warn user that if --numberize is not set, there will be no vocabulary!
+      if (!numberize) {
+          cerr << "Warning: with --numberize 0 and --words_file == "", there will be no vocabulary!" << endl;
+      }
+    }
+    if (mmap_file == false && randomize == true) {
+      cerr<<"Randomize option can only be used with mmap_file = 1"<<endl;
+      exit(1);
+    }
+    unordered_map<string,int> count; // For keeping word counts if no supplied vocab
+
+    deque<vector<string> > validation_data;
+    int train_data_size=0;
+    cerr<<"Processed ... ";
+    data_size_t num_tokens=0;
+    
+    ifstream training(train_text.c_str());
+
+    string line;
+    while (getline(training,line)) {
+      train_data_size++;
+      //stringstream lstr(line);
+      vector<string> lstr_items;
+      splitBySpace(line,lstr_items);
+      // if data is already ngramized, set/check ngram_size
+      if (!ngramize) {
+          if (ngram_size > 0) {
+              if (ngram_size != lstr_items.size()) {
+                  cerr << "Error: size of training ngrams does not match specified value of --ngram_size!" << endl;
+              }
+          }
+          // else if --ngram_size has not been specified, set it now
+          else {
+              ngram_size=lstr_items.size();
+          }
+      }
+      if ((train_data_size%100000)==0){
+        cerr<<train_data_size<<" lines ... ";
+      }
+      //break;
+      /*
+      if (lstr_items.size() ==1) {
+        cerr<<"line :"<<endl;
+        cerr<<line<<endl;
+        cerr<<"The number of items was 1"<<endl;
+        getchar();
+      }
+      */
+      num_tokens += lstr_items.size()+1;
+      if (words_file == "") {
+         for (int j=0; j<lstr_items.size(); j++) {
+              count[lstr_items[j]] += 1; 
+          }
+      }
+      // Add to validation set if the validation size
+      // has not been specified
+      if (validation_text == "" && validation_size > 0) {
+        //cerr<<"validation size is "<<validation_data.size()<<endl;
+        if (validation_data.size() == validation_size) {
+          //validation_data.erase(validation_data.begin());
+          validation_data.pop_front();
         }
+        validation_data.push_back(lstr_items);
+      }
+    }
+    cerr<<endl;
+    training.close();
+    //cerr<<"validation size is "<<validation_data.size()<<endl;
+    //getchar();
+    if (validation_data.size() < validation_size) {
+      cerr<<"validation size is "<<validation_data.size()<<endl;
+      cerr << "error: requested validation size is greater than training data size" << endl;
+      exit(1);
     }
     
-    vector<vector<string> > validation_data;
+    train_data_size -= validation_size; 
+    cerr<<"Training data size is "<<train_data_size<<endl;
+
+    // The items in the validation data have already been counted
+    // Decrementing the counts of those words before building the vocabulary
+    for(int i=0; i<validation_data.size(); i++){
+      num_tokens -= (validation_data[i].size() +1);
+      for (int j=0; j<validation_data[i].size();j++){
+        count[validation_data[i][j]] -= 1;
+        if (count[validation_data[i][j]] == 0) {
+          count.erase(validation_data[i][j]);
+        }
+      }
+    }
+
+    // Getting the top n frequent words for the vocabulary
+    if (words_file == "") {
+      vocab.insert_most_frequent(count, vocab_size);
+      if (vocab.size() < vocab_size) {
+          cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
+      }
+    }
+    //vector<vector<string> > validation_data;
     if (validation_text != "") {
         readSentFile(validation_text, validation_data);
         for (int i=0; i<validation_data.size(); i++) {
@@ -166,22 +531,37 @@ int main(int argc, char *argv[])
             }
         }
     }
+    //READING SENTENCE WEIGHTS IF THERE ARE ANY
+    vector<float> sent_weights;
+    if (sent_weights_text != "") {
+      cerr<<"Reading sentence weights from "<<sent_weights_text<<endl;
+      ifstream sent_weights_file(sent_weights_text.c_str());
+			string line;
+      readWeightsFile(sent_weights_file,sent_weights);
+			sent_weights_file.close();
+			if (sent_weights_text.size() != train_data_size) {
+				cerr<<"The number of sentence weights does not match the number of training sentences"<<endl;
+			}
+    }
+		
+    /*
     else if (validation_size > 0)
     {
-        // Create validation data
-        if (validation_size > train_data.size())
-	{
-	    cerr << "error: requested validation size is greater than training data size" << endl;
-	    exit(1);
-	}
-	validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
-	train_data.resize(train_data.size() - validation_size);
+      // Create validation data
+      if (validation_size > train_data.size())
+      {
+          cerr << "error: requested validation size is greater than training data size" << endl;
+          exit(1);
+      }
+	    validation_data.insert(validation_data.end(), train_data.end()-validation_size, train_data.end());
+	    train_data.resize(train_data.size() - validation_size);
     }
+    */
 
     // Construct vocabulary
-    vocabulary vocab;
-    int start, stop;
-
+    //vocabulary vocab;
+    //int start, stop;
+    
     // read vocabulary from file
     if (words_file != "") {
         vector<string> words;
@@ -202,12 +582,12 @@ int main(int argc, char *argv[])
         }
 
     }
-
+    /*
     // construct vocabulary to contain top <vocab_size> most frequent words; all other words replaced by <unk>
     else {
-        vocab.insert_word("<s>");
-	vocab.insert_word("</s>");
-	vocab.insert_word("<null>");
+      vocab.insert_word("<s>");
+	    vocab.insert_word("</s>");
+	    vocab.insert_word("<null>");
 
         // warn user that if --numberize is not set, there will be no vocabulary!
         if (!numberize) {
@@ -225,6 +605,7 @@ int main(int argc, char *argv[])
             cerr << "warning: fewer than " << vocab_size << " types in training data; the unknown word will not be learned" << endl;
         }
     }
+    */
 
     // write vocabulary to file
     if (write_words_file != "") {
@@ -236,11 +617,39 @@ int main(int argc, char *argv[])
     if (train_file != "")
     {
         cerr << "Writing training data to " << train_file << endl;
-        writeNgrams(train_data, ngram_size, vocab, numberize, add_start_stop, ngramize, train_file);
+        if (mmap_file == true) {
+          writeMmapNgrams(train_text,
+            ngram_size,
+            vocab,
+            numberize,
+            add_start_stop,
+            ngramize,
+            train_file,
+            train_data_size,
+            num_tokens,
+            randomize);
+        } else {
+          writeNgrams(train_text,
+              ngram_size,
+              vocab,
+              numberize,
+              add_start_stop,
+              ngramize,
+              train_file,
+              train_data_size,
+							sent_weights,
+							output_sent_weights_text);
+        }
     }
     if (validation_file != "")
     {
         cerr << "Writing validation data to " << validation_file << endl;
-        writeNgrams(validation_data, ngram_size, vocab, numberize, add_start_stop, ngramize, validation_file);
+        writeNgrams(validation_data,
+            ngram_size,
+            vocab,
+            numberize,
+            add_start_stop,
+            ngramize,
+            validation_file);
     }
 }
diff --git a/src/prepareNeuralTM.cpp b/src/prepareNeuralTM.cpp
index 8d7cbf8..0c30fd0 100644
--- a/src/prepareNeuralTM.cpp
+++ b/src/prepareNeuralTM.cpp
@@ -14,7 +14,7 @@ using namespace TCLAP;
 using namespace boost;
 using namespace nplm;
 
-void writeNgrams(const vector<vector<string> > &input_data, const vector<vector<string> > &output_data, int ngram_size, const vocabulary &input_vocab, const vocabulary &output_vocab, bool numberize, bool ngramize, const string &filename)
+void writeNgrams(const vector<vector<string> > &data, int source_context_size, int target_context_size, const vocabulary &input_vocab, int source_unk, const vocabulary &output_vocab, bool numberize, const string &filename)
 {
     ofstream file(filename.c_str());
     if (!file)
@@ -23,107 +23,37 @@ void writeNgrams(const vector<vector<string> > &input_data, const vector<vector<
 	exit(1);
     }
 
-    // check that input and output data have the same number of sentences
-    if (input_data.size() != output_data.size()) {
-        cerr << "Error: input and output data files have different number of lines" << endl;
-        exit(1);
-    }
+    int ngram_size = source_context_size + target_context_size + 1;
 
     // for each input and output line
-    int lines=input_data.size();
-    if (numberize) {
-        for (int i=0; i<lines; i++) {
-            // convert each line to a set of ngrams
-            vector<vector<int> > input_ngrams;
-            vector<int> input_nums;
-            for (int j=0; j<input_data[i].size(); j++) {
-                input_nums.push_back(input_vocab.lookup_word(input_data[i][j]));
+    for (int i=0; i<data.size(); i++) {
+        vector<int> nums;
+	if (numberize) {
+            for (int j=0; j<source_context_size; j++) {
+	        nums.push_back(input_vocab.lookup_word(data[i][j], source_unk));
             }
-            makeNgrams(input_nums, input_ngrams, ngram_size-1);
-            
-            vector<vector<int> > output_ngrams;
-            vector<int> output_nums;
-            for (int j=0; j<output_data[i].size(); j++) {
-                output_nums.push_back(output_vocab.lookup_word(output_data[i][j]));
-            }
-            makeNgrams(output_nums, output_ngrams, 1);
-    
-            // print out cross product of input and output ngrams
-            for (int j=0; j < input_ngrams.size(); j++) {
-                for (int k=0; k < output_ngrams.size(); k++) {
-                    int j_prime;
-                    for (j_prime=0; j_prime < input_ngrams[j].size()-1; j_prime++) {
-                        file << input_ngrams[j][j_prime] << " ";
-                    }
-                    file << input_ngrams[j][j_prime];
-                    int k_prime;
-                    for (k_prime=0; k_prime < output_ngrams[k].size(); k_prime++) {
-                        file << " " << output_ngrams[k][k_prime];
-                    }
-                    file << endl;
-                }
+            for (int j=source_context_size; j<ngram_size-1; j++) {
+                nums.push_back(input_vocab.lookup_word(data[i][j]));
             }
-        }
-    }
-
-    else {
-        for (int i=0; i<lines; i++) {
-            // convert each line to a set of ngrams
-            vector<vector<string> > input_ngrams;
-            vector<string> input_words;
-            for (int j=0; j<input_data[i].size(); j++) {
-                int unk = input_vocab.lookup_word("<unk>");
-                // if word is unknown
-                if (input_vocab.lookup_word(input_data[i][j]) == unk) {
-                    input_words.push_back("<unk>");
-                }
-                // if word is known
-                else {
-                    input_words.push_back(input_data[i][j]);
-                }
+	    nums.push_back(output_vocab.lookup_word(data[i][ngram_size-1]));
+	} else {
+            for (int j=0; j<ngram_size-1; j++) {
+	        nums.push_back(lexical_cast<int>(data[i][j]));
             }
-            makeNgrams(input_words, input_ngrams, ngram_size-1);
-            
-            vector<vector<string> > output_ngrams;
-            vector<string> output_words;
-            for (int j=0; j<output_data[i].size(); j++) {
-                int unk = output_vocab.lookup_word("<unk>");
-                // if word is unknown
-                if (output_vocab.lookup_word(output_data[i][j]) == unk) {
-                    output_words.push_back("<unk>");
-                }
-                // if word is known
-                else {
-                    output_words.push_back(output_data[i][j]);
-                }
-            }
-            makeNgrams(output_words, output_ngrams, 1);
-    
-            // print out cross product of input and output ngrams
-            for (int j=0; j < input_ngrams.size(); j++) {
-                for (int k=0; k < output_ngrams.size(); k++) {
-                    int j_prime;
-                    for (j_prime=0; j_prime < input_ngrams[j].size()-1; j_prime++) {
-                        file << input_ngrams[j][j_prime] << " ";
-                    }
-                    file << input_ngrams[j][j_prime];
-                    int k_prime;
-                    for (k_prime=0; k_prime < output_ngrams[k].size(); k_prime++) {
-                        file << " " << output_ngrams[k][k_prime];
-                    }
-                    file << endl;
-                }
-            }
-        }
+	    nums.push_back(lexical_cast<int>(data[i][ngram_size-1]));
+	}
+	for (int k=0; k<nums.size(); k++)
+	  file << nums[k] << " ";
+	file << endl;
     }
     file.close();
 }
     
 int main(int argc, char *argv[])
 {
-    int ngram_size, input_vocab_size, output_vocab_size, validation_size;
-    bool add_start_stop, numberize, ngramize;
-    string input_train_text, output_train_text, train_file, input_validation_text, output_validation_text, validation_file, write_input_words_file, write_output_words_file, input_words_file, output_words_file;
+    int source_context_size, target_context_size, input_vocab_size, output_vocab_size, validation_size;
+    bool numberize;
+    string train_text, train_file, validation_text, validation_file, write_input_words_file, write_output_words_file, input_words_file, output_words_file;
 
     try
     {
@@ -131,45 +61,37 @@ int main(int argc, char *argv[])
 
 	// The options are printed in reverse order
     
-    ValueArg<bool> arg_ngramize("", "ngramize", "If true, convert lines to ngrams. Default: true.", false, true, "bool", cmd);
     ValueArg<bool> arg_numberize("", "numberize", "If true, convert words to numbers. Default: true.", false, true, "bool", cmd);
-    ValueArg<bool> arg_add_start_stop("", "add_start_stop", "If true, prepend (ngram_size-1) start symbols and postpend 1 stop symbol. Default: true.", false, true, "bool", cmd);
     ValueArg<int> arg_input_vocab_size("", "input_vocab_size", "Vocabulary size.", false, -1, "int", cmd);
     ValueArg<int> arg_output_vocab_size("", "output_vocab_size", "Vocabulary size.", false, -1, "int", cmd);
     ValueArg<string> arg_input_words_file("", "input_words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd);
     ValueArg<string> arg_output_words_file("", "output_words_file", "File specifying words that should be included in vocabulary; all other words will be replaced by <unk>.", false, "", "string", cmd);
-    ValueArg<int> arg_ngram_size("", "ngram_size", "Size of n-grams.", true, -1, "int", cmd);
+    ValueArg<int> arg_source_context_size("", "source_context_size", "Size of input context.", true, -1, "int", cmd);
+    ValueArg<int> arg_target_context_size("", "target_context_size", "Size of output context.", true, -1, "int", cmd);
 	ValueArg<string> arg_write_input_words_file("", "write_input_words_file", "Output vocabulary.", false, "", "string", cmd);
 	ValueArg<string> arg_write_output_words_file("", "write_output_words_file", "Output vocabulary.", false, "", "string", cmd);
     ValueArg<int> arg_validation_size("", "validation_size", "How many lines from training data to hold out for validation. Default: 0.", false, 0, "int", cmd);
 	ValueArg<string> arg_validation_file("", "validation_file", "Output validation data (numberized n-grams).", false, "", "string", cmd);
-	ValueArg<string> arg_input_validation_text("", "input_validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
-	ValueArg<string> arg_output_validation_text("", "output_validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
+	ValueArg<string> arg_validation_text("", "validation_text", "Input validation data (tokenized). Overrides --validation_size. Default: none.", false, "", "string", cmd);
 	ValueArg<string> arg_train_file("", "train_file", "Output training data (numberized n-grams).", false, "", "string", cmd);
-    ValueArg<string> arg_input_train_text("", "input_train_text", "Input training data (tokenized).", true, "", "string", cmd);
-    ValueArg<string> arg_output_train_text("", "output_train_text", "Input training data (tokenized).", true, "", "string", cmd);
+    ValueArg<string> arg_train_text("", "train_text", "Input training data (tokenized).", true, "", "string", cmd);
 
 	cmd.parse(argc, argv);
 
-	input_train_text = arg_input_train_text.getValue();
-	output_train_text = arg_output_train_text.getValue();
+	train_text = arg_train_text.getValue();
 	train_file = arg_train_file.getValue();
 	validation_file = arg_validation_file.getValue();
-	input_validation_text = arg_input_validation_text.getValue();
-	output_validation_text = arg_output_validation_text.getValue();
-	input_validation_text = arg_input_validation_text.getValue();
-	output_validation_text = arg_output_validation_text.getValue();
+	validation_text = arg_validation_text.getValue();
 	validation_size = arg_validation_size.getValue();
 	write_input_words_file = arg_write_input_words_file.getValue();
 	write_output_words_file = arg_write_output_words_file.getValue();
-	ngram_size = arg_ngram_size.getValue();
+	source_context_size = arg_source_context_size.getValue();
+	target_context_size = arg_target_context_size.getValue();
 	input_vocab_size = arg_input_vocab_size.getValue();
 	output_vocab_size = arg_output_vocab_size.getValue();
 	input_words_file = arg_input_words_file.getValue();
 	output_words_file = arg_output_words_file.getValue();
 	numberize = arg_numberize.getValue();
-	ngramize = arg_ngramize.getValue();
-	add_start_stop = arg_add_start_stop.getValue();
 
     // check command line arguments
 
@@ -188,34 +110,24 @@ int main(int argc, char *argv[])
         exit(1);
     }
 
-    // Notes:
-    // - if --ngramize 0 is set, then
-    // - if --ngram_size is not set, it is inferred from the training file (different from current)
-    // - if --ngram_size is set, it is an error if the training file has a different n-gram size
-    // - if neither --validation_file or --validation_size is set, validation will not be performed.
-    // - if --numberize 0 is set, then --validation_size cannot be used.
-
     cerr << "Command line: " << endl;
     cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
 	
 	const string sep(" Value: ");
-	cerr << arg_input_train_text.getDescription() << sep << arg_input_train_text.getValue() << endl;
-	cerr << arg_output_train_text.getDescription() << sep << arg_output_train_text.getValue() << endl;
+	cerr << arg_train_text.getDescription() << sep << arg_train_text.getValue() << endl;
 	cerr << arg_train_file.getDescription() << sep << arg_train_file.getValue() << endl;
-	cerr << arg_input_validation_text.getDescription() << sep << arg_input_validation_text.getValue() << endl;
-	cerr << arg_output_validation_text.getDescription() << sep << arg_output_validation_text.getValue() << endl;
+	cerr << arg_validation_text.getDescription() << sep << arg_validation_text.getValue() << endl;
 	cerr << arg_validation_file.getDescription() << sep << arg_validation_file.getValue() << endl;
 	cerr << arg_validation_size.getDescription() << sep << arg_validation_size.getValue() << endl;
 	cerr << arg_write_input_words_file.getDescription() << sep << arg_write_input_words_file.getValue() << endl;
 	cerr << arg_write_output_words_file.getDescription() << sep << arg_write_output_words_file.getValue() << endl;
-	cerr << arg_ngram_size.getDescription() << sep << arg_ngram_size.getValue() << endl;
+	cerr << arg_source_context_size.getDescription() << sep << arg_source_context_size.getValue() << endl;
+	cerr << arg_target_context_size.getDescription() << sep << arg_target_context_size.getValue() << endl;
 	cerr << arg_input_vocab_size.getDescription() << sep << arg_input_vocab_size.getValue() << endl;
 	cerr << arg_output_vocab_size.getDescription() << sep << arg_output_vocab_size.getValue() << endl;
 	cerr << arg_input_words_file.getDescription() << sep << arg_input_words_file.getValue() << endl;
 	cerr << arg_output_words_file.getDescription() << sep << arg_output_words_file.getValue() << endl;
 	cerr << arg_numberize.getDescription() << sep << arg_numberize.getValue() << endl;
-	cerr << arg_ngramize.getDescription() << sep << arg_ngramize.getValue() << endl;
-	cerr << arg_add_start_stop.getDescription() << sep << arg_add_start_stop.getValue() << endl;
     }
     catch (TCLAP::ArgException &e)
     {
@@ -223,74 +135,32 @@ int main(int argc, char *argv[])
       exit(1);
     }
 
+    string start(string("<s>")), stop(string("</s>"));
+
     // Read in input training data and validation data
-    vector<vector<string> > input_train_data;
-    readSentFile(input_train_text, input_train_data);
-    if (add_start_stop) {
-      for (int i=0; i<input_train_data.size(); i++) {
-	vector<string> input_train_data_start_stop;
-	addStartStop<string>(input_train_data[i], input_train_data_start_stop, ngram_size, "<s>", "</s>");
-	input_train_data[i]=input_train_data_start_stop;
-      }
-    }
+    vector<vector<string> > train_data;
+    readSentFile(train_text, train_data);
     
-    vector<vector<string> > input_validation_data;
-    if (input_validation_text != "") {
-        readSentFile(input_validation_text, input_validation_data);
-        if (add_start_stop) {
-	  for (int i=0; i<input_validation_data.size(); i++) {
-	    vector<string> input_validation_data_start_stop;
-	    addStartStop<string>(input_validation_data[i], input_validation_data_start_stop, ngram_size, "<s>", "</s>");
-	    input_validation_data[i]=input_validation_data_start_stop;
-	  }
-        }
+    vector<vector<string> > validation_data;
+    if (validation_text != "") {
+        readSentFile(validation_text, validation_data);
     }
     else if (validation_size > 0)
     {
-        if (validation_size > input_train_data.size())
+        if (validation_size > train_data.size())
 	{
-	    cerr << "error: requested input_validation size is greater than training data size" << endl;
+	    cerr << "error: requested validation size is greater than training data size" << endl;
 	    exit(1);
 	}
-	input_validation_data.insert(input_validation_data.end(), input_train_data.end()-validation_size, input_train_data.end());
-	input_train_data.resize(input_train_data.size() - validation_size);
+	validation_data.insert(validation_data.end(), train_data.end() - validation_size, train_data.end());
+	train_data.resize(train_data.size() - validation_size);
     }
 
-    // Read in output training data and validation data
-    vector<vector<string> > output_train_data;
-    readSentFile(output_train_text, output_train_data);
-    if (add_start_stop) {
-      for (int i=0; i<output_train_data.size(); i++) {
-	vector<string> output_train_data_start_stop;
-	addStartStop<string>(output_train_data[i], output_train_data_start_stop, 1, "<s>", "</s>");
-	output_train_data[i]=output_train_data_start_stop;
-      }
-    }
-    
-    vector<vector<string> > output_validation_data;
-    if (output_validation_text != "") {
-        readSentFile(output_validation_text, output_validation_data);
-        if (add_start_stop) {
-	  for (int i=0; i<output_validation_data.size(); i++) {
-	    vector<string> output_validation_data_start_stop;
-	    addStartStop<string>(output_validation_data[i], output_validation_data_start_stop, 1, "<s>", "</s>");
-	    output_validation_data[i]=output_validation_data_start_stop;
-	  }
-        }
-    }
-    else if (validation_size > 0)
-    {
-        if (validation_size > output_train_data.size())
-	{
-	    cerr << "error: requested output_validation size is greater than training data size" << endl;
-	    exit(1);
-	}
-	output_validation_data.insert(output_validation_data.end(), output_train_data.end()-validation_size, output_train_data.end());
-	output_train_data.resize(output_train_data.size() - validation_size);
-    }
+    int ngram_size = source_context_size + target_context_size + 1;
 
     // Construct input vocabulary
     vocabulary input_vocab;
+    int source_unk = input_vocab.insert_word("<source_unk>");
     int input_start = input_vocab.insert_word("<s>");
     int input_stop = input_vocab.insert_word("</s>");
     input_vocab.insert_word("<null>");
@@ -298,7 +168,7 @@ int main(int argc, char *argv[])
     // read input vocabulary from file
     if (input_words_file != "") {
         vector<string> words;
-        readWordsFile(input_words_file,words);
+        readWordsFile(input_words_file, words);
         for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
             input_vocab.insert_word(*it);
         }
@@ -317,9 +187,9 @@ int main(int argc, char *argv[])
     // or construct input vocabulary to contain top <input_vocab_size> most frequent words; all other words replaced by <unk>
     else {
         unordered_map<string,int> count;
-        for (int i=0; i<input_train_data.size(); i++) {
-            for (int j=0; j<input_train_data[i].size(); j++) {
-                count[input_train_data[i][j]] += 1; 
+        for (int i=0; i<train_data.size(); i++) {
+            for (int j=0; j<ngram_size-1; j++) {
+                count[train_data[i][j]] += 1; 
             }
         }
 
@@ -333,12 +203,11 @@ int main(int argc, char *argv[])
     vocabulary output_vocab;
     int output_start = output_vocab.insert_word("<s>");
     int output_stop = output_vocab.insert_word("</s>");
-    output_vocab.insert_word("<null>");
 
     // read output vocabulary from file
     if (output_words_file != "") {
         vector<string> words;
-        readWordsFile(output_words_file,words);
+        readWordsFile(output_words_file, words);
         for(vector<string>::iterator it = words.begin(); it != words.end(); ++it) {
             output_vocab.insert_word(*it);
         }
@@ -357,10 +226,8 @@ int main(int argc, char *argv[])
     // or construct output vocabulary to contain top <output_vocab_size> most frequent words; all other words replaced by <unk>
     else {
         unordered_map<string,int> count;
-        for (int i=0; i<output_train_data.size(); i++) {
-            for (int j=0; j<output_train_data[i].size(); j++) {
-                count[output_train_data[i][j]] += 1; 
-            }
+        for (int i=0; i<train_data.size(); i++) {
+	  count[train_data[i][ngram_size-1]] += 1; 
         }
 
         output_vocab.insert_most_frequent(count, output_vocab_size);
@@ -385,12 +252,12 @@ int main(int argc, char *argv[])
     if (train_file != "")
     {
         cerr << "Writing training data to " << train_file << endl;
-        writeNgrams(input_train_data, output_train_data, ngram_size, input_vocab, output_vocab, numberize, ngramize, train_file);
+        writeNgrams(train_data, source_context_size, target_context_size, input_vocab, source_unk, output_vocab, numberize, train_file);
 
     }
     if (validation_file != "")
     {
         cerr << "Writing validation data to " << validation_file << endl;
-        writeNgrams(input_validation_data, output_validation_data, ngram_size, input_vocab, output_vocab, numberize, ngramize, validation_file);
+        writeNgrams(validation_data, source_context_size, target_context_size, input_vocab, source_unk, output_vocab, numberize, validation_file);
     }
 }
diff --git a/src/propagator.h b/src/propagator.h
index c52a6a9..df8a7c2 100644
--- a/src/propagator.h
+++ b/src/propagator.h
@@ -14,7 +14,7 @@ using Eigen::Dynamic;
 
 class propagator {
     int minibatch_size;
-    const model *pnn;
+    model *pnn;
 
 public:
     Node<Input_word_embeddings> input_layer_node;
@@ -23,24 +23,21 @@ public:
     Node<Linear_layer> second_hidden_linear_node;
     Node<Activation_function> second_hidden_activation_node;
     Node<Output_word_embeddings> output_layer_node;
-    bool skip_hidden;
 
 public:
     propagator () : minibatch_size(0), pnn(0) { }
 
-    propagator (const model &nn, int minibatch_size)
+    propagator (model &nn, int minibatch_size)
       :
         pnn(&nn),
-        // These are const for purposes of querying.  The issue is that it's also used non-const for purposes of training, so X* only takes mutable classes.
-        input_layer_node(const_cast<Input_word_embeddings*>(&nn.input_layer), minibatch_size),
-	first_hidden_linear_node(const_cast<Linear_layer*>(&nn.first_hidden_linear), minibatch_size),
-	first_hidden_activation_node(const_cast<Activation_function*>(&nn.first_hidden_activation), minibatch_size),
-        second_hidden_linear_node(const_cast<Linear_layer*>(&nn.second_hidden_linear), minibatch_size),
-	second_hidden_activation_node(const_cast<Activation_function*>(&nn.second_hidden_activation), minibatch_size),
-	output_layer_node(const_cast<Output_word_embeddings*>(&nn.output_layer), minibatch_size),
+        input_layer_node(&nn.input_layer, minibatch_size),
+	first_hidden_linear_node(&nn.first_hidden_linear, minibatch_size),
+	first_hidden_activation_node(&nn.first_hidden_activation, minibatch_size),
+        second_hidden_linear_node(&nn.second_hidden_linear, minibatch_size),
+	second_hidden_activation_node(&nn.second_hidden_activation, minibatch_size),
+	output_layer_node(&nn.output_layer, minibatch_size),
 	minibatch_size(minibatch_size)
     {
-        skip_hidden = (nn.num_hidden == 0);
     }
 
     // This must be called if the underlying model is resized.
@@ -81,17 +78,17 @@ public:
 	}
 	first_hidden_activation_node.param->fProp(first_hidden_linear_node.fProp_matrix,
 						  first_hidden_activation_node.fProp_matrix);
+  //std::cerr<<"in fprop first hidden activation node fprop is "<<first_hidden_activation_node.fProp_matrix<<std::endl;
+  //std::getchar();
 	stop_timer(1);
     
 
-        if (!skip_hidden) {
 	start_timer(2);
 	second_hidden_linear_node.param->fProp(first_hidden_activation_node.fProp_matrix,
 					       second_hidden_linear_node.fProp_matrix);
 	second_hidden_activation_node.param->fProp(second_hidden_linear_node.fProp_matrix,
 						   second_hidden_activation_node.fProp_matrix);
 	stop_timer(2);
-        }
 
 	// The propagation stops here because the last layer is very expensive.
     }
@@ -100,7 +97,12 @@ public:
     template <typename DerivedIn, typename DerivedOut>
     void bProp(const MatrixBase<DerivedIn> &data,
 	       const MatrixBase<DerivedOut> &output,
-	       double learning_rate, double momentum, double L2_reg) 
+	       double learning_rate,
+         double momentum,
+         double L2_reg,
+         std::string &parameter_update,
+         double conditioning_constant,
+         double decay) 
     {
         // Output embedding layer
 
@@ -110,113 +112,225 @@ public:
 	stop_timer(7);
 	
 	start_timer(8);
-        if (skip_hidden) {
-            output_layer_node.param->computeGradient(first_hidden_activation_node.fProp_matrix,
-                                                    output,
-                                                    learning_rate, momentum);
-        }
-        else {
-	output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix,
-						 output,
-						 learning_rate, momentum);
-        }
+  if (parameter_update == "SGD") {
+    output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix,
+               output,
+               learning_rate,
+               momentum);
+  } else if (parameter_update == "ADA") {
+    output_layer_node.param->computeGradientAdagrad(second_hidden_activation_node.fProp_matrix,
+               output,
+               learning_rate);
+  } else if (parameter_update == "ADAD") {
+    //std::cerr<<"Adadelta gradient"<<endl;
+    int current_minibatch_size = second_hidden_activation_node.fProp_matrix.cols();
+    output_layer_node.param->computeGradientAdadelta(second_hidden_activation_node.fProp_matrix,
+               output,
+               1.0/current_minibatch_size,
+               conditioning_constant,
+               decay);
+  } else {
+    std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
+  }
 	stop_timer(8);
 
-	bPropRest(data, learning_rate, momentum, L2_reg);
+	bPropRest(data, 
+      learning_rate,
+      momentum,
+      L2_reg,
+      parameter_update,
+      conditioning_constant,
+      decay);
     }
 
     // Sparse version (for NCE log-likelihood)
     template <typename DerivedIn, typename DerivedOutI, typename DerivedOutV>
     void bProp(const MatrixBase<DerivedIn> &data,
-	       const MatrixBase<DerivedOutI> &samples, const MatrixBase<DerivedOutV> &weights,
-	       double learning_rate, double momentum, double L2_reg) 
+	       const MatrixBase<DerivedOutI> &samples,
+         const MatrixBase<DerivedOutV> &weights,
+	       double learning_rate,
+         double momentum,
+         double L2_reg,
+         std::string &parameter_update,
+         double conditioning_constant,
+         double decay) 
     {
 
         // Output embedding layer
 
         start_timer(7);
-        output_layer_node.param->bProp(samples, weights, 
-				       output_layer_node.bProp_matrix);
+        output_layer_node.param->bProp(samples,
+            weights, 
+				    output_layer_node.bProp_matrix);
 	stop_timer(7);
 	
 
 	start_timer(8);
-        if (skip_hidden) {
-            output_layer_node.param->computeGradient(first_hidden_activation_node.fProp_matrix,
-                                                    samples, weights,
-                                                    learning_rate, momentum);
-        }
-        else {
-            output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix,
-                                                    samples, weights,
-                                                    learning_rate, momentum);
-        }
+  if (parameter_update == "SGD") {
+    output_layer_node.param->computeGradient(second_hidden_activation_node.fProp_matrix,
+               samples,
+               weights,
+               learning_rate,
+               momentum);
+  } else if (parameter_update == "ADA") {
+    output_layer_node.param->computeGradientAdagrad(second_hidden_activation_node.fProp_matrix,
+               samples,
+               weights,
+               learning_rate);
+  } else if (parameter_update == "ADAD") {
+    int current_minibatch_size = second_hidden_activation_node.fProp_matrix.cols();
+    //std::cerr<<"Adadelta gradient"<<endl;
+    output_layer_node.param->computeGradientAdadelta(second_hidden_activation_node.fProp_matrix,
+               samples,
+               weights,
+               1.0/current_minibatch_size,
+               conditioning_constant,
+               decay);
+  } else {
+    std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
+  }
+
 	stop_timer(8);
 
-	bPropRest(data, learning_rate, momentum, L2_reg);
+	bPropRest(data,
+      learning_rate,
+      momentum,
+      L2_reg,
+      parameter_update,
+      conditioning_constant,
+      decay);
     }
 
 private:
     template <typename DerivedIn>
     void bPropRest(const MatrixBase<DerivedIn> &data,
-		   double learning_rate, double momentum, double L2_reg) 
+		   double learning_rate, double momentum, double L2_reg,
+       std::string &parameter_update,
+       double conditioning_constant,
+       double decay) 
     {
 	// Second hidden layer
 
-        if (skip_hidden) {
-            start_timer(9);
-            first_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
-                                                    first_hidden_activation_node.bProp_matrix,
-                                                    first_hidden_linear_node.fProp_matrix,
-                                                    first_hidden_activation_node.fProp_matrix);
-
-            first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
-                                                first_hidden_linear_node.bProp_matrix);
-            stop_timer(9);
-        }
-        else {
-            start_timer(9);
-            second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
-                                                    second_hidden_activation_node.bProp_matrix,
-                                                    second_hidden_linear_node.fProp_matrix,
-                                                    second_hidden_activation_node.fProp_matrix);
-
-            second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix,
-                                                second_hidden_linear_node.bProp_matrix);
-            stop_timer(9);
-
-            start_timer(10);
-            second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix,
-                                                            first_hidden_activation_node.fProp_matrix,
-                                                            learning_rate, momentum, L2_reg);
-            stop_timer(10);
-
-            // First hidden layer
-
-            start_timer(11);
-            first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix,
-                                                    first_hidden_activation_node.bProp_matrix,
-                                                    first_hidden_linear_node.fProp_matrix,
-                                                    first_hidden_activation_node.fProp_matrix);
-
-            first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
-                                                first_hidden_linear_node.bProp_matrix);
-            stop_timer(11);
-        }
-	
-	start_timer(12);
-	first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix,
-							input_layer_node.fProp_matrix,
-							learning_rate, momentum, L2_reg);
-	stop_timer(12);
 
-	// Input word embeddings
-	
-	start_timer(13);
-	input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix,
-						data,
-						learning_rate, momentum, L2_reg);
-	stop_timer(13);
+  
+  // All the compute gradient functions are together and the backprop
+  // functions are together
+  ////////BACKPROP////////////
+        start_timer(9);
+  second_hidden_activation_node.param->bProp(output_layer_node.bProp_matrix,
+                                           second_hidden_activation_node.bProp_matrix,
+                                           second_hidden_linear_node.fProp_matrix,
+                                           second_hidden_activation_node.fProp_matrix);
+
+
+	second_hidden_linear_node.param->bProp(second_hidden_activation_node.bProp_matrix,
+					       second_hidden_linear_node.bProp_matrix);
+	stop_timer(9);
+
+	start_timer(11);
+	first_hidden_activation_node.param->bProp(second_hidden_linear_node.bProp_matrix,
+						  first_hidden_activation_node.bProp_matrix,
+						  first_hidden_linear_node.fProp_matrix,
+						  first_hidden_activation_node.fProp_matrix);
+
+  first_hidden_linear_node.param->bProp(first_hidden_activation_node.bProp_matrix,
+					      first_hidden_linear_node.bProp_matrix);
+	stop_timer(11);
+  //std::cerr<<"First hidden layer node backprop matrix is"<<first_hidden_linear_node.bProp_matrix<<std::endl;
+  //std::getchar();
+  ////COMPUTE GRADIENT/////////
+  if (parameter_update == "SGD") {
+    start_timer(10);
+    second_hidden_linear_node.param->computeGradient(second_hidden_activation_node.bProp_matrix,
+                 first_hidden_activation_node.fProp_matrix,
+                 learning_rate,
+                 momentum,
+                 L2_reg);
+    stop_timer(10);
+
+    // First hidden layer
+
+    
+    start_timer(12);
+    first_hidden_linear_node.param->computeGradient(first_hidden_activation_node.bProp_matrix,
+                input_layer_node.fProp_matrix,
+                learning_rate, momentum, L2_reg);
+    stop_timer(12);
+
+    // Input word embeddings
+    
+    start_timer(13);
+    input_layer_node.param->computeGradient(first_hidden_linear_node.bProp_matrix,
+              data,
+              learning_rate, momentum, L2_reg);
+    stop_timer(13);
+  } else if (parameter_update == "ADA") {
+    start_timer(10);
+    second_hidden_linear_node.param->computeGradientAdagrad(second_hidden_activation_node.bProp_matrix,
+                 first_hidden_activation_node.fProp_matrix,
+                 learning_rate,
+                 L2_reg);
+    stop_timer(10);
+
+    // First hidden layer
+
+    
+    start_timer(12);
+    first_hidden_linear_node.param->computeGradientAdagrad(first_hidden_activation_node.bProp_matrix,
+                input_layer_node.fProp_matrix,
+                learning_rate,
+                L2_reg);
+    stop_timer(12);
+
+    // Input word embeddings
+     
+    start_timer(13);
+    input_layer_node.param->computeGradientAdagrad(first_hidden_linear_node.bProp_matrix,
+              data,
+              learning_rate, 
+              L2_reg);
+    stop_timer(13);
+  } else if (parameter_update == "ADAD") {
+    int current_minibatch_size = first_hidden_activation_node.fProp_matrix.cols();
+    //std::cerr<<"Adadelta gradient"<<endl;
+    start_timer(10);
+    second_hidden_linear_node.param->computeGradientAdadelta(second_hidden_activation_node.bProp_matrix,
+                 first_hidden_activation_node.fProp_matrix,
+                 1.0/current_minibatch_size,
+                 L2_reg,
+                 conditioning_constant,
+                 decay);
+    stop_timer(10);
+    //std::cerr<<"Finished gradient for second hidden linear layer"<<std::endl;
+
+    // First hidden layer
+
+    
+    start_timer(12);
+    first_hidden_linear_node.param->computeGradientAdadelta(first_hidden_activation_node.bProp_matrix,
+                input_layer_node.fProp_matrix,
+                1.0/current_minibatch_size,
+                L2_reg,
+                conditioning_constant,
+                decay);
+    stop_timer(12);
+
+    //std::cerr<<"Finished gradient for first hidden linear layer"<<std::endl;
+    // Input word embeddings
+     
+    start_timer(13);
+    input_layer_node.param->computeGradientAdadelta(first_hidden_linear_node.bProp_matrix,
+              data,
+              1.0/current_minibatch_size, 
+              L2_reg,
+              conditioning_constant,
+              decay);
+    stop_timer(13);
+  
+    //std::cerr<<"Finished gradient for first input layer"<<std::endl;
+  } else {
+    std::cerr<<"Parameter update :"<<parameter_update<<" is unrecognized"<<std::endl;
+  }
 
     }
 };
@@ -224,3 +338,4 @@ private:
 } // namespace nplm
 
 #endif
+
diff --git a/src/python/nplm.pyx b/src/python/nplm.pyx
index 290d56c..9f882ae 100644
--- a/src/python/nplm.pyx
+++ b/src/python/nplm.pyx
@@ -6,7 +6,7 @@ cdef class NeuralLM:
         self.thisptr.set_normalization(normalization)
         self.thisptr.set_log_base(10.)
         if type(map_digits) is str and len(map_digits) == 1:
-            self.thisptr.set_map_digits(map_digits)
+            self.thisptr.set_map_digits((<char *>map_digits)[0])
         if cache_size:
             self.thisptr.set_cache(cache_size)
 
diff --git a/src/python/nptm.pxd b/src/python/nptm.pxd
new file mode 100644
index 0000000..bb185ac
--- /dev/null
+++ b/src/python/nptm.pxd
@@ -0,0 +1,25 @@
+from libcpp.string cimport string
+from libcpp.vector cimport vector
+
+cdef extern from "neuralTM.h":
+    cdef cppclass c_neuralTM "nplm::neuralTM":
+        c_neuralTM()
+        void set_normalization(bint)
+        void set_map_digits(char)
+        void set_log_base(double)
+        void read(string filename) except +
+        int get_order()
+        int lookup_input_word(string)
+        int lookup_output_word(string)
+        float lookup_ngram(vector[int])
+        float lookup_ngram(int *, int)
+        void set_cache(int)
+        double cache_hit_rate()
+
+cdef class NeuralTM:
+    cdef c_neuralTM *thisptr
+    cdef int c_lookup_input_word(self, char *s)
+    cdef int c_lookup_output_word(self, char *s)
+    cdef float c_lookup_ngram(self, int *words, int n)
+    cdef readonly int order
+    
diff --git a/src/python/nptm.pyx b/src/python/nptm.pyx
new file mode 100644
index 0000000..61338a1
--- /dev/null
+++ b/src/python/nptm.pyx
@@ -0,0 +1,46 @@
+# distutils: language = c++
+
+cdef class NeuralTM:
+    def __cinit__(self, normalization=False, map_digits=None, cache_size=0):
+        self.thisptr = new c_neuralTM()
+        self.thisptr.set_normalization(normalization)
+        self.thisptr.set_log_base(10.)
+        if type(map_digits) is str and len(map_digits) == 1:
+            self.thisptr.set_map_digits((<char *>map_digits)[0])
+        if cache_size:
+            self.thisptr.set_cache(cache_size)
+
+    def read(self, filename):
+        self.thisptr.read(filename)
+        self.order = self.thisptr.get_order()
+
+    def get_order(self):
+        return self.thisptr.get_order()
+
+    def lookup_input_word(self, s):
+        return self.thisptr.lookup_input_word(s)
+    
+    def lookup_output_word(self, s):
+        return self.thisptr.lookup_output_word(s)
+    
+    def lookup_ngram(self, words):
+        if len(words) == 0:
+            raise ValueError("ngram is empty")
+        return self.thisptr.lookup_ngram(words)
+
+    def cache_hit_rate(self):
+        return self.thisptr.cache_hit_rate()
+
+    # low-level interface that can be called by other Cython modules
+    cdef int c_lookup_input_word(self, char *s):
+        cdef string ss
+        ss.assign(s)
+        return self.thisptr.lookup_input_word(ss)
+
+    cdef int c_lookup_output_word(self, char *s):
+        cdef string ss
+        ss.assign(s)
+        return self.thisptr.lookup_output_word(ss)
+
+    cdef float c_lookup_ngram(self, int *words, int n):
+        return self.thisptr.lookup_ngram(words, n)
diff --git a/src/testNeuralLM.cpp b/src/testNeuralLM.cpp
index 5805022..834e253 100644
--- a/src/testNeuralLM.cpp
+++ b/src/testNeuralLM.cpp
@@ -2,10 +2,11 @@
 #include <fstream>
 
 #include <boost/algorithm/string/join.hpp>
+//#include <boost/thread/thread.hpp>
 #include <tclap/CmdLine.h>
 
-#include "../3rdparty/Eigen/Core"
-#include "../3rdparty/Eigen/Dense"
+#include <Eigen/Core>
+#include <Eigen/Dense>
 
 #include "param.h"
 
@@ -18,6 +19,47 @@ using namespace Eigen;
 
 using namespace nplm;
 
+void score(neuralLM &lm, int minibatch_size, vector<int>& start, vector< vector<int> > &ngrams,
+	   vector<double> &out) {
+    if (ngrams.size() == 0) return;
+    int ngram_size = ngrams[0].size();
+
+    if (minibatch_size == 0)
+    {
+        // Score one n-gram at a time. This is how the LM would be queried from a decoder.
+        for (int sent_id=0; sent_id<start.size()-1; sent_id++)
+	{	  
+	    double sent_log_prob = 0.0;
+	    for (int j=start[sent_id]; j<start[sent_id+1]; j++) 
+	        sent_log_prob += lm.lookup_ngram(ngrams[j]);
+	    out.push_back(sent_log_prob);
+	}
+    }
+    else
+    {
+	// Score a whole minibatch at a time.
+        Matrix<double,1,Dynamic> log_probs(ngrams.size());
+
+        Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size);
+	minibatch.setZero();
+        for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size)
+	{
+	    int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id;
+	    for (int j=0; j<current_minibatch_size; j++)
+	        minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size);
+	    lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size));
+	}
+
+	for (int sent_id=0; sent_id<start.size()-1; sent_id++)
+	{
+	    double sent_log_prob = 0.0;
+	    for (int j=start[sent_id]; j<start[sent_id+1]; j++)
+	        sent_log_prob += log_probs[j];
+	    out.push_back(sent_log_prob);
+	}
+    }
+}
+
 int main (int argc, char *argv[]) 
 {
     param myParam;
@@ -78,7 +120,8 @@ int main (int argc, char *argv[])
 
     ///// Create language model
 
-    neuralLM lm(myParam.model_file);
+    neuralLM lm;
+    lm.read(myParam.model_file);
     lm.set_normalization(normalization);
     lm.set_log_base(10);
     lm.set_cache(1048576);
@@ -89,8 +132,6 @@ int main (int argc, char *argv[])
 
     ///// Read test data
 
-    double log_likelihood = 0.0;
-
     ifstream test_file(myParam.test_file.c_str());
     if (!test_file)
     {
@@ -115,44 +156,33 @@ int main (int argc, char *argv[])
     }
     start.push_back(ngrams.size());
 
-    if (minibatch_size == 0)
-    {
-        // Score one n-gram at a time. This is how the LM would be queried from a decoder.
-        for (int sent_id=0; sent_id<start.size()-1; sent_id++)
-	{	  
-	    double sent_log_prob = 0.0;
-	    for (int j=start[sent_id]; j<start[sent_id+1]; j++) 
-	        sent_log_prob += lm.lookup_ngram(ngrams[j]);
-	    cout << sent_log_prob << endl;
-	    log_likelihood += sent_log_prob;
-	}
-    }
-    else
-    {
-	// Score a whole minibatch at a time.
-        Matrix<double,1,Dynamic> log_probs(ngrams.size());
-
-        Matrix<int,Dynamic,Dynamic> minibatch(ngram_size, minibatch_size);
-	minibatch.setZero();
-        for (int test_id = 0; test_id < ngrams.size(); test_id += minibatch_size)
-	{
-	    int current_minibatch_size = minibatch_size<ngrams.size()-test_id ? minibatch_size : ngrams.size()-test_id;
-	    for (int j=0; j<current_minibatch_size; j++)
-	        minibatch.col(j) = Map< Matrix<int,Dynamic,1> > (ngrams[test_id+j].data(), ngram_size);
-	    lm.lookup_ngram(minibatch.leftCols(current_minibatch_size), log_probs.middleCols(test_id, current_minibatch_size));
-	}
+    int num_threads = 1;
+    vector< vector<double> > sent_log_probs(num_threads);
 
-	for (int sent_id=0; sent_id<start.size()-1; sent_id++)
-	{
-	    double sent_log_prob = 0.0;
-	    for (int j=start[sent_id]; j<start[sent_id+1]; j++)
-	        sent_log_prob += log_probs[j];
-	    cout << sent_log_prob << endl;
-	    log_likelihood += sent_log_prob;
-	}
+    /*
+    // Test thread safety
+    boost::thread_group tg;
+    for (int t=0; t < num_threads; t++) {
+      tg.create_thread(boost::bind(score, lm, minibatch_size, boost::ref(start), boost::ref(ngrams), boost::ref(sent_log_probs[t]))); // copy lm
+    }
+    tg.join_all();
+    */
+    score(lm, minibatch_size, start, ngrams, sent_log_probs[0]);
+
+    vector<double> log_likelihood(num_threads);
+    std::fill(log_likelihood.begin(), log_likelihood.end(), 0.0);
+    for (int i=0; i<sent_log_probs[0].size(); i++) {
+        for (int t=0; t<num_threads; t++)
+	    cout << sent_log_probs[t][i] << "\t";
+	cout << endl;
+        for (int t=0; t<num_threads; t++)
+	log_likelihood[t] += sent_log_probs[t][i];
     }
     
-    cerr << "Test log10-likelihood: " << log_likelihood << endl;
+    cerr << "Test log10-likelihood: ";
+    for (int t=0; t<num_threads; t++)
+      cerr << log_likelihood[t] << " ";
+    cerr << endl;
     #ifdef USE_CHRONO
     cerr << "Propagation times:";
     for (int i=0; i<timer.size(); i++)
diff --git a/src/testNeuralNetwork.cpp b/src/testNeuralNetwork.cpp
index 1b4820e..3e1640e 100644
--- a/src/testNeuralNetwork.cpp
+++ b/src/testNeuralNetwork.cpp
@@ -80,7 +80,7 @@ int main (int argc, char *argv[])
     cerr << "Number of test instances: " << test_data_size << endl;
 
     Map< Matrix<int,Dynamic,Dynamic> > test_data(test_data_flat.data(), myParam.ngram_size, test_data_size);
-
+    
     ///// Score test data
 
     int num_batches = (test_data_size-1)/myParam.minibatch_size + 1;
@@ -101,10 +101,7 @@ int main (int argc, char *argv[])
 	prop.fProp(minibatch.topRows(myParam.ngram_size-1));
 
 	// Do full forward prop through output word embedding layer
-        if (prop.skip_hidden)
-            prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
-        else
-            prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+	prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
 
 	// And softmax and loss
 	double minibatch_log_likelihood;
diff --git a/src/trainNeuralNetwork.cpp b/src/trainNeuralNetwork.cpp
index 57323d9..e231c20 100644
--- a/src/trainNeuralNetwork.cpp
+++ b/src/trainNeuralNetwork.cpp
@@ -11,9 +11,13 @@
 #include <boost/lexical_cast.hpp>
 #include <boost/random/mersenne_twister.hpp>
 #include <boost/algorithm/string/join.hpp>
+# include <boost/interprocess/managed_shared_memory.hpp>
+# include <boost/interprocess/allocators/allocator.hpp>
+# include <boost/interprocess/managed_mapped_file.hpp>
+#include <boost/interprocess/containers/vector.hpp>
 
-#include "../3rdparty/Eigen/Dense"
-#include "../3rdparty/Eigen/Sparse"
+#include <Eigen/Dense>
+#include <Eigen/Sparse>
 #include "maybe_omp.h"
 #include <tclap/CmdLine.h>
 
@@ -36,16 +40,24 @@ using namespace boost::random;
 
 using namespace nplm;
 
+namespace ip = boost::interprocess;
 typedef unordered_map<Matrix<int,Dynamic,1>, double> vector_map;
 
+typedef ip::allocator<int, ip::managed_mapped_file::segment_manager> intAllocator;
+typedef ip::vector<int, intAllocator> vec;
+typedef ip::allocator<vec, ip::managed_mapped_file::segment_manager> vecAllocator;
+
+
 typedef long long int data_size_t; // training data can easily exceed 2G instances
 
 int main(int argc, char** argv)
 { 
+    ios::sync_with_stdio(false);
+    bool use_mmap_file, randomize;
     param myParam;
     try {
       // program options //
-      CmdLine cmd("Trains a two-layer neural probabilistic language model.", ' ' , "0.1");
+      CmdLine cmd("Trains a two-layer neural probabilistic language model.", ' ' , "0.3\n","");
 
       // The options are printed in reverse order
 
@@ -60,14 +72,23 @@ int main(int argc, char** argv)
       ValueArg<double> normalization_init("", "normalization_init", "Initial normalization parameter. Default: 0.", false, 0.0, "double", cmd);
       ValueArg<bool> normalization("", "normalization", "Learn individual normalization factors during training. 1 = yes, 0 = no. Default: 0.", false, 0, "bool", cmd);
 
-      ValueArg<int> num_noise_samples("", "num_noise_samples", "Number of noise samples for noise-contrastive estimation. Default: 25.", false, 25, "int", cmd);
+      ValueArg<bool> mmap_file("", "mmap_file", "Use memory mapped files. This is useful if the entire data cannot fit in memory. prepareNeuralLM can generate memory mapped files", false, 0, "bool", cmd);
+
+      ValueArg<bool> arg_randomize("", "randomize", "Randomize training instances for better training. 1 = yes, 0 = no. Default: 1.", false, true, "bool", cmd);
+
+      ValueArg<int> num_noise_samples("", "num_noise_samples", "Number of noise samples for noise-contrastive estimation. Default: 100.", false, 100, "int", cmd);
 
       ValueArg<double> L2_reg("", "L2_reg", "L2 regularization strength (hidden layer weights only). Default: 0.", false, 0.0, "double", cmd);
 
-      ValueArg<double> learning_rate("", "learning_rate", "Learning rate for stochastic gradient ascent. Default: 0.01.", false, 0.01, "double", cmd);
+      ValueArg<double> learning_rate("", "learning_rate", "Learning rate for stochastic gradient ascent. Default: 1.", false, 1., "double", cmd);
+
+      ValueArg<double> conditioning_constant("", "conditioning_constant", "Constant to condition the RMS of the expected square of the gradient in ADADELTA. Default: 10E-3.", false, 10E-3, "double", cmd);
 
+      ValueArg<double> decay("", "decay", "Decay for ADADELTA. Default: 0.95", false, 0.95, "double", cmd);
+      ValueArg<double> adagrad_epsilon("", "adagrad_epsilon", "Constant to initialize the L2 squared norm of the gradients with.\
+          Default: 10E-3", false, 10E-3, "double", cmd);
       ValueArg<int> validation_minibatch_size("", "validation_minibatch_size", "Minibatch size for validation. Default: 64.", false, 64, "int", cmd);
-      ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size (for training). Default: 64.", false, 64, "int", cmd);
+      ValueArg<int> minibatch_size("", "minibatch_size", "Minibatch size (for training). Default: 1000.", false, 1000, "int", cmd);
 
       ValueArg<int> num_epochs("", "num_epochs", "Number of epochs. Default: 10.", false, 10, "int", cmd);
 
@@ -90,21 +111,29 @@ int main(int argc, char** argv)
 
       ValueArg<string> model_prefix("", "model_prefix", "Prefix for output model files." , false, "", "string", cmd);
       ValueArg<string> words_file("", "words_file", "Vocabulary." , false, "", "string", cmd);
+      ValueArg<string> parameter_update("", "parameter_update", "parameter update type.\n Stochastic Gradient Descent(SGD)\n \
+          ADAGRAD(ADA)\n \
+          ADADELTA(ADAD)" , false, "SGD", "string", cmd);
       ValueArg<string> input_words_file("", "input_words_file", "Vocabulary." , false, "", "string", cmd);
       ValueArg<string> output_words_file("", "output_words_file", "Vocabulary." , false, "", "string", cmd);
       ValueArg<string> validation_file("", "validation_file", "Validation data (one numberized example per line)." , false, "", "string", cmd);
       ValueArg<string> train_file("", "train_file", "Training data (one numberized example per line)." , true, "", "string", cmd);
-      ValueArg<string> init_model("", "init_model", "Initialize parameters from existing model (to continue interrupted training)", false, "", "string", cmd);
+
+      ValueArg<string> model_file("", "model_file", "Model file.", false, "", "string", cmd);
+
 
       cmd.parse(argc, argv);
 
       // define program parameters //
+      use_mmap_file = mmap_file.getValue();
+      randomize = arg_randomize.getValue();
+      myParam.model_file = model_file.getValue();
       myParam.train_file = train_file.getValue();
       myParam.validation_file = validation_file.getValue();
       myParam.input_words_file = input_words_file.getValue();
       myParam.output_words_file = output_words_file.getValue();
       if (words_file.getValue() != "")
-	  myParam.input_words_file = myParam.output_words_file = words_file.getValue();
+	      myParam.input_words_file = myParam.output_words_file = words_file.getValue();
 
       myParam.model_prefix = model_prefix.getValue();
 
@@ -112,9 +141,9 @@ int main(int argc, char** argv)
       myParam.vocab_size = vocab_size.getValue();
       myParam.input_vocab_size = input_vocab_size.getValue();
       myParam.output_vocab_size = output_vocab_size.getValue();
-      if (vocab_size.getValue() >= 0)
-	  myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue();
-
+      if (vocab_size.getValue() >= 0) {
+	      myParam.input_vocab_size = myParam.output_vocab_size = vocab_size.getValue();
+      }
       myParam.num_hidden = num_hidden.getValue();
       myParam.activation_function = activation_function.getValue();
       myParam.loss_function = loss_function.getValue();
@@ -125,13 +154,17 @@ int main(int argc, char** argv)
 
       myParam.input_embedding_dimension = input_embedding_dimension.getValue();
       myParam.output_embedding_dimension = output_embedding_dimension.getValue();
-      if (embedding_dimension.getValue() >= 0)
+      if (embedding_dimension.getValue() >= 0) {
 	      myParam.input_embedding_dimension = myParam.output_embedding_dimension = embedding_dimension.getValue();
+      }
 
       myParam.minibatch_size = minibatch_size.getValue();
       myParam.validation_minibatch_size = validation_minibatch_size.getValue();
       myParam.num_epochs= num_epochs.getValue();
       myParam.learning_rate = learning_rate.getValue();
+      myParam.conditioning_constant = conditioning_constant.getValue();
+      myParam.decay = decay.getValue();
+      myParam.adagrad_epsilon = adagrad_epsilon.getValue();
       myParam.use_momentum = use_momentum.getValue();
       myParam.share_embeddings = share_embeddings.getValue();
       myParam.normalization = normalization.getValue();
@@ -140,8 +173,8 @@ int main(int argc, char** argv)
       myParam.L2_reg = L2_reg.getValue();
       myParam.init_normal= init_normal.getValue();
       myParam.init_range = init_range.getValue();
-      myParam.init_model = init_model.getValue();
       myParam.normalization_init = normalization_init.getValue();
+      myParam.parameter_update = parameter_update.getValue();
 
       cerr << "Command line: " << endl;
       cerr << boost::algorithm::join(vector<string>(argv, argv+argc), " ") << endl;
@@ -156,72 +189,70 @@ int main(int argc, char** argv)
       cerr << ngram_size.getDescription() << sep << ngram_size.getValue() << endl;
       cerr << input_vocab_size.getDescription() << sep << input_vocab_size.getValue() << endl;
       cerr << output_vocab_size.getDescription() << sep << output_vocab_size.getValue() << endl;
+      cerr << mmap_file.getDescription() << sep << mmap_file.getValue() << endl;
 
       if (embedding_dimension.getValue() >= 0)
       {
-	  cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl;
+	      cerr << embedding_dimension.getDescription() << sep << embedding_dimension.getValue() << endl;
       }
       else
       {
-	  cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl;
-	  cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl;
+	      cerr << input_embedding_dimension.getDescription() << sep << input_embedding_dimension.getValue() << endl;
+	      cerr << output_embedding_dimension.getDescription() << sep << output_embedding_dimension.getValue() << endl;
       }
       cerr << share_embeddings.getDescription() << sep << share_embeddings.getValue() << endl;
       if (share_embeddings.getValue() && input_embedding_dimension.getValue() != output_embedding_dimension.getValue())
       {
-	  cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl;
-	  exit(1);
+	      cerr << "error: sharing input and output embeddings requires that input and output embeddings have same dimension" << endl;
+	      exit(1);
       }
 
       cerr << num_hidden.getDescription() << sep << num_hidden.getValue() << endl;
 
       if (string_to_activation_function(activation_function.getValue()) == InvalidFunction)
       {
-	 cerr << "error: invalid activation function: " << activation_function.getValue() << endl;
-	  exit(1);
+	      cerr << "error: invalid activation function: " << activation_function.getValue() << endl;
+	      exit(1);
       }
       cerr << activation_function.getDescription() << sep << activation_function.getValue() << endl;
 
       if (string_to_loss_function(loss_function.getValue()) == InvalidLoss)
       {
-	 cerr << "error: invalid loss function: " << loss_function.getValue() << endl;
-	  exit(1);
+	      cerr << "error: invalid loss function: " << loss_function.getValue() << endl;
+	      exit(1);
       }
       cerr << loss_function.getDescription() << sep << loss_function.getValue() << endl;
 
-      if (init_model.getValue() != "") {
-        cerr << init_model.getDescription() << sep << init_model.getValue() << endl;
-      }
-      else {
-          cerr << init_normal.getDescription() << sep << init_normal.getValue() << endl;
-          cerr << init_range.getDescription() << sep << init_range.getValue() << endl;
-      }
+      cerr << init_normal.getDescription() << sep << init_normal.getValue() << endl;
+      cerr << init_range.getDescription() << sep << init_range.getValue() << endl;
 
       cerr << num_epochs.getDescription() << sep << num_epochs.getValue() << endl;
       cerr << minibatch_size.getDescription() << sep << minibatch_size.getValue() << endl;
-      if (myParam.validation_file != "")
-	  cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl;
+      if (myParam.validation_file != "") {
+	     cerr << validation_minibatch_size.getDescription() << sep << validation_minibatch_size.getValue() << endl;
+      }
       cerr << learning_rate.getDescription() << sep << learning_rate.getValue() << endl;
       cerr << L2_reg.getDescription() << sep << L2_reg.getValue() << endl;
 
       cerr << num_noise_samples.getDescription() << sep << num_noise_samples.getValue() << endl;
 
       cerr << normalization.getDescription() << sep << normalization.getValue() << endl;
-      if (myParam.normalization)
-	  cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl;
+      if (myParam.normalization){
+	      cerr << normalization_init.getDescription() << sep << normalization_init.getValue() << endl;
+      }
 
       cerr << use_momentum.getDescription() << sep << use_momentum.getValue() << endl;
       if (myParam.use_momentum)
       {
-	  cerr << initial_momentum.getDescription() << sep << initial_momentum.getValue() << endl;
-	  cerr << final_momentum.getDescription() << sep << final_momentum.getValue() << endl;
+        cerr << initial_momentum.getDescription() << sep << initial_momentum.getValue() << endl;
+        cerr << final_momentum.getDescription() << sep << final_momentum.getValue() << endl;
       }
 
       cerr << num_threads.getDescription() << sep << num_threads.getValue() << endl;
 
       if (unigram_probs_file.getValue() != "")
       {
-	  cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl;
+	      cerr << "Note: --unigram_probs_file is deprecated and ignored." << endl;
       }
     }
     catch (TCLAP::ArgException &e)
@@ -241,13 +272,114 @@ int main(int argc, char** argv)
     /////////////////////////////////////////////////////////////////////////////////////
 
     // Read training data
+
     vector<int> training_data_flat;
-    readDataFile(myParam.train_file, myParam.ngram_size, training_data_flat, myParam.minibatch_size);
-    data_size_t training_data_size = training_data_flat.size() / myParam.ngram_size;
+    vec * training_data_flat_mmap;
+    data_size_t training_data_size; //num_tokens;
+    ip::managed_mapped_file mmap_file;
+    if (use_mmap_file == false) {
+      cerr<<"Reading data from regular text file "<<endl;
+      readDataFile(myParam.train_file, myParam.ngram_size, training_data_flat, myParam.minibatch_size);
+      training_data_size = training_data_flat.size()/myParam.ngram_size;
+    } else {
+      cerr<<"Using mmaped file"<<endl;
+      mmap_file = ip::managed_mapped_file(ip::open_only,myParam.train_file.c_str());
+      training_data_flat_mmap = mmap_file.find<vec>("vector").first;
+      cerr<<"Size of mmaped vector is "<<training_data_flat_mmap->size()<<endl;
+      training_data_size = training_data_flat_mmap->size()/myParam.ngram_size;
+      //randomly shuffle the data for better learning. The shuffling will 
+      //be different for a standard stl vector
+      // Randomly shuffle training data to improve learning
+      if (randomize == true) {
+        cerr<<"Randomly shuffling data...";
+        data_size_t counter =0;
+        while (counter < training_data_size) {
+          data_size_t upper_limit = counter+5000000;
+          long int vector_size = 5000000;
+          if (counter + 10000000 >= training_data_size) {
+            upper_limit = training_data_size;
+            vector_size = training_data_size - counter;
+          }
+          vector<int> temp(vector_size*myParam.ngram_size,0);
+          for (int i=0;i<vector_size;i++){
+           for (int k=0;k<myParam.ngram_size;k++) {
+             temp[i*myParam.ngram_size+k] = training_data_flat_mmap->at((i+counter)*myParam.ngram_size+k);
+           }
+          }
+          /*
+          for (data_size_t i=upper_limit; i>counter; i--)
+          {
+            if (i %500000 == 0) {
+              cerr<<"Shuffled "<<training_data_size-1<<" instances...";
+            }
+            data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng);
+            for (int k=0;k<myParam.ngram_size;k++) {
+              int temp_val = training_data_flat_mmap->at(i*myParam.ngram_size+k);
+              training_data_flat_mmap->at(i*myParam.ngram_size+k) =
+                training_data_flat_mmap->at(j*myParam.ngram_size+k);
+              training_data_flat_mmap->at(j*myParam.ngram_size+k) = temp_val;
+            }
+          }
+          */
+          for (data_size_t i=vector_size-1; i>0; i--)
+          {
+            if (i %500000 == 0) {
+              cerr<<"Shuffled "<<training_data_size-1<<" instances...";
+            }
+            data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng);
+            for (int k=0;k<myParam.ngram_size;k++) {
+              int temp_val = temp.at(i*myParam.ngram_size+k);
+              temp.at(i*myParam.ngram_size+k) =
+                temp.at(j*myParam.ngram_size+k);
+              temp.at(j*myParam.ngram_size+k) = temp_val;
+            }
+          }
+          //Putting it back
+          for (int i=0;i<vector_size;i++){
+           for (int k=0;k<myParam.ngram_size;k++) {
+             training_data_flat_mmap->at((i+counter)*myParam.ngram_size+k) = temp[i*myParam.ngram_size+k];
+           }
+          }
+          counter = upper_limit;
+        }
+        /*
+        for (data_size_t i=training_data_size-1; i>0; i--)
+        {
+          if (i %500000 == 0) {
+            cerr<<"Shuffled "<<training_data_size-1<<" instances...";
+          }
+          data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng);
+          for (int k=0;k<myParam.ngram_size;k++) {
+            int temp_val = training_data_flat_mmap->at(i*myParam.ngram_size+k);
+            training_data_flat_mmap->at(i*myParam.ngram_size+k) =
+              training_data_flat_mmap->at(j*myParam.ngram_size+k);
+            training_data_flat_mmap->at(j*myParam.ngram_size+k) = temp_val;
+          }
+        }
+        */
+      cerr<<endl;
+      }
+    }
+    //cerr<<"Num tokens "<<num_tokens<<endl;
+    //data_size_t training_data_size = num_tokens / myParam.ngram_size;
     cerr << "Number of training instances: "<< training_data_size << endl;
-
-    Map< Matrix<int,Dynamic,Dynamic> > training_data(training_data_flat.data(), myParam.ngram_size, training_data_size);
-
+    
+    Matrix<int,Dynamic,Dynamic> training_data;
+    //(training_data_flat.data(), myParam.ngram_size, training_data_size);
+    
+    #ifdef MAP
+    cerr<<"Setting up eigen map"<<endl;
+    if (use_mmap_file == false) {
+      training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat.data(), myParam.ngram_size, training_data_size);
+    } else {
+      training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat_mmap->data().get(), myParam.ngram_size, training_data_size);
+    }
+    cerr<<"Created eigen map"<<endl;
+    #else 
+    if (use_mmap_file == false) {
+      training_data = Map< Matrix<int,Dynamic,Dynamic> >(training_data_flat.data(), myParam.ngram_size, training_data_size);
+    }
+    #endif 
     // If neither --input_vocab_size nor --input_words_file is given, set input_vocab_size to the maximum word index
     if (myParam.input_vocab_size == 0 and myParam.input_words_file == "")
     {
@@ -255,16 +387,18 @@ int main(int argc, char** argv)
     }
 
     // If neither --output_vocab_size nor --output_words_file is given, set output_vocab_size to the maximum word index
-    if (myParam.output_vocab_size == 0 and myParam.words_file == "")
+    if (myParam.output_vocab_size == 0 and myParam.output_words_file == "")
     {
         myParam.output_vocab_size = training_data.row(myParam.ngram_size-1).maxCoeff()+1;
     }
-
-    // Randomly shuffle training data to improve learning
-    for (data_size_t i=training_data_size-1; i>0; i--)
-    {
-        data_size_t j = boost::random::uniform_int_distribution<data_size_t>(0, i-1)(rng);
-	training_data.col(i).swap(training_data.col(j));
+    if (use_mmap_file == false && randomize == true) {
+      cerr<<"Randomly shuffling data..."<<endl;
+      // Randomly shuffle training data to improve learning
+      for (data_size_t i=training_data_size-1; i>0; i--)
+      {
+        data_size_t j = uniform_int_distribution<data_size_t>(0, i-1)(rng);
+        training_data.col(i).swap(training_data.col(j));
+      }
     }
 
     // Read validation data
@@ -273,9 +407,9 @@ int main(int argc, char** argv)
     
     if (myParam.validation_file != "")
     {
-	readDataFile(myParam.validation_file, myParam.ngram_size, validation_data_flat);
-	validation_data_size = validation_data_flat.size() / myParam.ngram_size;
-	cerr << "Number of validation instances: " << validation_data_size << endl;
+      readDataFile(myParam.validation_file, myParam.ngram_size, validation_data_flat);
+      validation_data_size = validation_data_flat.size() / myParam.ngram_size;
+      cerr << "Number of validation instances: " << validation_data_size << endl;
     }
 
     Map< Matrix<int,Dynamic,Dynamic> > validation_data(validation_data_flat.data(), myParam.ngram_size, validation_data_size);
@@ -303,28 +437,41 @@ int main(int argc, char** argv)
     vector<data_size_t> unigram_counts(myParam.output_vocab_size);
     for (data_size_t train_id=0; train_id < training_data_size; train_id++)
     {
-        int output_word = training_data(myParam.ngram_size-1, train_id);
-	unigram_counts[output_word] += 1;
+        int output_word;
+        if (use_mmap_file == false) {
+          output_word = training_data(myParam.ngram_size-1, train_id);
+        } else {
+	      //cerr<<"mmap word is "<<training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1)<<endl;
+          output_word = training_data_flat_mmap->at((train_id+1)*myParam.ngram_size - 1);
+        }
+		//cerr<<"output word is "<<output_word<<endl;
+	    unigram_counts[output_word] += 1;
     }
     multinomial<data_size_t> unigram (unigram_counts);
 
     ///// Create and initialize the neural network and associated propagators.
-
-    model nn(myParam.ngram_size,
-        myParam.input_vocab_size,
-        myParam.output_vocab_size,
-        myParam.input_embedding_dimension,
-	      myParam.num_hidden,
-        myParam.output_embedding_dimension,
-        myParam.share_embeddings);
-
-    if (myParam.init_model != "") {
-      nn.read(myParam.init_model);
-    }
-    else {
-      nn.initialize(rng, myParam.init_normal, myParam.init_range, -log(myParam.output_vocab_size));
+    model nn;
+    // IF THE MODEL FILE HAS BEEN DEFINED, THEN 
+    // LOAD THE NEURAL NETWORK MODEL
+    if (myParam.model_file != ""){
+      nn.read(myParam.model_file);
+      cerr<<"reading the model"<<endl;
+    } else {
+      nn.resize(myParam.ngram_size,
+          myParam.input_vocab_size,
+          myParam.output_vocab_size,
+          myParam.input_embedding_dimension,
+          myParam.num_hidden,
+          myParam.output_embedding_dimension);
+
+      nn.initialize(rng,
+          myParam.init_normal,
+          myParam.init_range,
+          -log(myParam.output_vocab_size),
+          myParam.parameter_update,
+          myParam.adagrad_epsilon);
+      nn.set_activation_function(string_to_activation_function(myParam.activation_function));
     }
-    nn.set_activation_function(string_to_activation_function(myParam.activation_function));
     loss_function_type loss_function = string_to_loss_function(myParam.loss_function);
 
     propagator prop(nn, myParam.minibatch_size);
@@ -360,14 +507,14 @@ int main(int argc, char** argv)
 
     if (myParam.normalization)
     {
-	for (data_size_t i=0;i<training_data_size;i++)
-	{
-	    Matrix<int,Dynamic,1> context = training_data.block(0,i,ngram_size-1,1);
-	    if (c_h.find(context) == c_h.end())
-	    {
-	        c_h[context] = -myParam.normalization_init;
-	    }
-	}
+      for (data_size_t i=0;i<training_data_size;i++)
+      {
+          Matrix<int,Dynamic,1> context = training_data.block(0,i,ngram_size-1,1);
+          if (c_h.find(context) == c_h.end())
+          {
+              c_h[context] = -myParam.normalization_init;
+          }
+      }
     }
 
     for (int epoch=0; epoch<myParam.num_epochs; epoch++)
@@ -403,9 +550,29 @@ int main(int argc, char** argv)
             } 
 
             data_size_t minibatch_start_index = minibatch_size * batch;
-            int current_minibatch_size = min(static_cast<data_size_t>(minibatch_size), training_data_size - minibatch_start_index);
-	    Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
 
+      int current_minibatch_size = min(static_cast<data_size_t>(minibatch_size), training_data_size - minibatch_start_index);
+      #ifdef MAP
+	    Matrix<int,Dynamic,Dynamic> minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
+      #else 
+      //ALTERNATIVE OPTION IF YOU'RE NOT USING eigen map interface on the mmapped file
+	    Matrix<int,Dynamic,Dynamic> minibatch;// = training_data.middleCols(minibatch_start_index, current_minibatch_size);
+		//cerr<<"Minibatch start index "<<minibatch_start_index<<endl;
+		//cerr<<"Minibatch size "<<current_minibatch_size<<endl;
+            if (use_mmap_file == true) {
+            minibatch.setZero(ngram_size,current_minibatch_size);
+            //now reading the ngrams from the mmaped file
+              for (int k=0; k<ngram_size; k++){
+                for (data_size_t index = 0 ; index<current_minibatch_size; index++) {
+				  data_size_t current_index = index + minibatch_start_index;
+				  //cerr<<"the value in the mmap file "<<index<<" "<<k<<" is "<<training_data_flat_mmap->at(current_index*ngram_size+k)<<endl;
+                  minibatch(k,index) = training_data_flat_mmap->at(current_index*ngram_size+k);
+                }
+              }
+            } else {
+              minibatch = training_data.middleCols(minibatch_start_index, current_minibatch_size);
+            }
+      #endif 
             double adjusted_learning_rate = current_learning_rate/current_minibatch_size;
             //cerr<<"Adjusted learning rate: "<<adjusted_learning_rate<<endl;
 
@@ -426,103 +593,106 @@ int main(int argc, char** argv)
 
 	    if (loss_function == NCELoss)
 	    {
-	        ///// Noise-contrastive estimation
+	      ///// Noise-contrastive estimation
 
-	        // Generate noise samples. Gather positive and negative samples into matrix.
+	      // Generate noise samples. Gather positive and negative samples into matrix.
 
-	        start_timer(3);
+	      start_timer(3);
 
-		minibatch_samples.block(0, 0, 1, current_minibatch_size) = minibatch.bottomRows(1);
-		
-		for (int sample_id = 1; sample_id < num_noise_samples+1; sample_id++)
-		    for (int train_id = 0; train_id < current_minibatch_size; train_id++)
-		        minibatch_samples(sample_id, train_id) = unigram.sample(rng);
-	    
-		stop_timer(3);
+        minibatch_samples.block(0, 0, 1, current_minibatch_size) = minibatch.bottomRows(1);
+        
+        for (int sample_id = 1; sample_id < num_noise_samples+1; sample_id++)
+            for (int train_id = 0; train_id < current_minibatch_size; train_id++)
+                minibatch_samples(sample_id, train_id) = unigram.sample(rng);
+          
+        stop_timer(3);
 
-		// Final forward propagation step (sparse)
-		start_timer(4);
-                if (prop.skip_hidden)
-                    prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix,
-                                                    minibatch_samples, scores);
-                else
-                    prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix,
-						    minibatch_samples, scores);
-		stop_timer(4);
+        // Final forward propagation step (sparse)
+        start_timer(4);
+        prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix,
+                    minibatch_samples, scores);
+        stop_timer(4);
 
-		// Apply normalization parameters
-		if (myParam.normalization)
-		{
-		    for (int train_id = 0;train_id < current_minibatch_size;train_id++)
-		    {
-			Matrix<int,Dynamic,1> context = minibatch.block(0, train_id, ngram_size-1, 1);
-			scores.col(train_id).array() += c_h[context];
-		    }
-		}
+        // Apply normalization parameters
+        if (myParam.normalization)
+        {
+            for (int train_id = 0;train_id < current_minibatch_size;train_id++)
+            {
+          Matrix<int,Dynamic,1> context = minibatch.block(0, train_id, ngram_size-1, 1);
+          scores.col(train_id).array() += c_h[context];
+            }
+        }
 
-		double minibatch_log_likelihood;
-		start_timer(5);
-		softmax_loss.fProp(scores.leftCols(current_minibatch_size), 
-				   minibatch_samples,
-				   probs, minibatch_log_likelihood);
-		stop_timer(5);
-		log_likelihood += minibatch_log_likelihood;
+        double minibatch_log_likelihood;
+        start_timer(5);
+        softmax_loss.fProp(scores.leftCols(current_minibatch_size), 
+               minibatch_samples,
+               probs, minibatch_log_likelihood);
+        stop_timer(5);
+        log_likelihood += minibatch_log_likelihood;
+
+        ///// Backward propagation
+
+        start_timer(6);
+        softmax_loss.bProp(probs, minibatch_weights);
+        stop_timer(6);
+        
+        // Update the normalization parameters
+        
+        if (myParam.normalization)
+        {
+          for (int train_id = 0;train_id < current_minibatch_size;train_id++)
+          {
+            Matrix<int,Dynamic,1> context = minibatch.block(0, train_id, ngram_size-1, 1);
+            c_h[context] += adjusted_learning_rate * minibatch_weights.col(train_id).sum();
+          }
+        }
 
-		///// Backward propagation
-
-		start_timer(6);
-		softmax_loss.bProp(probs, minibatch_weights);
-		stop_timer(6);
-		
-		// Update the normalization parameters
-		
-		if (myParam.normalization)
-		{
-		    for (int train_id = 0;train_id < current_minibatch_size;train_id++)
-		    {
-			Matrix<int,Dynamic,1> context = minibatch.block(0, train_id, ngram_size-1, 1);
-			c_h[context] += adjusted_learning_rate * minibatch_weights.col(train_id).sum();
-		    }
-		}
-
-		// Be careful of short minibatch
-		prop.bProp(minibatch.topRows(ngram_size-1),
-			   minibatch_samples.leftCols(current_minibatch_size), 
-			   minibatch_weights.leftCols(current_minibatch_size),
-			   adjusted_learning_rate, current_momentum, myParam.L2_reg);
+        // Be careful of short minibatch
+        prop.bProp(minibatch.topRows(ngram_size-1),
+             minibatch_samples.leftCols(current_minibatch_size), 
+             minibatch_weights.leftCols(current_minibatch_size),
+             adjusted_learning_rate, 
+             current_momentum,
+             myParam.L2_reg,
+             myParam.parameter_update,
+             myParam.conditioning_constant,
+             myParam.decay);
 	    }
 	    else if (loss_function == LogLoss)
 	    {
-	        ///// Standard log-likelihood
-	        start_timer(4);
-                if (prop.skip_hidden)
-                    prop.output_layer_node.param->fProp(prop.first_hidden_activation_node.fProp_matrix, scores);
-                else
-                    prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
-		stop_timer(4);
-
-		double minibatch_log_likelihood;
-		start_timer(5);
-		SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), 
-				       minibatch.row(ngram_size-1), 
-				       probs, 
-				       minibatch_log_likelihood);
-		stop_timer(5);
-		log_likelihood += minibatch_log_likelihood;
-
-		///// Backward propagation
-		
-		start_timer(6);
-		SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size), 
-				       probs.leftCols(current_minibatch_size), 
-				       minibatch_weights);
-		stop_timer(6);
-		
-		prop.bProp(minibatch.topRows(ngram_size-1).leftCols(current_minibatch_size),
-			   minibatch_weights,
-			   adjusted_learning_rate, current_momentum, myParam.L2_reg);
-	    }
-        }
+	      ///// Standard log-likelihood
+	      start_timer(4);
+        prop.output_layer_node.param->fProp(prop.second_hidden_activation_node.fProp_matrix, scores);
+        stop_timer(4);
+
+        double minibatch_log_likelihood;
+        start_timer(5);
+        SoftmaxLogLoss().fProp(scores.leftCols(current_minibatch_size), 
+                   minibatch.row(ngram_size-1), 
+                   probs, 
+                   minibatch_log_likelihood);
+        stop_timer(5);
+        log_likelihood += minibatch_log_likelihood;
+
+        ///// Backward propagation
+        
+        start_timer(6);
+        SoftmaxLogLoss().bProp(minibatch.row(ngram_size-1).leftCols(current_minibatch_size), 
+                   probs.leftCols(current_minibatch_size), 
+                   minibatch_weights);
+        stop_timer(6);
+        
+        prop.bProp(minibatch.topRows(ngram_size-1).leftCols(current_minibatch_size),
+             minibatch_weights,
+             adjusted_learning_rate,
+             current_momentum,
+             myParam.L2_reg,
+             myParam.parameter_update,
+             myParam.conditioning_constant,
+             myParam.decay);
+          }
+      }
 	cerr << "done." << endl;
 
 	if (loss_function == LogLoss)
@@ -573,10 +743,7 @@ int main(int argc, char** argv)
 
 		// Do full forward prop through output word embedding layer
 		start_timer(4);
-                if (prop_validation.skip_hidden)
-                    prop_validation.output_layer_node.param->fProp(prop_validation.first_hidden_activation_node.fProp_matrix, scores);
-                else
-                    prop_validation.output_layer_node.param->fProp(prop_validation.second_hidden_activation_node.fProp_matrix, scores);
+		prop_validation.output_layer_node.param->fProp(prop_validation.second_hidden_activation_node.fProp_matrix, scores);
 		stop_timer(4);
 
 		// And softmax and loss. Be careful of short minibatch
@@ -594,7 +761,7 @@ int main(int argc, char** argv)
             cerr << "           perplexity:     "<< exp(-log_likelihood/validation_data_size) << endl;
 
 	    // If the validation perplexity decreases, halve the learning rate.
-            if (epoch > 0 && log_likelihood < current_validation_ll)
+            if (epoch > 0 && log_likelihood < current_validation_ll && myParam.parameter_update != "ADA")
             { 
                 current_learning_rate /= 2;
             }
@@ -604,3 +771,4 @@ int main(int argc, char** argv)
     }
     return 0;
 }
+
diff --git a/src/types.hpp b/src/types.hpp
new file mode 100644
index 0000000..08b010f
--- /dev/null
+++ b/src/types.hpp
@@ -0,0 +1,41 @@
+#ifndef TYPES_HPP
+#define TYPES_HPP
+
+#include <cmath>
+#include <string>
+#include <vector>
+#include <boost/cstdint.hpp>
+#include <limits>
+
+namespace biglm{
+
+typedef double weight_type;
+const weight_type IMPOSSIBLE = -HUGE_VAL;
+
+typedef unsigned long block_type;
+const size_t bits_per_block = (std::numeric_limits<block_type>::digits);
+  //typedef std::size_t size_type;
+typedef boost::uint64_t size_type;
+typedef unsigned char byte_type;
+
+template<typename T>
+struct bytes {
+  static const byte_type *data(const T& key) { return reinterpret_cast<const byte_type *>(&key); }
+  static size_type size(const T& key) { return sizeof(T); }
+};
+
+template<>
+struct bytes<std::string> {
+  static const byte_type *data(const std::string& key) { return reinterpret_cast<const byte_type *>(key.data()); }
+  static size_type size(const std::string& key) { return key.size(); }
+};
+
+template<typename U>
+struct bytes<std::vector<U> > {
+  static const byte_type *data(const std::vector<U>& key) { return reinterpret_cast<const byte_type *>(&key[0]); }
+  static size_type size(const std::vector<U>& key) { return key.size() * sizeof(U); }
+};
+
+} //namespace nplm
+
+#endif
diff --git a/src/util.cpp b/src/util.cpp
index fe022c9..f6a5779 100644
--- a/src/util.cpp
+++ b/src/util.cpp
@@ -2,6 +2,8 @@
 #include <fstream>
 #include <iomanip>
 #include <cmath>
+#include <deque>
+#include <vector>
 
 #include <boost/unordered_map.hpp> 
 #include <boost/algorithm/string.hpp>
@@ -34,6 +36,21 @@ void splitBySpace(const std::string &line, std::vector<std::string> &items)
     boost::split(items, copy, boost::is_any_of(" \t"), boost::token_compress_on);
 }
 
+void readWeightsFile(ifstream &TRAININ, vector<float> &weights) {
+  string line;
+  while (getline(TRAININ, line) && line != "")
+  {
+    vector<string> items;
+    splitBySpace(line, items);
+    if (items.size() != 1)
+    {
+        cerr << "Error: weights file should have only one weight per line" << endl;
+        exit(-1);
+    }
+    weights.push_back(boost::lexical_cast<float>(items[0]));
+  }
+}
+
 void readWordsFile(ifstream &TRAININ, vector<string> &word_list)
 {
   string line;
@@ -87,28 +104,6 @@ void writeWordsFile(const vector<string> &words, const string &filename)
     OUT.close();
 }
 
-void readSentFile(const string &file, vector<vector<string> > &sentences)
-{
-  cerr << "Reading sentences from: " << file << endl;
-
-  ifstream TRAININ;
-  TRAININ.open(file.c_str());
-  if (! TRAININ)
-  {
-    cerr << "Error: can't read from file " << file<< endl;
-    exit(-1);
-  }
-
-  string line;
-  while (getline(TRAININ, line))
-  {
-    vector<string> words;
-    splitBySpace(line, words);
-    sentences.push_back(words);
-  }
-
-  TRAININ.close();
-}
 
 // Read a data file of unknown size into a flat vector<int>.
 // If this takes too much memory, we should create a vector of minibatches.
@@ -193,8 +188,7 @@ int setup_threads(int n_threads)
     Eigen::initParallel();
     Eigen::setNbThreads(n_threads);
 
-    #ifdef __INTEL_MKL__
-    /*
+    #ifdef MKL_SINGLE
     // Set the threading layer to match the compiler.
     // This lets MKL automatically go single-threaded in parallel regions.
     #ifdef __INTEL_COMPILER
@@ -202,7 +196,6 @@ int setup_threads(int n_threads)
     #elif defined __GNUC__
     mkl_set_threading_layer(MKL_THREADING_GNU);
     #endif
-    */
     mkl_set_num_threads(n_threads);
     #endif
     #endif
diff --git a/src/util.h b/src/util.h
index 9c9b255..5854712 100644
--- a/src/util.h
+++ b/src/util.h
@@ -15,7 +15,7 @@
 #include <boost/chrono.hpp>
 #endif
 
-#include "../3rdparty/Eigen/Dense"
+#include <Eigen/Dense>
 
 #include "maybe_omp.h"
 
@@ -43,7 +43,39 @@ void writeWordsFile(const std::vector<std::string> &words, std::ofstream &file);
 void writeWordsFile(const std::vector<std::string> &words, const std::string &filename);
 void readDataFile(const std::string &filename, int &ngram_size, std::vector<int> &data, int minibatch_size=0);
 void readUnigramProbs(const std::string &unigram_probs_file, std::vector<double> &unigram_probs);
-void readSentFile(const std::string &file, std::vector<std::vector<std::string> > &sentences);
+void readWeightsFile(std::ifstream &TRAININ, std::vector<float> &weights);
+//template <typename T> readSentFile(const std::string &file, T &sentences);
+
+
+template <typename T>
+void readSentFile(const std::string &file, T &sentences)
+{
+  std::cerr << "Reading sentences from: " << file << std::endl;
+
+  std::ifstream TRAININ;
+  TRAININ.open(file.c_str());
+  if (! TRAININ)
+  {
+    std::cerr << "Error: can't read from file " << file<< std::endl;
+    exit(-1);
+  }
+
+  std::string line;
+  while (getline(TRAININ, line))
+  {
+    std::vector<std::string> words;
+    splitBySpace(line, words);
+    sentences.push_back(words);
+  }
+
+  TRAININ.close();
+}
+
+inline void intgerize(std::vector<std::string> &ngram,std::vector<int> &int_ngram){
+        int ngram_size = ngram.size();
+        for (int i=0;i<ngram_size;i++)
+        int_ngram.push_back(boost::lexical_cast<int>(ngram[i]));
+}
 
 // Functions that take non-const matrices as arguments
 // are supposed to declare them const and then use this
@@ -84,6 +116,34 @@ void initMatrix(boost::random::mt19937 &engine,
 }
 
 template <typename Derived>
+void initBias(boost::random::mt19937 &engine,
+		const Eigen::MatrixBase<Derived> &p_const,
+		bool init_normal, double range)
+{
+    UNCONST(Derived, p_const, p);
+    if (init_normal == 0)
+     // initialize with uniform distribution in [-range, range]
+    {
+        boost::random::uniform_real_distribution<> unif_real(-range, range); 
+        for (int i = 0; i < p.size(); i++)
+        {
+            p(i) = unif_real(engine);    
+        }
+
+    }
+    else 
+      // initialize with gaussian distribution with mean 0 and stdev range
+    {
+        boost::random::normal_distribution<double> unif_normal(0., range);
+        for (int i = 0; i < p.size(); i++)
+        {
+            p(i) = unif_normal(engine);    
+        }
+    }
+}
+
+
+template <typename Derived>
 void readMatrix(std::ifstream &TRAININ, Eigen::MatrixBase<Derived> &param_const)
 {
     UNCONST(Derived, param_const, param);
diff --git a/src/vocabulary.h b/src/vocabulary.h
index 1f844a7..11fbfcd 100644
--- a/src/vocabulary.h
+++ b/src/vocabulary.h
@@ -19,6 +19,7 @@ class vocabulary {
     std::vector<std::string> m_words;
     boost::unordered_map<std::string, int> m_index;
     int unk;
+
 public:
     vocabulary() 
     { 
@@ -40,7 +41,17 @@ public:
 	if (pos != m_index.end())
 	    return pos->second;
 	else
-	  return unk;
+	    return unk;
+    }
+
+    // lookup word using custom unknown-word id
+    int lookup_word(const std::string &word, int unk) const
+    {
+        boost::unordered_map<std::string, int>::const_iterator pos = m_index.find(word);
+	if (pos != m_index.end())
+	    return pos->second;
+	else
+	    return unk;
     }
 
     int insert_word(const std::string &word)
@@ -77,8 +88,6 @@ public:
     }
 
     const std::vector<std::string> &words() const { return m_words; }
-    
-    const boost::unordered_map<std::string, int>& get_idmap() const { return m_index; }
 };
 
 } // namespace nplm
author	Rico Sennrich <rico.sennrich@gmx.ch>	2014-11-17 13:39:49 +0300
committer	Rico Sennrich <rico.sennrich@gmx.ch>	2014-11-17 13:51:09 +0300
commit	7eb6ea415c1a10d27d36182bf00c01d05e137325 (patch)
tree	4d927fdb05a4f106ad6d921b7227062208610e2f /src
parent	ba48d701c70e03fe1c1e96ecf5e06591ad4d3e27 (diff)